glibc/sysdeps/arm/armv7/strcmp.S

551 lines
13 KiB
ArmAsm

/* strcmp implementation for ARMv7-A, optimized for Cortex-A15.
Copyright (C) 2012-2017 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library. If not, see
<http://www.gnu.org/licenses/>. */
#include <arm-features.h>
#include <sysdep.h>
/* Implementation of strcmp for ARMv7 when DSP instructions are
available. Use ldrd to support wider loads, provided the data
is sufficiently aligned. Use saturating arithmetic to optimize
the compares. */
/* Build Options:
STRCMP_PRECHECK: Run a quick pre-check of the first byte in the
string. If comparing completely random strings the pre-check will
save time, since there is a very high probability of a mismatch in
the first character: we save significant overhead if this is the
common case. However, if strings are likely to be identical (e.g.
because we're verifying a hit in a hash table), then this check
is largely redundant. */
#define STRCMP_PRECHECK 1
.syntax unified
#ifdef __ARM_BIG_ENDIAN
# define S2LO lsl
# define S2LOEQ lsleq
# define S2HI lsr
# define MSB 0x000000ff
# define LSB 0xff000000
# define BYTE0_OFFSET 24
# define BYTE1_OFFSET 16
# define BYTE2_OFFSET 8
# define BYTE3_OFFSET 0
#else /* not __ARM_BIG_ENDIAN */
# define S2LO lsr
# define S2LOEQ lsreq
# define S2HI lsl
# define BYTE0_OFFSET 0
# define BYTE1_OFFSET 8
# define BYTE2_OFFSET 16
# define BYTE3_OFFSET 24
# define MSB 0xff000000
# define LSB 0x000000ff
#endif /* not __ARM_BIG_ENDIAN */
/* Parameters and result. */
#define src1 r0
#define src2 r1
#define result r0 /* Overlaps src1. */
/* Internal variables. */
#define tmp1 r4
#define tmp2 r5
#define const_m1 r12
/* Additional internal variables for 64-bit aligned data. */
#define data1a r2
#define data1b r3
#define data2a r6
#define data2b r7
#define syndrome_a tmp1
#define syndrome_b tmp2
/* Additional internal variables for 32-bit aligned data. */
#define data1 r2
#define data2 r3
#define syndrome tmp2
#ifndef NO_THUMB
/* This code is best on Thumb. */
.thumb
/* In Thumb code we can't use MVN with a register shift, but we do have ORN. */
.macro prepare_mask mask_reg, nbits_reg
S2HI \mask_reg, const_m1, \nbits_reg
.endm
.macro apply_mask data_reg, mask_reg
orn \data_reg, \data_reg, \mask_reg
.endm
#else
/* In ARM code we don't have ORN, but we can use MVN with a register shift. */
.macro prepare_mask mask_reg, nbits_reg
mvn \mask_reg, const_m1, S2HI \nbits_reg
.endm
.macro apply_mask data_reg, mask_reg
orr \data_reg, \data_reg, \mask_reg
.endm
/* These clobber the condition codes, which the real Thumb cbz/cbnz
instructions do not. But it doesn't matter for any of the uses here. */
.macro cbz reg, label
cmp \reg, #0
beq \label
.endm
.macro cbnz reg, label
cmp \reg, #0
bne \label
.endm
#endif
/* Macro to compute and return the result value for word-aligned
cases. */
.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
#ifdef __ARM_BIG_ENDIAN
/* If data1 contains a zero byte, then syndrome will contain a 1 in
bit 7 of that byte. Otherwise, the highest set bit in the
syndrome will highlight the first different bit. It is therefore
sufficient to extract the eight bits starting with the syndrome
bit. */
clz tmp1, \synd
lsl r1, \d2, tmp1
.if \restore_r6
ldrd r6, r7, [sp, #8]
.endif
lsl \d1, \d1, tmp1
lsr result, \d1, #24
ldrd r4, r5, [sp], #16
cfi_remember_state
cfi_def_cfa_offset (0)
cfi_restore (r4)
cfi_restore (r5)
cfi_restore (r6)
cfi_restore (r7)
sub result, result, r1, lsr #24
bx lr
#else
/* To use the big-endian trick we'd have to reverse all three words.
that's slower than this approach. */
rev \synd, \synd
clz tmp1, \synd
bic tmp1, tmp1, #7
lsr r1, \d2, tmp1
.if \restore_r6
ldrd r6, r7, [sp, #8]
.endif
lsr \d1, \d1, tmp1
and result, \d1, #255
and r1, r1, #255
ldrd r4, r5, [sp], #16
cfi_remember_state
cfi_def_cfa_offset (0)
cfi_restore (r4)
cfi_restore (r5)
cfi_restore (r6)
cfi_restore (r7)
sub result, result, r1
bx lr
#endif
.endm
.text
.p2align 5
.Lstrcmp_start_addr:
#if STRCMP_PRECHECK == 1
.Lfastpath_exit:
sub r0, r2, r3
bx lr
nop
#endif
ENTRY (strcmp)
#if STRCMP_PRECHECK == 1
sfi_breg src1, \
ldrb r2, [\B]
sfi_breg src2, \
ldrb r3, [\B]
cmp r2, #1
it cs
cmpcs r2, r3
bne .Lfastpath_exit
#endif
strd r4, r5, [sp, #-16]!
cfi_def_cfa_offset (16)
cfi_offset (r4, -16)
cfi_offset (r5, -12)
orr tmp1, src1, src2
strd r6, r7, [sp, #8]
cfi_offset (r6, -8)
cfi_offset (r7, -4)
mvn const_m1, #0
lsl r2, tmp1, #29
cbz r2, .Lloop_aligned8
.Lnot_aligned:
eor tmp1, src1, src2
tst tmp1, #7
bne .Lmisaligned8
/* Deal with mutual misalignment by aligning downwards and then
masking off the unwanted loaded data to prevent a difference. */
and tmp1, src1, #7
bic src1, src1, #7
and tmp2, tmp1, #3
bic src2, src2, #7
lsl tmp2, tmp2, #3 /* Bytes -> bits. */
sfi_breg src1, \
ldrd data1a, data1b, [\B], #16
tst tmp1, #4
sfi_breg src2, \
ldrd data2a, data2b, [\B], #16
prepare_mask tmp1, tmp2
apply_mask data1a, tmp1
apply_mask data2a, tmp1
beq .Lstart_realigned8
apply_mask data1b, tmp1
mov data1a, const_m1
apply_mask data2b, tmp1
mov data2a, const_m1
b .Lstart_realigned8
/* Unwind the inner loop by a factor of 2, giving 16 bytes per
pass. */
.p2align 5,,12 /* Don't start in the tail bytes of a cache line. */
.p2align 2 /* Always word aligned. */
.Lloop_aligned8:
sfi_breg src1, \
ldrd data1a, data1b, [\B], #16
sfi_breg src2, \
ldrd data2a, data2b, [\B], #16
.Lstart_realigned8:
uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
eor syndrome_a, data1a, data2a
sel syndrome_a, syndrome_a, const_m1
cbnz syndrome_a, .Ldiff_in_a
uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
eor syndrome_b, data1b, data2b
sel syndrome_b, syndrome_b, const_m1
cbnz syndrome_b, .Ldiff_in_b
sfi_breg src1, \
ldrd data1a, data1b, [\B, #-8]
sfi_breg src2, \
ldrd data2a, data2b, [\B, #-8]
uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
eor syndrome_a, data1a, data2a
sel syndrome_a, syndrome_a, const_m1
uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
eor syndrome_b, data1b, data2b
sel syndrome_b, syndrome_b, const_m1
/* Can't use CBZ for backwards branch. */
orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
beq .Lloop_aligned8
.Ldiff_found:
cbnz syndrome_a, .Ldiff_in_a
.Ldiff_in_b:
strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
.Ldiff_in_a:
cfi_restore_state
strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
cfi_restore_state
.Lmisaligned8:
tst tmp1, #3
bne .Lmisaligned4
ands tmp1, src1, #3
bne .Lmutual_align4
/* Unrolled by a factor of 2, to reduce the number of post-increment
operations. */
.Lloop_aligned4:
sfi_breg src1, \
ldr data1, [\B], #8
sfi_breg src2, \
ldr data2, [\B], #8
.Lstart_realigned4:
uadd8 syndrome, data1, const_m1 /* Only need GE bits. */
eor syndrome, data1, data2
sel syndrome, syndrome, const_m1
cbnz syndrome, .Laligned4_done
sfi_breg src1, \
ldr data1, [\B, #-4]
sfi_breg src2, \
ldr data2, [\B, #-4]
uadd8 syndrome, data1, const_m1
eor syndrome, data1, data2
sel syndrome, syndrome, const_m1
cmp syndrome, #0
beq .Lloop_aligned4
.Laligned4_done:
strcmp_epilogue_aligned syndrome, data1, data2, 0
.Lmutual_align4:
cfi_restore_state
/* Deal with mutual misalignment by aligning downwards and then
masking off the unwanted loaded data to prevent a difference. */
lsl tmp1, tmp1, #3 /* Bytes -> bits. */
bic src1, src1, #3
sfi_breg src1, \
ldr data1, [\B], #8
bic src2, src2, #3
sfi_breg src2, \
ldr data2, [\B], #8
prepare_mask tmp1, tmp1
apply_mask data1, tmp1
apply_mask data2, tmp1
b .Lstart_realigned4
.Lmisaligned4:
ands tmp1, src1, #3
beq .Lsrc1_aligned
sub src2, src2, tmp1
bic src1, src1, #3
lsls tmp1, tmp1, #31
sfi_breg src1, \
ldr data1, [\B], #4
beq .Laligned_m2
bcs .Laligned_m1
#if STRCMP_PRECHECK == 0
sfi_breg src2, \
ldrb data2, [\B, #1]
uxtb tmp1, data1, ror #BYTE1_OFFSET
subs tmp1, tmp1, data2
bne .Lmisaligned_exit
cbz data2, .Lmisaligned_exit
.Laligned_m2:
sfi_breg src2, \
ldrb data2, [\B, #2]
uxtb tmp1, data1, ror #BYTE2_OFFSET
subs tmp1, tmp1, data2
bne .Lmisaligned_exit
cbz data2, .Lmisaligned_exit
.Laligned_m1:
sfi_breg src2, \
ldrb data2, [\B, #3]
uxtb tmp1, data1, ror #BYTE3_OFFSET
subs tmp1, tmp1, data2
bne .Lmisaligned_exit
add src2, src2, #4
cbnz data2, .Lsrc1_aligned
#else /* STRCMP_PRECHECK */
/* If we've done the pre-check, then we don't need to check the
first byte again here. */
sfi_breg src2, \
ldrb data2, [\B, #2]
uxtb tmp1, data1, ror #BYTE2_OFFSET
subs tmp1, tmp1, data2
bne .Lmisaligned_exit
cbz data2, .Lmisaligned_exit
.Laligned_m2:
sfi_breg src2, \
ldrb data2, [\B, #3]
uxtb tmp1, data1, ror #BYTE3_OFFSET
subs tmp1, tmp1, data2
bne .Lmisaligned_exit
cbnz data2, .Laligned_m1
#endif
.Lmisaligned_exit:
mov result, tmp1
ldr r4, [sp], #16
cfi_remember_state
cfi_def_cfa_offset (0)
cfi_restore (r4)
cfi_restore (r5)
cfi_restore (r6)
cfi_restore (r7)
bx lr
#if STRCMP_PRECHECK == 1
.Laligned_m1:
add src2, src2, #4
#endif
.Lsrc1_aligned:
cfi_restore_state
/* src1 is word aligned, but src2 has no common alignment
with it. */
sfi_breg src1, \
ldr data1, [\B], #4
lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */
bic src2, src2, #3
sfi_breg src2, \
ldr data2, [\B], #4
bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */
bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */
/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */
.Loverlap3:
bic tmp1, data1, #MSB
uadd8 syndrome, data1, const_m1
eors syndrome, tmp1, data2, S2LO #8
sel syndrome, syndrome, const_m1
bne 4f
cbnz syndrome, 5f
sfi_breg src2, \
ldr data2, [\B], #4
eor tmp1, tmp1, data1
cmp tmp1, data2, S2HI #24
bne 6f
sfi_breg src1, \
ldr data1, [\B], #4
b .Loverlap3
4:
S2LO data2, data2, #8
b .Lstrcmp_tail
5:
bics syndrome, syndrome, #MSB
bne .Lstrcmp_done_equal
/* We can only get here if the MSB of data1 contains 0, so
fast-path the exit. */
sfi_breg src2, \
ldrb result, [\B]
ldrd r4, r5, [sp], #16
cfi_remember_state
cfi_def_cfa_offset (0)
cfi_restore (r4)
cfi_restore (r5)
/* R6/7 Not used in this sequence. */
cfi_restore (r6)
cfi_restore (r7)
neg result, result
bx lr
6:
cfi_restore_state
S2LO data1, data1, #24
and data2, data2, #LSB
b .Lstrcmp_tail
.p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
.Loverlap2:
and tmp1, data1, const_m1, S2LO #16
uadd8 syndrome, data1, const_m1
eors syndrome, tmp1, data2, S2LO #16
sel syndrome, syndrome, const_m1
bne 4f
cbnz syndrome, 5f
sfi_breg src2, \
ldr data2, [\B], #4
eor tmp1, tmp1, data1
cmp tmp1, data2, S2HI #16
bne 6f
sfi_breg src1, \
ldr data1, [\B], #4
b .Loverlap2
4:
S2LO data2, data2, #16
b .Lstrcmp_tail
5:
ands syndrome, syndrome, const_m1, S2LO #16
bne .Lstrcmp_done_equal
sfi_breg src2, \
ldrh data2, [\B]
S2LO data1, data1, #16
#ifdef __ARM_BIG_ENDIAN
lsl data2, data2, #16
#endif
b .Lstrcmp_tail
6:
S2LO data1, data1, #16
and data2, data2, const_m1, S2LO #16
b .Lstrcmp_tail
.p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
.Loverlap1:
and tmp1, data1, #LSB
uadd8 syndrome, data1, const_m1
eors syndrome, tmp1, data2, S2LO #24
sel syndrome, syndrome, const_m1
bne 4f
cbnz syndrome, 5f
sfi_breg src2, \
ldr data2, [\B], #4
eor tmp1, tmp1, data1
cmp tmp1, data2, S2HI #8
bne 6f
sfi_breg src1, \
ldr data1, [\B], #4
b .Loverlap1
4:
S2LO data2, data2, #24
b .Lstrcmp_tail
5:
tst syndrome, #LSB
bne .Lstrcmp_done_equal
sfi_breg src2, \
ldr data2, [\B]
6:
S2LO data1, data1, #8
bic data2, data2, #MSB
b .Lstrcmp_tail
.Lstrcmp_done_equal:
mov result, #0
ldrd r4, r5, [sp], #16
cfi_remember_state
cfi_def_cfa_offset (0)
cfi_restore (r4)
cfi_restore (r5)
/* R6/7 not used in this sequence. */
cfi_restore (r6)
cfi_restore (r7)
bx lr
.Lstrcmp_tail:
cfi_restore_state
#ifndef __ARM_BIG_ENDIAN
rev data1, data1
rev data2, data2
/* Now everything looks big-endian... */
#endif
uadd8 tmp1, data1, const_m1
eor tmp1, data1, data2
sel syndrome, tmp1, const_m1
clz tmp1, syndrome
lsl data1, data1, tmp1
lsl data2, data2, tmp1
lsr result, data1, #24
ldrd r4, r5, [sp], #16
cfi_def_cfa_offset (0)
cfi_restore (r4)
cfi_restore (r5)
/* R6/7 not used in this sequence. */
cfi_restore (r6)
cfi_restore (r7)
sub result, result, data2, lsr #24
bx lr
END (strcmp)
libc_hidden_builtin_def (strcmp)