PowerPC: strncpy/stpncpy optimization for PPC64/POWER7

The optimization is achieved by following techniques:
  > data alignment [gain from aligned memory access on read/write]
  > POWER7 gains performance with loop unrolling/unwinding
    [gain by reduction of branch penalty].
  > zero padding done by calling optimized memset
This commit is contained in:
Vidya Ranganathan 2014-05-05 19:10:45 -05:00 committed by Adhemerval Zanella
parent 978a41c357
commit f360f94a05
11 changed files with 610 additions and 1 deletions

View File

@ -1,3 +1,20 @@
2014-05-06 Vidya Ranganathan <vidya@linux.vnet.ibm.com>
* sysdeps/powerpc/powerpc64/power7/strncpy.S: New file: Optimization.
* sysdeps/powerpc/powerpc64/multiarch/strncpy.c: New file:
multiarch strncpy for PPC64.
* sysdeps/powerpc/powerpc64/multiarch/strncpy-ppc64.c: New file
* sysdeps/powerpc/powerpc64/multiarch/strncpy-power7.S: New file
* sysdeps/powerpc/powerpc64/multiarch/Makefile: Add strpcpy, stpncpy
multiarch optimizations.
* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c:
(__libc_ifunc_impl_list): Likewise.
* sysdeps/powerpc/powerpc64/power7/stpncpy.S: New file: Optimization.
* sysdeps/powerpc/powerpc64/multiarch/stpncpy.c: New file:
multiarch stpncpy for PPC64.
* sysdeps/powerpc/powerpc64/multiarch/stpncpy-ppc64.c: New file
* sysdeps/powerpc/powerpc64/multiarch/stpncpy-power7.S: New file
2014-05-06 Andreas Schwab <schwab@suse.de> 2014-05-06 Andreas Schwab <schwab@suse.de>
[BZ #16912] [BZ #16912]

View File

@ -16,7 +16,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
strcpy-power7 strcpy-ppc64 stpcpy-power7 stpcpy-ppc64 \ strcpy-power7 strcpy-ppc64 stpcpy-power7 stpcpy-ppc64 \
strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \ strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
strspn-power7 strspn-ppc64 strcspn-power7 strcspn-ppc64 \ strspn-power7 strspn-ppc64 strcspn-power7 strcspn-ppc64 \
strpbrk-power7 strpbrk-ppc64 strpbrk-power7 strpbrk-ppc64 strncpy-power7 strncpy-ppc64 \
stpncpy-power7 stpncpy-ppc64
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops

View File

@ -278,5 +278,21 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strpbrk, 1, IFUNC_IMPL_ADD (array, i, strpbrk, 1,
__strpbrk_ppc)) __strpbrk_ppc))
/* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c. */
IFUNC_IMPL (i, name, strncpy,
IFUNC_IMPL_ADD (array, i, strncpy,
hwcap & PPC_FEATURE_HAS_VSX,
__strncpy_power7)
IFUNC_IMPL_ADD (array, i, strncpy, 1,
__strncpy_ppc))
/* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c. */
IFUNC_IMPL (i, name, stpncpy,
IFUNC_IMPL_ADD (array, i, stpncpy,
hwcap & PPC_FEATURE_HAS_VSX,
__stpncpy_power7)
IFUNC_IMPL_ADD (array, i, stpncpy, 1,
__stpncpy_ppc))
return i; return i;
} }

View File

@ -0,0 +1,44 @@
/* Optimized stpncpy implementation for POWER7.
Copyright (C) 2014 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
#define USE_AS_STPNCPY
#undef EALIGN
#define EALIGN(name, alignt, words) \
.section ".text"; \
ENTRY_2(__stpncpy_power7) \
.align ALIGNARG(alignt); \
EALIGN_W_##words; \
BODY_LABEL(__stpncpy_power7): \
cfi_startproc; \
LOCALENTRY(__stpncpy_power7)
#undef END
#define END(name) \
cfi_endproc; \
TRACEBACK(__stpncpy_power7) \
END_2(__stpncpy_power7)
#undef libc_hidden_builtin_def
#define libc_hidden_builtin_def(name)
#define MEMSET __memset_power7
#include <sysdeps/powerpc/powerpc64/power7/stpncpy.S>

View File

@ -0,0 +1,26 @@
/* Default stpncpy implementation for PowerPC64.
Copyright (C) 2014 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#define STPNCPY __stpncpy_ppc
#ifdef SHARED
#undef libc_hidden_def
#define libc_hidden_def(name) \
__hidden_ver1 (__stpncpy_ppc, __GI___stpncpy, __stpncpy_ppc);
#endif
#include <string/stpncpy.c>

View File

@ -0,0 +1,33 @@
/* Multiple versions of stpncpy. PowerPC64 version.
Copyright (C) 2014 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#ifndef NOT_IN_libc
# include <string.h>
# include <shlib-compat.h>
# include "init-arch.h"
extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
libc_ifunc (__stpncpy,
(hwcap & PPC_FEATURE_HAS_VSX)
? __stpncpy_power7
: __stpncpy_ppc);
weak_alias (__stpncpy, stpncpy)
#endif

View File

@ -0,0 +1,42 @@
/* Optimized strncpy implementation for POWER7.
Copyright (C) 2014 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
#undef EALIGN
#define EALIGN(name, alignt, words) \
.section ".text"; \
ENTRY_2(__strncpy_power7) \
.align ALIGNARG(alignt); \
EALIGN_W_##words; \
BODY_LABEL(__strncpy_power7): \
cfi_startproc; \
LOCALENTRY(__strncpy_power7)
#undef END
#define END(name) \
cfi_endproc; \
TRACEBACK(__strncpy_power7) \
END_2(__strncpy_power7)
#undef libc_hidden_builtin_def
#define libc_hidden_builtin_def(name)
#define MEMSET __memset_power7
#include <sysdeps/powerpc/powerpc64/power7/strncpy.S>

View File

@ -0,0 +1,33 @@
/* Copyright (C) 2014 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <string.h>
#define STRNCPY __strncpy_ppc
#undef weak_alias
#define weak_alias(name, aliasname) \
extern __typeof (__strncpy_ppc) aliasname \
__attribute__ ((weak, alias ("__strncpy_ppc")));
#if !defined(NOT_IN_libc) && defined(SHARED)
# undef libc_hidden_builtin_def
# define libc_hidden_builtin_def(name) \
__hidden_ver1(__strncpy_ppc, __GI_strncpy, __strncpy_ppc);
#endif
extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
#include <string/strncpy.c>

View File

@ -0,0 +1,35 @@
/* Multiple versions of strncpy.
Copyright (C) 2014 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/ >. */
/* Define multiple versions only for definition in libc. */
#ifndef NOT_IN_libc
# include <string.h>
# include <shlib-compat.h>
# include "init-arch.h"
extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc (strncpy,
(hwcap & PPC_FEATURE_HAS_VSX)
? __strncpy_power7
: __strncpy_ppc);
#endif

View File

@ -0,0 +1,24 @@
/* Optimized stpncpy implementation for PowerPC64/POWER7.
Copyright (C) 2014 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#define USE_AS_STPNCPY
#include <sysdeps/powerpc/powerpc64/power7/strncpy.S>
weak_alias (__stpncpy, stpncpy)
libc_hidden_def (__stpncpy)
libc_hidden_builtin_def (stpncpy)

View File

@ -0,0 +1,338 @@
/* Copyright (C) 2014 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
/* Implements the functions
char * [r3] strncpy (char *dst [r3], const char *src [r4], size_t n [r5])
AND
char * [r3] stpncpy (char *dst [r3], const char *src [r4], size_t n [r5])
The algorithm is as follows:
> if src and dest are 8 byte aligned, perform double word copy
else
> copy byte by byte on unaligned addresses.
The aligned comparison are made using cmpb instructions. */
/* The focus on optimization for performance improvements are as follows:
1. data alignment [gain from aligned memory access on read/write]
2. POWER7 gains performance with loop unrolling/unwinding
[gain by reduction of branch penalty].
3. The final pad with null bytes is done by calling an optimized
memset. */
#ifdef USE_AS_STPNCPY
# define FUNC_NAME __stpncpy
#else
# define FUNC_NAME strncpy
#endif
#define FRAMESIZE (FRAME_MIN_SIZE+32)
#ifndef MEMSET
/* For builds with no IFUNC support, local calls should be made to internal
GLIBC symbol (created by libc_hidden_builtin_def). */
# ifdef SHARED
# define MEMSET __GI_memset
# else
# define MEMSET memset
# endif
#endif
.machine power7
EALIGN(FUNC_NAME, 4, 0)
CALL_MCOUNT 3
mflr r0 /* load link register LR to r0 */
or r10, r3, r4 /* to verify source and destination */
rldicl. r8, r10, 0, 61 /* is double word aligned .. ? */
std r19, -8(r1) /* save callers register , r19 */
std r18, -16(r1) /* save callers register , r18 */
std r0, 16(r1) /* store the link register */
stdu r1, -FRAMESIZE(r1) /* create the stack frame */
mr r9, r3 /* save r3 into r9 for use */
mr r18, r3 /* save r3 for retCode of strncpy */
bne 0, L(byte_by_byte)
srdi r11, r5, 3 /* compute count for CTR ; count = n/8 */
cmpldi cr7, r11, 3 /* if count > 4 ; perform unrolling 4 times */
ble 7, L(update1)
ld r10, 0(r4) /* load doubleWord from src */
cmpb r8, r10, r8 /* compare src with NULL ,we read just now */
cmpdi cr7, r8, 0 /* if cmpb returned NULL ; we continue */
bne cr7, L(update3)
std r10, 0(r3) /* copy doubleword at offset=0 */
ld r10, 8(r4) /* load next doubleword from offset=8 */
cmpb r8, r10, r8 /* compare src with NULL , we read just now */
cmpdi cr7, r8, 0 /* if cmpb returned NULL ; we continue */
bne 7,L(HopBy8)
addi r8, r11, -4
mr r7, r3
srdi r8, r8, 2
mr r6, r4
addi r8, r8, 1
li r12, 0
mtctr r8
b L(dwordCopy)
.p2align 4
L(dWordUnroll):
std r8, 16(r9)
ld r8, 24(r4) /* load dword,perform loop unrolling again */
cmpb r10, r8, r10
cmpdi cr7, r10, 0
bne cr7, L(HopBy24)
std r8, 24(r7) /* copy dword at offset=24 */
addi r9, r9, 32
addi r4, r4, 32
bdz L(leftDwords) /* continue with loop on counter */
ld r3, 32(r6)
cmpb r8, r3, r10
cmpdi cr7, r8, 0
bne cr7, L(update2)
std r3, 32(r7)
ld r10, 40(r6)
cmpb r8, r10, r8
cmpdi cr7, r8, 0
bne cr7, L(HopBy40)
mr r6, r4 /* update values */
mr r7, r9
mr r11, r0
mr r5, r19
L(dwordCopy):
std r10, 8(r9) /* copy dword at offset=8 */
addi r19, r5, -32
addi r0, r11, -4
ld r8, 16(r4)
cmpb r10, r8, r12
cmpdi cr7, r10, 0
beq cr7, L(dWordUnroll)
addi r9, r9, 16 /* increment dst by 16 */
addi r4, r4, 16 /* increment src by 16 */
addi r5, r5, -16 /* decrement length 'n' by 16 */
addi r0, r11, -2 /* decrement loop counter */
L(dWordUnrollOFF):
ld r10, 0(r4) /* load first dword */
li r8, 0 /* load mask */
cmpb r8, r10, r8
cmpdi cr7, r8, 0
bne cr7, L(byte_by_byte)
mtctr r0
li r7, 0
b L(CopyDword)
.p2align 4
L(loadDWordandCompare):
ld r10, 0(r4)
cmpb r8, r10, r7
cmpdi cr7, r8, 0
bne cr7, L(byte_by_byte)
L(CopyDword):
addi r9, r9, 8
std r10, -8(r9)
addi r4, r4, 8
addi r5, r5, -8
bdnz L(loadDWordandCompare)
L(byte_by_byte):
cmpldi cr7, r5, 3
ble cr7, L(verifyByte)
srdi r10, r5, 2
mr r19, r9
mtctr r10
b L(firstByteUnroll)
.p2align 4
L(bytes_unroll):
lbz r10, 1(r4) /* load byte from src */
cmpdi cr7, r10, 0 /* compare for NULL */
stb r10, 1(r19) /* store byte to dst */
beq cr7, L(updtDestComputeN2ndByte)
addi r4, r4, 4 /* advance src */
lbz r10, -2(r4) /* perform loop unrolling for byte r/w */
cmpdi cr7, r10, 0
stb r10, 2(r19)
beq cr7, L(updtDestComputeN3rdByte)
lbz r10, -1(r4) /* perform loop unrolling for byte r/w */
addi r19, r19, 4
cmpdi cr7, r10, 0
stb r10, -1(r19)
beq cr7, L(ComputeNByte)
bdz L(update0)
L(firstByteUnroll):
lbz r10, 0(r4) /* perform loop unrolling for byte r/w */
cmpdi cr7, 10, 0
stb r10, 0(r19)
bne cr7, L(bytes_unroll)
addi r19, r19, 1
L(ComputeNByte):
subf r9, r19, r9 /* compute 'n'n bytes to fill */
add r8, r9, r5
L(zeroFill):
cmpdi cr7, r8, 0 /* compare if length is zero */
beq cr7, L(update3return)
mr r3, r19 /* fill buffer with */
li r4, 0 /* zero fill buffer */
mr r5, r8 /* how many bytes to fill buffer with */
bl MEMSET /* call optimized memset */
nop
L(update3return):
#ifdef USE_AS_STPNCPY
addi r3, r19, -1 /* update return value */
#endif
L(hop2return):
#ifndef USE_AS_STPNCPY
mr r3, r18 /* set return value */
#endif
addi r1, r1, FRAMESIZE /* restore stack pointer */
ld r0, 16(r1) /* read the saved link register */
ld r18, -16(r1) /* restore callers save register, r18 */
ld r19, -8(r1) /* restore callers save register, r19 */
mtlr r0 /* branch to link register */
blr /* return */
.p2align 4
L(update0):
mr r9, r19
.p2align 4
L(verifyByte):
rldicl. r8, r5, 0, 62
#ifdef USE_AS_STPNCPY
mr r3, r9
#endif
beq cr0, L(hop2return)
mtctr r8
addi r4, r4, -1
mr r19, r9
b L(oneBYone)
.p2align 4
L(proceed):
bdz L(done)
L(oneBYone):
lbzu r10, 1(r4) /* copy byte */
addi r19, r19, 1
addi r8, r8, -1
cmpdi cr7, r10, 0
stb r10, -1(r19)
bne cr7, L(proceed)
b L(zeroFill)
.p2align 4
L(done):
addi r1, r1, FRAMESIZE /* restore stack pointer */
#ifdef USE_AS_STPNCPY
mr r3, r19 /* set the return value */
#else
mr r3, r18 /* set the return value */
#endif
ld r0, 16(r1) /* read the saved link register */
ld r18, -16(r1) /* restore callers save register, r18 */
ld r19, -8(r1) /* restore callers save register, r19 */
mtlr r0 /* branch to link register */
blr /* return */
L(update1):
mr r0, r11
mr r19, r5
.p2align 4
L(leftDwords):
cmpdi cr7, r0, 0
mr r5, r19
bne cr7, L(dWordUnrollOFF)
b L(byte_by_byte)
.p2align 4
L(updtDestComputeN2ndByte):
addi r19, r19, 2 /* update dst by 2 */
subf r9, r19, r9 /* compute distance covered */
add r8, r9, r5
b L(zeroFill)
.p2align 4
L(updtDestComputeN3rdByte):
addi r19, r19, 3 /* update dst by 3 */
subf r9, r19, r9 /* compute distance covered */
add r8, r9, r5
b L(zeroFill)
.p2align 4
L(HopBy24):
addi r9, r9, 24 /* increment dst by 24 */
addi r4, r4, 24 /* increment src by 24 */
addi r5, r5, -24 /* decrement length 'n' by 24 */
addi r0, r11, -3 /* decrement loop counter */
b L(dWordUnrollOFF)
.p2align 4
L(update2):
mr r5, r19
b L(dWordUnrollOFF)
.p2align 4
L(HopBy40):
addi r9, r7, 40 /* increment dst by 40 */
addi r4, r6, 40 /* increment src by 40 */
addi r5, r5, -40 /* decrement length 'n' by 40 */
addi r0, r11, -5 /* decrement loop counter */
b L(dWordUnrollOFF)
L(update3):
mr r0, r11
b L(dWordUnrollOFF)
L(HopBy8):
addi r9, r3, 8 /* increment dst by 8 */
addi r4, r4, 8 /* increment src by 8 */
addi r5, r5, -8 /* decrement length 'n' by 8 */
addi r0, r11, -1 /* decrement loop counter */
b L(dWordUnrollOFF)
END(FUNC_NAME)
#ifndef USE_AS_STPNCPY
libc_hidden_builtin_def (strncpy)
#endif