mirror of
https://sourceware.org/git/glibc.git
synced 2024-11-23 21:40:12 +00:00
powerpc64: Add optimized chacha20
It adds vectorized ChaCha20 implementation based on libgcrypt cipher/chacha20-ppc.c. It targets POWER8 and it is used on default for LE. On a POWER8 it shows the following improvements (using formatted bench-arc4random data): POWER8 GENERIC MB/s ----------------------------------------------- arc4random [single-thread] 138.77 arc4random_buf(16) [single-thread] 174.36 arc4random_buf(32) [single-thread] 228.11 arc4random_buf(48) [single-thread] 252.31 arc4random_buf(64) [single-thread] 270.11 arc4random_buf(80) [single-thread] 278.97 arc4random_buf(96) [single-thread] 287.78 arc4random_buf(112) [single-thread] 291.92 arc4random_buf(128) [single-thread] 295.25 POWER8 MB/s ----------------------------------------------- arc4random [single-thread] 198.06 arc4random_buf(16) [single-thread] 278.79 arc4random_buf(32) [single-thread] 448.89 arc4random_buf(48) [single-thread] 551.09 arc4random_buf(64) [single-thread] 646.12 arc4random_buf(80) [single-thread] 698.04 arc4random_buf(96) [single-thread] 756.06 arc4random_buf(112) [single-thread] 784.12 arc4random_buf(128) [single-thread] 808.04 ----------------------------------------------- Checked on powerpc64-linux-gnu and powerpc64le-linux-gnu. Reviewed-by: Paul E. Murphy <murphyp@linux.ibm.com>
This commit is contained in:
parent
84cfc6479b
commit
b7060acfe8
3
LICENSES
3
LICENSES
@ -391,7 +391,8 @@ Copyright 2001 by Stephen L. Moshier <moshier@na-net.ornl.gov>
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
sysdeps/aarch64/chacha20-aarch64.S, sysdeps/x86_64/chacha20-amd64-sse2.S,
|
||||
and sysdeps/x86_64/chacha20-amd64-avx2.S imports code from libgcrypt,
|
||||
sysdeps/x86_64/chacha20-amd64-avx2.S, and
|
||||
sysdeps/powerpc/powerpc64/power8/chacha20-ppc.c imports code from libgcrypt,
|
||||
with the following notices:
|
||||
|
||||
Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
||||
|
4
sysdeps/powerpc/powerpc64/be/multiarch/Makefile
Normal file
4
sysdeps/powerpc/powerpc64/be/multiarch/Makefile
Normal file
@ -0,0 +1,4 @@
|
||||
ifeq ($(subdir),stdlib)
|
||||
sysdep_routines += chacha20-ppc
|
||||
CFLAGS-chacha20-ppc.c += -mcpu=power8
|
||||
endif
|
1
sysdeps/powerpc/powerpc64/be/multiarch/chacha20-ppc.c
Normal file
1
sysdeps/powerpc/powerpc64/be/multiarch/chacha20-ppc.c
Normal file
@ -0,0 +1 @@
|
||||
#include <sysdeps/powerpc/powerpc64/power8/chacha20-ppc.c>
|
42
sysdeps/powerpc/powerpc64/be/multiarch/chacha20_arch.h
Normal file
42
sysdeps/powerpc/powerpc64/be/multiarch/chacha20_arch.h
Normal file
@ -0,0 +1,42 @@
|
||||
/* PowerPC optimization for ChaCha20.
|
||||
Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <ldsodefs.h>
|
||||
|
||||
unsigned int __chacha20_power8_blocks4 (uint32_t *state, uint8_t *dst,
|
||||
const uint8_t *src, size_t nblks)
|
||||
attribute_hidden;
|
||||
|
||||
static void
|
||||
chacha20_crypt (uint32_t *state, uint8_t *dst,
|
||||
const uint8_t *src, size_t bytes)
|
||||
{
|
||||
_Static_assert (CHACHA20_BUFSIZE % 4 == 0,
|
||||
"CHACHA20_BUFSIZE not multiple of 4");
|
||||
_Static_assert (CHACHA20_BUFSIZE >= CHACHA20_BLOCK_SIZE * 4,
|
||||
"CHACHA20_BUFSIZE < CHACHA20_BLOCK_SIZE * 4");
|
||||
|
||||
unsigned long int hwcap = GLRO(dl_hwcap);
|
||||
unsigned long int hwcap2 = GLRO(dl_hwcap2);
|
||||
if (hwcap2 & PPC_FEATURE2_ARCH_2_07 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
|
||||
__chacha20_power8_blocks4 (state, dst, src,
|
||||
CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
|
||||
else
|
||||
chacha20_crypt_generic (state, dst, src, bytes);
|
||||
}
|
@ -1,3 +1,8 @@
|
||||
ifeq ($(subdir),string)
|
||||
sysdep_routines += strcasestr-ppc64
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),stdlib)
|
||||
sysdep_routines += chacha20-ppc
|
||||
CFLAGS-chacha20-ppc.c += -mcpu=power8
|
||||
endif
|
||||
|
256
sysdeps/powerpc/powerpc64/power8/chacha20-ppc.c
Normal file
256
sysdeps/powerpc/powerpc64/power8/chacha20-ppc.c
Normal file
@ -0,0 +1,256 @@
|
||||
/* Optimized PowerPC implementation of ChaCha20 cipher.
|
||||
Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
/* chacha20-ppc.c - PowerPC vector implementation of ChaCha20
|
||||
Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
||||
|
||||
This file is part of Libgcrypt.
|
||||
|
||||
Libgcrypt is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as
|
||||
published by the Free Software Foundation; either version 2.1 of
|
||||
the License, or (at your option) any later version.
|
||||
|
||||
Libgcrypt is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this program; if not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <altivec.h>
|
||||
#include <endian.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <sys/cdefs.h>
|
||||
|
||||
typedef vector unsigned char vector16x_u8;
|
||||
typedef vector unsigned int vector4x_u32;
|
||||
typedef vector unsigned long long vector2x_u64;
|
||||
|
||||
#if __BYTE_ORDER == __BIG_ENDIAN
|
||||
static const vector16x_u8 le_bswap_const =
|
||||
{ 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
|
||||
#endif
|
||||
|
||||
static inline vector4x_u32
|
||||
vec_rol_elems (vector4x_u32 v, unsigned int idx)
|
||||
{
|
||||
#if __BYTE_ORDER != __BIG_ENDIAN
|
||||
return vec_sld (v, v, (16 - (4 * idx)) & 15);
|
||||
#else
|
||||
return vec_sld (v, v, (4 * idx) & 15);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline vector4x_u32
|
||||
vec_load_le (unsigned long offset, const unsigned char *ptr)
|
||||
{
|
||||
vector4x_u32 vec;
|
||||
vec = vec_vsx_ld (offset, (const uint32_t *)ptr);
|
||||
#if __BYTE_ORDER == __BIG_ENDIAN
|
||||
vec = (vector4x_u32) vec_perm ((vector16x_u8)vec, (vector16x_u8)vec,
|
||||
le_bswap_const);
|
||||
#endif
|
||||
return vec;
|
||||
}
|
||||
|
||||
static inline void
|
||||
vec_store_le (vector4x_u32 vec, unsigned long offset, unsigned char *ptr)
|
||||
{
|
||||
#if __BYTE_ORDER == __BIG_ENDIAN
|
||||
vec = (vector4x_u32)vec_perm((vector16x_u8)vec, (vector16x_u8)vec,
|
||||
le_bswap_const);
|
||||
#endif
|
||||
vec_vsx_st (vec, offset, (uint32_t *)ptr);
|
||||
}
|
||||
|
||||
|
||||
static inline vector4x_u32
|
||||
vec_add_ctr_u64 (vector4x_u32 v, vector4x_u32 a)
|
||||
{
|
||||
#if __BYTE_ORDER == __BIG_ENDIAN
|
||||
static const vector16x_u8 swap32 =
|
||||
{ 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11 };
|
||||
vector2x_u64 vec, add, sum;
|
||||
|
||||
vec = (vector2x_u64)vec_perm ((vector16x_u8)v, (vector16x_u8)v, swap32);
|
||||
add = (vector2x_u64)vec_perm ((vector16x_u8)a, (vector16x_u8)a, swap32);
|
||||
sum = vec + add;
|
||||
return (vector4x_u32)vec_perm ((vector16x_u8)sum, (vector16x_u8)sum, swap32);
|
||||
#else
|
||||
return (vector4x_u32)((vector2x_u64)(v) + (vector2x_u64)(a));
|
||||
#endif
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
4-way chacha20
|
||||
**********************************************************************/
|
||||
|
||||
#define ROTATE(v1,rolv) \
|
||||
__asm__ ("vrlw %0,%1,%2\n\t" : "=v" (v1) : "v" (v1), "v" (rolv))
|
||||
|
||||
#define PLUS(ds,s) \
|
||||
((ds) += (s))
|
||||
|
||||
#define XOR(ds,s) \
|
||||
((ds) ^= (s))
|
||||
|
||||
#define ADD_U64(v,a) \
|
||||
(v = vec_add_ctr_u64(v, a))
|
||||
|
||||
/* 4x4 32-bit integer matrix transpose */
|
||||
#define transpose_4x4(x0, x1, x2, x3) ({ \
|
||||
vector4x_u32 t1 = vec_mergeh(x0, x2); \
|
||||
vector4x_u32 t2 = vec_mergel(x0, x2); \
|
||||
vector4x_u32 t3 = vec_mergeh(x1, x3); \
|
||||
x3 = vec_mergel(x1, x3); \
|
||||
x0 = vec_mergeh(t1, t3); \
|
||||
x1 = vec_mergel(t1, t3); \
|
||||
x2 = vec_mergeh(t2, x3); \
|
||||
x3 = vec_mergel(t2, x3); \
|
||||
})
|
||||
|
||||
#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2) \
|
||||
PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \
|
||||
ROTATE(d1, rotate_16); ROTATE(d2, rotate_16); \
|
||||
PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \
|
||||
ROTATE(b1, rotate_12); ROTATE(b2, rotate_12); \
|
||||
PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \
|
||||
ROTATE(d1, rotate_8); ROTATE(d2, rotate_8); \
|
||||
PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \
|
||||
ROTATE(b1, rotate_7); ROTATE(b2, rotate_7);
|
||||
|
||||
unsigned int attribute_hidden
|
||||
__chacha20_power8_blocks4 (uint32_t *state, uint8_t *dst, const uint8_t *src,
|
||||
size_t nblks)
|
||||
{
|
||||
vector4x_u32 counters_0123 = { 0, 1, 2, 3 };
|
||||
vector4x_u32 counter_4 = { 4, 0, 0, 0 };
|
||||
vector4x_u32 rotate_16 = { 16, 16, 16, 16 };
|
||||
vector4x_u32 rotate_12 = { 12, 12, 12, 12 };
|
||||
vector4x_u32 rotate_8 = { 8, 8, 8, 8 };
|
||||
vector4x_u32 rotate_7 = { 7, 7, 7, 7 };
|
||||
vector4x_u32 state0, state1, state2, state3;
|
||||
vector4x_u32 v0, v1, v2, v3, v4, v5, v6, v7;
|
||||
vector4x_u32 v8, v9, v10, v11, v12, v13, v14, v15;
|
||||
vector4x_u32 tmp;
|
||||
int i;
|
||||
|
||||
/* Force preload of constants to vector registers. */
|
||||
__asm__ ("": "+v" (counters_0123) :: "memory");
|
||||
__asm__ ("": "+v" (counter_4) :: "memory");
|
||||
__asm__ ("": "+v" (rotate_16) :: "memory");
|
||||
__asm__ ("": "+v" (rotate_12) :: "memory");
|
||||
__asm__ ("": "+v" (rotate_8) :: "memory");
|
||||
__asm__ ("": "+v" (rotate_7) :: "memory");
|
||||
|
||||
state0 = vec_vsx_ld (0 * 16, state);
|
||||
state1 = vec_vsx_ld (1 * 16, state);
|
||||
state2 = vec_vsx_ld (2 * 16, state);
|
||||
state3 = vec_vsx_ld (3 * 16, state);
|
||||
|
||||
do
|
||||
{
|
||||
v0 = vec_splat (state0, 0);
|
||||
v1 = vec_splat (state0, 1);
|
||||
v2 = vec_splat (state0, 2);
|
||||
v3 = vec_splat (state0, 3);
|
||||
v4 = vec_splat (state1, 0);
|
||||
v5 = vec_splat (state1, 1);
|
||||
v6 = vec_splat (state1, 2);
|
||||
v7 = vec_splat (state1, 3);
|
||||
v8 = vec_splat (state2, 0);
|
||||
v9 = vec_splat (state2, 1);
|
||||
v10 = vec_splat (state2, 2);
|
||||
v11 = vec_splat (state2, 3);
|
||||
v12 = vec_splat (state3, 0);
|
||||
v13 = vec_splat (state3, 1);
|
||||
v14 = vec_splat (state3, 2);
|
||||
v15 = vec_splat (state3, 3);
|
||||
|
||||
v12 += counters_0123;
|
||||
v13 -= vec_cmplt (v12, counters_0123);
|
||||
|
||||
for (i = 20; i > 0; i -= 2)
|
||||
{
|
||||
QUARTERROUND2 (v0, v4, v8, v12, v1, v5, v9, v13)
|
||||
QUARTERROUND2 (v2, v6, v10, v14, v3, v7, v11, v15)
|
||||
QUARTERROUND2 (v0, v5, v10, v15, v1, v6, v11, v12)
|
||||
QUARTERROUND2 (v2, v7, v8, v13, v3, v4, v9, v14)
|
||||
}
|
||||
|
||||
v0 += vec_splat (state0, 0);
|
||||
v1 += vec_splat (state0, 1);
|
||||
v2 += vec_splat (state0, 2);
|
||||
v3 += vec_splat (state0, 3);
|
||||
v4 += vec_splat (state1, 0);
|
||||
v5 += vec_splat (state1, 1);
|
||||
v6 += vec_splat (state1, 2);
|
||||
v7 += vec_splat (state1, 3);
|
||||
v8 += vec_splat (state2, 0);
|
||||
v9 += vec_splat (state2, 1);
|
||||
v10 += vec_splat (state2, 2);
|
||||
v11 += vec_splat (state2, 3);
|
||||
tmp = vec_splat( state3, 0);
|
||||
tmp += counters_0123;
|
||||
v12 += tmp;
|
||||
v13 += vec_splat (state3, 1) - vec_cmplt (tmp, counters_0123);
|
||||
v14 += vec_splat (state3, 2);
|
||||
v15 += vec_splat (state3, 3);
|
||||
ADD_U64 (state3, counter_4);
|
||||
|
||||
transpose_4x4 (v0, v1, v2, v3);
|
||||
transpose_4x4 (v4, v5, v6, v7);
|
||||
transpose_4x4 (v8, v9, v10, v11);
|
||||
transpose_4x4 (v12, v13, v14, v15);
|
||||
|
||||
vec_store_le (v0, (64 * 0 + 16 * 0), dst);
|
||||
vec_store_le (v1, (64 * 1 + 16 * 0), dst);
|
||||
vec_store_le (v2, (64 * 2 + 16 * 0), dst);
|
||||
vec_store_le (v3, (64 * 3 + 16 * 0), dst);
|
||||
|
||||
vec_store_le (v4, (64 * 0 + 16 * 1), dst);
|
||||
vec_store_le (v5, (64 * 1 + 16 * 1), dst);
|
||||
vec_store_le (v6, (64 * 2 + 16 * 1), dst);
|
||||
vec_store_le (v7, (64 * 3 + 16 * 1), dst);
|
||||
|
||||
vec_store_le (v8, (64 * 0 + 16 * 2), dst);
|
||||
vec_store_le (v9, (64 * 1 + 16 * 2), dst);
|
||||
vec_store_le (v10, (64 * 2 + 16 * 2), dst);
|
||||
vec_store_le (v11, (64 * 3 + 16 * 2), dst);
|
||||
|
||||
vec_store_le (v12, (64 * 0 + 16 * 3), dst);
|
||||
vec_store_le (v13, (64 * 1 + 16 * 3), dst);
|
||||
vec_store_le (v14, (64 * 2 + 16 * 3), dst);
|
||||
vec_store_le (v15, (64 * 3 + 16 * 3), dst);
|
||||
|
||||
src += 4*64;
|
||||
dst += 4*64;
|
||||
|
||||
nblks -= 4;
|
||||
}
|
||||
while (nblks);
|
||||
|
||||
vec_vsx_st (state3, 3 * 16, state);
|
||||
|
||||
return 0;
|
||||
}
|
37
sysdeps/powerpc/powerpc64/power8/chacha20_arch.h
Normal file
37
sysdeps/powerpc/powerpc64/power8/chacha20_arch.h
Normal file
@ -0,0 +1,37 @@
|
||||
/* PowerPC optimization for ChaCha20.
|
||||
Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <ldsodefs.h>
|
||||
|
||||
unsigned int __chacha20_power8_blocks4 (uint32_t *state, uint8_t *dst,
|
||||
const uint8_t *src, size_t nblks)
|
||||
attribute_hidden;
|
||||
|
||||
static void
|
||||
chacha20_crypt (uint32_t *state, uint8_t *dst,
|
||||
const uint8_t *src, size_t bytes)
|
||||
{
|
||||
_Static_assert (CHACHA20_BUFSIZE % 4 == 0,
|
||||
"CHACHA20_BUFSIZE not multiple of 4");
|
||||
_Static_assert (CHACHA20_BUFSIZE >= CHACHA20_BLOCK_SIZE * 4,
|
||||
"CHACHA20_BUFSIZE < CHACHA20_BLOCK_SIZE * 4");
|
||||
|
||||
__chacha20_power8_blocks4 (state, dst, src,
|
||||
CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
|
||||
}
|
Loading…
Reference in New Issue
Block a user