powerpc64: Add optimized chacha20

It adds vectorized ChaCha20 implementation based on libgcrypt
cipher/chacha20-ppc.c.  It targets POWER8 and it is used on default
for LE.

On a POWER8 it shows the following improvements (using formatted
bench-arc4random data):

POWER8

GENERIC                                    MB/s
-----------------------------------------------
arc4random [single-thread]               138.77
arc4random_buf(16) [single-thread]       174.36
arc4random_buf(32) [single-thread]       228.11
arc4random_buf(48) [single-thread]       252.31
arc4random_buf(64) [single-thread]       270.11
arc4random_buf(80) [single-thread]       278.97
arc4random_buf(96) [single-thread]       287.78
arc4random_buf(112) [single-thread]      291.92
arc4random_buf(128) [single-thread]      295.25

POWER8                                     MB/s
-----------------------------------------------
arc4random [single-thread]               198.06
arc4random_buf(16) [single-thread]       278.79
arc4random_buf(32) [single-thread]       448.89
arc4random_buf(48) [single-thread]       551.09
arc4random_buf(64) [single-thread]       646.12
arc4random_buf(80) [single-thread]       698.04
arc4random_buf(96) [single-thread]       756.06
arc4random_buf(112) [single-thread]      784.12
arc4random_buf(128) [single-thread]      808.04
-----------------------------------------------

Checked on powerpc64-linux-gnu and powerpc64le-linux-gnu.
Reviewed-by: Paul E. Murphy <murphyp@linux.ibm.com>
This commit is contained in:
Adhemerval Zanella Netto 2022-07-21 10:05:05 -03:00 committed by Adhemerval Zanella
parent 84cfc6479b
commit b7060acfe8
7 changed files with 347 additions and 1 deletions

View File

@ -391,7 +391,8 @@ Copyright 2001 by Stephen L. Moshier <moshier@na-net.ornl.gov>
<https://www.gnu.org/licenses/>. */
sysdeps/aarch64/chacha20-aarch64.S, sysdeps/x86_64/chacha20-amd64-sse2.S,
and sysdeps/x86_64/chacha20-amd64-avx2.S imports code from libgcrypt,
sysdeps/x86_64/chacha20-amd64-avx2.S, and
sysdeps/powerpc/powerpc64/power8/chacha20-ppc.c imports code from libgcrypt,
with the following notices:
Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>

View File

@ -0,0 +1,4 @@
ifeq ($(subdir),stdlib)
sysdep_routines += chacha20-ppc
CFLAGS-chacha20-ppc.c += -mcpu=power8
endif

View File

@ -0,0 +1 @@
#include <sysdeps/powerpc/powerpc64/power8/chacha20-ppc.c>

View File

@ -0,0 +1,42 @@
/* PowerPC optimization for ChaCha20.
Copyright (C) 2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <stdbool.h>
#include <ldsodefs.h>
unsigned int __chacha20_power8_blocks4 (uint32_t *state, uint8_t *dst,
const uint8_t *src, size_t nblks)
attribute_hidden;
static void
chacha20_crypt (uint32_t *state, uint8_t *dst,
const uint8_t *src, size_t bytes)
{
_Static_assert (CHACHA20_BUFSIZE % 4 == 0,
"CHACHA20_BUFSIZE not multiple of 4");
_Static_assert (CHACHA20_BUFSIZE >= CHACHA20_BLOCK_SIZE * 4,
"CHACHA20_BUFSIZE < CHACHA20_BLOCK_SIZE * 4");
unsigned long int hwcap = GLRO(dl_hwcap);
unsigned long int hwcap2 = GLRO(dl_hwcap2);
if (hwcap2 & PPC_FEATURE2_ARCH_2_07 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
__chacha20_power8_blocks4 (state, dst, src,
CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
else
chacha20_crypt_generic (state, dst, src, bytes);
}

View File

@ -1,3 +1,8 @@
ifeq ($(subdir),string)
sysdep_routines += strcasestr-ppc64
endif
ifeq ($(subdir),stdlib)
sysdep_routines += chacha20-ppc
CFLAGS-chacha20-ppc.c += -mcpu=power8
endif

View File

@ -0,0 +1,256 @@
/* Optimized PowerPC implementation of ChaCha20 cipher.
Copyright (C) 2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
/* chacha20-ppc.c - PowerPC vector implementation of ChaCha20
Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
This file is part of Libgcrypt.
Libgcrypt is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation; either version 2.1 of
the License, or (at your option) any later version.
Libgcrypt is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this program; if not, see <https://www.gnu.org/licenses/>.
*/
#include <altivec.h>
#include <endian.h>
#include <stddef.h>
#include <stdint.h>
#include <sys/cdefs.h>
typedef vector unsigned char vector16x_u8;
typedef vector unsigned int vector4x_u32;
typedef vector unsigned long long vector2x_u64;
#if __BYTE_ORDER == __BIG_ENDIAN
static const vector16x_u8 le_bswap_const =
{ 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
#endif
static inline vector4x_u32
vec_rol_elems (vector4x_u32 v, unsigned int idx)
{
#if __BYTE_ORDER != __BIG_ENDIAN
return vec_sld (v, v, (16 - (4 * idx)) & 15);
#else
return vec_sld (v, v, (4 * idx) & 15);
#endif
}
static inline vector4x_u32
vec_load_le (unsigned long offset, const unsigned char *ptr)
{
vector4x_u32 vec;
vec = vec_vsx_ld (offset, (const uint32_t *)ptr);
#if __BYTE_ORDER == __BIG_ENDIAN
vec = (vector4x_u32) vec_perm ((vector16x_u8)vec, (vector16x_u8)vec,
le_bswap_const);
#endif
return vec;
}
static inline void
vec_store_le (vector4x_u32 vec, unsigned long offset, unsigned char *ptr)
{
#if __BYTE_ORDER == __BIG_ENDIAN
vec = (vector4x_u32)vec_perm((vector16x_u8)vec, (vector16x_u8)vec,
le_bswap_const);
#endif
vec_vsx_st (vec, offset, (uint32_t *)ptr);
}
static inline vector4x_u32
vec_add_ctr_u64 (vector4x_u32 v, vector4x_u32 a)
{
#if __BYTE_ORDER == __BIG_ENDIAN
static const vector16x_u8 swap32 =
{ 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11 };
vector2x_u64 vec, add, sum;
vec = (vector2x_u64)vec_perm ((vector16x_u8)v, (vector16x_u8)v, swap32);
add = (vector2x_u64)vec_perm ((vector16x_u8)a, (vector16x_u8)a, swap32);
sum = vec + add;
return (vector4x_u32)vec_perm ((vector16x_u8)sum, (vector16x_u8)sum, swap32);
#else
return (vector4x_u32)((vector2x_u64)(v) + (vector2x_u64)(a));
#endif
}
/**********************************************************************
4-way chacha20
**********************************************************************/
#define ROTATE(v1,rolv) \
__asm__ ("vrlw %0,%1,%2\n\t" : "=v" (v1) : "v" (v1), "v" (rolv))
#define PLUS(ds,s) \
((ds) += (s))
#define XOR(ds,s) \
((ds) ^= (s))
#define ADD_U64(v,a) \
(v = vec_add_ctr_u64(v, a))
/* 4x4 32-bit integer matrix transpose */
#define transpose_4x4(x0, x1, x2, x3) ({ \
vector4x_u32 t1 = vec_mergeh(x0, x2); \
vector4x_u32 t2 = vec_mergel(x0, x2); \
vector4x_u32 t3 = vec_mergeh(x1, x3); \
x3 = vec_mergel(x1, x3); \
x0 = vec_mergeh(t1, t3); \
x1 = vec_mergel(t1, t3); \
x2 = vec_mergeh(t2, x3); \
x3 = vec_mergel(t2, x3); \
})
#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2) \
PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \
ROTATE(d1, rotate_16); ROTATE(d2, rotate_16); \
PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \
ROTATE(b1, rotate_12); ROTATE(b2, rotate_12); \
PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \
ROTATE(d1, rotate_8); ROTATE(d2, rotate_8); \
PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \
ROTATE(b1, rotate_7); ROTATE(b2, rotate_7);
unsigned int attribute_hidden
__chacha20_power8_blocks4 (uint32_t *state, uint8_t *dst, const uint8_t *src,
size_t nblks)
{
vector4x_u32 counters_0123 = { 0, 1, 2, 3 };
vector4x_u32 counter_4 = { 4, 0, 0, 0 };
vector4x_u32 rotate_16 = { 16, 16, 16, 16 };
vector4x_u32 rotate_12 = { 12, 12, 12, 12 };
vector4x_u32 rotate_8 = { 8, 8, 8, 8 };
vector4x_u32 rotate_7 = { 7, 7, 7, 7 };
vector4x_u32 state0, state1, state2, state3;
vector4x_u32 v0, v1, v2, v3, v4, v5, v6, v7;
vector4x_u32 v8, v9, v10, v11, v12, v13, v14, v15;
vector4x_u32 tmp;
int i;
/* Force preload of constants to vector registers. */
__asm__ ("": "+v" (counters_0123) :: "memory");
__asm__ ("": "+v" (counter_4) :: "memory");
__asm__ ("": "+v" (rotate_16) :: "memory");
__asm__ ("": "+v" (rotate_12) :: "memory");
__asm__ ("": "+v" (rotate_8) :: "memory");
__asm__ ("": "+v" (rotate_7) :: "memory");
state0 = vec_vsx_ld (0 * 16, state);
state1 = vec_vsx_ld (1 * 16, state);
state2 = vec_vsx_ld (2 * 16, state);
state3 = vec_vsx_ld (3 * 16, state);
do
{
v0 = vec_splat (state0, 0);
v1 = vec_splat (state0, 1);
v2 = vec_splat (state0, 2);
v3 = vec_splat (state0, 3);
v4 = vec_splat (state1, 0);
v5 = vec_splat (state1, 1);
v6 = vec_splat (state1, 2);
v7 = vec_splat (state1, 3);
v8 = vec_splat (state2, 0);
v9 = vec_splat (state2, 1);
v10 = vec_splat (state2, 2);
v11 = vec_splat (state2, 3);
v12 = vec_splat (state3, 0);
v13 = vec_splat (state3, 1);
v14 = vec_splat (state3, 2);
v15 = vec_splat (state3, 3);
v12 += counters_0123;
v13 -= vec_cmplt (v12, counters_0123);
for (i = 20; i > 0; i -= 2)
{
QUARTERROUND2 (v0, v4, v8, v12, v1, v5, v9, v13)
QUARTERROUND2 (v2, v6, v10, v14, v3, v7, v11, v15)
QUARTERROUND2 (v0, v5, v10, v15, v1, v6, v11, v12)
QUARTERROUND2 (v2, v7, v8, v13, v3, v4, v9, v14)
}
v0 += vec_splat (state0, 0);
v1 += vec_splat (state0, 1);
v2 += vec_splat (state0, 2);
v3 += vec_splat (state0, 3);
v4 += vec_splat (state1, 0);
v5 += vec_splat (state1, 1);
v6 += vec_splat (state1, 2);
v7 += vec_splat (state1, 3);
v8 += vec_splat (state2, 0);
v9 += vec_splat (state2, 1);
v10 += vec_splat (state2, 2);
v11 += vec_splat (state2, 3);
tmp = vec_splat( state3, 0);
tmp += counters_0123;
v12 += tmp;
v13 += vec_splat (state3, 1) - vec_cmplt (tmp, counters_0123);
v14 += vec_splat (state3, 2);
v15 += vec_splat (state3, 3);
ADD_U64 (state3, counter_4);
transpose_4x4 (v0, v1, v2, v3);
transpose_4x4 (v4, v5, v6, v7);
transpose_4x4 (v8, v9, v10, v11);
transpose_4x4 (v12, v13, v14, v15);
vec_store_le (v0, (64 * 0 + 16 * 0), dst);
vec_store_le (v1, (64 * 1 + 16 * 0), dst);
vec_store_le (v2, (64 * 2 + 16 * 0), dst);
vec_store_le (v3, (64 * 3 + 16 * 0), dst);
vec_store_le (v4, (64 * 0 + 16 * 1), dst);
vec_store_le (v5, (64 * 1 + 16 * 1), dst);
vec_store_le (v6, (64 * 2 + 16 * 1), dst);
vec_store_le (v7, (64 * 3 + 16 * 1), dst);
vec_store_le (v8, (64 * 0 + 16 * 2), dst);
vec_store_le (v9, (64 * 1 + 16 * 2), dst);
vec_store_le (v10, (64 * 2 + 16 * 2), dst);
vec_store_le (v11, (64 * 3 + 16 * 2), dst);
vec_store_le (v12, (64 * 0 + 16 * 3), dst);
vec_store_le (v13, (64 * 1 + 16 * 3), dst);
vec_store_le (v14, (64 * 2 + 16 * 3), dst);
vec_store_le (v15, (64 * 3 + 16 * 3), dst);
src += 4*64;
dst += 4*64;
nblks -= 4;
}
while (nblks);
vec_vsx_st (state3, 3 * 16, state);
return 0;
}

View File

@ -0,0 +1,37 @@
/* PowerPC optimization for ChaCha20.
Copyright (C) 2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <stdbool.h>
#include <ldsodefs.h>
unsigned int __chacha20_power8_blocks4 (uint32_t *state, uint8_t *dst,
const uint8_t *src, size_t nblks)
attribute_hidden;
static void
chacha20_crypt (uint32_t *state, uint8_t *dst,
const uint8_t *src, size_t bytes)
{
_Static_assert (CHACHA20_BUFSIZE % 4 == 0,
"CHACHA20_BUFSIZE not multiple of 4");
_Static_assert (CHACHA20_BUFSIZE >= CHACHA20_BLOCK_SIZE * 4,
"CHACHA20_BUFSIZE < CHACHA20_BLOCK_SIZE * 4");
__chacha20_power8_blocks4 (state, dst, src,
CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
}