SECLIB-667: Accelerate SHA-256 with A64 crypto extensions

Provide an additional pair of #defines, MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT
and MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY. At most one of them may be
specified. If used, it is necessary to compile with -march=armv8-a+crypto.

The MBEDTLS_SHA256_PROCESS_ALT and MBEDTLS_SHA256_ALT mechanisms
continue to work, and are mutually exclusive with A64_CRYPTO.

There should be minimal code size impact if no A64_CRYPTO option is set.

Signed-off-by: Tom Cosgrove <tom.cosgrove@arm.com>
This commit is contained in:
Tom Cosgrove 2022-02-20 22:25:31 +00:00
parent 9b545c04f7
commit f3ebd90a1c
6 changed files with 364 additions and 11 deletions

View File

@ -0,0 +1,2 @@
Features
* A64 SHA-2 crypto extension support for SHA-256

View File

@ -592,6 +592,28 @@
#error "MBEDTLS_SHA256_C defined without MBEDTLS_SHA224_C"
#endif
#if defined(MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT) && \
defined(MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY)
#error "Must only define one of MBEDTLS_SHA256_USE_A64_CRYPTO_*"
#endif
#if defined(MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT) || \
defined(MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY)
#if !defined(MBEDTLS_SHA256_C)
#error "MBEDTLS_SHA256_USE_A64_CRYPTO_* defined without MBEDTLS_SHA256_C"
#endif
#if defined(MBEDTLS_SHA256_ALT) || defined(MBEDTLS_SHA256_PROCESS_ALT)
#error "MBEDTLS_SHA256_*ALT can't be used with MBEDTLS_SHA256_USE_A64_CRYPTO_*"
#endif
#if defined(__aarch64__) && !defined(__ARM_FEATURE_CRYPTO)
#error "Must use minimum -march=armv8-a+crypto for MBEDTLS_SHA256_USE_A64_CRYPTO_*"
#endif
#endif
#if defined(MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY) && !defined(__aarch64__)
#error "MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY defined on non-Aarch64 system"
#endif
#if defined(MBEDTLS_SSL_PROTO_TLS1_2) && ( !defined(MBEDTLS_SHA1_C) && \
!defined(MBEDTLS_SHA256_C) && !defined(MBEDTLS_SHA512_C) )
#error "MBEDTLS_SSL_PROTO_TLS1_2 defined, but not all prerequisites"

View File

@ -2759,6 +2759,56 @@
*/
#define MBEDTLS_SHA256_C
/**
* \def MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT
*
* Enable acceleration of the SHA-256 cryptographic hash algorithm with the
* Arm A64 cryptographic extensions if they are available at runtime. If not,
* it will fall back to the C implementation.
*
* \note If MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT is defined when building
* for a non-Aarch64 build it will be silently ignored.
*
* \note The code uses Neon intrinsics, so \c CFLAGS must be set to a minimum
* of \c -march=armv8-a+crypto.
*
* \warning MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT cannot be defined at the
* same time as MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY.
*
* Requires: MBEDTLS_SHA256_C.
*
* Module: library/sha256.c
*
* Uncomment to have the library check for the A64 SHA-256 crypto extensions
* and use them if available.
*/
//#define MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT
/**
* \def MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY
*
* Enable acceleration of the SHA-256 cryptographic hash algorithm with the
* Arm A64 cryptographic extensions, which must be available at runtime (or
* an illegal instruction fault will occur).
*
* \note This allows builds with a smaller code size than with
* MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT
*
* \note The code uses Neon intrinsics, so \c CFLAGS must be set to a minimum
* of \c -march=armv8-a+crypto.
*
* \warning MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY cannot be defined at the same
* time as MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT.
*
* Requires: MBEDTLS_SHA256_C.
*
* Module: library/sha256.c
*
* Uncomment to have the library use the A64 SHA-256 crypto extensions
* unconditionally.
*/
//#define MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY
/**
* \def MBEDTLS_SHA384_C
*

View File

@ -44,12 +44,97 @@
#endif /* MBEDTLS_PLATFORM_C */
#endif /* MBEDTLS_SELF_TEST */
#if defined(__aarch64__)
# if defined(MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT) || \
defined(MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY)
# include <arm_neon.h>
# endif
# if defined(MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT) && defined(__linux__)
# include <sys/auxv.h>
# endif
#else
# undef MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY
# undef MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT
#endif
#if defined(MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT)
/*
* Capability detection code comes early, so we can disable
* MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT if no detection mechanism found
*/
#if defined(HWCAP_SHA2)
static int mbedtls_a64_crypto_sha256_check_support( void )
{
return( ( getauxval( AT_HWCAP ) & HWCAP_SHA2 ) ? 1 : 0 );
}
#elif defined(__APPLE__)
static int mbedtls_a64_crypto_sha256_check_support( void )
{
return( 1 );
}
#elif defined(__unix__) && defined(SIG_SETMASK)
/* Detection with SIGILL, setjmp() and longjmp() */
#include <signal.h>
#include <setjmp.h>
#ifndef asm
#define asm __asm__
#endif
static jmp_buf return_from_sigill;
/*
* A64 SHA256 support detection via SIGILL
*/
static void sigill_handler( int signal )
{
(void) signal;
longjmp( return_from_sigill, 1 );
}
static int mbedtls_a64_crypto_sha256_check_support( void )
{
struct sigaction old_action, new_action;
sigset_t old_mask;
if( sigprocmask( 0, NULL, &old_mask ) )
return( 0 );
sigemptyset( &new_action.sa_mask );
new_action.sa_flags = 0;
new_action.sa_handler = sigill_handler;
sigaction( SIGILL, &new_action, &old_action );
static int ret = 0;
if( setjmp( return_from_sigill ) == 0 ) /* First return only */
{
/* If this traps, we will return a second time from setjmp() with 1 */
asm( "sha256h q0, q0, v0.4s" : : : "v0" );
ret = 1;
}
sigaction( SIGILL, &old_action, NULL );
sigprocmask( SIG_SETMASK, &old_mask, NULL );
return( ret );
}
#else
#warning "No mechanism to detect A64_CRYPTO found, using C code only"
#undef MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT
#endif /* HWCAP_SHA2, __APPLE__, __unix__ && SIG_SETMASK */
#endif /* MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT */
#define SHA256_VALIDATE_RET(cond) \
MBEDTLS_INTERNAL_VALIDATE_RET( cond, MBEDTLS_ERR_SHA256_BAD_INPUT_DATA )
#define SHA256_VALIDATE(cond) MBEDTLS_INTERNAL_VALIDATE( cond )
#if !defined(MBEDTLS_SHA256_ALT)
#define SHA256_BLOCK_SIZE 64
void mbedtls_sha256_init( mbedtls_sha256_context *ctx )
{
SHA256_VALIDATE( ctx != NULL );
@ -143,6 +228,132 @@ static const uint32_t K[] =
0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
};
#endif
#if defined(MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT) || \
defined(MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY)
#if defined(MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY)
# define mbedtls_internal_sha256_process_many_a64_crypto mbedtls_internal_sha256_process_many
# define mbedtls_internal_sha256_process_a64_crypto mbedtls_internal_sha256_process
#endif
static size_t mbedtls_internal_sha256_process_many_a64_crypto(
mbedtls_sha256_context *ctx, const uint8_t *msg, size_t len )
{
uint32x4_t abcd = vld1q_u32( &ctx->state[0] );
uint32x4_t efgh = vld1q_u32( &ctx->state[4] );
size_t processed = 0;
for( ;
len >= SHA256_BLOCK_SIZE;
processed += SHA256_BLOCK_SIZE,
msg += SHA256_BLOCK_SIZE,
len -= SHA256_BLOCK_SIZE )
{
uint32x4_t tmp, abcd_prev;
uint32x4_t abcd_orig = abcd;
uint32x4_t efgh_orig = efgh;
uint32x4_t sched0 = vld1q_u32( (const uint32_t *)( msg + 16 * 0 ) );
uint32x4_t sched1 = vld1q_u32( (const uint32_t *)( msg + 16 * 1 ) );
uint32x4_t sched2 = vld1q_u32( (const uint32_t *)( msg + 16 * 2 ) );
uint32x4_t sched3 = vld1q_u32( (const uint32_t *)( msg + 16 * 3 ) );
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ /* Will be true if not defined */
/* Untested on BE */
sched0 = vreinterpretq_u32_u8( vrev32q_u8( vreinterpretq_u8_u32( sched0 ) ) );
sched1 = vreinterpretq_u32_u8( vrev32q_u8( vreinterpretq_u8_u32( sched1 ) ) );
sched2 = vreinterpretq_u32_u8( vrev32q_u8( vreinterpretq_u8_u32( sched2 ) ) );
sched3 = vreinterpretq_u32_u8( vrev32q_u8( vreinterpretq_u8_u32( sched3 ) ) );
#endif
/* Rounds 0 to 3 */
tmp = vaddq_u32( sched0, vld1q_u32( &K[0] ) );
abcd_prev = abcd;
abcd = vsha256hq_u32( abcd_prev, efgh, tmp );
efgh = vsha256h2q_u32( efgh, abcd_prev, tmp );
/* Rounds 4 to 7 */
tmp = vaddq_u32( sched1, vld1q_u32( &K[4] ) );
abcd_prev = abcd;
abcd = vsha256hq_u32( abcd_prev, efgh, tmp );
efgh = vsha256h2q_u32( efgh, abcd_prev, tmp );
/* Rounds 8 to 11 */
tmp = vaddq_u32( sched2, vld1q_u32( &K[8] ) );
abcd_prev = abcd;
abcd = vsha256hq_u32( abcd_prev, efgh, tmp );
efgh = vsha256h2q_u32( efgh, abcd_prev, tmp );
/* Rounds 12 to 15 */
tmp = vaddq_u32( sched3, vld1q_u32( &K[12] ) );
abcd_prev = abcd;
abcd = vsha256hq_u32( abcd_prev, efgh, tmp );
efgh = vsha256h2q_u32( efgh, abcd_prev, tmp );
for( int t = 16; t < 64; t += 16 )
{
/* Rounds t to t + 3 */
sched0 = vsha256su1q_u32( vsha256su0q_u32( sched0, sched1 ), sched2, sched3 );
tmp = vaddq_u32( sched0, vld1q_u32( &K[t] ) );
abcd_prev = abcd;
abcd = vsha256hq_u32( abcd_prev, efgh, tmp );
efgh = vsha256h2q_u32( efgh, abcd_prev, tmp );
/* Rounds t + 4 to t + 7 */
sched1 = vsha256su1q_u32( vsha256su0q_u32( sched1, sched2 ), sched3, sched0 );
tmp = vaddq_u32( sched1, vld1q_u32( &K[t + 4] ) );
abcd_prev = abcd;
abcd = vsha256hq_u32( abcd_prev, efgh, tmp );
efgh = vsha256h2q_u32( efgh, abcd_prev, tmp );
/* Rounds t + 8 to t + 11 */
sched2 = vsha256su1q_u32( vsha256su0q_u32( sched2, sched3 ), sched0, sched1 );
tmp = vaddq_u32( sched2, vld1q_u32( &K[t + 8] ) );
abcd_prev = abcd;
abcd = vsha256hq_u32( abcd_prev, efgh, tmp );
efgh = vsha256h2q_u32( efgh, abcd_prev, tmp );
/* Rounds t + 12 to t + 15 */
sched3 = vsha256su1q_u32( vsha256su0q_u32( sched3, sched0 ), sched1, sched2 );
tmp = vaddq_u32( sched3, vld1q_u32( &K[t + 12] ) );
abcd_prev = abcd;
abcd = vsha256hq_u32( abcd_prev, efgh, tmp );
efgh = vsha256h2q_u32( efgh, abcd_prev, tmp );
}
abcd = vaddq_u32( abcd, abcd_orig );
efgh = vaddq_u32( efgh, efgh_orig );
}
vst1q_u32( &ctx->state[0], abcd );
vst1q_u32( &ctx->state[4], efgh );
return( processed );
}
int mbedtls_internal_sha256_process_a64_crypto( mbedtls_sha256_context *ctx,
const unsigned char data[SHA256_BLOCK_SIZE] )
{
return( ( mbedtls_internal_sha256_process_many_a64_crypto( ctx, data,
SHA256_BLOCK_SIZE ) == SHA256_BLOCK_SIZE ) ? 0 : -1 );
}
#endif /* MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT || MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY */
#if !defined(MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT)
#define mbedtls_internal_sha256_process_many_c mbedtls_internal_sha256_process_many
#define mbedtls_internal_sha256_process_c mbedtls_internal_sha256_process
#endif
#if !defined(MBEDTLS_SHA256_PROCESS_ALT) && \
!defined(MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY)
#define SHR(x,n) (((x) & 0xFFFFFFFF) >> (n))
#define ROTR(x,n) (SHR(x,n) | ((x) << (32 - (n))))
@ -169,8 +380,8 @@ static const uint32_t K[] =
(d) += local.temp1; (h) = local.temp1 + local.temp2; \
} while( 0 )
int mbedtls_internal_sha256_process( mbedtls_sha256_context *ctx,
const unsigned char data[64] )
int mbedtls_internal_sha256_process_c( mbedtls_sha256_context *ctx,
const unsigned char data[SHA256_BLOCK_SIZE] )
{
struct
{
@ -257,7 +468,69 @@ int mbedtls_internal_sha256_process( mbedtls_sha256_context *ctx,
return( 0 );
}
#endif /* !MBEDTLS_SHA256_PROCESS_ALT */
#endif /* !MBEDTLS_SHA256_PROCESS_ALT && !MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY */
#if !defined(MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY)
static size_t mbedtls_internal_sha256_process_many_c(
mbedtls_sha256_context *ctx, const uint8_t *data, size_t len )
{
size_t processed = 0;
while( len >= SHA256_BLOCK_SIZE )
{
if( mbedtls_internal_sha256_process_c( ctx, data ) != 0 )
return( 0 );
data += SHA256_BLOCK_SIZE;
len -= SHA256_BLOCK_SIZE;
processed += SHA256_BLOCK_SIZE;
}
return( processed );
}
#endif /* !MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY */
#if defined(MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT)
static int mbedtls_a64_crypto_sha256_has_support( void )
{
static int done = 0;
static int supported = 0;
if( !done )
{
supported = mbedtls_a64_crypto_sha256_check_support();
done = 1;
}
return( supported );
}
static size_t mbedtls_internal_sha256_process_many( mbedtls_sha256_context *ctx,
const uint8_t *msg, size_t len )
{
if( mbedtls_a64_crypto_sha256_has_support() )
return( mbedtls_internal_sha256_process_many_a64_crypto( ctx, msg, len ) );
else
return( mbedtls_internal_sha256_process_many_c( ctx, msg, len ) );
}
int mbedtls_internal_sha256_process( mbedtls_sha256_context *ctx,
const unsigned char data[SHA256_BLOCK_SIZE] )
{
if( mbedtls_a64_crypto_sha256_has_support() )
return( mbedtls_internal_sha256_process_a64_crypto( ctx, data ) );
else
return( mbedtls_internal_sha256_process_c( ctx, data ) );
}
#endif /* MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT */
/*
* SHA-256 process buffer
@ -277,7 +550,7 @@ int mbedtls_sha256_update( mbedtls_sha256_context *ctx,
return( 0 );
left = ctx->total[0] & 0x3F;
fill = 64 - left;
fill = SHA256_BLOCK_SIZE - left;
ctx->total[0] += (uint32_t) ilen;
ctx->total[0] &= 0xFFFFFFFF;
@ -297,13 +570,15 @@ int mbedtls_sha256_update( mbedtls_sha256_context *ctx,
left = 0;
}
while( ilen >= 64 )
while( ilen >= SHA256_BLOCK_SIZE )
{
if( ( ret = mbedtls_internal_sha256_process( ctx, input ) ) != 0 )
return( ret );
size_t processed =
mbedtls_internal_sha256_process_many( ctx, input, ilen );
if( processed < SHA256_BLOCK_SIZE )
return( MBEDTLS_ERR_ERROR_GENERIC_ERROR );
input += 64;
ilen -= 64;
input += processed;
ilen -= processed;
}
if( ilen > 0 )
@ -340,7 +615,7 @@ int mbedtls_sha256_finish( mbedtls_sha256_context *ctx,
else
{
/* We'll need an extra block */
memset( ctx->buffer + used, 0, 64 - used );
memset( ctx->buffer + used, 0, SHA256_BLOCK_SIZE - used );
if( ( ret = mbedtls_internal_sha256_process( ctx, ctx->buffer ) ) != 0 )
return( ret );

View File

@ -198,6 +198,7 @@ EXCLUDE_FROM_FULL = frozenset([
'MBEDTLS_PSA_CRYPTO_SPM', # platform dependency (PSA SPM)
'MBEDTLS_PSA_INJECT_ENTROPY', # build dependency (hook functions)
'MBEDTLS_RSA_NO_CRT', # influences the use of RSA in X.509 and TLS
'MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY', # interacts with *_USE_A64_CRYPTO_ONLY
'MBEDTLS_TEST_CONSTANT_FLOW_MEMSAN', # build dependency (clang+memsan)
'MBEDTLS_TEST_CONSTANT_FLOW_VALGRIND', # build dependency (valgrind headers)
'MBEDTLS_X509_REMOVE_INFO', # removes a feature

View File

@ -1498,6 +1498,9 @@ component_build_module_alt () {
# The SpecifiedECDomain parsing code accesses mbedtls_ecp_group fields
# directly and assumes the implementation works with partial groups.
scripts/config.py unset MBEDTLS_PK_PARSE_EC_EXTENDED
# MBEDTLS_SHA256_*ALT can't be used with MBEDTLS_SHA256_USE_A64_CRYPTO_*
scripts/config.py unset MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT
scripts/config.py unset MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY
# Enable all MBEDTLS_XXX_ALT for whole modules. Do not enable
# MBEDTLS_XXX_YYY_ALT which are for single functions.
scripts/config.py set-all 'MBEDTLS_([A-Z0-9]*|NIST_KW)_ALT'
@ -2702,7 +2705,7 @@ component_build_armcc () {
armc6_build_test "--target=arm-arm-none-eabi -march=armv8-m.main"
# ARM Compiler 6 - Target ARMv8-A - AArch64
armc6_build_test "--target=aarch64-arm-none-eabi -march=armv8.2-a"
armc6_build_test "--target=aarch64-arm-none-eabi -march=armv8.2-a+crypto"
}
component_test_tls13 () {