aria: optimise byte perms on Intel
(A similar commit for Arm follows.) Use specific instructions for moving bytes around in a word. This speeds things up, and as a side-effect, slightly lowers code size. ARIA_P3 (aka reverse byte order) is now 1 instruction on x86, which speeds up key schedule. (Clang 3.8 finds this but GCC 5.4 doesn't.) I couldn't find an Intel equivalent of ARM's ret16 (aka ARIA_P1), so I made it two instructions, which is still much better than the code generated with the previous mask-shift-or definition, and speeds up en/decryption. (Neither Clang 3.8 nor GCC 5.4 find this.) Before: O aria.o ins s 7976 43,865 2 10520 37,631 3 13040 28,146 After: O aria.o ins s 7768 33,497 2 9816 28,268 3 11432 20,829 For measurement method, see previous commit: "aria: turn macro into static inline function"
This commit is contained in:
parent
cac5008b17
commit
fb0e4f0d1a
@ -81,13 +81,26 @@ static void mbedtls_zeroize( void *v, size_t n ) {
|
||||
* modify byte order: ( A B C D ) -> ( B A D C ), i.e. swap pairs of bytes
|
||||
*
|
||||
* This is submatrix P1 in [1] Appendix B.1
|
||||
*
|
||||
* Common compilers fail to translate this to minimal number of instructions,
|
||||
* so let's provide asm versions for common platforms with C fallback.
|
||||
*/
|
||||
#if defined(MBEDTLS_HAVE_ASM) && defined(__GNUC__)
|
||||
#if defined(__i386__) || defined(__amd64__) || defined( __x86_64__)
|
||||
/* I couldn't find an Intel equivalent of ret16, so two instructions */
|
||||
#define ARIA_P1(x) ARIA_P2( ARIA_P3( x ) )
|
||||
#endif
|
||||
#endif /* MBEDTLS_HAVE_ASM && GNUC */
|
||||
#if !defined(ARIA_P1)
|
||||
#define ARIA_P1(x) ((((x) >> 8) & 0x00FF00FF) ^ (((x) & 0x00FF00FF) << 8))
|
||||
#endif
|
||||
|
||||
/*
|
||||
* modify byte order: ( A B C D ) -> ( C D A B ), i.e. rotate by 16 bits
|
||||
*
|
||||
* This is submatrix P2 in [1] Appendix B.1
|
||||
*
|
||||
* Common compilers will translate this to a single instruction.
|
||||
*/
|
||||
#define ARIA_P2(x) (((x) >> 16) ^ ((x) << 16))
|
||||
|
||||
@ -95,8 +108,23 @@ static void mbedtls_zeroize( void *v, size_t n ) {
|
||||
* modify byte order: ( A B C D ) -> ( D C B A ), i.e. change endianness
|
||||
*
|
||||
* This is submatrix P3 in [1] Appendix B.1
|
||||
*
|
||||
* Some compilers fail to translate this to a single instruction,
|
||||
* so let's provide asm versions for common platforms with C fallback.
|
||||
*/
|
||||
#if defined(MBEDTLS_HAVE_ASM) && defined(__GNUC__)
|
||||
#if defined(__i386__) || defined(__amd64__) || defined( __x86_64__)
|
||||
static inline uint32_t aria_p3( uint32_t x )
|
||||
{
|
||||
asm( "bswap %0" : "=r" (x) : "0" (x) );
|
||||
return( x );
|
||||
}
|
||||
#define ARIA_P3 aria_p3
|
||||
#endif
|
||||
#endif /* MBEDTLS_HAVE_ASM && GNUC */
|
||||
#if !defined(ARIA_P3)
|
||||
#define ARIA_P3(x) ARIA_P2( ARIA_P1 ( x ) )
|
||||
#endif
|
||||
|
||||
/*
|
||||
* ARIA Affine Transform
|
||||
|
Loading…
Reference in New Issue
Block a user