Fixed several libmvec bugs found during testing on KNL hardware.

AVX512 IFUNC implementations, implementations of wrappers to AVX2 versions and KNL expf implementation fixed. * sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S: Fixed AVX512 IFUNC. * sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S: Likewise. * sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Fixed wrappers to AVX2. * sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S: Fixed KNL implementation.
2024-11-10 15:20:10 +00:00 · 2015-07-24 14:47:23 +03:00 · 2015-07-24 14:47:23 +03:00 · 9901716135
commit 9901716135
parent 3bcea719dd
16 changed files with 220 additions and 223 deletions
--- a/19
+++ b/19
@ -1,3 +1,22 @@
+2015-07-24  Andrew Senkevich  <andrew.senkevich@intel.com>
+
+	* sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S: Fixed AVX512 IFUNC.
+	* sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S: Likewise.
+	* sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S: Likewise.
+	* sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S: Likewise.
+	* sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S: Likewise.
+	* sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S: Likewise.
+	* sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S: Likewise.
+	* sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S: Likewise.
+	* sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S: Likewise.
+	* sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S: Likewise.
+	* sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S: Likewise.
+	* sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S: Likewise.
+	* sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Fixed wrappers to AVX2.
+	* sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise.
+	* sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S: Fixed KNL
+	implementation.
+
 2015-07-24  Szabolcs Nagy  <szabolcs.nagy@arm.com>

 	[BZ #17711]
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
@ -23,16 +23,16 @@
 ENTRY (_ZGVeN8v_cos)
        .type   _ZGVeN8v_cos, @gnu_indirect_function
        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
        call    __init_cpu_features
 1:      leaq    _ZGVeN8v_cos_skx(%rip), %rax
        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8v_cos_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8v_cos_knl(%rip), %rax
        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
        leaq    _ZGVeN8v_cos_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8v_cos)

 #define _ZGVeN8v_cos _ZGVeN8v_cos_avx2_wrapper
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
@ -23,16 +23,16 @@
 ENTRY (_ZGVeN8v_exp)
        .type   _ZGVeN8v_exp, @gnu_indirect_function
        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
        call    __init_cpu_features
 1:      leaq    _ZGVeN8v_exp_skx(%rip), %rax
        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8v_exp_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8v_exp_knl(%rip), %rax
        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
        leaq    _ZGVeN8v_exp_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8v_exp)

 #define _ZGVeN8v_exp _ZGVeN8v_exp_avx2_wrapper
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
@ -23,16 +23,16 @@
 ENTRY (_ZGVeN8v_log)
        .type   _ZGVeN8v_log, @gnu_indirect_function
        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
        call    __init_cpu_features
 1:      leaq    _ZGVeN8v_log_skx(%rip), %rax
        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8v_log_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8v_log_knl(%rip), %rax
        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
        leaq    _ZGVeN8v_log_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8v_log)

 #define _ZGVeN8v_log _ZGVeN8v_log_avx2_wrapper
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
@ -23,16 +23,16 @@
 ENTRY (_ZGVeN8vv_pow)
        .type   _ZGVeN8vv_pow, @gnu_indirect_function
        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
        call    __init_cpu_features
 1:      leaq    _ZGVeN8vv_pow_skx(%rip), %rax
        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8vv_pow_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8vv_pow_knl(%rip), %rax
        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
        leaq    _ZGVeN8vv_pow_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8vv_pow)

 #define _ZGVeN8vv_pow _ZGVeN8vv_pow_avx2_wrapper
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
@ -23,16 +23,16 @@
 ENTRY (_ZGVeN8v_sin)
        .type   _ZGVeN8v_sin, @gnu_indirect_function
        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
        call    __init_cpu_features
 1:      leaq    _ZGVeN8v_sin_skx(%rip), %rax
        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8v_sin_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8v_sin_knl(%rip), %rax
        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
        leaq    _ZGVeN8v_sin_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8v_sin)

 #define _ZGVeN8v_sin _ZGVeN8v_sin_avx2_wrapper
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
@ -23,16 +23,16 @@
 ENTRY (_ZGVeN8vvv_sincos)
        .type   _ZGVeN8vvv_sincos, @gnu_indirect_function
        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
        call    __init_cpu_features
 1:      leaq    _ZGVeN8vvv_sincos_skx(%rip), %rax
        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8vvv_sincos_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8vvv_sincos_knl(%rip), %rax
        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
        leaq    _ZGVeN8vvv_sincos_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8vvv_sincos)

 #define _ZGVeN8vvv_sincos _ZGVeN8vvv_sincos_avx2_wrapper
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
@ -23,16 +23,16 @@
 ENTRY (_ZGVeN16v_cosf)
        .type   _ZGVeN16v_cosf, @gnu_indirect_function
        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
        call    __init_cpu_features
 1:      leaq    _ZGVeN16v_cosf_skx(%rip), %rax
        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16v_cosf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16v_cosf_knl(%rip), %rax
        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
        leaq    _ZGVeN16v_cosf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16v_cosf)

 #define _ZGVeN16v_cosf _ZGVeN16v_cosf_avx2_wrapper
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
@ -23,16 +23,16 @@
 ENTRY (_ZGVeN16v_expf)
        .type   _ZGVeN16v_expf, @gnu_indirect_function
        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
        call    __init_cpu_features
 1:      leaq    _ZGVeN16v_expf_skx(%rip), %rax
        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16v_expf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16v_expf_knl(%rip), %rax
        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
        leaq    _ZGVeN16v_expf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16v_expf)

 #define _ZGVeN16v_expf _ZGVeN16v_expf_avx2_wrapper
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
@ -46,6 +46,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
     The table lookup is skipped if k = 0.
     For low accuracy approximation, exp(r) ~ 1 or 1+r.  */

+        pushq     %rbp
        cfi_adjust_cfa_offset (8)
        cfi_rel_offset (%rbp, 0)
        movq      %rsp, %rbp
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
@ -23,16 +23,16 @@
 ENTRY (_ZGVeN16v_logf)
        .type   _ZGVeN16v_logf, @gnu_indirect_function
        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
        call    __init_cpu_features
 1:      leaq    _ZGVeN16v_logf_skx(%rip), %rax
        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16v_logf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16v_logf_knl(%rip), %rax
        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
        leaq    _ZGVeN16v_logf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16v_logf)

 #define _ZGVeN16v_logf _ZGVeN16v_logf_avx2_wrapper
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
@ -23,16 +23,16 @@
 ENTRY (_ZGVeN16vv_powf)
        .type   _ZGVeN16vv_powf, @gnu_indirect_function
        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
        call    __init_cpu_features
 1:      leaq    _ZGVeN16vv_powf_skx(%rip), %rax
        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16vv_powf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16vv_powf_knl(%rip), %rax
        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
        leaq    _ZGVeN16vv_powf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16vv_powf)

 #define _ZGVeN16vv_powf _ZGVeN16vv_powf_avx2_wrapper
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
@ -23,16 +23,16 @@
 ENTRY (_ZGVeN16vvv_sincosf)
        .type   _ZGVeN16vvv_sincosf, @gnu_indirect_function
        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
        call    __init_cpu_features
 1:      leaq    _ZGVeN16vvv_sincosf_skx(%rip), %rax
        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16vvv_sincosf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16vvv_sincosf_knl(%rip), %rax
        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
        leaq    _ZGVeN16vvv_sincosf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16vvv_sincosf)

 #define _ZGVeN16vvv_sincosf _ZGVeN16vvv_sincosf_avx2_wrapper
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
@ -23,16 +23,16 @@
 ENTRY (_ZGVeN16v_sinf)
        .type   _ZGVeN16v_sinf, @gnu_indirect_function
        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
        call    __init_cpu_features
 1:      leaq    _ZGVeN16v_sinf_skx(%rip), %rax
        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16v_sinf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16v_sinf_knl(%rip), %rax
        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
        leaq    _ZGVeN16v_sinf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16v_sinf)

 #define _ZGVeN16v_sinf _ZGVeN16v_sinf_avx2_wrapper
--- a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
+++ b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
@ -194,39 +194,39 @@

 /* AVX512 ISA version as wrapper to AVX2 ISA version.  */
 .macro WRAPPER_IMPL_AVX512 callee
-        pushq	%rbp
+        pushq     %rbp
        cfi_adjust_cfa_offset (8)
        cfi_rel_offset (%rbp, 0)
-        movq	%rsp, %rbp
+        movq      %rsp, %rbp
        cfi_def_cfa_register (%rbp)
-        andq	$-64, %rsp
-        subq	$64, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
-        .byte	0x62
-        .byte	0xf1
-        .byte	0x7c
-        .byte	0x48
-        .byte	0x29
-        .byte	0x04
-        .byte	0x24
-/* Below is encoding for vmovapd (%rsp), %ymm0.  */
-        .byte	0xc5
-        .byte	0xfd
-        .byte	0x28
-        .byte	0x04
-        .byte	0x24
-        call	HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 32(%rsp), %ymm0.  */
-        .byte	0xc5
-        .byte	0xfd
-        .byte	0x28
-        .byte	0x44
-        .byte	0x24
-        .byte	0x20
-        call	HIDDEN_JUMPTARGET(\callee)
-        movq	%rbp, %rsp
+        andq      $-64, %rsp
+        subq      $128, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x04
+        .byte   0x24
+        vmovupd   (%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 64(%rsp)
+        vmovupd   32(%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 96(%rsp)
+/* Below is encoding for vmovups 64(%rsp), %zmm0.  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x10
+        .byte   0x44
+        .byte   0x24
+        .byte   0x01
+        movq      %rbp, %rsp
        cfi_def_cfa_register (%rsp)
-        popq	%rbp
+        popq      %rbp
        cfi_adjust_cfa_offset (-8)
        cfi_restore (%rbp)
        ret
@ -234,61 +234,50 @@

 /* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version.  */
 .macro WRAPPER_IMPL_AVX512_ff callee
-        pushq	%rbp
+        pushq     %rbp
        cfi_adjust_cfa_offset (8)
        cfi_rel_offset (%rbp, 0)
-        movq	%rsp, %rbp
+        movq      %rsp, %rbp
        cfi_def_cfa_register (%rbp)
-        andq	$-64, %rsp
-        subq	$128, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
-        .byte	0x62
-        .byte	0xf1
-        .byte	0x7c
-        .byte	0x48
-        .byte	0x29
-        .byte	0x04
-        .byte	0x24
-/* Below is encoding for vmovaps %zmm1, 64(%rsp).  */
-        .byte	0x62
-        .byte	0xf1
-        .byte	0x7c
-        .byte	0x48
-        .byte	0x29
-        .byte	0x4c
-        .byte	0x24
-/* Below is encoding for vmovapd (%rsp), %ymm0.  */
-        .byte	0xc5
-        .byte	0xfd
-        .byte	0x28
-        .byte	0x04
-        .byte	0x24
-/* Below is encoding for vmovapd 64(%rsp), %ymm1.  */
-        .byte	0xc5
-        .byte	0xfd
-        .byte	0x28
-        .byte	0x4c
-        .byte	0x24
-        .byte	0x40
-        call	HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 32(%rsp), %ymm0.  */
-        .byte	0xc5
-        .byte	0xfd
-        .byte	0x28
-        .byte	0x44
-        .byte	0x24
-        .byte	0x20
-/* Below is encoding for vmovapd 96(%rsp), %ymm1.  */
-        .byte	0xc5
-        .byte	0xfd
-        .byte	0x28
-        .byte	0x4c
-        .byte	0x24
-        .byte	0x60
-        call	HIDDEN_JUMPTARGET(\callee)
-        movq	%rbp, %rsp
+        andq      $-64, %rsp
+        subq      $192, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x04
+        .byte   0x24
+/* Below is encoding for vmovups %zmm1, 64(%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x4c
+        .byte   0x24
+        .byte   0x01
+        vmovupd   (%rsp), %ymm0
+        vmovupd   64(%rsp), %ymm1
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 128(%rsp)
+        vmovupd   32(%rsp), %ymm0
+        vmovupd   96(%rsp), %ymm1
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 160(%rsp)
+/* Below is encoding for vmovups 128(%rsp), %zmm0.  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x10
+        .byte   0x44
+        .byte   0x24
+        .byte   0x02
+        movq      %rbp, %rsp
        cfi_def_cfa_register (%rsp)
-        popq	%rbp
+        popq      %rbp
        cfi_adjust_cfa_offset (-8)
        cfi_restore (%rbp)
        ret
@ -310,61 +299,26 @@
        cfi_rel_offset (%r13, 0)
        subq      $176, %rsp
        movq      %rsi, %r13
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
        .byte	0x62
        .byte	0xf1
        .byte	0x7c
        .byte	0x48
-        .byte	0x29
+        .byte	0x11
        .byte	0x04
        .byte	0x24
        movq    %rdi, %r12
-/* Below is encoding for vmovapd (%rsp), %ymm0.  */
-        .byte	0xc5
-        .byte	0xfd
-        .byte	0x28
-        .byte	0x04
-        .byte	0x24
+        vmovupd (%rsp), %ymm0
        call      HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 32(%rsp), %ymm0.  */
-        .byte	0xc5
-        .byte	0xfd
-        .byte	0x28
-        .byte	0x44
-        .byte	0x24
-        .byte	0x20
+        vmovupd   32(%rsp), %ymm0
        lea       64(%rsp), %rdi
        lea       96(%rsp), %rsi
        call      HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 64(%rsp), %ymm0.  */
-        .byte	0xc5
-        .byte	0xfd
-        .byte	0x28
-        .byte	0x44
-        .byte	0x24
-        .byte	0x40
-/* Below is encoding for vmovapd   96(%rsp), %ymm1.  */
-        .byte	0xc5
-        .byte	0xfd
-        .byte	0x28
-        .byte	0x4c
-        .byte	0x24
-        .byte	0x60
-/* Below is encoding for vmovapd   %ymm0, 32(%r12).  */
-        .byte	0xc4
-        .byte	0xc1
-        .byte	0x7d
-        .byte	0x29
-        .byte	0x44
-        .byte	0x24
-        .byte	0x20
-/* Below is encoding for vmovapd   %ymm1, 32(%r13).  */
-        .byte	0xc4
-        .byte	0xc1
-        .byte	0x7d
-        .byte	0x29
-        .byte	0x4d
-        .byte	0x20
+        vmovupd   64(%rsp), %ymm0
+        vmovupd   96(%rsp), %ymm1
+        vmovupd   %ymm0, 32(%r12)
+        vmovupd   %ymm1, 32(%r13)
+        vzeroupper
        addq      $176, %rsp
        popq      %r13
        cfi_adjust_cfa_offset (-8)
--- a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
+++ b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
@ -239,28 +239,39 @@

 /* AVX512 ISA version as wrapper to AVX2 ISA version.  */
 .macro WRAPPER_IMPL_AVX512 callee
-        pushq	%rbp
+        pushq     %rbp
        cfi_adjust_cfa_offset (8)
        cfi_rel_offset (%rbp, 0)
-        movq	%rsp, %rbp
+        movq      %rsp, %rbp
        cfi_def_cfa_register (%rbp)
-        andq	$-64, %rsp
-        subq	$64, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
-        .byte	0x62
-        .byte	0xf1
-        .byte	0x7c
-        .byte	0x48
-        .byte	0x29
-        .byte	0x04
-        .byte	0x24
-        vmovaps (%rsp), %ymm0
-        call	HIDDEN_JUMPTARGET(\callee)
-        vmovaps 32(%rsp), %ymm0
-        call	HIDDEN_JUMPTARGET(\callee)
-        movq	%rbp, %rsp
+        andq      $-64, %rsp
+        subq      $128, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x04
+        .byte   0x24
+        vmovupd   (%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 64(%rsp)
+        vmovupd   32(%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 96(%rsp)
+/* Below is encoding for vmovups 64(%rsp), %zmm0.  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x10
+        .byte   0x44
+        .byte   0x24
+        .byte   0x01
+        movq      %rbp, %rsp
        cfi_def_cfa_register (%rsp)
-        popq	%rbp
+        popq      %rbp
        cfi_adjust_cfa_offset (-8)
        cfi_restore (%rbp)
        ret
@ -274,29 +285,41 @@
        movq      %rsp, %rbp
        cfi_def_cfa_register (%rbp)
        andq      $-64, %rsp
-        subq      $128, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
-        .byte	0x62
-        .byte	0xf1
-        .byte	0x7c
-        .byte	0x48
-        .byte	0x29
-        .byte	0x04
-        .byte	0x24
-/* Below is encoding for vmovaps %zmm1, 64(%rsp).  */
-        .byte	0x62
-        .byte	0xf1
-        .byte	0x7c
-        .byte	0x48
-        .byte	0x29
-        .byte	0x4c
-        .byte	0x24
-        vmovaps (%rsp), %ymm0
-        vmovaps 64(%rsp), %ymm1
+        subq      $192, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x04
+        .byte   0x24
+/* Below is encoding for vmovups %zmm1, 64(%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x4c
+        .byte   0x24
+        .byte   0x01
+        vmovups   (%rsp), %ymm0
+        vmovups   64(%rsp), %ymm1
        call      HIDDEN_JUMPTARGET(\callee)
-        vmovaps 32(%rsp), %ymm0
-        vmovaps 96(%rsp), %ymm1
+        vmovups   %ymm0, 128(%rsp)
+        vmovups   32(%rsp), %ymm0
+        vmovups   96(%rsp), %ymm1
        call      HIDDEN_JUMPTARGET(\callee)
+        vmovups   %ymm0, 160(%rsp)
+/* Below is encoding for vmovups 128(%rsp), %zmm0.  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x10
+        .byte   0x44
+        .byte   0x24
+        .byte   0x02
        movq      %rbp, %rsp
        cfi_def_cfa_register (%rsp)
        popq      %rbp