2022-06-29 23:07:06 +00:00
|
|
|
#include <isa-level.h>
|
|
|
|
|
|
|
|
#if ISA_SHOULD_BUILD (4)
|
|
|
|
|
2021-09-20 21:20:15 +00:00
|
|
|
# define USE_WITH_EVEX 1
|
|
|
|
|
2022-10-15 03:00:28 +00:00
|
|
|
# include "x86-evex256-vecs.h"
|
2021-03-05 15:15:03 +00:00
|
|
|
|
x86: Improve vec generation in memset-vec-unaligned-erms.S
No bug.
Split vec generation into multiple steps. This allows the
broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
case. This saves an expensive lane-cross instruction and removes
the need for 'vzeroupper'.
For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
byte broadcast.
Results for memset-avx2 small (geomean of N = 20 benchset runs).
size, New Time, Old Time, New / Old
0, 4.100, 3.831, 0.934
1, 5.074, 4.399, 0.867
2, 4.433, 4.411, 0.995
4, 4.487, 4.415, 0.984
8, 4.454, 4.396, 0.987
16, 4.502, 4.443, 0.987
All relevant string/wcsmbs tests are passing.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2022-02-06 06:54:18 +00:00
|
|
|
# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
2022-10-15 03:00:28 +00:00
|
|
|
vpbroadcastb d, %VMM(0); \
|
x86: Improve vec generation in memset-vec-unaligned-erms.S
No bug.
Split vec generation into multiple steps. This allows the
broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
case. This saves an expensive lane-cross instruction and removes
the need for 'vzeroupper'.
For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
byte broadcast.
Results for memset-avx2 small (geomean of N = 20 benchset runs).
size, New Time, Old Time, New / Old
0, 4.100, 3.831, 0.934
1, 5.074, 4.399, 0.867
2, 4.433, 4.411, 0.995
4, 4.487, 4.415, 0.984
8, 4.454, 4.396, 0.987
16, 4.502, 4.443, 0.987
All relevant string/wcsmbs tests are passing.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2022-02-06 06:54:18 +00:00
|
|
|
movq r, %rax
|
2021-03-05 15:15:03 +00:00
|
|
|
|
x86: Improve vec generation in memset-vec-unaligned-erms.S
No bug.
Split vec generation into multiple steps. This allows the
broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
case. This saves an expensive lane-cross instruction and removes
the need for 'vzeroupper'.
For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
byte broadcast.
Results for memset-avx2 small (geomean of N = 20 benchset runs).
size, New Time, Old Time, New / Old
0, 4.100, 3.831, 0.934
1, 5.074, 4.399, 0.867
2, 4.433, 4.411, 0.995
4, 4.487, 4.415, 0.984
8, 4.454, 4.396, 0.987
16, 4.502, 4.443, 0.987
All relevant string/wcsmbs tests are passing.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2022-02-06 06:54:18 +00:00
|
|
|
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
2022-10-15 03:00:28 +00:00
|
|
|
vpbroadcastd d, %VMM(0); \
|
x86: Improve vec generation in memset-vec-unaligned-erms.S
No bug.
Split vec generation into multiple steps. This allows the
broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
case. This saves an expensive lane-cross instruction and removes
the need for 'vzeroupper'.
For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
byte broadcast.
Results for memset-avx2 small (geomean of N = 20 benchset runs).
size, New Time, Old Time, New / Old
0, 4.100, 3.831, 0.934
1, 5.074, 4.399, 0.867
2, 4.433, 4.411, 0.995
4, 4.487, 4.415, 0.984
8, 4.454, 4.396, 0.987
16, 4.502, 4.443, 0.987
All relevant string/wcsmbs tests are passing.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2022-02-06 06:54:18 +00:00
|
|
|
movq r, %rax
|
|
|
|
|
|
|
|
# define MEMSET_VDUP_TO_VEC0_HIGH()
|
|
|
|
# define MEMSET_VDUP_TO_VEC0_LOW()
|
|
|
|
|
|
|
|
# define WMEMSET_VDUP_TO_VEC0_HIGH()
|
|
|
|
# define WMEMSET_VDUP_TO_VEC0_LOW()
|
2021-03-05 15:15:03 +00:00
|
|
|
|
2022-06-29 23:07:06 +00:00
|
|
|
#ifndef MEMSET_SYMBOL
|
2021-03-05 15:15:03 +00:00
|
|
|
# define MEMSET_SYMBOL(p,s) p##_evex_##s
|
2022-06-29 23:07:06 +00:00
|
|
|
#endif
|
|
|
|
#ifndef WMEMSET_SYMBOL
|
2021-03-05 15:15:03 +00:00
|
|
|
# define WMEMSET_SYMBOL(p,s) p##_evex_##s
|
2022-06-29 23:07:06 +00:00
|
|
|
#endif
|
|
|
|
|
|
|
|
|
2021-04-19 21:48:10 +00:00
|
|
|
# define USE_LESS_VEC_MASK_STORE 1
|
2021-03-05 15:15:03 +00:00
|
|
|
# include "memset-vec-unaligned-erms.S"
|
|
|
|
#endif
|