S390: Optimize __memset_z196.

It turned out that an 256b-mvc instruction which depends on the
result of a previous 256b-mvc instruction is counterproductive.
Therefore this patch adjusts the 256b-loop by storing the
first byte with stc and setting the remaining 255b with mvc.
Now the 255b-mvc instruction depends on the stc instruction.
This commit is contained in:
Stefan Liebler 2020-06-26 09:45:11 +02:00
parent 0792c8ae1a
commit 1d21fb1061

View File

@ -157,28 +157,27 @@ ENTRY(MEMSET_Z196)
# if !defined __s390x__
llgfr %r4,%r4
# endif /* !defined __s390x__ */
ltgr %r4,%r4
je .L_Z196_4
clgfi %r4,1
jl .L_Z196_4 # n == 0
stc %r3,0(%r2)
je .L_Z196_4 # n == 1
aghi %r4,-2
lgr %r1,%r2
cghi %r4,1
je .L_Z196_4
aghi %r4,-2
srlg %r5,%r4,8
ltgr %r5,%r5
jne .L_Z196_1
risbg %r5,%r4,8,128+63,56 # r5 = n / 256
jne .L_Z196_1 # Jump away if r5 != 0
.L_Z196_3:
exrl %r4,.L_Z196_17
.L_Z196_4:
br %r14
.L_Z196_1:
cgfi %r5,1048576
jh __memset_mvcle # Switch to mvcle for >256MB
jh __memset_mvcle # Switch to mvcle for >256MB
.L_Z196_2:
pfd 2,1024(%r1)
mvc 1(256,%r1),0(%r1)
mvc 1(255,%r1),0(%r1)
aghi %r5,-1
la %r1,256(%r1)
stc %r3,0(%r1)
jne .L_Z196_2
j .L_Z196_3
.L_Z196_17: