glibc/sysdeps/sparc/sparc32/sub_n.S

329 lines
7.5 KiB
ArmAsm

! SPARC __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
! store difference in a third limb vector.
!
! Copyright (C) 1995-2023 Free Software Foundation, Inc.
!
! This file is part of the GNU MP Library.
!
! The GNU MP Library is free software; you can redistribute it and/or modify
! it under the terms of the GNU Lesser General Public License as published by
! the Free Software Foundation; either version 2.1 of the License, or (at your
! option) any later version.
!
! The GNU MP Library is distributed in the hope that it will be useful, but
! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
! License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with the GNU MP Library; see the file COPYING.LIB. If not,
! see <https://www.gnu.org/licenses/>.
! INPUT PARAMETERS
#define RES_PTR %o0
#define S1_PTR %o1
#define S2_PTR %o2
#define SIZE %o3
#include <sysdep.h>
ENTRY(__mpn_sub_n)
xor S2_PTR,RES_PTR,%g1
andcc %g1,4,%g0
bne LOC(1) ! branch if alignment differs
nop
! ** V1a **
andcc RES_PTR,4,%g0 ! RES_PTR unaligned? Side effect: cy=0
be LOC(v1) ! if no, branch
nop
/* Add least significant limb separately to align RES_PTR and S2_PTR */
ld [S1_PTR],%g4
add S1_PTR,4,S1_PTR
ld [S2_PTR],%g2
add S2_PTR,4,S2_PTR
add SIZE,-1,SIZE
subcc %g4,%g2,%o4
st %o4,[RES_PTR]
add RES_PTR,4,RES_PTR
LOC(v1):
addx %g0,%g0,%o4 ! save cy in register
cmp SIZE,2 ! if SIZE < 2 ...
bl LOC(end2) ! ... branch to tail code
subcc %g0,%o4,%g0 ! restore cy
ld [S1_PTR+0],%g4
addcc SIZE,-10,SIZE
ld [S1_PTR+4],%g1
ldd [S2_PTR+0],%g2
blt LOC(fin1)
subcc %g0,%o4,%g0 ! restore cy
/* Add blocks of 8 limbs until less than 8 limbs remain */
LOC(loop1):
subxcc %g4,%g2,%o4
ld [S1_PTR+8],%g4
subxcc %g1,%g3,%o5
ld [S1_PTR+12],%g1
ldd [S2_PTR+8],%g2
std %o4,[RES_PTR+0]
subxcc %g4,%g2,%o4
ld [S1_PTR+16],%g4
subxcc %g1,%g3,%o5
ld [S1_PTR+20],%g1
ldd [S2_PTR+16],%g2
std %o4,[RES_PTR+8]
subxcc %g4,%g2,%o4
ld [S1_PTR+24],%g4
subxcc %g1,%g3,%o5
ld [S1_PTR+28],%g1
ldd [S2_PTR+24],%g2
std %o4,[RES_PTR+16]
subxcc %g4,%g2,%o4
ld [S1_PTR+32],%g4
subxcc %g1,%g3,%o5
ld [S1_PTR+36],%g1
ldd [S2_PTR+32],%g2
std %o4,[RES_PTR+24]
addx %g0,%g0,%o4 ! save cy in register
addcc SIZE,-8,SIZE
add S1_PTR,32,S1_PTR
add S2_PTR,32,S2_PTR
add RES_PTR,32,RES_PTR
bge LOC(loop1)
subcc %g0,%o4,%g0 ! restore cy
LOC(fin1):
addcc SIZE,8-2,SIZE
blt LOC(end1)
subcc %g0,%o4,%g0 ! restore cy
/* Add blocks of 2 limbs until less than 2 limbs remain */
LOC(loope1):
subxcc %g4,%g2,%o4
ld [S1_PTR+8],%g4
subxcc %g1,%g3,%o5
ld [S1_PTR+12],%g1
ldd [S2_PTR+8],%g2
std %o4,[RES_PTR+0]
addx %g0,%g0,%o4 ! save cy in register
addcc SIZE,-2,SIZE
add S1_PTR,8,S1_PTR
add S2_PTR,8,S2_PTR
add RES_PTR,8,RES_PTR
bge LOC(loope1)
subcc %g0,%o4,%g0 ! restore cy
LOC(end1):
subxcc %g4,%g2,%o4
subxcc %g1,%g3,%o5
std %o4,[RES_PTR+0]
addx %g0,%g0,%o4 ! save cy in register
andcc SIZE,1,%g0
be LOC(ret1)
subcc %g0,%o4,%g0 ! restore cy
/* Add last limb */
ld [S1_PTR+8],%g4
ld [S2_PTR+8],%g2
subxcc %g4,%g2,%o4
st %o4,[RES_PTR+8]
LOC(ret1):
retl
addx %g0,%g0,%o0 ! return carry-out from most sign. limb
LOC(1): xor S1_PTR,RES_PTR,%g1
andcc %g1,4,%g0
bne LOC(2)
nop
! ** V1b **
andcc RES_PTR,4,%g0 ! RES_PTR unaligned? Side effect: cy=0
be LOC(v1b) ! if no, branch
nop
/* Add least significant limb separately to align RES_PTR and S1_PTR */
ld [S2_PTR],%g4
add S2_PTR,4,S2_PTR
ld [S1_PTR],%g2
add S1_PTR,4,S1_PTR
add SIZE,-1,SIZE
subcc %g2,%g4,%o4
st %o4,[RES_PTR]
add RES_PTR,4,RES_PTR
LOC(v1b):
addx %g0,%g0,%o4 ! save cy in register
cmp SIZE,2 ! if SIZE < 2 ...
bl LOC(end2) ! ... branch to tail code
subcc %g0,%o4,%g0 ! restore cy
ld [S2_PTR+0],%g4
addcc SIZE,-10,SIZE
ld [S2_PTR+4],%g1
ldd [S1_PTR+0],%g2
blt LOC(fin1b)
subcc %g0,%o4,%g0 ! restore cy
/* Add blocks of 8 limbs until less than 8 limbs remain */
LOC(loop1b):
subxcc %g2,%g4,%o4
ld [S2_PTR+8],%g4
subxcc %g3,%g1,%o5
ld [S2_PTR+12],%g1
ldd [S1_PTR+8],%g2
std %o4,[RES_PTR+0]
subxcc %g2,%g4,%o4
ld [S2_PTR+16],%g4
subxcc %g3,%g1,%o5
ld [S2_PTR+20],%g1
ldd [S1_PTR+16],%g2
std %o4,[RES_PTR+8]
subxcc %g2,%g4,%o4
ld [S2_PTR+24],%g4
subxcc %g3,%g1,%o5
ld [S2_PTR+28],%g1
ldd [S1_PTR+24],%g2
std %o4,[RES_PTR+16]
subxcc %g2,%g4,%o4
ld [S2_PTR+32],%g4
subxcc %g3,%g1,%o5
ld [S2_PTR+36],%g1
ldd [S1_PTR+32],%g2
std %o4,[RES_PTR+24]
addx %g0,%g0,%o4 ! save cy in register
addcc SIZE,-8,SIZE
add S1_PTR,32,S1_PTR
add S2_PTR,32,S2_PTR
add RES_PTR,32,RES_PTR
bge LOC(loop1b)
subcc %g0,%o4,%g0 ! restore cy
LOC(fin1b):
addcc SIZE,8-2,SIZE
blt LOC(end1b)
subcc %g0,%o4,%g0 ! restore cy
/* Add blocks of 2 limbs until less than 2 limbs remain */
LOC(loope1b):
subxcc %g2,%g4,%o4
ld [S2_PTR+8],%g4
subxcc %g3,%g1,%o5
ld [S2_PTR+12],%g1
ldd [S1_PTR+8],%g2
std %o4,[RES_PTR+0]
addx %g0,%g0,%o4 ! save cy in register
addcc SIZE,-2,SIZE
add S1_PTR,8,S1_PTR
add S2_PTR,8,S2_PTR
add RES_PTR,8,RES_PTR
bge LOC(loope1b)
subcc %g0,%o4,%g0 ! restore cy
LOC(end1b):
subxcc %g2,%g4,%o4
subxcc %g3,%g1,%o5
std %o4,[RES_PTR+0]
addx %g0,%g0,%o4 ! save cy in register
andcc SIZE,1,%g0
be LOC(ret1b)
subcc %g0,%o4,%g0 ! restore cy
/* Add last limb */
ld [S2_PTR+8],%g4
ld [S1_PTR+8],%g2
subxcc %g2,%g4,%o4
st %o4,[RES_PTR+8]
LOC(ret1b):
retl
addx %g0,%g0,%o0 ! return carry-out from most sign. limb
! ** V2 **
/* If we come here, the alignment of S1_PTR and RES_PTR as well as the
alignment of S2_PTR and RES_PTR differ. Since there are only two ways
things can be aligned (that we care about) we now know that the alignment
of S1_PTR and S2_PTR are the same. */
LOC(2): cmp SIZE,1
be LOC(jone)
nop
andcc S1_PTR,4,%g0 ! S1_PTR unaligned? Side effect: cy=0
be LOC(v2) ! if no, branch
nop
/* Add least significant limb separately to align S1_PTR and S2_PTR */
ld [S1_PTR],%g4
add S1_PTR,4,S1_PTR
ld [S2_PTR],%g2
add S2_PTR,4,S2_PTR
add SIZE,-1,SIZE
subcc %g4,%g2,%o4
st %o4,[RES_PTR]
add RES_PTR,4,RES_PTR
LOC(v2):
addx %g0,%g0,%o4 ! save cy in register
addcc SIZE,-8,SIZE
blt LOC(fin2)
subcc %g0,%o4,%g0 ! restore cy
/* Add blocks of 8 limbs until less than 8 limbs remain */
LOC(loop2):
ldd [S1_PTR+0],%g2
ldd [S2_PTR+0],%o4
subxcc %g2,%o4,%g2
st %g2,[RES_PTR+0]
subxcc %g3,%o5,%g3
st %g3,[RES_PTR+4]
ldd [S1_PTR+8],%g2
ldd [S2_PTR+8],%o4
subxcc %g2,%o4,%g2
st %g2,[RES_PTR+8]
subxcc %g3,%o5,%g3
st %g3,[RES_PTR+12]
ldd [S1_PTR+16],%g2
ldd [S2_PTR+16],%o4
subxcc %g2,%o4,%g2
st %g2,[RES_PTR+16]
subxcc %g3,%o5,%g3
st %g3,[RES_PTR+20]
ldd [S1_PTR+24],%g2
ldd [S2_PTR+24],%o4
subxcc %g2,%o4,%g2
st %g2,[RES_PTR+24]
subxcc %g3,%o5,%g3
st %g3,[RES_PTR+28]
addx %g0,%g0,%o4 ! save cy in register
addcc SIZE,-8,SIZE
add S1_PTR,32,S1_PTR
add S2_PTR,32,S2_PTR
add RES_PTR,32,RES_PTR
bge LOC(loop2)
subcc %g0,%o4,%g0 ! restore cy
LOC(fin2):
addcc SIZE,8-2,SIZE
blt LOC(end2)
subcc %g0,%o4,%g0 ! restore cy
LOC(loope2):
ldd [S1_PTR+0],%g2
ldd [S2_PTR+0],%o4
subxcc %g2,%o4,%g2
st %g2,[RES_PTR+0]
subxcc %g3,%o5,%g3
st %g3,[RES_PTR+4]
addx %g0,%g0,%o4 ! save cy in register
addcc SIZE,-2,SIZE
add S1_PTR,8,S1_PTR
add S2_PTR,8,S2_PTR
add RES_PTR,8,RES_PTR
bge LOC(loope2)
subcc %g0,%o4,%g0 ! restore cy
LOC(end2):
andcc SIZE,1,%g0
be LOC(ret2)
subcc %g0,%o4,%g0 ! restore cy
/* Add last limb */
LOC(jone):
ld [S1_PTR],%g4
ld [S2_PTR],%g2
subxcc %g4,%g2,%o4
st %o4,[RES_PTR]
LOC(ret2):
retl
addx %g0,%g0,%o0 ! return carry-out from most sign. limb
END(__mpn_sub_n)