Remove 32 bit sparc v7 support

The patch is straighforward:

  - The sparc32 v8 implementations are moved as the generic ones.

  - A configure test is added to check for either __sparc_v8__ or
    __sparc_v9__.

  - The triple names are simplified and sparc implies sparcv8.

The idea is to keep support on sparcv8 architectures that does support
CAS instructions, such as LEON3/LEON4.

Checked on a sparcv9-linux-gnu and sparc64-linux-gnu.

Tested-by: Andreas Larsson <andreas@gaisler.com>
This commit is contained in:
Adhemerval Zanella 2019-11-13 12:32:17 +00:00
parent bfdb731438
commit 5d9b7b9fa7
26 changed files with 418 additions and 2708 deletions

4
NEWS
View File

@ -80,6 +80,10 @@ Deprecated and removed features, and other changes affecting compatibility:
* The obsolete functions ftime has been deprecated and will be removed from
a future version of glibc. Application should use clock_gettime instead.
* The sparc*-*linux-gnu configurations targeting v7 or order architecture
are no longer supported. For v8 only implementations with native CAS
instruction are still supported (such as LEON).
Changes to build and runtime requirements:
[Add changes to build and runtime requirements here]

View File

@ -358,8 +358,10 @@ class Context(object):
self.add_config(arch='sparc64',
os_name='linux-gnu',
glibcs=[{},
{'arch': 'sparcv8',
'ccopts': '-m32 -mlong-double-128 -mcpu=leon3'}],
{'arch': 'sparcv9',
'ccopts': '-m32 -mlong-double-128'}],
'ccopts': '-m32 -mlong-double-128 -mcpu=v9'}],
extra_glibcs=[{'variant': 'disable-multi-arch',
'cfg': ['--disable-multi-arch']},
{'variant': 'disable-multi-arch',
@ -847,11 +849,7 @@ class Context(object):
# be touched because nothing in a build depends on the files
# in question.
for f in ('sysdeps/gnu/errlist.c',
'sysdeps/mach/hurd/bits/errno.h',
'sysdeps/sparc/sparc32/rem.S',
'sysdeps/sparc/sparc32/sdiv.S',
'sysdeps/sparc/sparc32/udiv.S',
'sysdeps/sparc/sparc32/urem.S'):
'sysdeps/mach/hurd/bits/errno.h'):
to_touch = os.path.join(srcdir, f)
subprocess.run(['touch', '-c', to_touch], check=True)
for dirpath, dirnames, filenames in os.walk(srcdir):

View File

@ -1,24 +1,10 @@
# preconfigure fragment for sparc.
case "$machine" in
sparc | sparcv[67])
sparc | sparcv8 | supersparc | hypersparc)
base_machine=sparc machine=sparc/sparc32 ;;
sparcv8 | supersparc | hypersparc)
base_machine=sparc machine=sparc/sparc32/sparcv8 ;;
sparcv8plus | sparcv8plusa | sparcv9)
sparcv8plus* | sparcv9*)
base_machine=sparc machine=sparc/sparc32/sparcv9 ;;
sparcv8plusb | sparcv9b)
base_machine=sparc machine=sparc/sparc32/sparcv9/sparcv9b ;;
sparcv9v)
base_machine=sparc machine=sparc/sparc32/sparcv9/sparcv9v ;;
sparcv9v2)
base_machine=sparc machine=sparc/sparc32/sparcv9/sparcv9v2 ;;
sparc64)
sparc64*)
base_machine=sparc machine=sparc/sparc64 ;;
sparc64b)
base_machine=sparc machine=sparc/sparc64/sparcv9b ;;
sparc64v)
base_machine=sparc machine=sparc/sparc64/sparcv9v ;;
sparc64v2)
base_machine=sparc machine=sparc/sparc64/sparcv9v2 ;;
esac

View File

@ -19,35 +19,8 @@ ifeq ($(subdir),gnulib)
sysdep_routines = dotmul umul $(divrem) alloca
endif # gnulib
# We distribute these files, even though they are generated,
# so as to avoid the need for a functioning m4 to build the library.
divrem := sdiv udiv rem urem
+divrem-NAME-sdiv := div
+divrem-NAME-udiv := udiv
+divrem-NAME-rem := rem
+divrem-NAME-urem := urem
+divrem-NAME = $(+divrem-NAME-$(basename $(notdir $@)))
+divrem-OP-div := div
+divrem-OP-udiv := div
+divrem-OP-rem := rem
+divrem-OP-urem := rem
+divrem-S-div := true
+divrem-S-rem := true
+divrem-S-udiv := false
+divrem-S-urem := false
$(divrem:%=$(sysdep_dir)/sparc/sparc32/%.S): $(sysdep_dir)/sparc/sparc32/divrem.m4
(echo -n "define(NAME,\`.$(+divrem-NAME)')"; \
echo -n " define(OP,\`$(+divrem-OP-$(+divrem-NAME))')"; \
echo -n " define(S,\`$(+divrem-S-$(+divrem-NAME))')"; \
echo " /* This file is generated from divrem.m4; DO NOT EDIT! */"; \
cat $<) | $(M4) > $@-tmp
# Make it unwritable so noone will edit it by mistake.
-chmod a-w $@-tmp
mv -f $@-tmp $@
sysdep-realclean := $(sysdep-realclean) $(divrem:%=sysdeps/sparc/sparc32/%.S)
# libgcc __divdi3 and __moddi3 uses .udiv and since it is also exported by
# libc.so linker will create PLTs for the symbol. To avoid it we strong alias
# the exported libc one to __wrap_.udiv and use linker option --wrap to make any

View File

@ -1,146 +1,118 @@
! SPARC __mpn_addmul_1 -- Multiply a limb vector with a limb and add
! the result to a second limb vector.
!
! SPARC v8 __mpn_addmul_1 -- Multiply a limb vector with a limb and
! add the result to a second limb vector.
! Copyright (C) 1992-2019 Free Software Foundation, Inc.
!
! This file is part of the GNU MP Library.
!
! The GNU MP Library is free software; you can redistribute it and/or modify
! it under the terms of the GNU Lesser General Public License as published by
! the Free Software Foundation; either version 2.1 of the License, or (at your
! option) any later version.
!
! The GNU MP Library is distributed in the hope that it will be useful, but
! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
! License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with the GNU MP Library; see the file COPYING.LIB. If not,
! see <https://www.gnu.org/licenses/>.
! INPUT PARAMETERS
! RES_PTR o0
! S1_PTR o1
! SIZE o2
! S2_LIMB o3
! res_ptr o0
! s1_ptr o1
! size o2
! s2_limb o3
#include <sysdep.h>
ENTRY(__mpn_addmul_1)
! Make S1_PTR and RES_PTR point at the end of their blocks
! and put (- 4 x SIZE) in index/loop counter.
sll %o2,2,%o2
add %o0,%o2,%o4 ! RES_PTR in o4 since o0 is retval
add %o1,%o2,%o1
sub %g0,%o2,%o2
ld [%o1+0],%o4 ! 1
sll %o2,4,%g1
orcc %g0,%g0,%g2
mov %o7,%g4 ! Save return address register
and %g1,(4-1)<<4,%g1
1: call 2f
add %o7,3f-1b,%g3
2: jmp %g3+%g1
mov %g4,%o7 ! Restore return address register
cmp %o3,0xfff
bgu LOC(large)
.align 4
3:
LOC(00):
add %o0,-4,%o0
b LOC(loop00) /* 4, 8, 12, ... */
add %o1,-4,%o1
nop
LOC(01):
b LOC(loop01) /* 1, 5, 9, ... */
nop
nop
nop
LOC(10):
add %o0,-12,%o0 /* 2, 6, 10, ... */
b LOC(loop10)
add %o1,4,%o1
nop
LOC(11):
add %o0,-8,%o0 /* 3, 7, 11, ... */
b LOC(loop11)
add %o1,-8,%o1
nop
ld [%o1+%o2],%o5
mov 0,%o0
b LOC(0)
add %o4,-4,%o4
LOC(loop0):
addcc %o5,%g1,%g1
ld [%o1+%o2],%o5
addx %o0,%g0,%o0
st %g1,[%o4+%o2]
LOC(0): wr %g0,%o3,%y
sra %o5,31,%g2
and %o3,%g2,%g2
andcc %g1,0,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,0,%g1
sra %g1,20,%g4
sll %g1,12,%g1
rd %y,%g3
srl %g3,20,%g3
or %g1,%g3,%g1
addcc %g1,%o0,%g1
addx %g2,%g4,%o0 ! add sign-compensation and cy to hi limb
addcc %o2,4,%o2 ! loop counter
bne LOC(loop0)
ld [%o4+%o2],%o5
addcc %o5,%g1,%g1
addx %o0,%g0,%o0
retl
st %g1,[%o4+%o2]
LOC(large):
ld [%o1+%o2],%o5
mov 0,%o0
sra %o3,31,%g4 ! g4 = mask of ones iff S2_LIMB < 0
b LOC(1)
add %o4,-4,%o4
LOC(loop):
addcc %o5,%g3,%g3
ld [%o1+%o2],%o5
addx %o0,%g0,%o0
st %g3,[%o4+%o2]
LOC(1): wr %g0,%o5,%y
and %o5,%g4,%g2
andcc %g0,%g0,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%g0,%g1
rd %y,%g3
addcc %g3,%o0,%g3
addx %g2,%g1,%o0
addcc %o2,4,%o2
bne LOC(loop)
ld [%o4+%o2],%o5
addcc %g3,%g2,%g3 ! 1
ld [%o1+4],%o4 ! 2
rd %y,%g2 ! 1
addx %g0,%g2,%g2
ld [%o0+0],%g1 ! 2
addcc %g1,%g3,%g3
st %g3,[%o0+0] ! 1
LOC(loop00):
umul %o4,%o3,%g3 ! 2
ld [%o0+4],%g1 ! 2
addxcc %g3,%g2,%g3 ! 2
ld [%o1+8],%o4 ! 3
rd %y,%g2 ! 2
addx %g0,%g2,%g2
nop
addcc %g1,%g3,%g3
st %g3,[%o0+4] ! 2
LOC(loop11):
umul %o4,%o3,%g3 ! 3
addxcc %g3,%g2,%g3 ! 3
ld [%o1+12],%o4 ! 4
rd %y,%g2 ! 3
add %o1,16,%o1
addx %g0,%g2,%g2
ld [%o0+8],%g1 ! 2
addcc %g1,%g3,%g3
st %g3,[%o0+8] ! 3
LOC(loop10):
umul %o4,%o3,%g3 ! 4
addxcc %g3,%g2,%g3 ! 4
ld [%o1+0],%o4 ! 1
rd %y,%g2 ! 4
addx %g0,%g2,%g2
ld [%o0+12],%g1 ! 2
addcc %g1,%g3,%g3
st %g3,[%o0+12] ! 4
add %o0,16,%o0
addx %g0,%g2,%g2
LOC(loop01):
addcc %o2,-4,%o2
bg LOC(loop)
umul %o4,%o3,%g3 ! 1
addcc %o5,%g3,%g3
addx %o0,%g0,%o0
addcc %g3,%g2,%g3 ! 4
rd %y,%g2 ! 4
addx %g0,%g2,%g2
ld [%o0+0],%g1 ! 2
addcc %g1,%g3,%g3
st %g3,[%o0+0] ! 4
retl
st %g3,[%o4+%o2]
addx %g0,%g2,%o0
END(__mpn_addmul_1)

162
sysdeps/sparc/sparc32/configure vendored Normal file
View File

@ -0,0 +1,162 @@
# This file is generated from configure.ac by Autoconf. DO NOT EDIT!
# Local configure fragment for sysdeps/sparc/sparc32
# Test if compiler targets at least sparcv8.
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for grep that handles long lines and -e" >&5
$as_echo_n "checking for grep that handles long lines and -e... " >&6; }
if ${ac_cv_path_GREP+:} false; then :
$as_echo_n "(cached) " >&6
else
if test -z "$GREP"; then
ac_path_GREP_found=false
# Loop through the user's path and test for each of PROGNAME-LIST
as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
do
IFS=$as_save_IFS
test -z "$as_dir" && as_dir=.
for ac_prog in grep ggrep; do
for ac_exec_ext in '' $ac_executable_extensions; do
ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext"
as_fn_executable_p "$ac_path_GREP" || continue
# Check for GNU ac_path_GREP and select it if it is found.
# Check for GNU $ac_path_GREP
case `"$ac_path_GREP" --version 2>&1` in
*GNU*)
ac_cv_path_GREP="$ac_path_GREP" ac_path_GREP_found=:;;
*)
ac_count=0
$as_echo_n 0123456789 >"conftest.in"
while :
do
cat "conftest.in" "conftest.in" >"conftest.tmp"
mv "conftest.tmp" "conftest.in"
cp "conftest.in" "conftest.nl"
$as_echo 'GREP' >> "conftest.nl"
"$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break
diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
as_fn_arith $ac_count + 1 && ac_count=$as_val
if test $ac_count -gt ${ac_path_GREP_max-0}; then
# Best one so far, save it but keep looking for a better one
ac_cv_path_GREP="$ac_path_GREP"
ac_path_GREP_max=$ac_count
fi
# 10*(2^10) chars as input seems more than enough
test $ac_count -gt 10 && break
done
rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
esac
$ac_path_GREP_found && break 3
done
done
done
IFS=$as_save_IFS
if test -z "$ac_cv_path_GREP"; then
as_fn_error $? "no acceptable grep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
fi
else
ac_cv_path_GREP=$GREP
fi
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_GREP" >&5
$as_echo "$ac_cv_path_GREP" >&6; }
GREP="$ac_cv_path_GREP"
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for egrep" >&5
$as_echo_n "checking for egrep... " >&6; }
if ${ac_cv_path_EGREP+:} false; then :
$as_echo_n "(cached) " >&6
else
if echo a | $GREP -E '(a|b)' >/dev/null 2>&1
then ac_cv_path_EGREP="$GREP -E"
else
if test -z "$EGREP"; then
ac_path_EGREP_found=false
# Loop through the user's path and test for each of PROGNAME-LIST
as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
do
IFS=$as_save_IFS
test -z "$as_dir" && as_dir=.
for ac_prog in egrep; do
for ac_exec_ext in '' $ac_executable_extensions; do
ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext"
as_fn_executable_p "$ac_path_EGREP" || continue
# Check for GNU ac_path_EGREP and select it if it is found.
# Check for GNU $ac_path_EGREP
case `"$ac_path_EGREP" --version 2>&1` in
*GNU*)
ac_cv_path_EGREP="$ac_path_EGREP" ac_path_EGREP_found=:;;
*)
ac_count=0
$as_echo_n 0123456789 >"conftest.in"
while :
do
cat "conftest.in" "conftest.in" >"conftest.tmp"
mv "conftest.tmp" "conftest.in"
cp "conftest.in" "conftest.nl"
$as_echo 'EGREP' >> "conftest.nl"
"$ac_path_EGREP" 'EGREP$' < "conftest.nl" >"conftest.out" 2>/dev/null || break
diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
as_fn_arith $ac_count + 1 && ac_count=$as_val
if test $ac_count -gt ${ac_path_EGREP_max-0}; then
# Best one so far, save it but keep looking for a better one
ac_cv_path_EGREP="$ac_path_EGREP"
ac_path_EGREP_max=$ac_count
fi
# 10*(2^10) chars as input seems more than enough
test $ac_count -gt 10 && break
done
rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
esac
$ac_path_EGREP_found && break 3
done
done
done
IFS=$as_save_IFS
if test -z "$ac_cv_path_EGREP"; then
as_fn_error $? "no acceptable egrep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
fi
else
ac_cv_path_EGREP=$EGREP
fi
fi
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_EGREP" >&5
$as_echo "$ac_cv_path_EGREP" >&6; }
EGREP="$ac_cv_path_EGREP"
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for at least sparcv8 support" >&5
$as_echo_n "checking for at least sparcv8 support... " >&6; }
if ${libc_cv_sparcv8+:} false; then :
$as_echo_n "(cached) " >&6
else
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
#if defined (__sparc_v8__) || defined (__sparc_v9__)
yes
#endif
_ACEOF
if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
$EGREP "yes" >/dev/null 2>&1; then :
libc_cv_sparcv8=yes
else
libc_cv_sparcv8=no
fi
rm -f conftest*
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_sparcv8" >&5
$as_echo "$libc_cv_sparcv8" >&6; }
if test $libc_cv_sparcv8 = no; then
as_fn_error $? "no support for pre-v8 sparc" "$LINENO" 5
fi

View File

@ -0,0 +1,13 @@
GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
# Local configure fragment for sysdeps/sparc/sparc32
# Test if compiler targets at least sparcv8.
AC_CACHE_CHECK([for at least sparcv8 support],
[libc_cv_sparcv8],
[AC_EGREP_CPP(yes,[#if defined (__sparc_v8__) || defined (__sparc_v9__)
yes
#endif
], libc_cv_sparcv8=yes, libc_cv_sparcv8=no)])
if test $libc_cv_sparcv8 = no; then
AC_MSG_ERROR([no support for pre-v8 sparc])
fi

View File

@ -1,234 +0,0 @@
/*
* Division and remainder, from Appendix E of the Sparc Version 8
* Architecture Manual, with fixes from Gordon Irlam.
*/
/*
* Input: dividend and divisor in %o0 and %o1 respectively.
*
* m4 parameters:
* NAME name of function to generate
* OP OP=div => %o0 / %o1; OP=rem => %o0 % %o1
* S S=true => signed; S=false => unsigned
*
* Algorithm parameters:
* N how many bits per iteration we try to get (4)
* WORDSIZE total number of bits (32)
*
* Derived constants:
* TOPBITS number of bits in the top `decade' of a number
*
* Important variables:
* Q the partial quotient under development (initially 0)
* R the remainder so far, initially the dividend
* ITER number of main division loop iterations required;
* equal to ceil(log2(quotient) / N). Note that this
* is the log base (2^N) of the quotient.
* V the current comparand, initially divisor*2^(ITER*N-1)
*
* Cost:
* Current estimate for non-large dividend is
* ceil(log2(quotient) / N) * (10 + 7N/2) + C
* A large dividend is one greater than 2^(31-TOPBITS) and takes a
* different path, as the upper bits of the quotient must be developed
* one bit at a time.
*/
define(N, `4')dnl
define(WORDSIZE, `32')dnl
define(TOPBITS, eval(WORDSIZE - N*((WORDSIZE-1)/N)))dnl
dnl
define(dividend, `%o0')dnl
define(divisor, `%o1')dnl
define(Q, `%o2')dnl
define(R, `%o3')dnl
define(ITER, `%o4')dnl
define(V, `%o5')dnl
dnl
dnl m4 reminder: ifelse(a,b,c,d) => if a is b, then c, else d
define(T, `%g1')dnl
define(SC, `%g2')dnl
ifelse(S, `true', `define(SIGN, `%g3')')dnl
dnl
dnl This is the recursive definition for developing quotient digits.
dnl
dnl Parameters:
dnl $1 the current depth, 1 <= $1 <= N
dnl $2 the current accumulation of quotient bits
dnl N max depth
dnl
dnl We add a new bit to $2 and either recurse or insert the bits in
dnl the quotient. R, Q, and V are inputs and outputs as defined above;
dnl the condition codes are expected to reflect the input R, and are
dnl modified to reflect the output R.
dnl
define(DEVELOP_QUOTIENT_BITS,
` ! depth $1, accumulated bits $2
bl LOC($1.eval(2**N+$2))
srl V,1,V
! remainder is positive
subcc R,V,R
ifelse($1, N,
` b 9f
add Q, ($2*2+1), Q
', ` DEVELOP_QUOTIENT_BITS(incr($1), `eval(2*$2+1)')')
LOC($1.eval(2**N+$2)):
! remainder is negative
addcc R,V,R
ifelse($1, N,
` b 9f
add Q, ($2*2-1), Q
', ` DEVELOP_QUOTIENT_BITS(incr($1), `eval(2*$2-1)')')
ifelse($1, 1, `9:')')dnl
#include <sysdep.h>
#include <sys/trap.h>
ENTRY(NAME)
ifelse(S, `true',
` ! compute sign of result; if neither is negative, no problem
orcc divisor, dividend, %g0 ! either negative?
bge 2f ! no, go do the divide
ifelse(OP, `div',
` xor divisor, dividend, SIGN ! compute sign in any case',
` mov dividend, SIGN ! sign of remainder matches dividend')
tst divisor
bge 1f
tst dividend
! divisor is definitely negative; dividend might also be negative
bge 2f ! if dividend not negative...
sub %g0, divisor, divisor ! in any case, make divisor nonneg
1: ! dividend is negative, divisor is nonnegative
sub %g0, dividend, dividend ! make dividend nonnegative
2:
')
! Ready to divide. Compute size of quotient; scale comparand.
orcc divisor, %g0, V
bne 1f
mov dividend, R
! Divide by zero trap. If it returns, return 0 (about as
! wrong as possible, but that is what SunOS does...).
ta ST_DIV0
retl
clr %o0
1:
cmp R, V ! if divisor exceeds dividend, done
blu LOC(got_result) ! (and algorithm fails otherwise)
clr Q
sethi %hi(1 << (WORDSIZE - TOPBITS - 1)), T
cmp R, T
blu LOC(not_really_big)
clr ITER
! `Here the dividend is >= 2**(31-N) or so. We must be careful here,
! as our usual N-at-a-shot divide step will cause overflow and havoc.
! The number of bits in the result here is N*ITER+SC, where SC <= N.
! Compute ITER in an unorthodox manner: know we need to shift V into
! the top decade: so do not even bother to compare to R.'
1:
cmp V, T
bgeu 3f
mov 1, SC
sll V, N, V
b 1b
add ITER, 1, ITER
! Now compute SC.
2: addcc V, V, V
bcc LOC(not_too_big)
add SC, 1, SC
! We get here if the divisor overflowed while shifting.
! This means that R has the high-order bit set.
! Restore V and subtract from R.
sll T, TOPBITS, T ! high order bit
srl V, 1, V ! rest of V
add V, T, V
b LOC(do_single_div)
sub SC, 1, SC
LOC(not_too_big):
3: cmp V, R
blu 2b
nop
be LOC(do_single_div)
nop
/* NB: these are commented out in the V8-Sparc manual as well */
/* (I do not understand this) */
! V > R: went too far: back up 1 step
! srl V, 1, V
! dec SC
! do single-bit divide steps
!
! We have to be careful here. We know that R >= V, so we can do the
! first divide step without thinking. BUT, the others are conditional,
! and are only done if R >= 0. Because both R and V may have the high-
! order bit set in the first step, just falling into the regular
! division loop will mess up the first time around.
! So we unroll slightly...
LOC(do_single_div):
subcc SC, 1, SC
bl LOC(end_regular_divide)
nop
sub R, V, R
mov 1, Q
b LOC(end_single_divloop)
nop
LOC(single_divloop):
sll Q, 1, Q
bl 1f
srl V, 1, V
! R >= 0
sub R, V, R
b 2f
add Q, 1, Q
1: ! R < 0
add R, V, R
sub Q, 1, Q
2:
LOC(end_single_divloop):
subcc SC, 1, SC
bge LOC(single_divloop)
tst R
b,a LOC(end_regular_divide)
LOC(not_really_big):
1:
sll V, N, V
cmp V, R
bleu 1b
addcc ITER, 1, ITER
be LOC(got_result)
sub ITER, 1, ITER
tst R ! set up for initial iteration
LOC(divloop):
sll Q, N, Q
DEVELOP_QUOTIENT_BITS(1, 0)
LOC(end_regular_divide):
subcc ITER, 1, ITER
bge LOC(divloop)
tst R
bl,a LOC(got_result)
! non-restoring fixup here (one instruction only!)
ifelse(OP, `div',
` sub Q, 1, Q
', ` add R, divisor, R
')
LOC(got_result):
ifelse(S, `true',
` ! check to see if answer should be < 0
tst SIGN
bl,a 1f
ifelse(OP, `div', `sub %g0, Q, Q', `sub %g0, R, R')
1:')
retl
ifelse(OP, `div', `mov Q, %o0', `mov R, %o0')
END(NAME)
ifelse(OP, `div', ifelse(S, `false', `strong_alias (.udiv, __wrap_.udiv)
'))dnl

View File

@ -1,127 +1,13 @@
/*
* Signed multiply, from Appendix E of the Sparc Version 8
* Architecture Manual.
*/
/*
* Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the upper 32 bits of
* the 64-bit product).
*
* This code optimizes short (less than 13-bit) multiplies.
* Sparc v8 has multiply.
*/
#include <sysdep.h>
ENTRY(.mul)
mov %o0, %y ! multiplier -> Y
andncc %o0, 0xfff, %g0 ! test bits 12..31
be LOC(mul_shortway) ! if zero, can do it the short way
andcc %g0, %g0, %o4 ! zero the partial product and clear N and V
/*
* Long multiply. 32 steps, followed by a final shift step.
*/
mulscc %o4, %o1, %o4 ! 1
mulscc %o4, %o1, %o4 ! 2
mulscc %o4, %o1, %o4 ! 3
mulscc %o4, %o1, %o4 ! 4
mulscc %o4, %o1, %o4 ! 5
mulscc %o4, %o1, %o4 ! 6
mulscc %o4, %o1, %o4 ! 7
mulscc %o4, %o1, %o4 ! 8
mulscc %o4, %o1, %o4 ! 9
mulscc %o4, %o1, %o4 ! 10
mulscc %o4, %o1, %o4 ! 11
mulscc %o4, %o1, %o4 ! 12
mulscc %o4, %o1, %o4 ! 13
mulscc %o4, %o1, %o4 ! 14
mulscc %o4, %o1, %o4 ! 15
mulscc %o4, %o1, %o4 ! 16
mulscc %o4, %o1, %o4 ! 17
mulscc %o4, %o1, %o4 ! 18
mulscc %o4, %o1, %o4 ! 19
mulscc %o4, %o1, %o4 ! 20
mulscc %o4, %o1, %o4 ! 21
mulscc %o4, %o1, %o4 ! 22
mulscc %o4, %o1, %o4 ! 23
mulscc %o4, %o1, %o4 ! 24
mulscc %o4, %o1, %o4 ! 25
mulscc %o4, %o1, %o4 ! 26
mulscc %o4, %o1, %o4 ! 27
mulscc %o4, %o1, %o4 ! 28
mulscc %o4, %o1, %o4 ! 29
mulscc %o4, %o1, %o4 ! 30
mulscc %o4, %o1, %o4 ! 31
mulscc %o4, %o1, %o4 ! 32
mulscc %o4, %g0, %o4 ! final shift
! If %o0 was negative, the result is
! (%o0 * %o1) + (%o1 << 32))
! We fix that here.
#if 0
tst %o0
bge 1f
rd %y, %o0
! %o0 was indeed negative; fix upper 32 bits of result by subtracting
! %o1 (i.e., return %o4 - %o1 in %o1).
smul %o0, %o1, %o0
retl
sub %o4, %o1, %o1
1:
retl
mov %o4, %o1
#else
/* Faster code adapted from tege@sics.se's code for umul.S. */
sra %o0, 31, %o2 ! make mask from sign bit
and %o1, %o2, %o2 ! %o2 = 0 or %o1, depending on sign of %o0
rd %y, %o0 ! get lower half of product
retl
sub %o4, %o2, %o1 ! subtract compensation
! and put upper half in place
#endif
LOC(mul_shortway):
/*
* Short multiply. 12 steps, followed by a final shift step.
* The resulting bits are off by 12 and (32-12) = 20 bit positions,
* but there is no problem with %o0 being negative (unlike above).
*/
mulscc %o4, %o1, %o4 ! 1
mulscc %o4, %o1, %o4 ! 2
mulscc %o4, %o1, %o4 ! 3
mulscc %o4, %o1, %o4 ! 4
mulscc %o4, %o1, %o4 ! 5
mulscc %o4, %o1, %o4 ! 6
mulscc %o4, %o1, %o4 ! 7
mulscc %o4, %o1, %o4 ! 8
mulscc %o4, %o1, %o4 ! 9
mulscc %o4, %o1, %o4 ! 10
mulscc %o4, %o1, %o4 ! 11
mulscc %o4, %o1, %o4 ! 12
mulscc %o4, %g0, %o4 ! final shift
/*
* %o4 has 20 of the bits that should be in the low part of the
* result; %y has the bottom 12 (as %y's top 12). That is:
*
* %o4 %y
* +----------------+----------------+
* | -12- | -20- | -12- | -20- |
* +------(---------+------)---------+
* --hi-- ----low-part----
*
* The upper 12 bits of %o4 should be sign-extended to form the
* high part of the product (i.e., highpart = %o4 >> 20).
*/
rd %y, %o5
sll %o4, 12, %o0 ! shift middle bits left 12
srl %o5, 20, %o5 ! shift low bits right 20, zero fill at left
or %o5, %o0, %o0 ! construct low part of result
retl
sra %o4, 20, %o1 ! ... and extract high part of result
rd %y, %o1
END(.mul)

View File

@ -1,198 +1,102 @@
! SPARC __mpn_mul_1 -- Multiply a limb vector with a limb and store
! the result in a second limb vector.
!
! SPARC v8 __mpn_mul_1 -- Multiply a limb vector with a single limb and
! store the product in a second limb vector.
! Copyright (C) 1992-2019 Free Software Foundation, Inc.
!
! This file is part of the GNU MP Library.
!
! The GNU MP Library is free software; you can redistribute it and/or modify
! it under the terms of the GNU Lesser General Public License as published by
! the Free Software Foundation; either version 2.1 of the License, or (at your
! option) any later version.
!
! The GNU MP Library is distributed in the hope that it will be useful, but
! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
! License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with the GNU MP Library; see the file COPYING.LIB. If not,
! see <https://www.gnu.org/licenses/>.
! INPUT PARAMETERS
! RES_PTR o0
! S1_PTR o1
! SIZE o2
! S2_LIMB o3
! ADD CODE FOR SMALL MULTIPLIERS!
!1: ld
! st
!
!2: ld ,a
! addxcc a,a,x
! st x,
!
!3_unrolled:
! ld ,a
! addxcc a,a,x1 ! 2a + cy
! addx %g0,%g0,x2
! addcc a,x1,x ! 3a + c
! st x,
!
! ld ,a
! addxcc a,a,y1
! addx %g0,%g0,y2
! addcc a,y1,x
! st x,
!
!4_unrolled:
! ld ,a
! srl a,2,x1 ! 4a
! addxcc y2,x1,x
! sll a,30,x2
! st x,
!
! ld ,a
! srl a,2,y1
! addxcc x2,y1,y
! sll a,30,y2
! st x,
!
!5_unrolled:
! ld ,a
! srl a,2,x1 ! 4a
! addxcc a,x1,x ! 5a + c
! sll a,30,x2
! addx %g0,x2,x2
! st x,
!
! ld ,a
! srl a,2,y1
! addxcc a,y1,x
! sll a,30,y2
! addx %g0,y2,y2
! st x,
!
!8_unrolled:
! ld ,a
! srl a,3,x1 ! 8a
! addxcc y2,x1,x
! sll a,29,x2
! st x,
!
! ld ,a
! srl a,3,y1
! addxcc x2,y1,y
! sll a,29,y2
! st x,
! res_ptr o0
! s1_ptr o1
! size o2
! s2_limb o3
#include <sysdep.h>
ENTRY(__mpn_mul_1)
! Make S1_PTR and RES_PTR point at the end of their blocks
! and put (- 4 x SIZE) in index/loop counter.
sll %o2,2,%o2
add %o0,%o2,%o4 ! RES_PTR in o4 since o0 is retval
add %o1,%o2,%o1
sub %g0,%o2,%o2
sll %o2,4,%g1
mov %o7,%g4 ! Save return address register
and %g1,(4-1)<<4,%g1
1: call 2f
add %o7,3f-1b,%g3
2: mov %g4,%o7 ! Restore return address register
jmp %g3+%g1
ld [%o1+0],%o4 ! 1
cmp %o3,0xfff
bgu LOC(large)
.align 4
3:
LOC(00):
add %o0,-4,%o0
add %o1,-4,%o1
b LOC(loop00) /* 4, 8, 12, ... */
orcc %g0,%g0,%g2
LOC(01):
b LOC(loop01) /* 1, 5, 9, ... */
orcc %g0,%g0,%g2
nop
nop
LOC(10):
add %o0,-12,%o0 /* 2, 6, 10, ... */
add %o1,4,%o1
b LOC(loop10)
orcc %g0,%g0,%g2
nop
LOC(11):
add %o0,-8,%o0 /* 3, 7, 11, ... */
add %o1,-8,%o1
b LOC(loop11)
orcc %g0,%g0,%g2
ld [%o1+%o2],%o5
mov 0,%o0
b LOC(0)
add %o4,-4,%o4
LOC(loop0):
st %g1,[%o4+%o2]
LOC(0): wr %g0,%o3,%y
sra %o5,31,%g2
and %o3,%g2,%g2
andcc %g1,0,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,0,%g1
sra %g1,20,%g4
sll %g1,12,%g1
rd %y,%g3
srl %g3,20,%g3
or %g1,%g3,%g1
addcc %g1,%o0,%g1
addx %g2,%g4,%o0 ! add sign-compensation and cy to hi limb
addcc %o2,4,%o2 ! loop counter
bne,a LOC(loop0)
ld [%o1+%o2],%o5
retl
st %g1,[%o4+%o2]
LOC(large):
ld [%o1+%o2],%o5
mov 0,%o0
sra %o3,31,%g4 ! g4 = mask of ones iff S2_LIMB < 0
b LOC(1)
add %o4,-4,%o4
LOC(loop):
st %g3,[%o4+%o2]
LOC(1): wr %g0,%o5,%y
and %o5,%g4,%g2 ! g2 = S1_LIMB iff S2_LIMB < 0, else 0
andcc %g0,%g0,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%g0,%g1
rd %y,%g3
addcc %g3,%o0,%g3
addx %g2,%g1,%o0 ! add sign-compensation and cy to hi limb
addcc %o2,4,%o2 ! loop counter
bne,a LOC(loop)
ld [%o1+%o2],%o5
addcc %g3,%g2,%g3 ! 1
ld [%o1+4],%o4 ! 2
st %g3,[%o0+0] ! 1
rd %y,%g2 ! 1
LOC(loop00):
umul %o4,%o3,%g3 ! 2
addxcc %g3,%g2,%g3 ! 2
ld [%o1+8],%o4 ! 3
st %g3,[%o0+4] ! 2
rd %y,%g2 ! 2
LOC(loop11):
umul %o4,%o3,%g3 ! 3
addxcc %g3,%g2,%g3 ! 3
ld [%o1+12],%o4 ! 4
add %o1,16,%o1
st %g3,[%o0+8] ! 3
rd %y,%g2 ! 3
LOC(loop10):
umul %o4,%o3,%g3 ! 4
addxcc %g3,%g2,%g3 ! 4
ld [%o1+0],%o4 ! 1
st %g3,[%o0+12] ! 4
add %o0,16,%o0
rd %y,%g2 ! 4
addx %g0,%g2,%g2
LOC(loop01):
addcc %o2,-4,%o2
bg LOC(loop)
umul %o4,%o3,%g3 ! 1
addcc %g3,%g2,%g3 ! 4
st %g3,[%o0+0] ! 4
rd %y,%g2 ! 4
retl
st %g3,[%o4+%o2]
addx %g0,%g2,%o0
END(__mpn_mul_1)

View File

@ -1,363 +1,21 @@
/* This file is generated from divrem.m4; DO NOT EDIT! */
/*
* Division and remainder, from Appendix E of the Sparc Version 8
* Architecture Manual, with fixes from Gordon Irlam.
* Sparc v8 has divide.
*/
/*
* Input: dividend and divisor in %o0 and %o1 respectively.
*
* m4 parameters:
* .rem name of function to generate
* rem rem=div => %o0 / %o1; rem=rem => %o0 % %o1
* true true=true => signed; true=false => unsigned
*
* Algorithm parameters:
* N how many bits per iteration we try to get (4)
* WORDSIZE total number of bits (32)
*
* Derived constants:
* TOPBITS number of bits in the top decade of a number
*
* Important variables:
* Q the partial quotient under development (initially 0)
* R the remainder so far, initially the dividend
* ITER number of main division loop iterations required;
* equal to ceil(log2(quotient) / N). Note that this
* is the log base (2^N) of the quotient.
* V the current comparand, initially divisor*2^(ITER*N-1)
*
* Cost:
* Current estimate for non-large dividend is
* ceil(log2(quotient) / N) * (10 + 7N/2) + C
* A large dividend is one greater than 2^(31-TOPBITS) and takes a
* different path, as the upper bits of the quotient must be developed
* one bit at a time.
*/
#include <sysdep.h>
#include <sys/trap.h>
ENTRY(.rem)
! compute sign of result; if neither is negative, no problem
orcc %o1, %o0, %g0 ! either negative?
bge 2f ! no, go do the divide
mov %o0, %g3 ! sign of remainder matches %o0
tst %o1
bge 1f
tst %o0
! %o1 is definitely negative; %o0 might also be negative
bge 2f ! if %o0 not negative...
sub %g0, %o1, %o1 ! in any case, make %o1 nonneg
1: ! %o0 is negative, %o1 is nonnegative
sub %g0, %o0, %o0 ! make %o0 nonnegative
2:
! Ready to divide. Compute size of quotient; scale comparand.
orcc %o1, %g0, %o5
bne 1f
mov %o0, %o3
! Divide by zero trap. If it returns, return 0 (about as
! wrong as possible, but that is what SunOS does...).
ta ST_DIV0
retl
clr %o0
1:
cmp %o3, %o5 ! if %o1 exceeds %o0, done
blu LOC(got_result) ! (and algorithm fails otherwise)
clr %o2
sethi %hi(1 << (32 - 4 - 1)), %g1
cmp %o3, %g1
blu LOC(not_really_big)
clr %o4
! Here the dividend is >= 2**(31-N) or so. We must be careful here,
! as our usual N-at-a-shot divide step will cause overflow and havoc.
! The number of bits in the result here is N*ITER+SC, where SC <= N.
! Compute ITER in an unorthodox manner: know we need to shift V into
! the top decade: so do not even bother to compare to R.
1:
cmp %o5, %g1
bgeu 3f
mov 1, %g2
sll %o5, 4, %o5
b 1b
add %o4, 1, %o4
! Now compute %g2.
2: addcc %o5, %o5, %o5
bcc LOC(not_too_big)
add %g2, 1, %g2
! We get here if the %o1 overflowed while shifting.
! This means that %o3 has the high-order bit set.
! Restore %o5 and subtract from %o3.
sll %g1, 4, %g1 ! high order bit
srl %o5, 1, %o5 ! rest of %o5
add %o5, %g1, %o5
b LOC(do_single_div)
sub %g2, 1, %g2
LOC(not_too_big):
3: cmp %o5, %o3
blu 2b
nop
be LOC(do_single_div)
nop
/* NB: these are commented out in the V8-Sparc manual as well */
/* (I do not understand this) */
! %o5 > %o3: went too far: back up 1 step
! srl %o5, 1, %o5
! dec %g2
! do single-bit divide steps
!
! We have to be careful here. We know that %o3 >= %o5, so we can do the
! first divide step without thinking. BUT, the others are conditional,
! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high-
! order bit set in the first step, just falling into the regular
! division loop will mess up the first time around.
! So we unroll slightly...
LOC(do_single_div):
subcc %g2, 1, %g2
bl LOC(end_regular_divide)
nop
sub %o3, %o5, %o3
mov 1, %o2
b LOC(end_single_divloop)
nop
LOC(single_divloop):
sll %o2, 1, %o2
bl 1f
srl %o5, 1, %o5
! %o3 >= 0
sub %o3, %o5, %o3
b 2f
add %o2, 1, %o2
1: ! %o3 < 0
add %o3, %o5, %o3
sub %o2, 1, %o2
2:
LOC(end_single_divloop):
subcc %g2, 1, %g2
bge LOC(single_divloop)
tst %o3
b,a LOC(end_regular_divide)
LOC(not_really_big):
1:
sll %o5, 4, %o5
cmp %o5, %o3
bleu 1b
addcc %o4, 1, %o4
be LOC(got_result)
sub %o4, 1, %o4
tst %o3 ! set up for initial iteration
LOC(divloop):
sll %o2, 4, %o2
! depth 1, accumulated bits 0
bl LOC(1.16)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 2, accumulated bits 1
bl LOC(2.17)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 3, accumulated bits 3
bl LOC(3.19)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits 7
bl LOC(4.23)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (7*2+1), %o2
LOC(4.23):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (7*2-1), %o2
LOC(3.19):
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits 5
bl LOC(4.21)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (5*2+1), %o2
LOC(4.21):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (5*2-1), %o2
LOC(2.17):
! remainder is negative
addcc %o3,%o5,%o3
! depth 3, accumulated bits 1
bl LOC(3.17)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits 3
bl LOC(4.19)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (3*2+1), %o2
LOC(4.19):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (3*2-1), %o2
LOC(3.17):
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits 1
bl LOC(4.17)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (1*2+1), %o2
LOC(4.17):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (1*2-1), %o2
LOC(1.16):
! remainder is negative
addcc %o3,%o5,%o3
! depth 2, accumulated bits -1
bl LOC(2.15)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 3, accumulated bits -1
bl LOC(3.15)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits -1
bl LOC(4.15)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (-1*2+1), %o2
LOC(4.15):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (-1*2-1), %o2
LOC(3.15):
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits -3
bl LOC(4.13)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (-3*2+1), %o2
LOC(4.13):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (-3*2-1), %o2
LOC(2.15):
! remainder is negative
addcc %o3,%o5,%o3
! depth 3, accumulated bits -3
bl LOC(3.13)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits -5
bl LOC(4.11)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (-5*2+1), %o2
LOC(4.11):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (-5*2-1), %o2
LOC(3.13):
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits -7
bl LOC(4.9)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (-7*2+1), %o2
LOC(4.9):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (-7*2-1), %o2
9:
LOC(end_regular_divide):
subcc %o4, 1, %o4
bge LOC(divloop)
tst %o3
bl,a LOC(got_result)
! non-restoring fixup here (one instruction only!)
add %o3, %o1, %o3
LOC(got_result):
! check to see if answer should be < 0
tst %g3
bl,a 1f
sub %g0, %o3, %o3
1:
sra %o0, 31, %o2
wr %o2, 0, %y
nop
nop
nop
sdivcc %o0, %o1, %o2
bvs,a 1f
xnor %o2, %g0, %o2
1: smul %o2, %o1, %o2
retl
mov %o3, %o0
sub %o0, %o2, %o0
END(.rem)

View File

@ -1,363 +1,20 @@
/* This file is generated from divrem.m4; DO NOT EDIT! */
/*
* Division and remainder, from Appendix E of the Sparc Version 8
* Architecture Manual, with fixes from Gordon Irlam.
* Sparc v8 has divide.
*/
/*
* Input: dividend and divisor in %o0 and %o1 respectively.
*
* m4 parameters:
* .div name of function to generate
* div div=div => %o0 / %o1; div=rem => %o0 % %o1
* true true=true => signed; true=false => unsigned
*
* Algorithm parameters:
* N how many bits per iteration we try to get (4)
* WORDSIZE total number of bits (32)
*
* Derived constants:
* TOPBITS number of bits in the top decade of a number
*
* Important variables:
* Q the partial quotient under development (initially 0)
* R the remainder so far, initially the dividend
* ITER number of main division loop iterations required;
* equal to ceil(log2(quotient) / N). Note that this
* is the log base (2^N) of the quotient.
* V the current comparand, initially divisor*2^(ITER*N-1)
*
* Cost:
* Current estimate for non-large dividend is
* ceil(log2(quotient) / N) * (10 + 7N/2) + C
* A large dividend is one greater than 2^(31-TOPBITS) and takes a
* different path, as the upper bits of the quotient must be developed
* one bit at a time.
*/
#include <sysdep.h>
#include <sys/trap.h>
ENTRY(.div)
! compute sign of result; if neither is negative, no problem
orcc %o1, %o0, %g0 ! either negative?
bge 2f ! no, go do the divide
xor %o1, %o0, %g3 ! compute sign in any case
tst %o1
bge 1f
tst %o0
! %o1 is definitely negative; %o0 might also be negative
bge 2f ! if %o0 not negative...
sub %g0, %o1, %o1 ! in any case, make %o1 nonneg
1: ! %o0 is negative, %o1 is nonnegative
sub %g0, %o0, %o0 ! make %o0 nonnegative
2:
! Ready to divide. Compute size of quotient; scale comparand.
orcc %o1, %g0, %o5
bne 1f
mov %o0, %o3
! Divide by zero trap. If it returns, return 0 (about as
! wrong as possible, but that is what SunOS does...).
ta ST_DIV0
retl
clr %o0
1:
cmp %o3, %o5 ! if %o1 exceeds %o0, done
blu LOC(got_result) ! (and algorithm fails otherwise)
clr %o2
sethi %hi(1 << (32 - 4 - 1)), %g1
cmp %o3, %g1
blu LOC(not_really_big)
clr %o4
! Here the dividend is >= 2**(31-N) or so. We must be careful here,
! as our usual N-at-a-shot divide step will cause overflow and havoc.
! The number of bits in the result here is N*ITER+SC, where SC <= N.
! Compute ITER in an unorthodox manner: know we need to shift V into
! the top decade: so do not even bother to compare to R.
1:
cmp %o5, %g1
bgeu 3f
mov 1, %g2
sll %o5, 4, %o5
b 1b
add %o4, 1, %o4
! Now compute %g2.
2: addcc %o5, %o5, %o5
bcc LOC(not_too_big)
add %g2, 1, %g2
! We get here if the %o1 overflowed while shifting.
! This means that %o3 has the high-order bit set.
! Restore %o5 and subtract from %o3.
sll %g1, 4, %g1 ! high order bit
srl %o5, 1, %o5 ! rest of %o5
add %o5, %g1, %o5
b LOC(do_single_div)
sub %g2, 1, %g2
LOC(not_too_big):
3: cmp %o5, %o3
blu 2b
nop
be LOC(do_single_div)
nop
/* NB: these are commented out in the V8-Sparc manual as well */
/* (I do not understand this) */
! %o5 > %o3: went too far: back up 1 step
! srl %o5, 1, %o5
! dec %g2
! do single-bit divide steps
!
! We have to be careful here. We know that %o3 >= %o5, so we can do the
! first divide step without thinking. BUT, the others are conditional,
! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high-
! order bit set in the first step, just falling into the regular
! division loop will mess up the first time around.
! So we unroll slightly...
LOC(do_single_div):
subcc %g2, 1, %g2
bl LOC(end_regular_divide)
nop
sub %o3, %o5, %o3
mov 1, %o2
b LOC(end_single_divloop)
nop
LOC(single_divloop):
sll %o2, 1, %o2
bl 1f
srl %o5, 1, %o5
! %o3 >= 0
sub %o3, %o5, %o3
b 2f
add %o2, 1, %o2
1: ! %o3 < 0
add %o3, %o5, %o3
sub %o2, 1, %o2
2:
LOC(end_single_divloop):
subcc %g2, 1, %g2
bge LOC(single_divloop)
tst %o3
b,a LOC(end_regular_divide)
LOC(not_really_big):
1:
sll %o5, 4, %o5
cmp %o5, %o3
bleu 1b
addcc %o4, 1, %o4
be LOC(got_result)
sub %o4, 1, %o4
tst %o3 ! set up for initial iteration
LOC(divloop):
sll %o2, 4, %o2
! depth 1, accumulated bits 0
bl LOC(1.16)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 2, accumulated bits 1
bl LOC(2.17)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 3, accumulated bits 3
bl LOC(3.19)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits 7
bl LOC(4.23)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (7*2+1), %o2
LOC(4.23):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (7*2-1), %o2
LOC(3.19):
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits 5
bl LOC(4.21)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (5*2+1), %o2
LOC(4.21):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (5*2-1), %o2
LOC(2.17):
! remainder is negative
addcc %o3,%o5,%o3
! depth 3, accumulated bits 1
bl LOC(3.17)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits 3
bl LOC(4.19)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (3*2+1), %o2
LOC(4.19):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (3*2-1), %o2
LOC(3.17):
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits 1
bl LOC(4.17)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (1*2+1), %o2
LOC(4.17):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (1*2-1), %o2
LOC(1.16):
! remainder is negative
addcc %o3,%o5,%o3
! depth 2, accumulated bits -1
bl LOC(2.15)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 3, accumulated bits -1
bl LOC(3.15)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits -1
bl LOC(4.15)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (-1*2+1), %o2
LOC(4.15):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (-1*2-1), %o2
LOC(3.15):
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits -3
bl LOC(4.13)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (-3*2+1), %o2
LOC(4.13):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (-3*2-1), %o2
LOC(2.15):
! remainder is negative
addcc %o3,%o5,%o3
! depth 3, accumulated bits -3
bl LOC(3.13)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits -5
bl LOC(4.11)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (-5*2+1), %o2
LOC(4.11):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (-5*2-1), %o2
LOC(3.13):
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits -7
bl LOC(4.9)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (-7*2+1), %o2
LOC(4.9):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (-7*2-1), %o2
9:
LOC(end_regular_divide):
subcc %o4, 1, %o4
bge LOC(divloop)
tst %o3
bl,a LOC(got_result)
! non-restoring fixup here (one instruction only!)
sub %o2, 1, %o2
LOC(got_result):
! check to see if answer should be < 0
tst %g3
bl,a 1f
sub %g0, %o2, %o2
1:
retl
mov %o2, %o0
sra %o0, 31, %o2
wr %o2, 0, %y
nop
nop
nop
sdivcc %o0, %o1, %o0
bvs,a 1f
xnor %o0, %g0, %o0
1: retl
nop
END(.div)

View File

@ -1 +0,0 @@
sysdep-CFLAGS += -mcpu=v8

View File

@ -1,118 +0,0 @@
! SPARC v8 __mpn_addmul_1 -- Multiply a limb vector with a limb and
! add the result to a second limb vector.
! Copyright (C) 1992-2019 Free Software Foundation, Inc.
! This file is part of the GNU MP Library.
! The GNU MP Library is free software; you can redistribute it and/or modify
! it under the terms of the GNU Lesser General Public License as published by
! the Free Software Foundation; either version 2.1 of the License, or (at your
! option) any later version.
! The GNU MP Library is distributed in the hope that it will be useful, but
! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
! License for more details.
! You should have received a copy of the GNU Lesser General Public License
! along with the GNU MP Library; see the file COPYING.LIB. If not,
! see <https://www.gnu.org/licenses/>.
! INPUT PARAMETERS
! res_ptr o0
! s1_ptr o1
! size o2
! s2_limb o3
#include <sysdep.h>
ENTRY(__mpn_addmul_1)
ld [%o1+0],%o4 ! 1
sll %o2,4,%g1
orcc %g0,%g0,%g2
mov %o7,%g4 ! Save return address register
and %g1,(4-1)<<4,%g1
1: call 2f
add %o7,3f-1b,%g3
2: jmp %g3+%g1
mov %g4,%o7 ! Restore return address register
.align 4
3:
LOC(00):
add %o0,-4,%o0
b LOC(loop00) /* 4, 8, 12, ... */
add %o1,-4,%o1
nop
LOC(01):
b LOC(loop01) /* 1, 5, 9, ... */
nop
nop
nop
LOC(10):
add %o0,-12,%o0 /* 2, 6, 10, ... */
b LOC(loop10)
add %o1,4,%o1
nop
LOC(11):
add %o0,-8,%o0 /* 3, 7, 11, ... */
b LOC(loop11)
add %o1,-8,%o1
nop
LOC(loop):
addcc %g3,%g2,%g3 ! 1
ld [%o1+4],%o4 ! 2
rd %y,%g2 ! 1
addx %g0,%g2,%g2
ld [%o0+0],%g1 ! 2
addcc %g1,%g3,%g3
st %g3,[%o0+0] ! 1
LOC(loop00):
umul %o4,%o3,%g3 ! 2
ld [%o0+4],%g1 ! 2
addxcc %g3,%g2,%g3 ! 2
ld [%o1+8],%o4 ! 3
rd %y,%g2 ! 2
addx %g0,%g2,%g2
nop
addcc %g1,%g3,%g3
st %g3,[%o0+4] ! 2
LOC(loop11):
umul %o4,%o3,%g3 ! 3
addxcc %g3,%g2,%g3 ! 3
ld [%o1+12],%o4 ! 4
rd %y,%g2 ! 3
add %o1,16,%o1
addx %g0,%g2,%g2
ld [%o0+8],%g1 ! 2
addcc %g1,%g3,%g3
st %g3,[%o0+8] ! 3
LOC(loop10):
umul %o4,%o3,%g3 ! 4
addxcc %g3,%g2,%g3 ! 4
ld [%o1+0],%o4 ! 1
rd %y,%g2 ! 4
addx %g0,%g2,%g2
ld [%o0+12],%g1 ! 2
addcc %g1,%g3,%g3
st %g3,[%o0+12] ! 4
add %o0,16,%o0
addx %g0,%g2,%g2
LOC(loop01):
addcc %o2,-4,%o2
bg LOC(loop)
umul %o4,%o3,%g3 ! 1
addcc %g3,%g2,%g3 ! 4
rd %y,%g2 ! 4
addx %g0,%g2,%g2
ld [%o0+0],%g1 ! 2
addcc %g1,%g3,%g3
st %g3,[%o0+0] ! 4
retl
addx %g0,%g2,%o0
END(__mpn_addmul_1)

View File

@ -1,13 +0,0 @@
/*
* Sparc v8 has multiply.
*/
#include <sysdep.h>
ENTRY(.mul)
smul %o0, %o1, %o0
retl
rd %y, %o1
END(.mul)

View File

@ -1,102 +0,0 @@
! SPARC v8 __mpn_mul_1 -- Multiply a limb vector with a single limb and
! store the product in a second limb vector.
! Copyright (C) 1992-2019 Free Software Foundation, Inc.
! This file is part of the GNU MP Library.
! The GNU MP Library is free software; you can redistribute it and/or modify
! it under the terms of the GNU Lesser General Public License as published by
! the Free Software Foundation; either version 2.1 of the License, or (at your
! option) any later version.
! The GNU MP Library is distributed in the hope that it will be useful, but
! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
! License for more details.
! You should have received a copy of the GNU Lesser General Public License
! along with the GNU MP Library; see the file COPYING.LIB. If not,
! see <https://www.gnu.org/licenses/>.
! INPUT PARAMETERS
! res_ptr o0
! s1_ptr o1
! size o2
! s2_limb o3
#include <sysdep.h>
ENTRY(__mpn_mul_1)
sll %o2,4,%g1
mov %o7,%g4 ! Save return address register
and %g1,(4-1)<<4,%g1
1: call 2f
add %o7,3f-1b,%g3
2: mov %g4,%o7 ! Restore return address register
jmp %g3+%g1
ld [%o1+0],%o4 ! 1
.align 4
3:
LOC(00):
add %o0,-4,%o0
add %o1,-4,%o1
b LOC(loop00) /* 4, 8, 12, ... */
orcc %g0,%g0,%g2
LOC(01):
b LOC(loop01) /* 1, 5, 9, ... */
orcc %g0,%g0,%g2
nop
nop
LOC(10):
add %o0,-12,%o0 /* 2, 6, 10, ... */
add %o1,4,%o1
b LOC(loop10)
orcc %g0,%g0,%g2
nop
LOC(11):
add %o0,-8,%o0 /* 3, 7, 11, ... */
add %o1,-8,%o1
b LOC(loop11)
orcc %g0,%g0,%g2
LOC(loop):
addcc %g3,%g2,%g3 ! 1
ld [%o1+4],%o4 ! 2
st %g3,[%o0+0] ! 1
rd %y,%g2 ! 1
LOC(loop00):
umul %o4,%o3,%g3 ! 2
addxcc %g3,%g2,%g3 ! 2
ld [%o1+8],%o4 ! 3
st %g3,[%o0+4] ! 2
rd %y,%g2 ! 2
LOC(loop11):
umul %o4,%o3,%g3 ! 3
addxcc %g3,%g2,%g3 ! 3
ld [%o1+12],%o4 ! 4
add %o1,16,%o1
st %g3,[%o0+8] ! 3
rd %y,%g2 ! 3
LOC(loop10):
umul %o4,%o3,%g3 ! 4
addxcc %g3,%g2,%g3 ! 4
ld [%o1+0],%o4 ! 1
st %g3,[%o0+12] ! 4
add %o0,16,%o0
rd %y,%g2 ! 4
addx %g0,%g2,%g2
LOC(loop01):
addcc %o2,-4,%o2
bg LOC(loop)
umul %o4,%o3,%g3 ! 1
addcc %g3,%g2,%g3 ! 4
st %g3,[%o0+0] ! 4
rd %y,%g2 ! 4
retl
addx %g0,%g2,%o0
END(__mpn_mul_1)

View File

@ -1,21 +0,0 @@
/*
* Sparc v8 has divide.
*/
#include <sysdep.h>
ENTRY(.rem)
sra %o0, 31, %o2
wr %o2, 0, %y
nop
nop
nop
sdivcc %o0, %o1, %o2
bvs,a 1f
xnor %o2, %g0, %o2
1: smul %o2, %o1, %o2
retl
sub %o0, %o2, %o0
END(.rem)

View File

@ -1,20 +0,0 @@
/*
* Sparc v8 has divide.
*/
#include <sysdep.h>
ENTRY(.div)
sra %o0, 31, %o2
wr %o2, 0, %y
nop
nop
nop
sdivcc %o0, %o1, %o0
bvs,a 1f
xnor %o0, %g0, %o0
1: retl
nop
END(.div)

View File

@ -1,57 +0,0 @@
! SPARC v8 __mpn_submul_1 -- Multiply a limb vector with a limb and
! subtract the result from a second limb vector.
! Copyright (C) 1992-2019 Free Software Foundation, Inc.
! This file is part of the GNU MP Library.
! The GNU MP Library is free software; you can redistribute it and/or modify
! it under the terms of the GNU Lesser General Public License as published by
! the Free Software Foundation; either version 2.1 of the License, or (at your
! option) any later version.
! The GNU MP Library is distributed in the hope that it will be useful, but
! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
! License for more details.
! You should have received a copy of the GNU Lesser General Public License
! along with the GNU MP Library; see the file COPYING.LIB. If not,
! see <https://www.gnu.org/licenses/>.
! INPUT PARAMETERS
! res_ptr o0
! s1_ptr o1
! size o2
! s2_limb o3
#include <sysdep.h>
ENTRY(__mpn_submul_1)
sub %g0,%o2,%o2 ! negate ...
sll %o2,2,%o2 ! ... and scale size
sub %o1,%o2,%o1 ! o1 is offset s1_ptr
sub %o0,%o2,%g1 ! g1 is offset res_ptr
mov 0,%o0 ! clear cy_limb
LOC(loop):
ld [%o1+%o2],%o4
ld [%g1+%o2],%g2
umul %o4,%o3,%o5
rd %y,%g3
addcc %o5,%o0,%o5
addx %g3,0,%o0
subcc %g2,%o5,%g2
addx %o0,0,%o0
st %g2,[%g1+%o2]
addcc %o2,4,%o2
bne LOC(loop)
nop
retl
nop
END(__mpn_submul_1)

View File

@ -1,16 +0,0 @@
/*
* Sparc v8 has divide.
*/
#include <sysdep.h>
ENTRY(.udiv)
wr %g0, 0, %y
nop
nop
retl
udiv %o0, %o1, %o0
END(.udiv)
strong_alias (.udiv, __wrap_.udiv)

View File

@ -1,13 +0,0 @@
/*
* Sparc v8 has multiply.
*/
#include <sysdep.h>
ENTRY(.umul)
umul %o0, %o1, %o0
retl
rd %y, %o1
END(.umul)

View File

@ -1,18 +0,0 @@
/*
* Sparc v8 has divide.
*/
#include <sysdep.h>
ENTRY(.urem)
wr %g0, 0, %y
nop
nop
nop
udiv %o0, %o1, %o2
umul %o2, %o1, %o2
retl
sub %o0, %o2, %o0
END(.urem)

View File

@ -1,146 +1,57 @@
! SPARC __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
! the result from a second limb vector.
!
! SPARC v8 __mpn_submul_1 -- Multiply a limb vector with a limb and
! subtract the result from a second limb vector.
! Copyright (C) 1992-2019 Free Software Foundation, Inc.
!
! This file is part of the GNU MP Library.
!
! The GNU MP Library is free software; you can redistribute it and/or modify
! it under the terms of the GNU Lesser General Public License as published by
! the Free Software Foundation; either version 2.1 of the License, or (at your
! option) any later version.
!
! The GNU MP Library is distributed in the hope that it will be useful, but
! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
! License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with the GNU MP Library; see the file COPYING.LIB. If not,
! see <https://www.gnu.org/licenses/>.
! INPUT PARAMETERS
! RES_PTR o0
! S1_PTR o1
! SIZE o2
! S2_LIMB o3
! res_ptr o0
! s1_ptr o1
! size o2
! s2_limb o3
#include <sysdep.h>
ENTRY(__mpn_submul_1)
! Make S1_PTR and RES_PTR point at the end of their blocks
! and put (- 4 x SIZE) in index/loop counter.
sll %o2,2,%o2
add %o0,%o2,%o4 ! RES_PTR in o4 since o0 is retval
add %o1,%o2,%o1
sub %g0,%o2,%o2
sub %g0,%o2,%o2 ! negate ...
sll %o2,2,%o2 ! ... and scale size
sub %o1,%o2,%o1 ! o1 is offset s1_ptr
sub %o0,%o2,%g1 ! g1 is offset res_ptr
cmp %o3,0xfff
bgu LOC(large)
nop
mov 0,%o0 ! clear cy_limb
ld [%o1+%o2],%o5
mov 0,%o0
b LOC(0)
add %o4,-4,%o4
LOC(loop0):
subcc %o5,%g1,%g1
ld [%o1+%o2],%o5
addx %o0,%g0,%o0
st %g1,[%o4+%o2]
LOC(0): wr %g0,%o3,%y
sra %o5,31,%g2
and %o3,%g2,%g2
andcc %g1,0,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,%o5,%g1
mulscc %g1,0,%g1
sra %g1,20,%g4
sll %g1,12,%g1
rd %y,%g3
srl %g3,20,%g3
or %g1,%g3,%g1
addcc %g1,%o0,%g1
addx %g2,%g4,%o0 ! add sign-compensation and cy to hi limb
addcc %o2,4,%o2 ! loop counter
bne LOC(loop0)
ld [%o4+%o2],%o5
subcc %o5,%g1,%g1
addx %o0,%g0,%o0
retl
st %g1,[%o4+%o2]
LOC(large):
ld [%o1+%o2],%o5
mov 0,%o0
sra %o3,31,%g4 ! g4 = mask of ones iff S2_LIMB < 0
b LOC(1)
add %o4,-4,%o4
LOC(loop):
subcc %o5,%g3,%g3
ld [%o1+%o2],%o5
addx %o0,%g0,%o0
st %g3,[%o4+%o2]
LOC(1): wr %g0,%o5,%y
and %o5,%g4,%g2
andcc %g0,%g0,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%o3,%g1
mulscc %g1,%g0,%g1
ld [%o1+%o2],%o4
ld [%g1+%o2],%g2
umul %o4,%o3,%o5
rd %y,%g3
addcc %g3,%o0,%g3
addx %g2,%g1,%o0
addcc %o5,%o0,%o5
addx %g3,0,%o0
subcc %g2,%o5,%g2
addx %o0,0,%o0
st %g2,[%g1+%o2]
addcc %o2,4,%o2
bne LOC(loop)
ld [%o4+%o2],%o5
nop
subcc %o5,%g3,%g3
addx %o0,%g0,%o0
retl
st %g3,[%o4+%o2]
nop
END(__mpn_submul_1)

View File

@ -1,347 +1,16 @@
/* This file is generated from divrem.m4; DO NOT EDIT! */
/*
* Division and remainder, from Appendix E of the Sparc Version 8
* Architecture Manual, with fixes from Gordon Irlam.
* Sparc v8 has divide.
*/
/*
* Input: dividend and divisor in %o0 and %o1 respectively.
*
* m4 parameters:
* .udiv name of function to generate
* div div=div => %o0 / %o1; div=rem => %o0 % %o1
* false false=true => signed; false=false => unsigned
*
* Algorithm parameters:
* N how many bits per iteration we try to get (4)
* WORDSIZE total number of bits (32)
*
* Derived constants:
* TOPBITS number of bits in the top decade of a number
*
* Important variables:
* Q the partial quotient under development (initially 0)
* R the remainder so far, initially the dividend
* ITER number of main division loop iterations required;
* equal to ceil(log2(quotient) / N). Note that this
* is the log base (2^N) of the quotient.
* V the current comparand, initially divisor*2^(ITER*N-1)
*
* Cost:
* Current estimate for non-large dividend is
* ceil(log2(quotient) / N) * (10 + 7N/2) + C
* A large dividend is one greater than 2^(31-TOPBITS) and takes a
* different path, as the upper bits of the quotient must be developed
* one bit at a time.
*/
#include <sysdep.h>
#include <sys/trap.h>
ENTRY(.udiv)
! Ready to divide. Compute size of quotient; scale comparand.
orcc %o1, %g0, %o5
bne 1f
mov %o0, %o3
! Divide by zero trap. If it returns, return 0 (about as
! wrong as possible, but that is what SunOS does...).
ta ST_DIV0
retl
clr %o0
1:
cmp %o3, %o5 ! if %o1 exceeds %o0, done
blu LOC(got_result) ! (and algorithm fails otherwise)
clr %o2
sethi %hi(1 << (32 - 4 - 1)), %g1
cmp %o3, %g1
blu LOC(not_really_big)
clr %o4
! Here the dividend is >= 2**(31-N) or so. We must be careful here,
! as our usual N-at-a-shot divide step will cause overflow and havoc.
! The number of bits in the result here is N*ITER+SC, where SC <= N.
! Compute ITER in an unorthodox manner: know we need to shift V into
! the top decade: so do not even bother to compare to R.
1:
cmp %o5, %g1
bgeu 3f
mov 1, %g2
sll %o5, 4, %o5
b 1b
add %o4, 1, %o4
! Now compute %g2.
2: addcc %o5, %o5, %o5
bcc LOC(not_too_big)
add %g2, 1, %g2
! We get here if the %o1 overflowed while shifting.
! This means that %o3 has the high-order bit set.
! Restore %o5 and subtract from %o3.
sll %g1, 4, %g1 ! high order bit
srl %o5, 1, %o5 ! rest of %o5
add %o5, %g1, %o5
b LOC(do_single_div)
sub %g2, 1, %g2
LOC(not_too_big):
3: cmp %o5, %o3
blu 2b
nop
be LOC(do_single_div)
nop
/* NB: these are commented out in the V8-Sparc manual as well */
/* (I do not understand this) */
! %o5 > %o3: went too far: back up 1 step
! srl %o5, 1, %o5
! dec %g2
! do single-bit divide steps
!
! We have to be careful here. We know that %o3 >= %o5, so we can do the
! first divide step without thinking. BUT, the others are conditional,
! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high-
! order bit set in the first step, just falling into the regular
! division loop will mess up the first time around.
! So we unroll slightly...
LOC(do_single_div):
subcc %g2, 1, %g2
bl LOC(end_regular_divide)
nop
sub %o3, %o5, %o3
mov 1, %o2
b LOC(end_single_divloop)
nop
LOC(single_divloop):
sll %o2, 1, %o2
bl 1f
srl %o5, 1, %o5
! %o3 >= 0
sub %o3, %o5, %o3
b 2f
add %o2, 1, %o2
1: ! %o3 < 0
add %o3, %o5, %o3
sub %o2, 1, %o2
2:
LOC(end_single_divloop):
subcc %g2, 1, %g2
bge LOC(single_divloop)
tst %o3
b,a LOC(end_regular_divide)
LOC(not_really_big):
1:
sll %o5, 4, %o5
cmp %o5, %o3
bleu 1b
addcc %o4, 1, %o4
be LOC(got_result)
sub %o4, 1, %o4
tst %o3 ! set up for initial iteration
LOC(divloop):
sll %o2, 4, %o2
! depth 1, accumulated bits 0
bl LOC(1.16)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 2, accumulated bits 1
bl LOC(2.17)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 3, accumulated bits 3
bl LOC(3.19)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits 7
bl LOC(4.23)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (7*2+1), %o2
LOC(4.23):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (7*2-1), %o2
LOC(3.19):
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits 5
bl LOC(4.21)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (5*2+1), %o2
LOC(4.21):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (5*2-1), %o2
LOC(2.17):
! remainder is negative
addcc %o3,%o5,%o3
! depth 3, accumulated bits 1
bl LOC(3.17)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits 3
bl LOC(4.19)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (3*2+1), %o2
LOC(4.19):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (3*2-1), %o2
LOC(3.17):
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits 1
bl LOC(4.17)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (1*2+1), %o2
LOC(4.17):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (1*2-1), %o2
LOC(1.16):
! remainder is negative
addcc %o3,%o5,%o3
! depth 2, accumulated bits -1
bl LOC(2.15)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 3, accumulated bits -1
bl LOC(3.15)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits -1
bl LOC(4.15)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (-1*2+1), %o2
LOC(4.15):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (-1*2-1), %o2
LOC(3.15):
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits -3
bl LOC(4.13)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (-3*2+1), %o2
LOC(4.13):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (-3*2-1), %o2
LOC(2.15):
! remainder is negative
addcc %o3,%o5,%o3
! depth 3, accumulated bits -3
bl LOC(3.13)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits -5
bl LOC(4.11)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (-5*2+1), %o2
LOC(4.11):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (-5*2-1), %o2
LOC(3.13):
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits -7
bl LOC(4.9)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (-7*2+1), %o2
LOC(4.9):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (-7*2-1), %o2
9:
LOC(end_regular_divide):
subcc %o4, 1, %o4
bge LOC(divloop)
tst %o3
bl,a LOC(got_result)
! non-restoring fixup here (one instruction only!)
sub %o2, 1, %o2
LOC(got_result):
wr %g0, 0, %y
nop
nop
retl
mov %o2, %o0
udiv %o0, %o1, %o0
END(.udiv)
strong_alias (.udiv, __wrap_.udiv)

View File

@ -1,155 +1,13 @@
/*
* Unsigned multiply. Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the
* upper 32 bits of the 64-bit product).
*
* This code optimizes short (less than 13-bit) multiplies. Short
* multiplies require 25 instruction cycles, and long ones require
* 45 instruction cycles.
*
* On return, overflow has occurred (%o1 is not zero) if and only if
* the Z condition code is clear, allowing, e.g., the following:
*
* call .umul
* nop
* bnz overflow (or tnz)
* Sparc v8 has multiply.
*/
#include <sysdep.h>
ENTRY(.umul)
or %o0, %o1, %o4
mov %o0, %y ! multiplier -> Y
andncc %o4, 0xfff, %g0 ! test bits 12..31 of *both* args
be LOC(mul_shortway) ! if zero, can do it the short way
andcc %g0, %g0, %o4 ! zero the partial product; clear N & V
/*
* Long multiply. 32 steps, followed by a final shift step.
*/
mulscc %o4, %o1, %o4 ! 1
mulscc %o4, %o1, %o4 ! 2
mulscc %o4, %o1, %o4 ! 3
mulscc %o4, %o1, %o4 ! 4
mulscc %o4, %o1, %o4 ! 5
mulscc %o4, %o1, %o4 ! 6
mulscc %o4, %o1, %o4 ! 7
mulscc %o4, %o1, %o4 ! 8
mulscc %o4, %o1, %o4 ! 9
mulscc %o4, %o1, %o4 ! 10
mulscc %o4, %o1, %o4 ! 11
mulscc %o4, %o1, %o4 ! 12
mulscc %o4, %o1, %o4 ! 13
mulscc %o4, %o1, %o4 ! 14
mulscc %o4, %o1, %o4 ! 15
mulscc %o4, %o1, %o4 ! 16
mulscc %o4, %o1, %o4 ! 17
mulscc %o4, %o1, %o4 ! 18
mulscc %o4, %o1, %o4 ! 19
mulscc %o4, %o1, %o4 ! 20
mulscc %o4, %o1, %o4 ! 21
mulscc %o4, %o1, %o4 ! 22
mulscc %o4, %o1, %o4 ! 23
mulscc %o4, %o1, %o4 ! 24
mulscc %o4, %o1, %o4 ! 25
mulscc %o4, %o1, %o4 ! 26
mulscc %o4, %o1, %o4 ! 27
mulscc %o4, %o1, %o4 ! 28
mulscc %o4, %o1, %o4 ! 29
mulscc %o4, %o1, %o4 ! 30
mulscc %o4, %o1, %o4 ! 31
mulscc %o4, %o1, %o4 ! 32
mulscc %o4, %g0, %o4 ! final shift
/*
* Normally, with the shift-and-add approach, if both numbers are
* positive you get the correct result. With 32-bit two's-complement
* numbers, -x is represented as
*
* x 32
* ( 2 - ------ ) mod 2 * 2
* 32
* 2
*
* (the `mod 2' subtracts 1 from 1.bbbb). To avoid lots of 2^32s,
* we can treat this as if the radix point were just to the left
* of the sign bit (multiply by 2^32), and get
*
* -x = (2 - x) mod 2
*
* Then, ignoring the `mod 2's for convenience:
*
* x * y = xy
* -x * y = 2y - xy
* x * -y = 2x - xy
* -x * -y = 4 - 2x - 2y + xy
*
* For signed multiplies, we subtract (x << 32) from the partial
* product to fix this problem for negative multipliers (see mul.s).
* Because of the way the shift into the partial product is calculated
* (N xor V), this term is automatically removed for the multiplicand,
* so we don't have to adjust.
*
* But for unsigned multiplies, the high order bit wasn't a sign bit,
* and the correction is wrong. So for unsigned multiplies where the
* high order bit is one, we end up with xy - (y << 32). To fix it
* we add y << 32.
*/
#if 0
tst %o1
bl,a 1f ! if %o1 < 0 (high order bit = 1),
add %o4, %o0, %o4 ! %o4 += %o0 (add y to upper half)
1: rd %y, %o0 ! get lower half of product
umul %o0, %o1, %o0
retl
addcc %o4, %g0, %o1 ! put upper half in place and set Z for %o1==0
#else
/* Faster code from tege@sics.se. */
sra %o1, 31, %o2 ! make mask from sign bit
and %o0, %o2, %o2 ! %o2 = 0 or %o0, depending on sign of %o1
rd %y, %o0 ! get lower half of product
retl
addcc %o4, %o2, %o1 ! add compensation and put upper half in place
#endif
LOC(mul_shortway):
/*
* Short multiply. 12 steps, followed by a final shift step.
* The resulting bits are off by 12 and (32-12) = 20 bit positions,
* but there is no problem with %o0 being negative (unlike above),
* and overflow is impossible (the answer is at most 24 bits long).
*/
mulscc %o4, %o1, %o4 ! 1
mulscc %o4, %o1, %o4 ! 2
mulscc %o4, %o1, %o4 ! 3
mulscc %o4, %o1, %o4 ! 4
mulscc %o4, %o1, %o4 ! 5
mulscc %o4, %o1, %o4 ! 6
mulscc %o4, %o1, %o4 ! 7
mulscc %o4, %o1, %o4 ! 8
mulscc %o4, %o1, %o4 ! 9
mulscc %o4, %o1, %o4 ! 10
mulscc %o4, %o1, %o4 ! 11
mulscc %o4, %o1, %o4 ! 12
mulscc %o4, %g0, %o4 ! final shift
/*
* %o4 has 20 of the bits that should be in the result; %y has
* the bottom 12 (as %y's top 12). That is:
*
* %o4 %y
* +----------------+----------------+
* | -12- | -20- | -12- | -20- |
* +------(---------+------)---------+
* -----result-----
*
* The 12 bits of %o4 left of the `result' area are all zero;
* in fact, all top 20 bits of %o4 are zero.
*/
rd %y, %o5
sll %o4, 12, %o0 ! shift middle bits left 12
srl %o5, 20, %o5 ! shift low bits right 20
or %o5, %o0, %o0
retl
addcc %g0, %g0, %o1 ! %o1 = zero, and set Z
rd %y, %o1
END(.umul)

View File

@ -1,346 +1,18 @@
/* This file is generated from divrem.m4; DO NOT EDIT! */
/*
* Division and remainder, from Appendix E of the Sparc Version 8
* Architecture Manual, with fixes from Gordon Irlam.
* Sparc v8 has divide.
*/
/*
* Input: dividend and divisor in %o0 and %o1 respectively.
*
* m4 parameters:
* .urem name of function to generate
* rem rem=div => %o0 / %o1; rem=rem => %o0 % %o1
* false false=true => signed; false=false => unsigned
*
* Algorithm parameters:
* N how many bits per iteration we try to get (4)
* WORDSIZE total number of bits (32)
*
* Derived constants:
* TOPBITS number of bits in the top decade of a number
*
* Important variables:
* Q the partial quotient under development (initially 0)
* R the remainder so far, initially the dividend
* ITER number of main division loop iterations required;
* equal to ceil(log2(quotient) / N). Note that this
* is the log base (2^N) of the quotient.
* V the current comparand, initially divisor*2^(ITER*N-1)
*
* Cost:
* Current estimate for non-large dividend is
* ceil(log2(quotient) / N) * (10 + 7N/2) + C
* A large dividend is one greater than 2^(31-TOPBITS) and takes a
* different path, as the upper bits of the quotient must be developed
* one bit at a time.
*/
#include <sysdep.h>
#include <sys/trap.h>
ENTRY(.urem)
! Ready to divide. Compute size of quotient; scale comparand.
orcc %o1, %g0, %o5
bne 1f
mov %o0, %o3
! Divide by zero trap. If it returns, return 0 (about as
! wrong as possible, but that is what SunOS does...).
ta ST_DIV0
retl
clr %o0
1:
cmp %o3, %o5 ! if %o1 exceeds %o0, done
blu LOC(got_result) ! (and algorithm fails otherwise)
clr %o2
sethi %hi(1 << (32 - 4 - 1)), %g1
cmp %o3, %g1
blu LOC(not_really_big)
clr %o4
! Here the dividend is >= 2**(31-N) or so. We must be careful here,
! as our usual N-at-a-shot divide step will cause overflow and havoc.
! The number of bits in the result here is N*ITER+SC, where SC <= N.
! Compute ITER in an unorthodox manner: know we need to shift V into
! the top decade: so do not even bother to compare to R.
1:
cmp %o5, %g1
bgeu 3f
mov 1, %g2
sll %o5, 4, %o5
b 1b
add %o4, 1, %o4
! Now compute %g2.
2: addcc %o5, %o5, %o5
bcc LOC(not_too_big)
add %g2, 1, %g2
! We get here if the %o1 overflowed while shifting.
! This means that %o3 has the high-order bit set.
! Restore %o5 and subtract from %o3.
sll %g1, 4, %g1 ! high order bit
srl %o5, 1, %o5 ! rest of %o5
add %o5, %g1, %o5
b LOC(do_single_div)
sub %g2, 1, %g2
LOC(not_too_big):
3: cmp %o5, %o3
blu 2b
nop
be LOC(do_single_div)
nop
/* NB: these are commented out in the V8-Sparc manual as well */
/* (I do not understand this) */
! %o5 > %o3: went too far: back up 1 step
! srl %o5, 1, %o5
! dec %g2
! do single-bit divide steps
!
! We have to be careful here. We know that %o3 >= %o5, so we can do the
! first divide step without thinking. BUT, the others are conditional,
! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high-
! order bit set in the first step, just falling into the regular
! division loop will mess up the first time around.
! So we unroll slightly...
LOC(do_single_div):
subcc %g2, 1, %g2
bl LOC(end_regular_divide)
nop
sub %o3, %o5, %o3
mov 1, %o2
b LOC(end_single_divloop)
nop
LOC(single_divloop):
sll %o2, 1, %o2
bl 1f
srl %o5, 1, %o5
! %o3 >= 0
sub %o3, %o5, %o3
b 2f
add %o2, 1, %o2
1: ! %o3 < 0
add %o3, %o5, %o3
sub %o2, 1, %o2
2:
LOC(end_single_divloop):
subcc %g2, 1, %g2
bge LOC(single_divloop)
tst %o3
b,a LOC(end_regular_divide)
LOC(not_really_big):
1:
sll %o5, 4, %o5
cmp %o5, %o3
bleu 1b
addcc %o4, 1, %o4
be LOC(got_result)
sub %o4, 1, %o4
tst %o3 ! set up for initial iteration
LOC(divloop):
sll %o2, 4, %o2
! depth 1, accumulated bits 0
bl LOC(1.16)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 2, accumulated bits 1
bl LOC(2.17)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 3, accumulated bits 3
bl LOC(3.19)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits 7
bl LOC(4.23)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (7*2+1), %o2
LOC(4.23):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (7*2-1), %o2
LOC(3.19):
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits 5
bl LOC(4.21)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (5*2+1), %o2
LOC(4.21):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (5*2-1), %o2
LOC(2.17):
! remainder is negative
addcc %o3,%o5,%o3
! depth 3, accumulated bits 1
bl LOC(3.17)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits 3
bl LOC(4.19)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (3*2+1), %o2
LOC(4.19):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (3*2-1), %o2
LOC(3.17):
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits 1
bl LOC(4.17)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (1*2+1), %o2
LOC(4.17):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (1*2-1), %o2
LOC(1.16):
! remainder is negative
addcc %o3,%o5,%o3
! depth 2, accumulated bits -1
bl LOC(2.15)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 3, accumulated bits -1
bl LOC(3.15)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits -1
bl LOC(4.15)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (-1*2+1), %o2
LOC(4.15):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (-1*2-1), %o2
LOC(3.15):
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits -3
bl LOC(4.13)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (-3*2+1), %o2
LOC(4.13):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (-3*2-1), %o2
LOC(2.15):
! remainder is negative
addcc %o3,%o5,%o3
! depth 3, accumulated bits -3
bl LOC(3.13)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits -5
bl LOC(4.11)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (-5*2+1), %o2
LOC(4.11):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (-5*2-1), %o2
LOC(3.13):
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits -7
bl LOC(4.9)
srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
b 9f
add %o2, (-7*2+1), %o2
LOC(4.9):
! remainder is negative
addcc %o3,%o5,%o3
b 9f
add %o2, (-7*2-1), %o2
9:
LOC(end_regular_divide):
subcc %o4, 1, %o4
bge LOC(divloop)
tst %o3
bl,a LOC(got_result)
! non-restoring fixup here (one instruction only!)
add %o3, %o1, %o3
LOC(got_result):
wr %g0, 0, %y
nop
nop
nop
udiv %o0, %o1, %o2
umul %o2, %o1, %o2
retl
mov %o3, %o0
sub %o0, %o2, %o0
END(.urem)