* sysdeps/alpha/Makefile <gnulib> (sysdep_routines): Merge divrem variable, add unsigned variants. * sysdeps/alpha/divrem.h: Remove file. * sysdeps/alpha/div_libc.h: New file. * sysdeps/alpha/divl.S: Rewrite from scratch. * sysdeps/alpha/reml.S: Likewise. * sysdeps/alpha/divq.S: Likewise. * sysdeps/alpha/remq.S: Likewise. * sysdeps/alpha/divlu.S: New file. * sysdeps/alpha/remlu.S: New file. * sysdeps/alpha/divqu.S: New file. * sysdeps/alpha/remqu.S: New file.

2004-03-26 Richard Henderson <rth@redhat.com> * sysdeps/alpha/Makefile <gnulib> (sysdep_routines): Merge divrem variable, add unsigned variants. * sysdeps/alpha/divrem.h: Remove file. * sysdeps/alpha/div_libc.h: New file. * sysdeps/alpha/divl.S: Rewrite from scratch. * sysdeps/alpha/reml.S: Likewise. * sysdeps/alpha/divq.S: Likewise. * sysdeps/alpha/remq.S: Likewise. * sysdeps/alpha/divlu.S: New file. * sysdeps/alpha/remlu.S: New file. * sysdeps/alpha/divqu.S: New file. * sysdeps/alpha/remqu.S: New file.
2024-11-10 07:10:06 +00:00 · 2004-03-27 00:32:28 +00:00 · 2004-03-27 00:32:28 +00:00 · fdc715a1f0
commit fdc715a1f0
parent 4ba541a978
12 changed files with 1309 additions and 248 deletions
--- a/15
+++ b/15
@ -1,3 +1,18 @@
+2004-03-26  Richard Henderson  <rth@redhat.com>
+
+	* sysdeps/alpha/Makefile <gnulib> (sysdep_routines): Merge divrem
+	variable, add unsigned variants.
+	* sysdeps/alpha/divrem.h: Remove file.
+	* sysdeps/alpha/div_libc.h: New file.
+	* sysdeps/alpha/divl.S: Rewrite from scratch.
+	* sysdeps/alpha/reml.S: Likewise.
+	* sysdeps/alpha/divq.S: Likewise.
+	* sysdeps/alpha/remq.S: Likewise.
+	* sysdeps/alpha/divlu.S: New file.
+	* sysdeps/alpha/remlu.S: New file.
+	* sysdeps/alpha/divqu.S: New file.
+	* sysdeps/alpha/remqu.S: New file.
+
 2004-03-26  Ulrich Drepper  <drepper@redhat.com>

 	* elf/dl-open.c (check_libc_caller): Fix typo.
--- a/sysdeps/alpha/Makefile
+++ b/sysdeps/alpha/Makefile
@ -26,7 +26,7 @@ sysdep_routines += _mcount
 endif

 ifeq ($(subdir),gnulib)
-sysdep_routines += $(divrem)
+sysdep_routines += divl divlu divq divqu reml remlu remq remqu
 endif

 ifeq ($(subdir),string)
@ -38,8 +38,6 @@ ifeq ($(subdir),elf)
 CFLAGS-rtld.c = -mbuild-constants
 endif

-divrem := divl divq reml remq
-
 # For now, build everything with full IEEE math support.
 # TODO: build separate libm and libm-ieee.
 sysdep-CFLAGS += -mieee
--- a/sysdeps/alpha/div_libc.h
+++ b/sysdeps/alpha/div_libc.h
@ -0,0 +1,113 @@
+/* Copyright (C) 2004 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+/* Common bits for implementing software divide.  */
+
+#include <sysdep.h>
+#ifdef __linux__
+# include <asm/gentrap.h>
+# include <asm/pal.h>
+#else
+# include <machine/pal.h>
+#endif
+
+/* These are not normal C functions.  Argument registers are t10 and t11;
+   the result goes in t12; the return address is in t9.  Only t12 and AT
+   may be clobbered.  */
+#define X	t10
+#define Y	t11
+#define RV	t12
+#define RA	t9
+
+/* None of these functions should use implicit anything.  */
+	.set	nomacro
+	.set	noat
+
+/* Code fragment to invoke _mcount for profiling.  This should be invoked
+   directly after allocation of the stack frame.  */
+.macro CALL_MCOUNT
+#ifdef PROF
+	stq	ra, 0(sp)
+	stq	pv, 8(sp)
+	stq	gp, 16(sp)
+	cfi_rel_offset (ra, 0)
+	cfi_rel_offset (pv, 8)
+	cfi_rel_offset (gp, 16)
+	br	AT, 1f
+	.set	macro
+1:	ldgp	gp, 0(AT)
+	mov	RA, ra
+	lda	AT, _mcount
+	jsr	AT, (AT), _mcount
+	.set	nomacro
+	ldq	ra, 0(sp)
+	ldq	pv, 8(sp)
+	ldq	gp, 16(sp)
+	cfi_restore (ra)
+	cfi_restore (pv)
+	cfi_restore (gp)
+	/* Realign subsequent code with what we'd have without this
+	   macro at all.  This means aligned with one arithmetic insn
+	   used within the bundle.  */
+	.align	4
+	nop
+#endif
+.endm
+
+/* In order to make the below work, all top-level divide routines must
+   use the same frame size.  */
+#define FRAME	48
+
+/* Code fragment to generate an integer divide-by-zero fault.  When
+   building libc.so, we arrange for there to be one copy of this code
+   placed late in the dso, such that all branches are forward.  When
+   building libc.a, we use multiple copies to avoid having an out of
+   range branch.  Users should jump to DIVBYZERO.  */
+
+.macro DO_DIVBYZERO
+#ifdef PIC
+#define DIVBYZERO	__divbyzero
+	.section .gnu.linkonce.t.divbyzero, "ax", @progbits
+	.globl	__divbyzero
+	.type	__divbyzero, @function
+	.usepv	__divbyzero, no
+	.hidden	__divbyzero
+#else
+#define DIVBYZERO	$divbyzero
+#endif
+
+	.align	4
+DIVBYZERO:
+	cfi_startproc
+	cfi_return_column (RA)
+	cfi_def_cfa_offset (FRAME)
+
+	mov	a0, RV
+	unop
+	lda	a0, GEN_INTDIV
+	call_pal PAL_gentrap
+
+	mov	RV, a0
+	clr	RV
+	lda	sp, FRAME(sp)
+	cfi_def_cfa_offset (0)
+	ret	$31, (RA), 1
+
+	cfi_endproc
+	.size	DIVBYZERO, .-DIVBYZERO
+.endm
--- a/sysdeps/alpha/divl.S
+++ b/sysdeps/alpha/divl.S
@ -1,6 +1,75 @@
-#define IS_REM		0
-#define SIZE		4
-#define UFUNC_NAME	__divlu
-#define SFUNC_NAME	__divl
+/* Copyright (C) 2004 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.

-#include "divrem.h"
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include "div_libc.h"
+
+/* 32-bit signed int divide.  This is not a normal C function.  Argument
+   registers are t10 and t11, the result goes in t12.  Only t12 and AT may
+   be clobbered.
+
+   The FPU can handle all input values except zero.  Whee!  */
+
+#ifndef EXTEND
+#define EXTEND(S,D)	sextl S, D
+#endif
+
+	.text
+	.align	4
+	.globl	__divl
+	.type	__divl, @function
+	.usepv	__divl, no
+
+	cfi_startproc
+	cfi_return_column (RA)
+__divl:
+	lda	sp, -FRAME(sp)
+	cfi_def_cfa_offset (FRAME)
+	CALL_MCOUNT
+	stt	$f0, 0(sp)
+	stt	$f1, 8(sp)
+	beq	Y, DIVBYZERO
+	cfi_rel_offset ($f0, 0)
+	cfi_rel_offset ($f1, 8)
+
+	EXTEND	(X, RV)
+	EXTEND	(Y, AT)
+	stq	RV, 16(sp)
+	stq	AT, 24(sp)
+
+	ldt	$f0, 16(sp)
+	ldt	$f1, 24(sp)
+	cvtqt	$f0, $f0
+	cvtqt	$f1, $f1
+
+	divt/c	$f0, $f1, $f0
+	cvttq/c	$f0, $f0
+	stt	$f0, 16(sp)
+	ldt	$f0, 0(sp)
+
+	ldt	$f1, 8(sp)
+	ldl	RV, 16(sp)
+	lda	sp, FRAME(sp)
+	cfi_restore ($f0)
+	cfi_restore ($f1)
+	cfi_def_cfa_offset (0)
+	ret	$31, (RA), 1
+
+	cfi_endproc
+	.size	__divl, .-__divl
+
+	DO_DIVBYZERO
--- a/sysdeps/alpha/divlu.S
+++ b/sysdeps/alpha/divlu.S
@ -0,0 +1,4 @@
+#define UNSIGNED
+#define EXTEND(S,D)	zapnot S, 15, D
+#define __divl		__divlu
+#include <divl.S>
--- a/sysdeps/alpha/divq.S
+++ b/sysdeps/alpha/divq.S
@ -1,6 +1,265 @@
-#define IS_REM		0
-#define SIZE		8
-#define UFUNC_NAME	__divqu
-#define SFUNC_NAME	__divq
+/* Copyright (C) 2004 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.

-#include "divrem.h"
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include "div_libc.h"
+
+
+/* 64-bit signed long divide.  These are not normal C functions.  Argument
+   registers are t10 and t11, the result goes in t12.  Only t12 and AT may
+   be clobbered.
+
+   Theory of operation here is that we can use the FPU divider for virtually
+   all operands that we see: all dividend values between -2**53 and 2**53-1
+   can be computed directly.  Note that divisor values need not be checked
+   against that range because the rounded fp value will be close enough such
+   that the quotient is < 1, which will properly be truncated to zero when we
+   convert back to integer.
+
+   When the dividend is outside the range for which we can compute exact
+   results, we use the fp quotent as an estimate from which we begin refining
+   an exact integral value.  This reduces the number of iterations in the
+   shift-and-subtract loop significantly.  */
+
+	.text
+	.align	4
+	.globl	__divq
+	.type	__divq, @function
+	.usepv	__divq, no
+
+	cfi_startproc
+	cfi_return_column (RA)
+__divq:
+	lda	sp, -FRAME(sp)
+	cfi_def_cfa_offset (FRAME)
+	CALL_MCOUNT
+
+	/* Get the fp divide insn issued as quickly as possible.  After
+	   that's done, we have at least 22 cycles until its results are
+	   ready -- all the time in the world to figure out how we're
+	   going to use the results.  */
+	stq	X, 16(sp)
+	stq	Y, 24(sp)
+	beq	Y, DIVBYZERO
+
+	stt	$f0, 0(sp)
+	stt	$f1, 8(sp)
+	cfi_rel_offset ($f0, 0)
+	cfi_rel_offset ($f1, 8)
+	ldt	$f0, 16(sp)
+	ldt	$f1, 24(sp)
+
+	cvtqt	$f0, $f0
+	cvtqt	$f1, $f1
+	divt/c	$f0, $f1, $f0
+
+	/* Check to see if X fit in the double as an exact value.  */
+	sll	X, (64-53), AT
+	ldt	$f1, 8(sp)
+	sra	AT, (64-53), AT
+	cmpeq	X, AT, AT
+	beq	AT, $x_big
+
+	/* If we get here, we're expecting exact results from the division.
+	   Do nothing else besides convert and clean up.  */
+	cvttq/c	$f0, $f0
+	stt	$f0, 16(sp)
+
+	ldq	RV, 16(sp)
+	ldt	$f0, 0(sp)
+	cfi_restore ($f1)
+	cfi_remember_state
+	cfi_restore ($f0)
+	cfi_def_cfa_offset (0)
+	lda	sp, FRAME(sp)
+	ret	$31, (RA), 1
+
+	.align	4
+	cfi_restore_state
+$x_big:
+	/* If we get here, X is large enough that we don't expect exact
+	   results, and neither X nor Y got mis-translated for the fp
+	   division.  Our task is to take the fp result, figure out how
+	   far it's off from the correct result and compute a fixup.  */
+	stq	t0, 16(sp)
+	stq	t1, 24(sp)
+	stq	t2, 32(sp)
+	stq	t5, 40(sp)
+	cfi_rel_offset (t0, 16)
+	cfi_rel_offset (t1, 24)
+	cfi_rel_offset (t2, 32)
+	cfi_rel_offset (t5, 40)
+
+#define Q	RV		/* quotient */
+#define R	t0		/* remainder */
+#define SY	t1		/* scaled Y */
+#define S	t2		/* scalar */
+#define QY	t3		/* Q*Y */
+
+	/* The fixup code below can only handle unsigned values.  */
+	or	X, Y, AT
+	mov	$31, t5
+	blt	AT, $fix_sign_in
+$fix_sign_in_ret1:
+	cvttq/c	$f0, $f0
+
+	stt	$f0, 8(sp)
+	ldq	Q, 8(sp)
+$fix_sign_in_ret2:
+	mulq	Q, Y, QY
+	stq	t4, 8(sp)
+
+	ldt	$f0, 0(sp)
+	unop
+	cfi_rel_offset (t4, 8)
+	cfi_restore ($f0)
+	stq	t3, 0(sp)
+	unop
+	cfi_rel_offset (t3, 0)
+
+	subq	QY, X, R
+	mov	Y, SY
+	mov	1, S
+	bgt	R, $q_high
+
+$q_high_ret:
+	subq	X, QY, R
+	mov	Y, SY
+	mov	1, S
+	bgt	R, $q_low
+
+$q_low_ret:
+	ldq	t0, 16(sp)
+	ldq	t1, 24(sp)
+	ldq	t2, 32(sp)
+	bne	t5, $fix_sign_out
+
+$fix_sign_out_ret:
+	ldq	t3, 0(sp)
+	ldq	t4, 8(sp)
+	ldq	t5, 40(sp)
+	lda	sp, FRAME(sp)
+	cfi_remember_state
+	cfi_restore (t0)
+	cfi_restore (t1)
+	cfi_restore (t2)
+	cfi_restore (t3)
+	cfi_restore (t4)
+	cfi_restore (t5)
+	cfi_def_cfa_offset (0)
+	ret	$31, (RA), 1
+
+	.align	4
+	cfi_restore_state
+	/* The quotient that we computed was too large.  We need to reduce
+	   it by S such that Y*S >= R.  Obviously the closer we get to the
+	   correct value the better, but overshooting high is ok, as we'll
+	   fix that up later.  */
+0:
+	addq	SY, SY, SY
+	addq	S, S, S
+$q_high:
+	cmpult	SY, R, AT
+	bne	AT, 0b
+
+	subq	Q, S, Q
+	unop
+	subq	QY, SY, QY
+	br	$q_high_ret
+
+	.align	4
+	/* The quotient that we computed was too small.  Divide Y by the 
+	   current remainder (R) and add that to the existing quotient (Q).
+	   The expectation, of course, is that R is much smaller than X.  */
+	/* Begin with a shift-up loop.  Compute S such that Y*S >= R.  We
+	   already have a copy of Y in SY and the value 1 in S.  */
+0:
+	addq	SY, SY, SY
+	addq	S, S, S
+$q_low:
+	cmpult	SY, R, AT
+	bne	AT, 0b
+
+	/* Shift-down and subtract loop.  Each iteration compares our scaled
+	   Y (SY) with the remainder (R); if SY <= R then X is divisible by
+	   Y's scalar (S) so add it to the quotient (Q).  */
+2:	addq	Q, S, t3
+	srl	S, 1, S
+	cmpule	SY, R, AT
+	subq	R, SY, t4
+
+	cmovne	AT, t3, Q
+	cmovne	AT, t4, R
+	srl	SY, 1, SY
+	bne	S, 2b
+
+	br	$q_low_ret
+
+	.align	4
+$fix_sign_in:
+	/* If we got here, then X|Y is negative.  Need to adjust everything
+	   such that we're doing unsigned division in the fixup loop.  */
+	/* T5 records the changes we had to make:
+		bit 0:	set if result should be negative.
+		bit 2:	set if X was negated.
+		bit 3:	set if Y was negated.
+	*/
+	xor	X, Y, AT
+	cmplt	AT, 0, t5
+	cmplt	X, 0, AT
+	negq	X, t0
+
+	s4addq	AT, t5, t5
+	cmovne	AT, t0, X
+	cmplt	Y, 0, AT
+	negq	Y, t0
+
+	s8addq	AT, t5, t5
+	cmovne	AT, t0, Y
+	unop
+	blbc	t5, $fix_sign_in_ret1
+
+	cvttq/c	$f0, $f0
+	stt	$f0, 8(sp)
+	ldq	Q, 8(sp)
+	unop
+
+	negq	Q, Q
+	br	$fix_sign_in_ret2
+
+	.align	4
+$fix_sign_out:
+	/* Now we get to undo what we did above.  */
+	/* ??? Is this really faster than just increasing the size of
+	   the stack frame and storing X and Y in memory?  */
+	and	t5, 8, AT
+	negq	Y, t4
+	cmovne	AT, t4, Y
+
+	and	t5, 4, AT
+	negq	X, t4
+	cmovne	AT, t4, X
+
+	negq	RV, t4
+	cmovlbs	t5, t4, RV
+
+	br	$fix_sign_out_ret
+
+	cfi_endproc
+	.size	__divq, .-__divq
+
+	DO_DIVBYZERO
--- a/sysdeps/alpha/divqu.S
+++ b/sysdeps/alpha/divqu.S
@ -0,0 +1,244 @@
+/* Copyright (C) 2004 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include "div_libc.h"
+
+
+/* 64-bit unsigned long divide.  These are not normal C functions.  Argument
+   registers are t10 and t11, the result goes in t12.  Only t12 and AT may be
+   clobbered.
+
+   Theory of operation here is that we can use the FPU divider for virtually
+   all operands that we see: all dividend values between -2**53 and 2**53-1
+   can be computed directly.  Note that divisor values need not be checked
+   against that range because the rounded fp value will be close enough such
+   that the quotient is < 1, which will properly be truncated to zero when we
+   convert back to integer.
+
+   When the dividend is outside the range for which we can compute exact
+   results, we use the fp quotent as an estimate from which we begin refining
+   an exact integral value.  This reduces the number of iterations in the
+   shift-and-subtract loop significantly.  */
+
+	.text
+	.align	4
+	.globl	__divqu
+	.type	__divqu, @function
+	.usepv	__divqu, no
+
+	cfi_startproc
+	cfi_return_column (RA)
+__divqu:
+	lda	sp, -FRAME(sp)
+	cfi_def_cfa_offset (FRAME)
+	CALL_MCOUNT
+
+	/* Get the fp divide insn issued as quickly as possible.  After
+	   that's done, we have at least 22 cycles until its results are
+	   ready -- all the time in the world to figure out how we're
+	   going to use the results.  */
+	stq	X, 16(sp)
+	stq	Y, 24(sp)
+	beq	Y, DIVBYZERO
+
+	stt	$f0, 0(sp)
+	stt	$f1, 8(sp)
+	cfi_rel_offset ($f0, 0)
+	cfi_rel_offset ($f1, 8)
+	ldt	$f0, 16(sp)
+	ldt	$f1, 24(sp)
+
+	cvtqt	$f0, $f0
+	cvtqt	$f1, $f1
+	blt	X, $x_is_neg
+	divt/c	$f0, $f1, $f0
+
+	/* Check to see if Y was mis-converted as signed value.  */
+	ldt	$f1, 8(sp)
+	unop
+	nop
+	blt	Y, $y_is_neg
+
+	/* Check to see if X fit in the double as an exact value.  */
+	srl	X, 53, AT
+	bne	AT, $x_big
+
+	/* If we get here, we're expecting exact results from the division.
+	   Do nothing else besides convert and clean up.  */
+	cvttq/c	$f0, $f0
+	stt	$f0, 16(sp)
+
+	ldq	RV, 16(sp)
+	ldt	$f0, 0(sp)
+	cfi_remember_state
+	cfi_restore ($f0)
+	cfi_restore ($f1)
+	cfi_def_cfa_offset (0)
+	lda	sp, FRAME(sp)
+	ret	$31, (RA), 1
+
+	.align	4
+	cfi_restore_state
+$x_is_neg:
+	/* If we get here, X is so big that bit 63 is set, which made the
+	   conversion come out negative.  Fix it up lest we not even get
+	   a good estimate.  */
+	ldah	AT, 0x5f80		/* 2**64 as float.  */
+	stt	$f2, 24(sp)
+	cfi_rel_offset ($f2, 24)
+	stl	AT, 16(sp)
+	lds	$f2, 16(sp)
+
+	addt	$f0, $f2, $f0
+	unop
+	divt/c	$f0, $f1, $f0
+	unop
+
+	/* Ok, we've now the divide issued.  Continue with other checks.  */
+	ldt	$f1, 8(sp)
+	unop
+	ldt	$f2, 24(sp)
+	blt	Y, $y_is_neg
+	cfi_restore ($f1)
+	cfi_restore ($f2)
+	cfi_remember_state	/* for y_is_neg */
+
+	.align	4
+$x_big:
+	/* If we get here, X is large enough that we don't expect exact
+	   results, and neither X nor Y got mis-translated for the fp
+	   division.  Our task is to take the fp result, figure out how
+	   far it's off from the correct result and compute a fixup.  */
+	stq	t0, 16(sp)
+	stq	t1, 24(sp)
+	stq	t2, 32(sp)
+	stq	t3, 40(sp)
+	cfi_rel_offset (t0, 16)
+	cfi_rel_offset (t1, 24)
+	cfi_rel_offset (t2, 32)
+	cfi_rel_offset (t3, 40)
+
+#define Q	RV		/* quotient */
+#define R	t0		/* remainder */
+#define SY	t1		/* scaled Y */
+#define S	t2		/* scalar */
+#define QY	t3		/* Q*Y */
+
+	cvttq/c	$f0, $f0
+	stt	$f0, 8(sp)
+	ldq	Q, 8(sp)
+	mulq	Q, Y, QY
+
+	stq	t4, 8(sp)
+	unop
+	ldt	$f0, 0(sp)
+	unop
+	cfi_rel_offset (t4, 8)
+	cfi_restore ($f0)
+
+	subq	QY, X, R
+	mov	Y, SY
+	mov	1, S
+	bgt	R, $q_high
+
+$q_high_ret:
+	subq	X, QY, R
+	mov	Y, SY
+	mov	1, S
+	bgt	R, $q_low
+
+$q_low_ret:
+	ldq	t4, 8(sp)
+	ldq	t0, 16(sp)
+	ldq	t1, 24(sp)
+	ldq	t2, 32(sp)
+
+	ldq	t3, 40(sp)
+	lda	sp, FRAME(sp)
+	cfi_remember_state
+	cfi_restore (t0)
+	cfi_restore (t1)
+	cfi_restore (t2)
+	cfi_restore (t3)
+	cfi_restore (t4)
+	cfi_def_cfa_offset (0)
+	ret	$31, (RA), 1
+
+	.align	4
+	cfi_restore_state
+	/* The quotient that we computed was too large.  We need to reduce
+	   it by S such that Y*S >= R.  Obviously the closer we get to the
+	   correct value the better, but overshooting high is ok, as we'll
+	   fix that up later.  */
+0:
+	addq	SY, SY, SY
+	addq	S, S, S
+$q_high:
+	cmpult	SY, R, AT
+	bne	AT, 0b
+
+	subq	Q, S, Q
+	unop
+	subq	QY, SY, QY
+	br	$q_high_ret
+
+	.align	4
+	/* The quotient that we computed was too small.  Divide Y by the 
+	   current remainder (R) and add that to the existing quotient (Q).
+	   The expectation, of course, is that R is much smaller than X.  */
+	/* Begin with a shift-up loop.  Compute S such that Y*S >= R.  We
+	   already have a copy of Y in SY and the value 1 in S.  */
+0:
+	addq	SY, SY, SY
+	addq	S, S, S
+$q_low:
+	cmpult	SY, R, AT
+	bne	AT, 0b
+
+	/* Shift-down and subtract loop.  Each iteration compares our scaled
+	   Y (SY) with the remainder (R); if SY <= R then X is divisible by
+	   Y's scalar (S) so add it to the quotient (Q).  */
+2:	addq	Q, S, t3
+	srl	S, 1, S
+	cmpule	SY, R, AT
+	subq	R, SY, t4
+
+	cmovne	AT, t3, Q
+	cmovne	AT, t4, R
+	srl	SY, 1, SY
+	bne	S, 2b
+
+	br	$q_low_ret
+
+	.align	4
+	cfi_restore_state
+$y_is_neg:
+	/* If we get here, Y is so big that bit 63 is set.  The results
+	   from the divide will be completely wrong.  Fortunately, the
+	   quotient must be either 0 or 1, so just compute it directly.  */
+	cmpult	Y, X, RV
+	ldt	$f0, 0(sp)
+	lda	sp, FRAME(sp)
+	cfi_restore ($f0)
+	cfi_def_cfa_offset (0)
+	ret	$31, (RA), 1
+
+	cfi_endproc
+	.size	__divqu, .-__divqu
+
+	DO_DIVBYZERO
--- a/sysdeps/alpha/divrem.h
+++ b/sysdeps/alpha/divrem.h
@ -1,225 +0,0 @@
-/* Copyright (C) 1996,97,2002 Free Software Foundation, Inc.
-   Contributed by David Mosberger (davidm@cs.arizona.edu).
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-/* The current Alpha chips don't provide hardware for integer
-   division.  The C compiler expects the functions
-
-	__divqu: 64-bit unsigned long divide
-	__remqu: 64-bit unsigned long remainder
-	__divqs/__remqs: signed 64-bit
-	__divlu/__remlu: unsigned 32-bit
-	__divls/__remls: signed 32-bit
-
-   These are not normal C functions: instead of the normal calling
-   sequence, these expect their arguments in registers t10 and t11, and
-   return the result in t12 (aka pv).  Register AT may be clobbered
-   (assembly temporary), anything else must be saved.  */
-
-#include <sysdep.h>
-
-#ifdef __linux__
-# include <asm/gentrap.h>
-# include <asm/pal.h>
-#else
-# include <machine/pal.h>
-#endif
-
-#define mask			v0
-#define divisor			t0
-#define compare			AT
-#define tmp1			t2
-#define tmp2			t3
-#define retaddr			t9
-#define arg1			t10
-#define arg2			t11
-#define result			t12
-
-#if IS_REM
-# define DIV_ONLY(x,y...)
-# define REM_ONLY(x,y...)	x,##y
-# define modulus		result
-# define quotient		t1
-# define GETSIGN(x)		mov arg1, x
-# define STACK			32
-#else
-# define DIV_ONLY(x,y...)	x,##y
-# define REM_ONLY(x,y...)
-# define modulus		t1
-# define quotient		result
-# define GETSIGN(x)		xor arg1, arg2, x
-# define STACK			48
-#endif
-
-#if SIZE == 8
-# define LONGIFY(x,y)		mov x,y
-# define SLONGIFY(x,y)		mov x,y
-# define _SLONGIFY(x)
-# define NEG(x,y)		negq x,y
-#else
-# define LONGIFY(x,y)		zapnot x,15,y
-# define SLONGIFY(x,y)		sextl x,y
-# define _SLONGIFY(x)		sextl x,x
-# define NEG(x,y)		negl x,y
-#endif
-
-	.set noreorder
-	.set noat
-
-	.ent UFUNC_NAME
-	.globl UFUNC_NAME
-
-	.align 3
-UFUNC_NAME:
-$udiv_entry:
-	lda	sp, -STACK(sp)
-	.frame	sp, STACK, retaddr, 0
-#ifdef PROF
-	stq	ra, 0(sp)
-	stq	pv, 8(sp)
-	stq	gp, 16(sp)
-
-	br	AT, 1f
-1:	ldgp	gp, 0(AT)
-
-	mov	retaddr, ra
-	lda	AT, _mcount
-	jsr	AT, (AT), _mcount
-
-	ldq	ra, 0(sp)
-	ldq	pv, 8(sp)
-	ldq	gp, 16(sp)
-#endif
-	.prologue 0
-
-$udiv:
-	stq	t0, 0(sp)
-	LONGIFY	(arg2, divisor)
-	stq	t1, 8(sp)
-	LONGIFY	(arg1, modulus)
-	stq	v0, 16(sp)
-	clr	quotient
-	stq	tmp1, 24(sp)
-	ldiq	mask, 1
-	DIV_ONLY(stq tmp2,32(sp))
-
-	beq	divisor, $divbyzero
-
-	.align 3
-#if SIZE == 8
-	/* Shift divisor left.  */
-1:	cmpult	divisor, modulus, compare
-	blt	divisor, 2f
-	addq	divisor, divisor, divisor
-	addq	mask, mask, mask
-	bne	compare, 1b
-	unop
-2:
-#else
-	/* Shift divisor left using 3-bit shifts as we can't overflow.
-	   This results in looping three times less here, but up to
-	   two more times later.  Thus using a large shift isn't worth it.  */
-1:	cmpult	divisor, modulus, compare
-	s8addq	divisor, zero, divisor
-	s8addq	mask, zero, mask
-	bne	compare, 1b
-#endif
-
-	/* Now go back to the right.  */
-3:	DIV_ONLY(addq quotient, mask, tmp2)
-	srl	mask, 1, mask
-	cmpule	divisor, modulus, compare
-	subq	modulus, divisor, tmp1
-	DIV_ONLY(cmovne compare, tmp2, quotient)
-	srl	divisor, 1, divisor
-	cmovne	compare, tmp1, modulus
-	bne	mask, 3b
-
-$done:	ldq	t0, 0(sp)
-	ldq	t1, 8(sp)
-	ldq	v0, 16(sp)
-	ldq	tmp1, 24(sp)
-	DIV_ONLY(ldq tmp2, 32(sp))
-	lda	sp, STACK(sp)
-	ret	zero, (retaddr), 1
-
-$divbyzero:
-	mov	a0, tmp1
-	ldiq	a0, GEN_INTDIV
-	call_pal PAL_gentrap
-	mov	tmp1, a0
-	clr	result			/* If trap returns, return zero.  */
-	br	$done
-
-	.end UFUNC_NAME
-
-	.ent SFUNC_NAME
-	.globl SFUNC_NAME
-
-	.align 3
-SFUNC_NAME:
-	lda	sp, -STACK(sp)
-	.frame	sp, STACK, retaddr, 0
-#ifdef PROF
-	stq	ra, 0(sp)
-	stq	pv, 8(sp)
-	stq	gp, 16(sp)
-
-	br	AT, 1f
-1:	ldgp	gp, 0(AT)
-
-	mov	retaddr, ra
-	jsr	AT, _mcount
-
-	ldq	ra, 0(sp)
-	ldq	pv, 8(sp)
-	ldq	gp, 16(sp)
-#endif
-	.prologue 0
-
-	or	arg1, arg2, AT
-	_SLONGIFY(AT)
-	bge	AT, $udiv		/* don't need to mess with signs */
-
-	/* Save originals and find absolute values.  */
-	stq	arg1, 0(sp)
-	NEG	(arg1, AT)
-	stq	arg2, 8(sp)
-	cmovge	AT, AT, arg1
-	stq	retaddr, 16(sp)
-	NEG	(arg2, AT)
-	stq	tmp1, 24(sp)
-	cmovge	AT, AT, arg2
-
-	/* Do the unsigned division.  */
-	bsr	retaddr, $udiv_entry
-
-	/* Restore originals and adjust the sign of the result.  */
-	ldq	arg1, 0(sp)
-	ldq	arg2, 8(sp)
-	GETSIGN	(AT)
-	NEG	(result, tmp1)
-	_SLONGIFY(AT)
-	ldq	retaddr, 16(sp)
-	cmovlt	AT, tmp1, result
-	ldq	tmp1, 24(sp)
-
-	lda	sp, STACK(sp)
-	ret	zero, (retaddr), 1
-
-	.end	SFUNC_NAME
--- a/sysdeps/alpha/reml.S
+++ b/sysdeps/alpha/reml.S
@ -1,6 +1,80 @@
-#define IS_REM		1
-#define SIZE		4
-#define UFUNC_NAME	__remlu
-#define SFUNC_NAME	__reml
+/* Copyright (C) 2004 Free Software Foundation, Inc.
+   Contributed by Richard Henderson  <rth@twiddle.net>
+   This file is part of the GNU C Library.

-#include "divrem.h"
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include "div_libc.h"
+
+/* 32-bit signed int remainder.  This is not a normal C function.  Argument
+   registers are t10 and t11, the result goes in t12.  Only t12 and AT may
+   be clobbered.
+
+   The FPU can handle the division for all input values except zero.
+   All we have to do is compute the remainder via multiply-and-subtract.  */
+
+#ifndef EXTEND
+#define EXTEND(S,D)	sextl S, D
+#endif
+
+	.text
+	.align	4
+	.globl	__reml
+	.type	__reml, @function
+	.usepv	__reml, no
+
+	cfi_startproc
+	cfi_return_column (RA)
+__reml:
+	lda	sp, -FRAME(sp)
+	cfi_def_cfa_offset (FRAME)
+	CALL_MCOUNT
+	stt	$f0, 0(sp)
+	stt	$f1, 8(sp)
+	beq	Y, DIVBYZERO
+	cfi_rel_offset ($f0, 0)
+	cfi_rel_offset ($f1, 8)
+
+	EXTEND	(X, RV)
+	EXTEND	(Y, AT)
+	stq	RV, 16(sp)
+	stq	AT, 24(sp)
+
+	ldt	$f0, 16(sp)
+	ldt	$f1, 24(sp)
+	cvtqt	$f0, $f0
+	cvtqt	$f1, $f1
+
+	divt/c	$f0, $f1, $f0
+	cvttq/c	$f0, $f0
+	stt	$f0, 16(sp)
+	ldq	RV, 16(sp)
+
+	ldt	$f0, 0(sp)
+	mull	RV, Y, RV
+	ldt	$f1, 8(sp)
+	lda	sp, FRAME(sp)
+	cfi_restore ($f0)
+	cfi_restore ($f1)
+	cfi_def_cfa_offset (0)
+
+	subl	X, RV, RV
+	ret	$31, (RA), 1
+
+	cfi_endproc
+	.size	__reml, .-__reml
+
+	DO_DIVBYZERO
--- a/sysdeps/alpha/remlu.S
+++ b/sysdeps/alpha/remlu.S
@ -0,0 +1,4 @@
+#define UNSIGNED
+#define EXTEND(S,D)	zapnot S, 15, D
+#define __reml		__remlu
+#include <reml.S>
--- a/sysdeps/alpha/remq.S
+++ b/sysdeps/alpha/remq.S
@ -1,6 +1,261 @@
-#define IS_REM		1
-#define SIZE		8
-#define UFUNC_NAME	__remqu
-#define SFUNC_NAME	__remq
+/* Copyright (C) 2004 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.

-#include "divrem.h"
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include "div_libc.h"
+
+
+/* 64-bit signed long remainder.  These are not normal C functions.  Argument
+   registers are t10 and t11, the result goes in t12.  Only t12 and AT may
+   be clobbered.
+
+   Theory of operation here is that we can use the FPU divider for virtually
+   all operands that we see: all dividend values between -2**53 and 2**53-1
+   can be computed directly.  Note that divisor values need not be checked
+   against that range because the rounded fp value will be close enough such
+   that the quotient is < 1, which will properly be truncated to zero when we
+   convert back to integer.
+
+   When the dividend is outside the range for which we can compute exact
+   results, we use the fp quotent as an estimate from which we begin refining
+   an exact integral value.  This reduces the number of iterations in the
+   shift-and-subtract loop significantly.  */
+
+	.text
+	.align	4
+	.globl	__remq
+	.type	__remq, @function
+	.usepv	__remq, no
+
+	cfi_startproc
+	cfi_return_column (RA)
+__remq:
+	lda	sp, -FRAME(sp)
+	cfi_def_cfa_offset (FRAME)
+	CALL_MCOUNT
+
+	/* Get the fp divide insn issued as quickly as possible.  After
+	   that's done, we have at least 22 cycles until its results are
+	   ready -- all the time in the world to figure out how we're
+	   going to use the results.  */
+	stq	X, 16(sp)
+	stq	Y, 24(sp)
+	beq	Y, DIVBYZERO
+
+	stt	$f0, 0(sp)
+	stt	$f1, 8(sp)
+	cfi_rel_offset ($f0, 0)
+	cfi_rel_offset ($f1, 8)
+	ldt	$f0, 16(sp)
+	ldt	$f1, 24(sp)
+
+	cvtqt	$f0, $f0
+	cvtqt	$f1, $f1
+	divt/c	$f0, $f1, $f0
+
+	/* Check to see if X fit in the double as an exact value.  */
+	sll	X, (64-53), AT
+	ldt	$f1, 8(sp)
+	sra	AT, (64-53), AT
+	cmpeq	X, AT, AT
+	beq	AT, $x_big
+
+	/* If we get here, we're expecting exact results from the division.
+	   Do nothing else besides convert, compute remainder, clean up.  */
+	cvttq/c	$f0, $f0
+	stt	$f0, 16(sp)
+
+	ldq	AT, 16(sp)
+	mulq	AT, Y, AT
+	ldt	$f0, 0(sp)
+	cfi_restore ($f1)
+	cfi_remember_state
+	cfi_restore ($f0)
+	cfi_def_cfa_offset (0)
+	lda	sp, FRAME(sp)
+
+	subq	X, AT, RV
+	ret	$31, (RA), 1
+
+	.align	4
+	cfi_restore_state
+$x_big:
+	/* If we get here, X is large enough that we don't expect exact
+	   results, and neither X nor Y got mis-translated for the fp
+	   division.  Our task is to take the fp result, figure out how
+	   far it's off from the correct result and compute a fixup.  */
+	stq	t0, 16(sp)
+	stq	t1, 24(sp)
+	stq	t2, 32(sp)
+	stq	t5, 40(sp)
+	cfi_rel_offset (t0, 16)
+	cfi_rel_offset (t1, 24)
+	cfi_rel_offset (t2, 32)
+	cfi_rel_offset (t5, 40)
+
+#define Q	t0		/* quotient */
+#define R	RV		/* remainder */
+#define SY	t1		/* scaled Y */
+#define S	t2		/* scalar */
+#define QY	t3		/* Q*Y */
+
+	/* The fixup code below can only handle unsigned values.  */
+	or	X, Y, AT
+	mov	$31, t5
+	blt	AT, $fix_sign_in
+$fix_sign_in_ret1:
+	cvttq/c	$f0, $f0
+
+	stt	$f0, 8(sp)
+	ldq	Q, 8(sp)
+$fix_sign_in_ret2:
+	mulq	Q, Y, QY
+	stq	t4, 8(sp)
+
+	ldt	$f0, 0(sp)
+	unop
+	cfi_rel_offset (t4, 8)
+	cfi_restore ($f0)
+	stq	t3, 0(sp)
+	unop
+	cfi_rel_offset (t3, 0)
+
+	subq	QY, X, R
+	mov	Y, SY
+	mov	1, S
+	bgt	R, $q_high
+
+$q_high_ret:
+	subq	X, QY, R
+	mov	Y, SY
+	mov	1, S
+	bgt	R, $q_low
+
+$q_low_ret:
+	ldq	t0, 16(sp)
+	ldq	t1, 24(sp)
+	ldq	t2, 32(sp)
+	bne	t5, $fix_sign_out
+
+$fix_sign_out_ret:
+	ldq	t3, 0(sp)
+	ldq	t4, 8(sp)
+	ldq	t5, 40(sp)
+	lda	sp, FRAME(sp)
+	cfi_remember_state
+	cfi_restore (t0)
+	cfi_restore (t1)
+	cfi_restore (t2)
+	cfi_restore (t3)
+	cfi_restore (t4)
+	cfi_restore (t5)
+	cfi_def_cfa_offset (0)
+	ret	$31, (RA), 1
+
+	.align	4
+	cfi_restore_state
+	/* The quotient that we computed was too large.  We need to reduce
+	   it by S such that Y*S >= R.  Obviously the closer we get to the
+	   correct value the better, but overshooting high is ok, as we'll
+	   fix that up later.  */
+0:
+	addq	SY, SY, SY
+	addq	S, S, S
+$q_high:
+	cmpult	SY, R, AT
+	bne	AT, 0b
+
+	subq	Q, S, Q
+	unop
+	subq	QY, SY, QY
+	br	$q_high_ret
+
+	.align	4
+	/* The quotient that we computed was too small.  Divide Y by the 
+	   current remainder (R) and add that to the existing quotient (Q).
+	   The expectation, of course, is that R is much smaller than X.  */
+	/* Begin with a shift-up loop.  Compute S such that Y*S >= R.  We
+	   already have a copy of Y in SY and the value 1 in S.  */
+0:
+	addq	SY, SY, SY
+	addq	S, S, S
+$q_low:
+	cmpult	SY, R, AT
+	bne	AT, 0b
+
+	/* Shift-down and subtract loop.  Each iteration compares our scaled
+	   Y (SY) with the remainder (R); if SY <= R then X is divisible by
+	   Y's scalar (S) so add it to the quotient (Q).  */
+2:	addq	Q, S, t3
+	srl	S, 1, S
+	cmpule	SY, R, AT
+	subq	R, SY, t4
+
+	cmovne	AT, t3, Q
+	cmovne	AT, t4, R
+	srl	SY, 1, SY
+	bne	S, 2b
+
+	br	$q_low_ret
+
+	.align	4
+$fix_sign_in:
+	/* If we got here, then X|Y is negative.  Need to adjust everything
+	   such that we're doing unsigned division in the fixup loop.  */
+	/* T5 records the changes we had to make:
+		bit 0:	set if X was negated.  Note that the sign of the
+			remainder follows the sign of the divisor.
+		bit 2:	set if Y was negated.
+	*/
+	xor	X, Y, t1
+	cmplt	X, 0, t5
+	negq	X, t0
+	cmovne	t5, t0, X
+
+	cmplt	Y, 0, AT
+	negq	Y, t0
+	s4addq	AT, t5, t5
+	cmovne	AT, t0, Y
+
+	bge	t1, $fix_sign_in_ret1
+	cvttq/c	$f0, $f0
+	stt	$f0, 8(sp)
+	ldq	Q, 8(sp)
+
+	negq	Q, Q
+	br	$fix_sign_in_ret2
+
+	.align	4
+$fix_sign_out:
+	/* Now we get to undo what we did above.  */
+	/* ??? Is this really faster than just increasing the size of
+	   the stack frame and storing X and Y in memory?  */
+	and	t5, 4, AT
+	negq	Y, t4
+	cmovne	AT, t4, Y
+
+	negq	X, t4
+	cmovlbs	t5, t4, X
+	negq	RV, t4
+	cmovlbs	t5, t4, RV
+
+	br	$fix_sign_out_ret
+
+	cfi_endproc
+	.size	__remq, .-__remq
+
+	DO_DIVBYZERO
--- a/sysdeps/alpha/remqu.S
+++ b/sysdeps/alpha/remqu.S
@ -0,0 +1,251 @@
+/* Copyright (C) 2004 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include "div_libc.h"
+
+
+/* 64-bit unsigned long remainder.  These are not normal C functions.  Argument
+   registers are t10 and t11, the result goes in t12.  Only t12 and AT may be
+   clobbered.
+
+   Theory of operation here is that we can use the FPU divider for virtually
+   all operands that we see: all dividend values between -2**53 and 2**53-1
+   can be computed directly.  Note that divisor values need not be checked
+   against that range because the rounded fp value will be close enough such
+   that the quotient is < 1, which will properly be truncated to zero when we
+   convert back to integer.
+
+   When the dividend is outside the range for which we can compute exact
+   results, we use the fp quotent as an estimate from which we begin refining
+   an exact integral value.  This reduces the number of iterations in the
+   shift-and-subtract loop significantly.  */
+
+	.text
+	.align	4
+	.globl	__remqu
+	.type	__remqu, @function
+	.usepv	__remqu, no
+
+	cfi_startproc
+	cfi_return_column (RA)
+__remqu:
+	lda	sp, -FRAME(sp)
+	cfi_def_cfa_offset (FRAME)
+	CALL_MCOUNT
+
+	/* Get the fp divide insn issued as quickly as possible.  After
+	   that's done, we have at least 22 cycles until its results are
+	   ready -- all the time in the world to figure out how we're
+	   going to use the results.  */
+	stq	X, 16(sp)
+	stq	Y, 24(sp)
+	beq	Y, DIVBYZERO
+
+	stt	$f0, 0(sp)
+	stt	$f1, 8(sp)
+	cfi_rel_offset ($f0, 0)
+	cfi_rel_offset ($f1, 8)
+	ldt	$f0, 16(sp)
+	ldt	$f1, 24(sp)
+
+	cvtqt	$f0, $f0
+	cvtqt	$f1, $f1
+	blt	X, $x_is_neg
+	divt/c	$f0, $f1, $f0
+
+	/* Check to see if Y was mis-converted as signed value.  */
+	ldt	$f1, 8(sp)
+	unop
+	nop
+	blt	Y, $y_is_neg
+
+	/* Check to see if X fit in the double as an exact value.  */
+	srl	X, 53, AT
+	bne	AT, $x_big
+
+	/* If we get here, we're expecting exact results from the division.
+	   Do nothing else besides convert, compute remainder, clean up.  */
+	cvttq/c	$f0, $f0
+	stt	$f0, 16(sp)
+
+	ldq	AT, 16(sp)
+	mulq	AT, Y, AT
+	ldt	$f0, 0(sp)
+	lda	sp, FRAME(sp)
+	cfi_remember_state
+	cfi_restore ($f0)
+	cfi_restore ($f1)
+	cfi_def_cfa_offset (0)
+
+	subq	X, AT, RV
+	ret	$31, (RA), 1
+
+	.align	4
+	cfi_restore_state
+$x_is_neg:
+	/* If we get here, X is so big that bit 63 is set, which made the
+	   conversion come out negative.  Fix it up lest we not even get
+	   a good estimate.  */
+	ldah	AT, 0x5f80		/* 2**64 as float.  */
+	stt	$f2, 24(sp)
+	cfi_rel_offset ($f2, 24)
+	stl	AT, 16(sp)
+	lds	$f2, 16(sp)
+
+	addt	$f0, $f2, $f0
+	unop
+	divt/c	$f0, $f1, $f0
+	unop
+
+	/* Ok, we've now the divide issued.  Continue with other checks.  */
+	ldt	$f1, 8(sp)
+	unop
+	ldt	$f2, 24(sp)
+	blt	Y, $y_is_neg
+	cfi_restore ($f1)
+	cfi_restore ($f2)
+	cfi_remember_state	/* for y_is_neg */
+
+	.align	4
+$x_big:
+	/* If we get here, X is large enough that we don't expect exact
+	   results, and neither X nor Y got mis-translated for the fp
+	   division.  Our task is to take the fp result, figure out how
+	   far it's off from the correct result and compute a fixup.  */
+	stq	t0, 16(sp)
+	stq	t1, 24(sp)
+	stq	t2, 32(sp)
+	stq	t3, 40(sp)
+	cfi_rel_offset (t0, 16)
+	cfi_rel_offset (t1, 24)
+	cfi_rel_offset (t2, 32)
+	cfi_rel_offset (t3, 40)
+
+#define Q	t0		/* quotient */
+#define R	RV		/* remainder */
+#define SY	t1		/* scaled Y */
+#define S	t2		/* scalar */
+#define QY	t3		/* Q*Y */
+
+	cvttq/c	$f0, $f0
+	stt	$f0, 8(sp)
+	ldq	Q, 8(sp)
+	mulq	Q, Y, QY
+
+	stq	t4, 8(sp)
+	unop
+	ldt	$f0, 0(sp)
+	unop
+	cfi_rel_offset (t4, 8)
+	cfi_restore ($f0)
+
+	subq	QY, X, R
+	mov	Y, SY
+	mov	1, S
+	bgt	R, $q_high
+
+$q_high_ret:
+	subq	X, QY, R
+	mov	Y, SY
+	mov	1, S
+	bgt	R, $q_low
+
+$q_low_ret:
+	ldq	t4, 8(sp)
+	ldq	t0, 16(sp)
+	ldq	t1, 24(sp)
+	ldq	t2, 32(sp)
+
+	ldq	t3, 40(sp)
+	lda	sp, FRAME(sp)
+	cfi_remember_state
+	cfi_restore (t0)
+	cfi_restore (t1)
+	cfi_restore (t2)
+	cfi_restore (t3)
+	cfi_restore (t4)
+	cfi_def_cfa_offset (0)
+	ret	$31, (RA), 1
+
+	.align	4
+	cfi_restore_state
+	/* The quotient that we computed was too large.  We need to reduce
+	   it by S such that Y*S >= R.  Obviously the closer we get to the
+	   correct value the better, but overshooting high is ok, as we'll
+	   fix that up later.  */
+0:
+	addq	SY, SY, SY
+	addq	S, S, S
+$q_high:
+	cmpult	SY, R, AT
+	bne	AT, 0b
+
+	subq	Q, S, Q
+	unop
+	subq	QY, SY, QY
+	br	$q_high_ret
+
+	.align	4
+	/* The quotient that we computed was too small.  Divide Y by the 
+	   current remainder (R) and add that to the existing quotient (Q).
+	   The expectation, of course, is that R is much smaller than X.  */
+	/* Begin with a shift-up loop.  Compute S such that Y*S >= R.  We
+	   already have a copy of Y in SY and the value 1 in S.  */
+0:
+	addq	SY, SY, SY
+	addq	S, S, S
+$q_low:
+	cmpult	SY, R, AT
+	bne	AT, 0b
+
+	/* Shift-down and subtract loop.  Each iteration compares our scaled
+	   Y (SY) with the remainder (R); if SY <= R then X is divisible by
+	   Y's scalar (S) so add it to the quotient (Q).  */
+2:	addq	Q, S, t3
+	srl	S, 1, S
+	cmpule	SY, R, AT
+	subq	R, SY, t4
+
+	cmovne	AT, t3, Q
+	cmovne	AT, t4, R
+	srl	SY, 1, SY
+	bne	S, 2b
+
+	br	$q_low_ret
+
+	.align	4
+	cfi_restore_state
+$y_is_neg:
+	/* If we get here, Y is so big that bit 63 is set.  The results
+	   from the divide will be completely wrong.  Fortunately, the
+	   quotient must be either 0 or 1, so the remainder must be X
+	   or X-Y, so just compute it directly.  */
+	cmpult	Y, X, AT
+	subq	X, Y, RV
+	ldt	$f0, 0(sp)
+	cmoveq	AT, X, RV
+
+	lda	sp, FRAME(sp)
+	cfi_restore ($f0)
+	cfi_def_cfa_offset (0)
+	ret	$31, (RA), 1
+
+	cfi_endproc
+	.size	__remqu, .-__remqu
+
+	DO_DIVBYZERO