Sat Oct 14 02:52:36 1995 Ulrich Drepper <drepper@ipd.info.uni-karlsruhe.de>

* malloc/malloc.c (_malloc_internal): Performance fix. Move if statement out of loop. * stdio/_itoa.c, stdio/_itoa.h: Complete rewrite. Much faster implementation using GMP functions. Contributed by Torbjorn Granlund and Ulrich Drepper. * stdio/test_rdwr.c: Include <errno.h>. * sysdeps/i386/i586/Implies: New file. New highly optimized string functions for i[345]86. * sysdeps/i386/memchr.S, sysdeps/i386/memcmp.S: New files. * sysdeps/i386/stpcpy.S, sysdeps/i386/stpncpy.S: New files. * sysdeps/i386/strchr.S, sysdeps/i386/strcspn.S: New files. * sysdeps/i386/strpbrk.S, sysdeps/i386/strrchr.S: New files. * sysdeps/i386/strspn.S, sysdeps/i386/i486/strcat.S: New files. * sysdeps/i386/i486/strlen.S, sysdeps/i386/i586/strchr.S: New files. * sysdeps/i386/i586/strlen.S: New file. * sysdeps/i386/memchr.c: Removed. There is now an assembler version. * sysdeps/i386/i586/memcopy.h (WORD_COPY_BWD): Parameters did not correspond to used values. * sysdeps/unix/sysv/linux/nfs/nfs.h: New file. Simply a wrapper around a kernel header file. * sysdeps/unix/sysv/linux/Dist: Add it. * sysdeps/unix/sysv/linux/Makefile [$(subdir)=sunrpc] (headers): Likewise. * sysdeps/unix/sysv/linux/local_lim.h: Rewrite. Instead of defining ourself we use a kernel header file. * sysdeps/unix/sysv/linux/i386/sysdep.h (DO_CALL): Optimize system call handler for i586. * sysdeps/unix/sysv/linux/sys/param.h: Add copyright and clean up. Sat Oct 14 02:52:36 1995 Ulrich Drepper <drepper@ipd.info.uni-karlsruhe.de> * malloc/malloc.c (_malloc_internal): Performance fix. Move if statement out of loop. * stdio/_itoa.c, stdio/_itoa.h: Complete rewrite. Much faster implementation using GMP functions. Contributed by Torbjorn Granlund and Ulrich Drepper. * stdio/test_rdwr.c: Include <errno.h>. * sysdeps/i386/i586/Implies: New file. New highly optimized string functions for i[345]86. * sysdeps/i386/memchr.S, sysdeps/i386/memcmp.S: New files. * sysdeps/i386/stpcpy.S, sysdeps/i386/stpncpy.S: New files. * sysdeps/i386/strchr.S, sysdeps/i386/strcspn.S: New files. * sysdeps/i386/strpbrk.S, sysdeps/i386/strrchr.S: New files. * sysdeps/i386/strspn.S, sysdeps/i386/i486/strcat.S: New files. * sysdeps/i386/i486/strlen.S, sysdeps/i386/i586/strchr.S: New files. * sysdeps/i386/i586/strlen.S: New file. * sysdeps/i386/memchr.c: Removed. There is now an assembler version. * sysdeps/i386/i586/memcopy.h (WORD_COPY_BWD): Parameters did not correspond to used values. * sysdeps/unix/sysv/linux/nfs/nfs.h: New file. Simply a wrapper around a kernel header file. * sysdeps/unix/sysv/linux/Dist: Add it. * sysdeps/unix/sysv/linux/Makefile [$(subdir)=sunrpc] (headers): Likewise. * sysdeps/unix/sysv/linux/local_lim.h: Rewrite. Instead of defining ourself we use a kernel header file. * sysdeps/unix/sysv/linux/i386/sysdep.h (DO_CALL): Optimize system call handler for i586. * sysdeps/unix/sysv/linux/sys/param.h: Add copyright and clean up.
2024-11-16 01:50:11 +00:00 · 1995-10-16 01:37:51 +00:00 · 1995-10-16 01:37:51 +00:00 · 8f5ca04bc7
commit 8f5ca04bc7
parent 5d82cf5c55
109 changed files with 9751 additions and 357 deletions
--- a/.cvsignore
+++ b/.cvsignore
@ -5,7 +5,7 @@ glibc-*

 configparms 

-sun4 i386 i386-gnuelf hp300-netbsd hp300 i486-linux
+sun[43]* i[345]86* hp300*

 ieeetest hppa-sysdeps regex

--- a/40
+++ b/40
@ -1,3 +1,43 @@
+Sat Oct 14 02:52:36 1995  Ulrich Drepper  <drepper@ipd.info.uni-karlsruhe.de>
+
+	* malloc/malloc.c (_malloc_internal): Performance fix.  Move
+	if statement out of loop.
+
+	* stdio/_itoa.c, stdio/_itoa.h: Complete rewrite.  Much faster
+	implementation using GMP functions.  Contributed by
+	Torbjorn Granlund and Ulrich Drepper.
+
+	* stdio/test_rdwr.c: Include <errno.h>.
+
+	* sysdeps/i386/i586/Implies: New file.
+
+	New highly optimized string functions for i[345]86.
+	* sysdeps/i386/memchr.S, sysdeps/i386/memcmp.S: New files.
+        * sysdeps/i386/stpcpy.S, sysdeps/i386/stpncpy.S: New files.
+        * sysdeps/i386/strchr.S, sysdeps/i386/strcspn.S: New files.
+        * sysdeps/i386/strpbrk.S, sysdeps/i386/strrchr.S: New files.
+        * sysdeps/i386/strspn.S, sysdeps/i386/i486/strcat.S: New files.
+        * sysdeps/i386/i486/strlen.S, sysdeps/i386/i586/strchr.S: New files.
+        * sysdeps/i386/i586/strlen.S: New file.
+	* sysdeps/i386/memchr.c: Removed.  There is now an assembler version.
+
+	* sysdeps/i386/i586/memcopy.h (WORD_COPY_BWD): Parameters did
+	not correspond to used values.
+
+	* sysdeps/unix/sysv/linux/nfs/nfs.h: New file.  Simply a wrapper
+        around a kernel header file.
+	* sysdeps/unix/sysv/linux/Dist: Add it.
+	* sysdeps/unix/sysv/linux/Makefile [$(subdir)=sunrpc] (headers):
+	Likewise.
+
+	* sysdeps/unix/sysv/linux/local_lim.h: Rewrite.  Instead of
+        defining ourself we use a kernel header file.
+
+	* sysdeps/unix/sysv/linux/i386/sysdep.h (DO_CALL): Optimize system
+        call handler for i586.
+
+	* sysdeps/unix/sysv/linux/sys/param.h: Add copyright and clean up.
+
 Wed Oct 11 00:00:00 1995  Roland McGrath  <roland@churchy.gnu.ai.mit.edu>

 	* sysdeps/i386/dl-machine.h (elf_machine_rel): Use +=, not =, to
--- a/configure.in
+++ b/configure.in
@ -82,22 +82,18 @@ changequote(,)dnl
 # Expand the configuration machine name into a subdirectory by architecture
 # type and particular chip.
 case "$machine" in
-i[345]86)
-  machine=i386/$machine ;;
-sparc[6789])
-  machine=sparc/$machine ;;
-m68k)
-  machine=m68k/m68020 ;;
-m680?0)
-  machine=m68k/$machine ;;
-m88k)
-  machine=m88k/m88100 ;;
-m88???)
-  machine=m88k/$machine ;;
-mips64*)
-  machine=mips/mips64/$machine ;;
-mips*)
-  machine=mips/$machine ;;
+a29k | am29000)	machine=a29k ;;
+alpha*)		machine=alpha/$machine ;;
+hppa*)		machine=hppa/$machine ;;
+i[345]86)	machine=i386/$machine ;;
+m680?0)		machine=m68k/$machine ;;
+m68k)		machine=m68k/m68020 ;;
+m88???)		machine=m88k/$machine ;;
+m88k)		machine=m88k/m88100 ;;
+mips*)		machine=mips/$machine ;;
+mips64*)	machine=mips/mips64/$machine ;;
+sparc[6789])	machine=sparc/$machine ;;
+supersparc)	machine=sparc/sparc8 ;;
 esac

 # Make sco3.2v4 become sco3.2.4 and sunos4.1.1_U1 become sunos4.1.1.U1.
--- a/hurd/Makefile
+++ b/hurd/Makefile
@ -26,7 +26,7 @@ include ../Makeconfig

 headers = hurd.h $(interface-headers) \
 	  $(addprefix hurd/,fd.h id.h port.h signal.h userlink.h \
-		            resource.h threadvar.h)
+		            resource.h threadvar.h lookup.h)

 distribute := hurdstartup.h hurdfault.h intr-rpc.defs STATUS

@ -44,7 +44,7 @@ routines = hurdstartup hurdinit \
 	   setauth \
 	   pid2task task2pid \
 	   getuids setuids getumask fchroot \
-	   hurdsock hurdauth invoke-trans \
+	   hurdsock hurdauth \
 	   privports \
 	   msgportdemux \
 	   fopenport \
--- a/hurd/hurd.h
+++ b/hurd/hurd.h
@ -77,11 +77,16 @@ extern struct hurd_port *_hurd_ports;
 extern unsigned int _hurd_nports;
 extern volatile mode_t _hurd_umask;

-/* Shorthand macro for referencing _hurd_ports (see <hurd/port.h>).  */
+/* Shorthand macro for internal library code referencing _hurd_ports (see
+   <hurd/port.h>).  */

 #define	__USEPORT(which, expr) \
  HURD_PORT_USE (&_hurd_ports[INIT_PORT_##which], (expr))

+/* Function version of __USEPORT: calls OPERATE with a send right.  */
+
+extern error_t _hurd_ports_use (int which, error_t (*operate) (mach_port_t));
+

 /* Base address and size of the initial stack set up by the exec server.
   If using cthreads, this stack is deallocated in startup.
@ -150,52 +155,6 @@ extern int setcttyid (mach_port_t);
 extern int __setauth (auth_t), setauth (auth_t);


-/* Split FILE into a directory and a name within the directory.  Look up a
-   port for the directory and store it in *DIR; store in *NAME a pointer
-   into FILE where the name within directory begins.  The directory lookup
-   uses CRDIR for the root directory and CWDIR for the current directory.
-   Returns zero on success or an error code.  */
-
-extern error_t __hurd_file_name_split (file_t crdir, file_t cwdir,
-				       const char *file,
-				       file_t *dir, char **name);
-extern error_t hurd_file_name_split (file_t crdir, file_t cwdir,
-				     const char *file,
-				     file_t *dir, char **name);
-
-/* Open a port to FILE with the given FLAGS and MODE (see <fcntl.h>).
-   The file lookup uses CRDIR for the root directory and CWDIR for the
-   current directory.  If successful, returns zero and store the port
-   to FILE in *PORT; otherwise returns an error code. */
-
-extern error_t __hurd_file_name_lookup (file_t crdir, file_t cwdir,
-					const char *file,
-					int flags, mode_t mode,
-					file_t *port);
-extern error_t hurd_file_name_lookup (file_t crdir, file_t cwdir,
-				      const char *filename,
-				      int flags, mode_t mode,
-				      file_t *port);
-
-/* Process the values returned by `dir_lookup' et al, and loop doing
-   `dir_lookup' calls until one returns FS_RETRY_NONE.  CRDIR is the
-   root directory used for things like symlinks to absolute file names; the
-   other arguments should be those just passed to and/or returned from
-   `dir_lookup', `fsys_getroot', or `file_invoke_translator'.  This
-   function consumes the reference in *RESULT even if it returns an error.  */
-
-extern error_t __hurd_file_name_lookup_retry (file_t crdir,
-					      enum retry_type doretry,
-					      char retryname[1024],
-					      int flags, mode_t mode,
-					      file_t *result);
-extern error_t hurd_file_name_lookup_retry (file_t crdir,
-					    enum retry_type doretry,
-					    char retryname[1024],
-					    int flags, mode_t mode,
-					    file_t *result);
-
-
 /* Split FILE into a directory and a name within the directory.  The
   directory lookup uses the current root and working directory.  If
   successful, stores in *NAME a pointer into FILE where the name
@ -213,15 +172,15 @@ extern file_t file_name_split (const char *file, char **name);
 extern file_t __file_name_lookup (const char *file, int flags, mode_t mode);
 extern file_t file_name_lookup (const char *file, int flags, mode_t mode);

-/* Invoke any translator set on the node FILE represents, and return in
-   *TRANSLATED a port to the translated node.  FLAGS are as for
-   `dir_lookup' et al, but the returned port will not necessarily have
-   any more access rights than FILE does.  */
+/* Open a port to FILE with the given FLAGS and MODE (see <fcntl.h>).  The
+   file lookup uses the current root directory, but uses STARTDIR as the
+   "working directory" for file relative names.  Returns a port to the file
+   if successful; otherwise sets `errno' and returns MACH_PORT_NULL.  */

-extern error_t __hurd_invoke_translator (file_t file, int flags,
-					 file_t *translated);
-extern error_t hurd_invoke_translator (file_t file, int flags,
-				       file_t *translated);
+extern file_t __file_name_lookup_under (file_t startdir, const char *file,
+					int flags, mode_t mode);
+extern file_t file_name_lookup_under (file_t startdir, const char *file,
+				      int flags, mode_t mode);


 /* Open a file descriptor on a port.  FLAGS are as for `open'; flags
--- a/hurd/hurdinit.c
+++ b/hurd/hurdinit.c
@ -31,6 +31,12 @@ struct hurd_port *_hurd_ports;
 unsigned int _hurd_nports;
 mode_t _hurd_umask;

+error_t
+_hurd_ports_use (int which, error_t (*operate) (mach_port_t))
+{
+  return HURD_PORT_USE (&_hurd_ports[which], (*operate) (port));
+}
+
 void _hurd_proc_init (char **argv);

 DEFINE_HOOK (_hurd_subinit, (void));
--- a/stdio/_itoa.c
+++ b/stdio/_itoa.c
@ -1,6 +1,8 @@
 /* Internal function for converting integers to ASCII.
 Copyright (C) 1994, 1995 Free Software Foundation, Inc.
 This file is part of the GNU C Library.
+Contributed by Torbjorn Granlund <tege@matematik.su.se>
+and Ulrich Drepper <drepper@gnu.ai.mit.edu>.

 The GNU C Library is free software; you can redistribute it and/or
 modify it under the terms of the GNU Library General Public License as
@ -17,13 +19,400 @@ License along with the GNU C Library; see the file COPYING.LIB.  If
 not, write to the Free Software Foundation, Inc., 675 Mass Ave,
 Cambridge, MA 02139, USA.  */

-/* Lower-case digits.  */
-const char _itoa_lower_digits[] = "0123456789abcdefghijklmnopqrstuvwxyz";
-/* Upper-case digits.  */
-const char _itoa_upper_digits[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
-
-/* Cause _itoa.h to define _itoa as a real function instead of an
-   `extern inline'.  */
-#define _EXTERN_INLINE /* empty */
+#include <gmp-mparam.h>
+#include "../stdlib/gmp.h"
+#include "../stdlib/gmp-impl.h"
+#include "../stdlib/longlong.h"

 #include "_itoa.h"
+
+
+/* Canonize environment.  For some architectures not all values might
+   be defined in the GMP header files.  */
+#ifndef UMUL_TIME
+# define UMUL_TIME 1
+#endif
+#ifndef UDIV_TIME
+# define UDIV_TIME 1
+#endif
+
+/* Control memory layout.  */
+#ifdef PACK
+# undef PACK
+# define PACK __attribute__ ((packed))
+#else
+# define PACK
+#endif
+
+
+/* Declare local types.  */
+struct base_table_t
+{
+#if (UDIV_TIME > 2 * UMUL_TIME)
+  mp_limb base_multiplier;
+#endif
+  char flag;
+  char post_shift;
+#if BITS_PER_MP_LIMB == 32
+  struct
+    {
+      char normalization_steps;
+      char ndigits;
+      mp_limb base PACK;
+#if UDIV_TIME > 2 * UMUL_TIME
+      mp_limb base_ninv PACK;
+#endif
+    } big;
+#endif
+};
+
+/* To reduce the memory needed we include some fields of the tables
+   only confitionally.  */
+#if BITS_PER_MP_LIMB == 32
+# if UDIV_TIME > 2 * UMUL_TIME
+#  define SEL1(X) X,
+#  define SEL2(X) ,X
+# else
+#  define SEL1(X)
+#  define SEL2(X)
+# endif
+#endif
+
+
+/* Local variables.  */
+static const struct base_table_t base_table[] =
+{
+#if BITS_PER_MP_LIMB == 64
+  /*  2 */ {0ul, 1, 1},
+  /*  3 */ {0xaaaaaaaaaaaaaaabul, 0, 1},
+  /*  4 */ {0ul, 1, 2},
+  /*  5 */ {0xcccccccccccccccdul, 0, 2},
+  /*  6 */ {0xaaaaaaaaaaaaaaabul, 0, 2},
+  /*  7 */ {0x2492492492492493ul, 1, 3},
+  /*  8 */ {0ul, 1, 3},
+  /*  9 */ {0xe38e38e38e38e38ful, 0, 3},
+  /* 10 */ {0xcccccccccccccccdul, 0, 3},
+  /* 11 */ {0x2e8ba2e8ba2e8ba3ul, 0, 1},
+  /* 12 */ {0xaaaaaaaaaaaaaaabul, 0, 3},
+  /* 13 */ {0x4ec4ec4ec4ec4ec5ul, 0, 2},
+  /* 14 */ {0x2492492492492493ul, 1, 4},
+  /* 15 */ {0x8888888888888889ul, 0, 3},
+  /* 16 */ {0ul, 1, 4},
+  /* 17 */ {0xf0f0f0f0f0f0f0f1ul, 0, 4},
+  /* 18 */ {0xe38e38e38e38e38ful, 0, 4},
+  /* 19 */ {0xd79435e50d79435ful, 0, 4},
+  /* 20 */ {0xcccccccccccccccdul, 0, 4},
+  /* 21 */ {0x8618618618618619ul, 1, 5},
+  /* 22 */ {0x2e8ba2e8ba2e8ba3ul, 0, 2},
+  /* 23 */ {0x642c8590b21642c9ul, 1, 5},
+  /* 24 */ {0xaaaaaaaaaaaaaaabul, 0, 4},
+  /* 25 */ {0x47ae147ae147ae15ul, 1, 5},
+  /* 26 */ {0x4ec4ec4ec4ec4ec5ul, 0, 3},
+  /* 27 */ {0x97b425ed097b425ful, 0, 4},
+  /* 28 */ {0x2492492492492493ul, 1, 5},
+  /* 29 */ {0x1a7b9611a7b9611bul, 1, 5},
+  /* 30 */ {0x8888888888888889ul, 0, 4},
+  /* 31 */ {0x0842108421084211ul, 1, 5},
+  /* 32 */ {0ul, 1, 5},
+  /* 33 */ {0x0f83e0f83e0f83e1ul, 0, 1},
+  /* 34 */ {0xf0f0f0f0f0f0f0f1ul, 0, 5},
+  /* 35 */ {0xea0ea0ea0ea0ea0ful, 0, 5},
+  /* 36 */ {0xe38e38e38e38e38ful, 0, 5}
+#endif
+#if BITS_PER_MP_LIMB == 32
+  /*  2 */ {SEL1(0ul) 1, 1, {0, 31, 0x80000000ul SEL2(0xfffffffful)}},
+  /*  3 */ {SEL1(0xaaaaaaabul) 0, 1, {0, 20, 0xcfd41b91ul SEL2(0x3b563c24ul)}},
+  /*  4 */ {SEL1(0ul) 1, 2, {1, 15, 0x40000000ul SEL2(0xfffffffful)}},
+  /*  5 */ {SEL1(0xcccccccdul) 0, 2, {1, 13, 0x48c27395ul SEL2(0xc25c2684ul)}},
+  /*  6 */ {SEL1(0xaaaaaaabul) 0, 2, {0, 12, 0x81bf1000ul SEL2(0xf91bd1b6ul)}},
+  /*  7 */ {SEL1(0x24924925ul) 1, 3, {1, 11, 0x75db9c97ul SEL2(0x1607a2cbul)}},
+  /*  8 */ {SEL1(0ul) 1, 3, {1, 10, 0x40000000ul SEL2(0xfffffffful)}},
+  /*  9 */ {SEL1(0x38e38e39ul) 0, 1, {0, 10, 0xcfd41b91ul SEL2(0x3b563c24ul)}},
+  /* 10 */ {SEL1(0xcccccccdul) 0, 3, {2, 9, 0x3b9aca00ul SEL2(0x12e0be82ul)}},
+  /* 11 */ {SEL1(0xba2e8ba3ul) 0, 3, {0, 9, 0x8c8b6d2bul SEL2(0xd24cde04ul)}},
+  /* 12 */ {SEL1(0xaaaaaaabul) 0, 3, {3, 8, 0x19a10000ul SEL2(0x3fa39ab5ul)}},
+  /* 13 */ {SEL1(0x4ec4ec4ful) 0, 2, {2, 8, 0x309f1021ul SEL2(0x50f8ac5ful)}},
+  /* 14 */ {SEL1(0x24924925ul) 1, 4, {1, 8, 0x57f6c100ul SEL2(0x74843b1eul)}},
+  /* 15 */ {SEL1(0x88888889ul) 0, 3, {0, 8, 0x98c29b81ul SEL2(0xad0326c2ul)}},
+  /* 16 */ {SEL1(0ul) 1, 4, {3, 7, 0x10000000ul SEL2(0xfffffffful)}},
+  /* 17 */ {SEL1(0xf0f0f0f1ul) 0, 4, {3, 7, 0x18754571ul SEL2(0x4ef0b6bdul)}},
+  /* 18 */ {SEL1(0x38e38e39ul) 0, 2, {2, 7, 0x247dbc80ul SEL2(0xc0fc48a1ul)}},
+  /* 19 */ {SEL1(0xaf286bcbul) 1, 5, {2, 7, 0x3547667bul SEL2(0x33838942ul)}},
+  /* 20 */ {SEL1(0xcccccccdul) 0, 4, {1, 7, 0x4c4b4000ul SEL2(0xad7f29abul)}},
+  /* 21 */ {SEL1(0x86186187ul) 1, 5, {1, 7, 0x6b5a6e1dul SEL2(0x313c3d15ul)}},
+  /* 22 */ {SEL1(0xba2e8ba3ul) 0, 4, {0, 7, 0x94ace180ul SEL2(0xb8cca9e0ul)}},
+  /* 23 */ {SEL1(0xb21642c9ul) 0, 4, {0, 7, 0xcaf18367ul SEL2(0x42ed6de9ul)}},
+  /* 24 */ {SEL1(0xaaaaaaabul) 0, 4, {4, 6, 0x0b640000ul SEL2(0x67980e0bul)}},
+  /* 25 */ {SEL1(0x51eb851ful) 0, 3, {4, 6, 0x0e8d4a51ul SEL2(0x19799812ul)}},
+  /* 26 */ {SEL1(0x4ec4ec4ful) 0, 3, {3, 6, 0x1269ae40ul SEL2(0xbce85396ul)}},
+  /* 27 */ {SEL1(0x2f684bdbul) 1, 5, {3, 6, 0x17179149ul SEL2(0x62c103a9ul)}},
+  /* 28 */ {SEL1(0x24924925ul) 1, 5, {3, 6, 0x1cb91000ul SEL2(0x1d353d43ul)}},
+  /* 29 */ {SEL1(0x8d3dcb09ul) 0, 4, {2, 6, 0x23744899ul SEL2(0xce1deceaul)}},
+  /* 30 */ {SEL1(0x88888889ul) 0, 4, {2, 6, 0x2b73a840ul SEL2(0x790fc511ul)}},
+  /* 31 */ {SEL1(0x08421085ul) 1, 5, {2, 6, 0x34e63b41ul SEL2(0x35b865a0ul)}},
+  /* 32 */ {SEL1(0ul) 1, 5, {1, 6, 0x40000000ul SEL2(0xfffffffful)}},
+  /* 33 */ {SEL1(0x3e0f83e1ul) 0, 3, {1, 6, 0x4cfa3cc1ul SEL2(0xa9aed1b3ul)}},
+  /* 34 */ {SEL1(0xf0f0f0f1ul) 0, 5, {1, 6, 0x5c13d840ul SEL2(0x63dfc229ul)}},
+  /* 35 */ {SEL1(0xd41d41d5ul) 1, 6, {1, 6, 0x6d91b519ul SEL2(0x2b0fee30ul)}},
+  /* 36 */ {SEL1(0x38e38e39ul) 0, 3, {0, 6, 0x81bf1000ul SEL2(0xf91bd1b6ul)}}
+#endif
+};
+
+/* Lower-case digits.  */
+static const char _itoa_lower_digits[]
+	= "0123456789abcdefghijklmnopqrstuvwxyz";
+/* Upper-case digits.  */
+static const char _itoa_upper_digits[]
+	= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+
+
+char *
+_itoa (value, buflim, base, upper_case)
+     unsigned long long int value;
+     char *buflim;
+     unsigned int base;
+     int upper_case;
+{
+  const char *digits = upper_case ? _itoa_upper_digits : _itoa_lower_digits;
+  char *bp = buflim;
+  const struct base_table_t *brec = &base_table[base - 2];
+
+  switch (base)
+    {
+#define RUN_2N(BITS)							  \
+      do								  \
+        {								  \
+	  /* `unsigned long long int' always has 64 bits.  */		  \
+	  mp_limb work_hi = value >> (64 - BITS_PER_MP_LIMB);		  \
+									  \
+	  if (BITS_PER_MP_LIMB == 32)					  \
+	    if (work_hi != 0)						  \
+	      {								  \
+		mp_limb work_lo;					  \
+		int cnt;						  \
+									  \
+		work_lo = value & 0xfffffffful;				  \
+		for (cnt = BITS_PER_MP_LIMB / BITS; cnt > 0; --cnt)	  \
+		  {							  \
+		    *--bp = digits[work_lo & ((1ul << BITS) - 1)];	  \
+		    work_lo >>= BITS;					  \
+		  }							  \
+		if (BITS_PER_MP_LIMB % BITS != 0)			  \
+		  {							  \
+		    work_lo |= ((work_hi				  \
+				 & ((1 << BITS - BITS_PER_MP_LIMB % BITS) \
+				    - 1))				  \
+				<< BITS_PER_MP_LIMB % BITS);		  \
+		    *--bp = digits[work_lo];				  \
+		    work_hi >>= BITS - BITS_PER_MP_LIMB % BITS;		  \
+		  }							  \
+	      }								  \
+	    else							  \
+	      work_hi = value & 0xfffffffful;				  \
+	  do								  \
+	    {								  \
+	      *--bp = digits[work_hi & ((1 << BITS) - 1)];		  \
+	      work_hi >>= BITS;						  \
+	    }								  \
+	  while (work_hi != 0);						  \
+	}								  \
+      while (0)
+    case 8:
+      RUN_2N (3);
+      break;
+
+    case 16:
+      RUN_2N (4);
+      break;
+
+    default:
+      {
+#if BITS_PER_MP_LIMB == 64
+	mp_limb base_multiplier = brec->base_multiplier;
+	if (brec->flag)
+	  while (value != 0)
+	    {
+	      mp_limb quo, rem, x, dummy;
+
+	      umul_ppmm (x, dummy, value, base_multiplier);
+	      quo = (x + ((value - x) >> 1)) >> (brec->post_shift - 1);
+	      rem = value - quo * base;
+	      *--bp = digits[rem];
+	      value = quo;
+	    }
+	else
+	  while (value != 0)
+	    {
+	      mp_limb quo, rem, x, dummy;
+
+	      umul_ppmm (x, dummy, value, base_multiplier);
+	      quo = x >> brec->post_shift;
+	      rem = value - quo * base;
+	      *--bp = digits[rem];
+	      value = quo;
+	    }
+#endif
+#if BITS_PER_MP_LIMB == 32
+	mp_limb t[3];
+	int n;
+
+	/* First convert x0 to 1-3 words in base s->big.base.
+	   Optimize for frequent cases of 32 bit numbers.  */
+	if ((mp_limb) (value >> 32) >= 1)
+	  {
+	    int big_normalization_steps = brec->big.normalization_steps;
+	    mp_limb big_base_norm = brec->big.base << big_normalization_steps;
+
+	    if ((mp_limb) (value >> 32) >= brec->big.base)
+	      {
+		mp_limb x1hi, x1lo, r;
+		/* If you want to optimize this, take advantage of
+		   that the quotient in the first udiv_qrnnd will
+		   always be very small.  It might be faster just to
+		   subtract in a tight loop.  */
+
+#if UDIV_TIME > 2 * UMUL_TIME
+		mp_limb x, xh, xl;
+
+		if (big_normalization_steps == 0)
+		  xh = 0;
+		else
+		  xh = (mp_limb) (value >> 64 - big_normalization_steps);
+		xl = (mp_limb) (value >> 32 - big_normalization_steps);
+		udiv_qrnnd_preinv (x1hi, r, xh, xl, big_base_norm,
+				   brec->big.base_ninv);
+
+		xl = ((mp_limb) value) << big_normalization_steps;
+		udiv_qrnnd_preinv (x1lo, x, r, xl, big_base_norm,
+				   big_normalization_steps);
+		t[2] = x >> big_normalization_steps;
+
+		if (big_normalization_steps == 0)
+		  xh = x1hi;
+		else
+		  xh = ((x1hi << big_normalization_steps)
+			| (x1lo >> 32 - big_normalization_steps));
+		xl = x1lo << big_normalization_steps;
+		udiv_qrnnd_preinv (t[0], x, xh, xl, big_base_norm,
+				   big_normalization_steps);
+		t[1] = x >> big_normalization_steps;
+#elif UDIV_NEEDS_NORMALIZATION
+		mp_limb x, xh, xl;
+
+		if (big_normalization_steps == 0)
+		  xh = 0;
+		else
+		  xh = (mp_limb) (value >> 64 - big_normalization_steps);
+		xl = (mp_limb) (value >> 32 - big_normalization_steps);
+		udiv_qrnnd (x1hi, r, xh, xl, big_base_norm);
+
+		xl = ((mp_limb) value) << big_normalization_steps;
+		udiv_qrnnd (x1lo, x, r, xl, big_base_norm);
+		t[2] = x >> big_normalization_steps;
+
+		if (big_normalization_steps == 0)
+		  xh = x1hi;
+		else
+		  xh = ((x1hi << big_normalization_steps)
+			| (x1lo >> 32 - big_normalization_steps));
+		xl = x1lo << big_normalization_steps;
+		udiv_qrnnd (t[0], x, xh, xl, big_base_norm);
+		t[1] = x >> big_normalization_steps;
+#else
+		udiv_qrnnd (x1hi, r, 0, (mp_limb) (value >> 32),
+			    brec->big.base);
+		udiv_qrnnd (x1lo, t[2], r, (mp_limb) value, brec->big.base);
+		udiv_qrnnd (t[0], t[1], x1hi, x1lo, brec->big.base);
+#endif
+		n = 3;
+	      }
+	    else
+	      {
+#if (UDIV_TIME > 2 * UMUL_TIME)
+		mp_limb x;
+
+		value <<= brec->big.normalization_steps;
+		udiv_qrnnd_preinv (t[0], x, (mp_limb) (value >> 32),
+				   (mp_limb) value, big_base_norm,
+				   brec->big.base_ninv);
+		t[1] = x >> brec->big.normalization_steps;
+#elif UDIV_NEEDS_NORMALIZATION
+		mp_limb x;
+
+		value <<= big_normalization_steps;
+		udiv_qrnnd (t[0], x, (mp_limb) (value >> 32),
+			    (mp_limb) value, big_base_norm);
+		t[1] = x >> big_normalization_steps;
+#else
+		udiv_qrnnd (t[0], t[1], (mp_limb) (value >> 32),
+			    (mp_limb) value, brec->big.base);
+#endif
+		n = 2;
+	      }
+	  }
+	else
+	  {
+	    t[0] = value;
+	    n = 1;
+	  }
+
+	/* Convert the 1-3 words in t[], word by word, to ASCII.  */
+	do
+	  {
+	    mp_limb ti = t[--n];
+	    int ndig_for_this_limb = 0;
+
+#if UDIV_TIME > 2 * UMUL_TIME
+	    mp_limb base_multiplier = brec->base_multiplier;
+	    if (brec->flag)
+	      while (ti != 0)
+		{
+		  mp_limb quo, rem, x, dummy;
+
+		  umul_ppmm (x, dummy, ti, base_multiplier);
+		  quo = (x + ((ti - x) >> 1)) >> (brec->post_shift - 1);
+		  rem = ti - quo * base;
+		  *--bp = digits[rem];
+		  ti = quo;
+		  ++ndig_for_this_limb;
+		}
+	    else
+	      while (ti != 0)
+		{
+		  mp_limb quo, rem, x, dummy;
+
+		  umul_ppmm (x, dummy, ti, base_multiplier);
+		  quo = x >> brec->post_shift;
+		  rem = ti - quo * base;
+		  *--bp = digits[rem];
+		  ti = quo;
+		  ++ndig_for_this_limb;
+		}
+#else
+	    while (ti != 0)
+	      {
+		mp_limb quo, rem;
+
+		quo = ti / base;
+		rem = ti % base;
+		*--bp = digits[rem];
+		ti = quo;
+		++ndig_for_this_limb;
+	      }
+#endif
+	    /* If this wasn't the most significant word, pad with zeros.  */
+	    if (n != 0)
+	      while (ndig_for_this_limb < brec->big.ndigits)
+		{
+		  *--bp = '0';
+		  ++ndig_for_this_limb;
+		}
+	  }
+	while (n != 0);
+#endif
+      }
+      break;
+    }
+
+  return bp;
+}
--- a/stdio/_itoa.h
+++ b/stdio/_itoa.h
@ -21,8 +21,6 @@ Cambridge, MA 02139, USA.  */
 #define _ITOA_H
 #include <sys/cdefs.h>

-extern const char _itoa_lower_digits[], _itoa_upper_digits[];
-
 /* Convert VALUE into ASCII in base BASE (2..36).
   Write backwards starting the character just before BUFLIM.
   Return the address of the first (left-to-right) character in the number.
@ -31,28 +29,4 @@ extern const char _itoa_lower_digits[], _itoa_upper_digits[];
 extern char *_itoa __P ((unsigned long long int value, char *buflim,
 			 unsigned int base, int upper_case));

-#ifndef _EXTERN_INLINE
-#define _EXTERN_INLINE extern __inline 
-#endif
-
-_EXTERN_INLINE
-char *
-_itoa (unsigned long long int value, char *buflim,
-       unsigned int base, int upper_case)
-{
-  /* Base-36 digits for numbers.  */
-  const char *digits = upper_case ? _itoa_upper_digits : _itoa_lower_digits;
-
-  register char *bp = buflim;
-
-  while (value > 0)
-    {
-      *--bp = digits[value % base];
-      value /= base;
-    }
-
-  return bp;
-}
-
-
 #endif	/* itoa.h */
--- a/stdio/test_rdwr.c
+++ b/stdio/test_rdwr.c
@ -17,6 +17,7 @@ not, write to the Free Software Foundation, Inc., 675 Mass Ave,
 Cambridge, MA 02139, USA.  */

 #include <ansidecl.h>
+#include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
--- a/stdlib/gmp-impl.h
+++ b/stdlib/gmp-impl.h
@ -19,11 +19,17 @@ along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
 the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */

 #if ! defined (alloca)
-#if defined (__GNUC__) || defined (__sparc__) || defined (sparc)
+#if defined (__GNUC__)
 #define alloca __builtin_alloca
 #endif
 #endif

+#if ! defined (alloca)
+#if defined (__sparc__) || defined (sparc) || defined (__sgi)
+#include <alloca.h>
+#endif
+#endif
+
 #ifndef NULL
 #define NULL 0L
 #endif
@ -168,6 +174,7 @@ void _mp_default_free ();
    else								\
      ____mpn_sqr_n (prodp, up, size, tspace);				\
  } while (0);
+#define assert(trueval) do {if (!(trueval)) abort ();} while (0)

 /* Structure for conversion between internal binary format and
   strings in base 2..36.  */
@ -197,9 +204,11 @@ struct bases
 extern const struct bases __mp_bases[];
 extern mp_size_t __gmp_default_fp_limb_precision;

-/* Divide the two-limb number in (NH,,NL) by D, with DI being a 32 bit
-   approximation to (2**(2*BITS_PER_MP_LIMB))/D - (2**BITS_PER_MP_LIMB).
-   Put the quotient in Q and the remainder in R.  */
+/* Divide the two-limb number in (NH,,NL) by D, with DI being the largest
+   limb not larger than (2**(2*BITS_PER_MP_LIMB))/D - (2**BITS_PER_MP_LIMB).
+   If this would yield overflow, DI should be the largest possible number
+   (i.e., only ones).  For correct operation, the most significant bit of D
+   has to be set.  Put the quotient in Q and the remainder in R.  */
 #define udiv_qrnnd_preinv(q, r, nh, nl, d, di) \
  do {									\
    mp_limb _q, _ql, _r;						\
@ -226,6 +235,8 @@ extern mp_size_t __gmp_default_fp_limb_precision;
    (r) = _r;								\
    (q) = _q;								\
  } while (0)
+/* Like udiv_qrnnd_preinv, but for for any value D.  DNORM is D shifted left
+   so that its most significant bit is set.  LGUP is ceil(log2(D)).  */
 #define udiv_qrnnd_preinv2gen(q, r, nh, nl, d, di, dnorm, lgup) \
  do {									\
    mp_limb n2, n10, n1, nadj, q1;					\
@ -243,6 +254,8 @@ extern mp_size_t __gmp_default_fp_limb_precision;
    (r) = _xl + ((d) & _xh);						\
    (q) = _xh - q1;							\
  } while (0)
+/* Exactly like udiv_qrnnd_preinv, but branch-free.  It is not clear which
+   version to use.  */
 #define udiv_qrnnd_preinv2norm(q, r, nh, nl, d, di) \
  do {									\
    mp_limb n2, n10, n1, nadj, q1;					\
@ -262,22 +275,49 @@ extern mp_size_t __gmp_default_fp_limb_precision;
  } while (0)

 #if defined (__GNUC__)
-/* Define stuff for longlong.h asm macros.  */
-#if __GNUC_NEW_ATTR_MODE_SYNTAX
-typedef unsigned int UQItype	__attribute__ ((mode ("QI")));
-typedef 	 int SItype	__attribute__ ((mode ("SI")));
-typedef unsigned int USItype	__attribute__ ((mode ("SI")));
-typedef		 int DItype	__attribute__ ((mode ("DI")));
-typedef unsigned int UDItype	__attribute__ ((mode ("DI")));
-#else
+/* Define stuff for longlong.h.  */
 typedef unsigned int UQItype	__attribute__ ((mode (QI)));
 typedef 	 int SItype	__attribute__ ((mode (SI)));
 typedef unsigned int USItype	__attribute__ ((mode (SI)));
 typedef		 int DItype	__attribute__ ((mode (DI)));
 typedef unsigned int UDItype	__attribute__ ((mode (DI)));
-#endif
+#else
+typedef unsigned char UQItype;
+typedef 	 long SItype;
+typedef unsigned long USItype;
 #endif

 typedef mp_limb UWtype;
 typedef unsigned int UHWtype;
 #define W_TYPE_SIZE BITS_PER_MP_LIMB
+
+
+#ifndef IEEE_DOUBLE_BIG_ENDIAN
+#define IEEE_DOUBLE_BIG_ENDIAN 1
+#endif
+
+#if IEEE_DOUBLE_BIG_ENDIAN
+union ieee_double_extract
+{
+  struct
+    {
+      unsigned long sig:1;
+      unsigned long exp:11;
+      unsigned long manh:20;
+      unsigned long manl:32;
+    } s;
+  double d;
+};
+#else
+union ieee_double_extract
+{
+  struct
+    {
+      unsigned long manl:32;
+      unsigned long manh:20;
+      unsigned long exp:11;
+      unsigned long sig:1;
+    } s;
+  double d;
+};
+#endif
--- a/stdlib/gmp.h
+++ b/stdlib/gmp.h
@ -24,13 +24,13 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
 #define __need_size_t
 #include <stddef.h>

-#ifdef __STDC__
+#if defined (__STDC__)
 #define __gmp_const const
 #else
 #define __gmp_const
 #endif

-#ifdef __GNUC__
+#if defined (__GNUC__)
 #define __gmp_inline inline
 #else
 #define __gmp_inline
@ -40,9 +40,14 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
 typedef unsigned int		mp_limb;
 typedef int			mp_limb_signed;
 #else
+#if _LONG_LONG_LIMB
+typedef unsigned long long int	mp_limb;
+typedef long long int		mp_limb_signed;
+#else
 typedef unsigned long int	mp_limb;
 typedef long int		mp_limb_signed;
 #endif
+#endif

 typedef mp_limb *		mp_ptr;
 typedef __gmp_const mp_limb *	mp_srcptr;
@ -52,9 +57,9 @@ typedef long int		mp_exp_t;
 #ifndef __MP_SMALL__
 typedef struct
 {
-  long int alloc;		/* Number of *limbs* allocated and pointed
+  mp_size_t alloc;		/* Number of *limbs* allocated and pointed
 				   to by the D field.  */
-  long int size;		/* abs(SIZE) is the number of limbs
+  mp_size_t size;		/* abs(SIZE) is the number of limbs
 				   the last field points to.  If SIZE
 				   is negative this is a negative
 				   number.  */
@ -130,12 +135,16 @@ typedef __mpf_struct *mpf_ptr;
 typedef __gmp_const __mpq_struct *mpq_srcptr;
 typedef __mpq_struct *mpq_ptr;

-#ifdef __STDC__
+#if defined (__STDC__)
 #define _PROTO(x) x
 #else
 #define _PROTO(x) ()
 #endif

+#if defined (FILE) || defined (_STDIO_H_) || defined (__STDIO_H__) || defined (H_STDIO)
+#define _GMP_H_HAVE_FILE 1
+#endif
+
 void mp_set_memory_functions _PROTO((void *(*) (size_t),
 				     void *(*) (void *, size_t, size_t),
 				     void (*) (void *, size_t)));
@ -165,7 +174,7 @@ unsigned long int mpz_get_ui _PROTO ((mpz_srcptr));
 mp_limb mpz_getlimbn _PROTO ((mpz_srcptr, mp_size_t));
 mp_size_t mpz_hamdist _PROTO ((mpz_srcptr, mpz_srcptr));
 void mpz_init _PROTO ((mpz_ptr));
-#ifdef FILE
+#ifdef _GMP_H_HAVE_FILE
 void mpz_inp_raw _PROTO ((mpz_ptr, FILE *));
 int mpz_inp_str _PROTO ((mpz_ptr, FILE *, int));
 #endif
@ -180,7 +189,7 @@ void mpz_mul _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
 void mpz_mul_2exp _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
 void mpz_mul_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
 void mpz_neg _PROTO ((mpz_ptr, mpz_srcptr));
-#ifdef FILE
+#ifdef _GMP_H_HAVE_FILE
 void mpz_out_raw _PROTO ((FILE *, mpz_srcptr));
 void mpz_out_str _PROTO ((FILE *, int, mpz_srcptr));
 #endif
@ -218,6 +227,8 @@ void mpz_tdiv_qr_ui _PROTO((mpz_ptr, mpz_ptr, mpz_srcptr, unsigned long int));
 void mpz_tdiv_r _PROTO((mpz_ptr, mpz_srcptr, mpz_srcptr));
 void mpz_tdiv_r_ui _PROTO((mpz_ptr, mpz_srcptr, unsigned long int));

+void mpz_array_init (mpz_ptr, size_t, mp_size_t);
+
 /**************** Rational (i.e. Q) routines.  ****************/

 void mpq_init _PROTO ((mpq_ptr));
@ -253,7 +264,7 @@ void mpf_dump _PROTO ((mpf_srcptr));
 char *mpf_get_str _PROTO ((char *, mp_exp_t *, int, size_t, mpf_srcptr));
 void mpf_init _PROTO ((mpf_ptr));
 void mpf_init2 _PROTO ((mpf_ptr, mp_size_t));
-#ifdef FILE
+#ifdef _GMP_H_HAVE_FILE
 void mpf_inp_str _PROTO ((mpf_ptr, FILE *, int));
 #endif
 void mpf_init_set _PROTO ((mpf_ptr, mpf_srcptr));
@ -265,7 +276,7 @@ void mpf_mul _PROTO ((mpf_ptr, mpf_srcptr, mpf_srcptr));
 void mpf_mul_2exp _PROTO ((mpf_ptr, mpf_srcptr, unsigned long int));
 void mpf_mul_ui _PROTO ((mpf_ptr, mpf_srcptr, unsigned long int));
 void mpf_neg _PROTO ((mpf_ptr, mpf_srcptr));
-#ifdef FILE
+#ifdef _GMP_H_HAVE_FILE
 void mpf_out_str _PROTO ((mpf_ptr, int, size_t, FILE *));
 #endif
 void mpf_set _PROTO ((mpf_ptr, mpf_srcptr));
@ -335,7 +346,7 @@ mp_limb __mpn_gcd_1 _PROTO ((mp_srcptr, mp_size_t, mp_limb));


 static __gmp_inline mp_limb
-#if __STDC__
+#if defined (__STDC__)
 __mpn_add_1 (register mp_ptr res_ptr,
 	     register mp_srcptr s1_ptr,
 	     register mp_size_t s1_size,
@ -377,7 +388,7 @@ __mpn_add_1 (res_ptr, s1_ptr, s1_size, s2_limb)
 }

 static __gmp_inline mp_limb
-#if __STDC__
+#if defined (__STDC__)
 __mpn_add (register mp_ptr res_ptr,
 	   register mp_srcptr s1_ptr,
 	   register mp_size_t s1_size,
@ -406,7 +417,7 @@ __mpn_add (res_ptr, s1_ptr, s1_size, s2_ptr, s2_size)
 }

 static __gmp_inline mp_limb
-#if __STDC__
+#if defined (__STDC__)
 __mpn_sub_1 (register mp_ptr res_ptr,
 	     register mp_srcptr s1_ptr,
 	     register mp_size_t s1_size,
@ -448,7 +459,7 @@ __mpn_sub_1 (res_ptr, s1_ptr, s1_size, s2_limb)
 }

 static __gmp_inline mp_limb
-#if __STDC__
+#if defined (__STDC__)
 __mpn_sub (register mp_ptr res_ptr,
 	   register mp_srcptr s1_ptr,
 	   register mp_size_t s1_size,
@ -477,7 +488,7 @@ __mpn_sub (res_ptr, s1_ptr, s1_size, s2_ptr, s2_size)
 }

 static __gmp_inline mp_size_t
-#if __STDC__
+#if defined (__STDC__)
 __mpn_normal_size (mp_srcptr ptr, mp_size_t size)
 #else
 __mpn_normal_size (ptr, size)
@ -512,7 +523,6 @@ __mpn_normal_size (ptr, size)
 /* Useful synonyms, but not quite compatible with GMP 1.  */
 #define mpz_div		mpz_fdiv_q
 #define mpz_divmod	mpz_fdiv_qr
-#define mpz_mod		mpz_fdiv_r
 #define mpz_div_ui	mpz_fdiv_q_ui
 #define mpz_divmod_ui	mpz_fdiv_qr_ui
 #define mpz_mod_ui	mpz_fdiv_r_ui
--- a/stdlib/longlong.h
+++ b/stdlib/longlong.h
@ -97,7 +97,7 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
 #define __AND_CLOBBER_CC , "cc"
 #endif /* __GNUC__ < 2 */

-#if (defined (__a29k__) || defined (___AM29K__)) && W_TYPE_SIZE == 32
+#if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
  __asm__ ("add %1,%4,%5
 	addc %0,%2,%3"							\
@ -152,6 +152,7 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
    (pl) = __m0 * __m1;							\
  } while (0)
 #define UMUL_TIME 46
+#ifndef LONGLONG_STANDALONE
 #define udiv_qrnnd(q, r, n1, n0, d) \
  do { UDItype __r;							\
    (q) = __udiv_qrnnd (&__r, (n1), (n0), (d));				\
@ -159,12 +160,13 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
  } while (0)
 extern UDItype __udiv_qrnnd ();
 #define UDIV_TIME 220
-#endif
+#endif /* LONGLONG_STANDALONE */
+#endif /* __alpha__ */

 #if defined (__arm__) && W_TYPE_SIZE == 32
 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
-  __asm__ ("adds %1,%4,%5
-	adc %0,%2,%3"							\
+  __asm__ ("adds	%1, %4, %5
+	adc	%0, %2, %3"						\
 	   : "=r" ((USItype)(sh)),					\
 	     "=&r" ((USItype)(sl))					\
 	   : "%r" ((USItype)(ah)),					\
@ -172,8 +174,8 @@ extern UDItype __udiv_qrnnd ();
 	     "%r" ((USItype)(al)),					\
 	     "rI" ((USItype)(bl)))
 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
-  __asm__ ("subs %1,%4,%5
-	sbc %0,%2,%3"							\
+  __asm__ ("subs	%1, %4, %5
+	sbc	%0, %2, %3"						\
 	   : "=r" ((USItype)(sh)),					\
 	     "=&r" ((USItype)(sl))					\
 	   : "r" ((USItype)(ah)),					\
@ -181,19 +183,19 @@ extern UDItype __udiv_qrnnd ();
 	     "r" ((USItype)(al)),					\
 	     "rI" ((USItype)(bl)))
 #define umul_ppmm(xh, xl, a, b) \
-  __asm__ ("; Inlined umul_ppmm
-	mov	r0,%2 lsr 16
-	mov	r2,%3 lsr 16
-	bic	r1,%2,r0 lsl 16
-	bic	r2,%3,r2 lsl 16
-	mul	%1,r1,r2
-	mul	r2,r0,r2
-	mul	r1,%0,r1
-	mul	%0,r0,%0
-	adds	r1,r2,r1
-	addcs	%0,%0,0x10000
-	adds	%1,%1,r1 lsl 16
-	adc	%0,%0,r1 lsr 16"					\
+  __asm__ ("%@ Inlined umul_ppmm
+	mov	%|r0, %2, lsr #16
+	mov	%|r2, %3, lsr #16
+	bic	%|r1, %2, %|r0, lsl #16
+	bic	%|r2, %3, %|r2, lsl #16
+	mul	%1, %|r1, %|r2
+	mul	%|r2, %|r0, %|r2
+	mul	%|r1, %0, %|r1
+	mul	%0, %|r0, %0
+	adds	%|r1, %|r2, %|r1
+	addcs	%0, %0, #65536
+	adds	%1, %1, %|r1, lsl #16
+	adc	%0, %0, %|r1, lsr #16"					\
 	   : "=&r" ((USItype)(xh)),					\
 	     "=r" ((USItype)(xl))					\
 	   : "r" ((USItype)(a)),					\
@ -296,9 +298,9 @@ extern UDItype __udiv_qrnnd ();
 	   struct {USItype __h, __l;} __i;				\
 	  } __xx;							\
    __asm__ ("xmpyu %1,%2,%0"						\
-	     : "=x" (__xx.__ll)						\
-	     : "x" ((USItype)(u)),					\
-	       "x" ((USItype)(v)));					\
+	     : "=fx" (__xx.__ll)					\
+	     : "fx" ((USItype)(u)),					\
+	       "fx" ((USItype)(v)));					\
    (wh) = __xx.__i.__h;						\
    (wl) = __xx.__i.__l;						\
  } while (0)
@ -308,12 +310,14 @@ extern UDItype __udiv_qrnnd ();
 #define UMUL_TIME 40
 #define UDIV_TIME 80
 #endif
+#ifndef LONGLONG_STANDALONE
 #define udiv_qrnnd(q, r, n1, n0, d) \
  do { USItype __r;							\
    (q) = __udiv_qrnnd (&__r, (n1), (n0), (d));				\
    (r) = __r;								\
  } while (0)
 extern USItype __udiv_qrnnd ();
+#endif /* LONGLONG_STANDALONE */
 #define count_leading_zeros(count, x) \
  do {									\
    USItype __tmp;							\
@ -419,8 +423,12 @@ extern USItype __udiv_qrnnd ();
  } while (0)
 #define count_trailing_zeros(count, x) \
  __asm__ ("bsfl %1,%0" : "=r" (count) : "rm" ((USItype)(x)))
+#ifndef UMUL_TIME
 #define UMUL_TIME 40
+#endif
+#ifndef UDIV_TIME
 #define UDIV_TIME 40
+#endif
 #endif /* 80x86 */

 #if defined (__i960__) && W_TYPE_SIZE == 32
@ -442,7 +450,7 @@ extern USItype __udiv_qrnnd ();
    __w; })  
 #endif /* __i960__ */

-#if defined (__mc68000__) && W_TYPE_SIZE == 32
+#if (defined (__mc68000__) || defined (__mc68020__) || defined (__NeXT__) || defined(mc68020)) && W_TYPE_SIZE == 32
 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
  __asm__ ("add%.l %5,%1
 	addx%.l %3,%0"							\
@ -489,38 +497,34 @@ extern USItype __udiv_qrnnd ();
 	   : "=d" ((USItype)(count))					\
 	   : "od" ((USItype)(x)), "n" (0))
 #else /* not mc68020 */
-#define umul_ppmm(xh, xl, a, b) \
-  __asm__ ("| Inlined umul_ppmm
-	move%.l	%2,%/d0
-	move%.l	%3,%/d1
-	move%.l	%/d0,%/d2
-	swap	%/d0
-	move%.l	%/d1,%/d3
-	swap	%/d1
-	move%.w	%/d2,%/d4
-	mulu	%/d3,%/d4
-	mulu	%/d1,%/d2
-	mulu	%/d0,%/d3
-	mulu	%/d0,%/d1
-	move%.l	%/d4,%/d0
-	eor%.w	%/d0,%/d0
-	swap	%/d0
-	add%.l	%/d0,%/d2
-	add%.l	%/d3,%/d2
+#define umul_ppmmxx(xh, xl, a, b) \
+  do { USItype __umul_tmp1, __umul_tmp2;				\
+	__asm__ ("| Inlined umul_ppmm
+	move%.l	%5,%3
+	move%.l	%2,%0
+	move%.w	%3,%1
+	swap	%3
+	swap	%0
+	mulu	%2,%1
+	mulu	%3,%0
+	mulu	%2,%3
+	swap	%2
+	mulu	%5,%2
+	add%.l	%3,%2
 	jcc	1f
-	add%.l	#65536,%/d1
-1:	swap	%/d2
-	moveq	#0,%/d0
-	move%.w	%/d2,%/d0
-	move%.w	%/d4,%/d2
-	move%.l	%/d2,%1
-	add%.l	%/d1,%/d0
-	move%.l	%/d0,%0"						\
-	   : "=g" ((USItype)(xh)),					\
-	     "=g" ((USItype)(xl))					\
-	   : "g" ((USItype)(a)),					\
-	     "g" ((USItype)(b))						\
-	   : "d0", "d1", "d2", "d3", "d4")
+	add%.l	%#0x10000,%0
+1:	move%.l	%2,%3
+	clr%.w	%2
+	swap	%2
+	swap	%3
+	clr%.w	%3
+	add%.l	%3,%1
+	addx%.l	%2,%0
+	| End inlined umul_ppmm"					\
+	      : "=&d" ((USItype)(xh)), "=&d" ((USItype)(xl)),		\
+	        "=d" (__umul_tmp1), "=&d" (__umul_tmp2)			\
+	      : "%2" ((USItype)(a)), "d" ((USItype)(b)));		\
+  } while (0)
 #define UMUL_TIME 100
 #define UDIV_TIME 400
 #endif /* not mc68020 */
@ -553,7 +557,7 @@ extern USItype __udiv_qrnnd ();
 	     : "r" ((USItype)(x)));					\
    (count) = __cbtmp ^ 31;						\
  } while (0)
-#if defined (__mc88110__)
+#if defined (__m88110__)
 #define umul_ppmm(wh, wl, u, v) \
  do {									\
    union {UDItype __ll;						\
@ -582,10 +586,18 @@ extern USItype __udiv_qrnnd ();
 #else
 #define UMUL_TIME 17
 #define UDIV_TIME 150
-#endif /* __mc88110__ */
+#endif /* __m88110__ */
 #endif /* __m88000__ */

 #if defined (__mips__) && W_TYPE_SIZE == 32
+#if __GNUC__ > 2 || __GNUC_MINOR__ >= 7
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("multu %2,%3"						\
+	   : "=l" ((USItype)(w0)),					\
+	     "=h" ((USItype)(w1))					\
+	   : "d" ((USItype)(u)),					\
+	     "d" ((USItype)(v)))
+#else
 #define umul_ppmm(w1, w0, u, v) \
  __asm__ ("multu %2,%3
 	mflo %0
@ -594,11 +606,20 @@ extern USItype __udiv_qrnnd ();
 	     "=d" ((USItype)(w1))					\
 	   : "d" ((USItype)(u)),					\
 	     "d" ((USItype)(v)))
+#endif
 #define UMUL_TIME 10
 #define UDIV_TIME 100
 #endif /* __mips__ */

 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
+#if __GNUC__ > 2 || __GNUC_MINOR__ >= 7
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("dmultu %2,%3"						\
+	   : "=l" ((UDItype)(w0)),					\
+	     "=h" ((UDItype)(w1))					\
+	   : "d" ((UDItype)(u)),					\
+	     "d" ((UDItype)(v)))
+#else
 #define umul_ppmm(w1, w0, u, v) \
  __asm__ ("dmultu %2,%3
 	mflo %0
@ -607,8 +628,9 @@ extern USItype __udiv_qrnnd ();
 	     "=d" ((UDItype)(w1))					\
 	   : "d" ((UDItype)(u)),					\
 	     "d" ((UDItype)(v)))
-#define UMUL_TIME 10
-#define UDIV_TIME 100
+#endif
+#define UMUL_TIME 20
+#define UDIV_TIME 140
 #endif /* __mips__ */

 #if defined (__ns32000__) && W_TYPE_SIZE == 32
@ -647,7 +669,7 @@ extern USItype __udiv_qrnnd ();
  } while (0)
 #endif /* __ns32000__ */

-#if (defined (__powerpc__) || defined (___IBMR2__)) && W_TYPE_SIZE == 32
+#if (defined (_ARCH_PPC) || defined (_IBMR2)) && W_TYPE_SIZE == 32
 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
  do {									\
    if (__builtin_constant_p (bh) && (bh) == 0)				\
@ -676,14 +698,14 @@ extern USItype __udiv_qrnnd ();
 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
  do {									\
    if (__builtin_constant_p (ah) && (ah) == 0)				\
-      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"		\
+      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"	\
 	       : "=r" ((USItype)(sh)),					\
 		 "=&r" ((USItype)(sl))					\
 	       : "r" ((USItype)(bh)),					\
 		 "rI" ((USItype)(al)),					\
 		 "r" ((USItype)(bl)));					\
    else if (__builtin_constant_p (ah) && (ah) ==~(USItype) 0)		\
-      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"		\
+      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"	\
 	       : "=r" ((USItype)(sh)),					\
 		 "=&r" ((USItype)(sl))					\
 	       : "r" ((USItype)(bh)),					\
@ -716,7 +738,7 @@ extern USItype __udiv_qrnnd ();
  __asm__ ("{cntlz|cntlzw} %0,%1"					\
 	   : "=r" ((USItype)(count))					\
 	   : "r" ((USItype)(x)))
-#if defined (__powerpc__)
+#if defined (_ARCH_PPC)
 #define umul_ppmm(ph, pl, m0, m1) \
  do {									\
    USItype __m0 = (m0), __m1 = (m1);					\
@ -785,16 +807,15 @@ extern USItype __udiv_qrnnd ();
 	     "g" ((USItype)(bh)),					\
 	     "1" ((USItype)(al)),					\
 	     "g" ((USItype)(bl)))
-/* This insn doesn't work on ancient pyramids.  */
+/* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
 #define umul_ppmm(w1, w0, u, v) \
  ({union {UDItype __ll;						\
 	   struct {USItype __h, __l;} __i;				\
 	  } __xx;							\
-  __xx.__i.__l = u;							\
-  __asm__ ("uemul %3,%0"						\
-	   : "=r" (__xx.__i.__h),					\
-	     "=r" (__xx.__i.__l)					\
-	   : "1" (__xx.__i.__l),					\
+  __asm__ ("movw %1,%R0
+	uemul %2,%0"							\
+	   : "=&r" (__xx.__ll)						\
+	   : "g" ((USItype) (u)),					\
 	     "g" ((USItype)(v)));					\
  (w1) = __xx.__i.__h; (w0) = __xx.__i.__l;})
 #endif /* __pyr__ */
@ -868,6 +889,20 @@ extern USItype __udiv_qrnnd ();
  } while (0)
 #endif

+#if defined (__sh2__) && W_TYPE_SIZE == 32
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ (								\
+       "dmulu.l	%2,%3
+	sts	macl,%1
+	sts	mach,%0"						\
+	   : "=r" ((USItype)(w1)),					\
+	     "=r" ((USItype)(w0))					\
+	   : "r" ((USItype)(u)),					\
+	     "r" ((USItype)(v))						\
+	   : "macl", "mach")
+#define UMUL_TIME 5
+#endif
+
 #if defined (__sparc__) && W_TYPE_SIZE == 32
 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
  __asm__ ("addcc %r4,%5,%1
@ -901,17 +936,21 @@ extern USItype __udiv_qrnnd ();
 	   : "r" ((USItype)(u)),					\
 	     "r" ((USItype)(v)))
 #define UMUL_TIME 5
-/* We might want to leave this undefined for `SuperSPARC (tm)' since
-   its implementation is crippled and often traps.  */
+#ifndef SUPERSPARC	/* SuperSPARC's udiv only handles 53 bit dividends */
 #define udiv_qrnnd(q, r, n1, n0, d) \
-  __asm__ ("mov %2,%%y;nop;nop;nop;udiv %3,%4,%0;umul %0,%4,%1;sub %3,%1,%1"\
-	   : "=&r" ((USItype)(q)),					\
-	     "=&r" ((USItype)(r))					\
-	   : "r" ((USItype)(n1)),					\
-	     "r" ((USItype)(n0)),					\
-	     "r" ((USItype)(d)))
+  do {									\
+    USItype __q;							\
+    __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
+	     : "=r" ((USItype)(__q))					\
+	     : "r" ((USItype)(n1)),					\
+	       "r" ((USItype)(n0)),					\
+	       "r" ((USItype)(d)));					\
+    (r) = (n0) - __q * (d);						\
+    (q) = __q;								\
+  } while (0)
 #define UDIV_TIME 25
-#else
+#endif /* SUPERSPARC */
+#else /* ! __sparc_v8__ */
 #if defined (__sparclite__)
 /* This has hardware multiply but not divide.  It also has two additional
   instructions scan (ffs from high bit) and divscc.  */
@ -973,9 +1012,10 @@ extern USItype __udiv_qrnnd ();
  __asm__ ("scan %1,0,%0"						\
 	   : "=r" ((USItype)(x))					\
 	   : "r" ((USItype)(count)))
-#else
-/* SPARC without integer multiplication and divide instructions.
-   (i.e. at least Sun4/20,40,60,65,75,110,260,280,330,360,380,470,490) */
+#endif /* __sparclite__ */
+#endif /* __sparc_v8__ */
+/* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
+#ifndef umul_ppmm
 #define umul_ppmm(w1, w0, u, v) \
  __asm__ ("! Inlined umul_ppmm
 	wr	%%g0,%2,%%y	! SPARC has 0-3 delay insn after a wr
@ -1023,6 +1063,9 @@ extern USItype __udiv_qrnnd ();
 	     "r" ((USItype)(v))						\
 	   : "%g1", "%g2" __AND_CLOBBER_CC)
 #define UMUL_TIME 39		/* 39 instructions */
+#endif
+#ifndef udiv_qrnnd
+#ifndef LONGLONG_STANDALONE
 #define udiv_qrnnd(q, r, n1, n0, d) \
  do { USItype __r;							\
    (q) = __udiv_qrnnd (&__r, (n1), (n0), (d));				\
@ -1030,8 +1073,8 @@ extern USItype __udiv_qrnnd ();
  } while (0)
 extern USItype __udiv_qrnnd ();
 #define UDIV_TIME 140
-#endif /* __sparclite__ */
-#endif /* __sparc_v8__ */
+#endif /* LONGLONG_STANDALONE */
+#endif /* udiv_qrnnd */
 #endif /* __sparc__ */

 #if defined (__vax__) && W_TYPE_SIZE == 32
@ -1075,7 +1118,7 @@ extern USItype __udiv_qrnnd ();
    __xx.__i.__h = n1; __xx.__i.__l = n0;				\
    __asm__ ("ediv %3,%2,%0,%1"						\
 	     : "=g" (q), "=g" (r)					\
-	     : "g" (__n1n0.ll), "g" (d));				\
+	     : "g" (__xx.ll), "g" (d));					\
  } while (0)
 #endif /* __vax__ */

@ -1173,11 +1216,12 @@ extern USItype __udiv_qrnnd ();
  do {									\
    UWtype __x0, __x1, __x2, __x3;					\
    UHWtype __ul, __vl, __uh, __vh;					\
+    UWtype __u = (u), __v = (v);					\
 									\
-    __ul = __ll_lowpart (u);						\
-    __uh = __ll_highpart (u);						\
-    __vl = __ll_lowpart (v);						\
-    __vh = __ll_highpart (v);						\
+    __ul = __ll_lowpart (__u);						\
+    __uh = __ll_highpart (__u);						\
+    __vl = __ll_lowpart (__v);						\
+    __vh = __ll_highpart (__v);						\
 									\
    __x0 = (UWtype) __ul * __vl;					\
    __x1 = (UWtype) __ul * __vh;					\
@ -1194,6 +1238,17 @@ extern USItype __udiv_qrnnd ();
  } while (0)
 #endif

+#if !defined (umul_ppmm)
+#define smul_ppmm(w1, w0, u, v)						\
+  do {									\
+    UWtype __w1;							\
+    UWtype __m0 = (u), __m1 = (v);					\
+    umul_ppmm (__w1, w0, __m0, __m1);					\
+    (w1) = __w1 - (-(__m0 >> (W_TYPE_SIZE - 1)) & __m1)			\
+		- (-(__m1 >> (W_TYPE_SIZE - 1)) & __m0);		\
+  } while (0)
+#endif
+
 /* Define this unconditionally, so it can be used for debugging.  */
 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
  do {									\
--- a/sysdeps/alpha/add_n.s
+++ b/sysdeps/alpha/add_n.s
@ -0,0 +1,119 @@
+ # Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$16
+ # s1_ptr	$17
+ # s2_ptr	$18
+ # size		$19
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_add_n
+	.ent	__mpn_add_n
+__mpn_add_n:
+	.frame	$30,0,$26,0
+
+	ldq	$3,0($17)
+	ldq	$4,0($18)
+
+	subq	$19,1,$19
+	and	$19,4-1,$2	# number of limbs in first loop
+	bis	$31,$31,$0
+	beq	$2,.L0		# if multiple of 4 limbs, skip first loop
+
+	subq	$19,$2,$19
+
+.Loop0:	subq	$2,1,$2
+	ldq	$5,8($17)
+	addq	$4,$0,$4
+	ldq	$6,8($18)
+	cmpult	$4,$0,$1
+	addq	$3,$4,$4
+	cmpult	$4,$3,$0
+	stq	$4,0($16)
+	or	$0,$1,$0
+
+	addq	$17,8,$17
+	addq	$18,8,$18
+	bis	$5,$5,$3
+	bis	$6,$6,$4
+	addq	$16,8,$16
+	bne	$2,.Loop0
+
+.L0:	beq	$19,.Lend
+
+	.align	3
+.Loop:	subq	$19,4,$19
+
+	ldq	$5,8($17)
+	addq	$4,$0,$4
+	ldq	$6,8($18)
+	cmpult	$4,$0,$1
+	addq	$3,$4,$4
+	cmpult	$4,$3,$0
+	stq	$4,0($16)
+	or	$0,$1,$0
+
+	ldq	$3,16($17)
+	addq	$6,$0,$6
+	ldq	$4,16($18)
+	cmpult	$6,$0,$1
+	addq	$5,$6,$6
+	cmpult	$6,$5,$0
+	stq	$6,8($16)
+	or	$0,$1,$0
+
+	ldq	$5,24($17)
+	addq	$4,$0,$4
+	ldq	$6,24($18)
+	cmpult	$4,$0,$1
+	addq	$3,$4,$4
+	cmpult	$4,$3,$0
+	stq	$4,16($16)
+	or	$0,$1,$0
+
+	ldq	$3,32($17)
+	addq	$6,$0,$6
+	ldq	$4,32($18)
+	cmpult	$6,$0,$1
+	addq	$5,$6,$6
+	cmpult	$6,$5,$0
+	stq	$6,24($16)
+	or	$0,$1,$0
+
+	addq	$17,32,$17
+	addq	$18,32,$18
+	addq	$16,32,$16
+	bne	$19,.Loop
+
+.Lend:	addq	$4,$0,$4
+	cmpult	$4,$0,$1
+	addq	$3,$4,$4
+	cmpult	$4,$3,$0
+	stq	$4,0($16)
+	or	$0,$1,$0
+	ret	$31,($26),1
+
+	.end	__mpn_add_n
--- a/sysdeps/alpha/addmul_1.s
+++ b/sysdeps/alpha/addmul_1.s
@ -0,0 +1,100 @@
+ # Alpha 21064 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+ # the result to a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # s2_limb	r19
+
+ # This code runs at 42 cycles/limb on the 21064.
+
+ # To improve performance for long multiplications, we would use
+ # 'fetch' for S1 and 'fetch_m' for RES.  It's not obvious how to use
+ # these instructions without slowing down the general code: 1. We can
+ # only have two prefetches in operation at any time in the Alpha
+ # architecture.  2. There will seldom be any special alignment
+ # between RES_PTR and S1_PTR.  Maybe we can simply divide the current
+ # loop into an inner and outer loop, having the inner loop handle
+ # exactly one prefetch block?
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_addmul_1
+	.ent	__mpn_addmul_1 2
+__mpn_addmul_1:
+	.frame	$30,0,$26
+
+	ldq	$2,0($17)	# $2 = s1_limb
+	addq	$17,8,$17	# s1_ptr++
+	subq	$18,1,$18	# size--
+	mulq	$2,$19,$3	# $3 = prod_low
+	ldq	$5,0($16)	# $5 = *res_ptr
+	umulh	$2,$19,$0	# $0 = prod_high
+	beq	$18,Lend1	# jump if size was == 1
+	ldq	$2,0($17)	# $2 = s1_limb
+	addq	$17,8,$17	# s1_ptr++
+	subq	$18,1,$18	# size--
+	addq	$5,$3,$3
+	cmpult	$3,$5,$4
+	stq	$3,0($16)
+	addq	$16,8,$16	# res_ptr++
+	beq	$18,Lend2	# jump if size was == 2
+
+	.align	3
+Loop:	mulq	$2,$19,$3	# $3 = prod_low
+	ldq	$5,0($16)	# $5 = *res_ptr
+	addq	$4,$0,$0	# cy_limb = cy_limb + 'cy'
+	subq	$18,1,$18	# size--
+	umulh	$2,$19,$4	# $4 = cy_limb
+	ldq	$2,0($17)	# $2 = s1_limb
+	addq	$17,8,$17	# s1_ptr++
+	addq	$3,$0,$3	# $3 = cy_limb + prod_low
+	cmpult	$3,$0,$0	# $0 = carry from (cy_limb + prod_low)
+	addq	$5,$3,$3
+	cmpult	$3,$5,$5
+	stq	$3,0($16)
+	addq	$16,8,$16	# res_ptr++
+	addq	$5,$0,$0	# combine carries
+	bne	$18,Loop
+
+Lend2:	mulq	$2,$19,$3	# $3 = prod_low
+	ldq	$5,0($16)	# $5 = *res_ptr
+	addq	$4,$0,$0	# cy_limb = cy_limb + 'cy'
+	umulh	$2,$19,$4	# $4 = cy_limb
+	addq	$3,$0,$3	# $3 = cy_limb + prod_low
+	cmpult	$3,$0,$0	# $0 = carry from (cy_limb + prod_low)
+	addq	$5,$3,$3
+	cmpult	$3,$5,$5
+	stq	$3,0($16)
+	addq	$5,$0,$0	# combine carries
+	addq	$4,$0,$0	# cy_limb = prod_high + cy
+	ret	$31,($26),1
+Lend1:	addq	$5,$3,$3
+	cmpult	$3,$5,$5
+	stq	$3,0($16)
+	addq	$0,$5,$0
+	ret	$31,($26),1
+
+	.end	__mpn_addmul_1
--- a/sysdeps/alpha/alphaev5/add_n.s
+++ b/sysdeps/alpha/alphaev5/add_n.s
@ -0,0 +1,118 @@
+ # Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$16
+ # s1_ptr	$17
+ # s2_ptr	$18
+ # size		$19
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_add_n
+	.ent	__mpn_add_n
+__mpn_add_n:
+	.frame	$30,0,$26,0
+
+	ldq	$3,0($17)
+	ldq	$4,0($18)
+
+	subq	$19,1,$19
+	and	$19,4-1,$2	# number of limbs in first loop
+	bis	$31,$31,$0
+	beq	$2,.L0		# if multiple of 4 limbs, skip first loop
+
+	subq	$19,$2,$19
+
+.Loop0:	subq	$2,1,$2
+	ldq	$5,8($17)
+	addq	$4,$0,$4
+	ldq	$6,8($18)
+	cmpult	$4,$0,$1
+	addq	$3,$4,$4
+	cmpult	$4,$3,$0
+	stq	$4,0($16)
+	or	$0,$1,$0
+
+	addq	$17,8,$17
+	addq	$18,8,$18
+	bis	$5,$5,$3
+	bis	$6,$6,$4
+	addq	$16,8,$16
+	bne	$2,.Loop0
+
+.L0:	beq	$19,.Lend
+
+	.align	4
+.Loop:	subq	$19,4,$19
+	unop
+
+	ldq	$6,8($18)
+	addq	$4,$0,$0
+	ldq	$5,8($17)
+	cmpult	$0,$4,$1
+	ldq	$4,16($18)
+	addq	$3,$0,$20
+	cmpult	$20,$3,$0
+	ldq	$3,16($17)
+	or	$0,$1,$0
+	addq	$6,$0,$0
+	cmpult	$0,$6,$1
+	ldq	$6,24($18)
+	addq	$5,$0,$21
+	cmpult	$21,$5,$0
+	ldq	$5,24($17)
+	or	$0,$1,$0
+	addq	$4,$0,$0
+	cmpult	$0,$4,$1
+	ldq	$4,32($18)
+	addq	$3,$0,$22
+	cmpult	$22,$3,$0
+	ldq	$3,32($17)
+	or	$0,$1,$0
+	addq	$6,$0,$0
+	cmpult	$0,$6,$1
+	addq	$5,$0,$23
+	cmpult	$23,$5,$0
+	or	$0,$1,$0
+
+	stq	$20,0($16)
+	stq	$21,8($16)
+	stq	$22,16($16)
+	stq	$23,24($16)
+
+	addq	$17,32,$17
+	addq	$18,32,$18
+	addq	$16,32,$16
+	bne	$19,.Loop
+
+.Lend:	addq	$4,$0,$4
+	cmpult	$4,$0,$1
+	addq	$3,$4,$4
+	cmpult	$4,$3,$0
+	stq	$4,0($16)
+	or	$0,$1,$0
+	ret	$31,($26),1
+
+	.end	__mpn_add_n
--- a/sysdeps/alpha/alphaev5/lshift.s
+++ b/sysdeps/alpha/alphaev5/lshift.s
@ -0,0 +1,175 @@
+ # Alpha EV5 __mpn_lshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # cnt		r19
+
+ # This code runs at 4.25 cycles/limb on the EV5.
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_lshift
+	.ent	__mpn_lshift
+__mpn_lshift:
+	.frame	$30,0,$26,0
+
+	s8addq	$18,$17,$17	# make r17 point at end of s1
+	ldq	$4,-8($17)	# load first limb
+	subq	$31,$19,$20
+	s8addq	$18,$16,$16	# make r16 point at end of RES
+	subq	$18,1,$18
+	and	$18,4-1,$28	# number of limbs in first loop
+	srl	$4,$20,$0	# compute function result
+
+	beq	$28,L0
+	subq	$18,$28,$18
+
+	.align	3
+Loop0:	ldq	$3,-16($17)
+	subq	$16,8,$16
+	sll	$4,$19,$5
+	subq	$17,8,$17
+	subq	$28,1,$28
+	srl	$3,$20,$6
+	or	$3,$3,$4
+	or	$5,$6,$8
+	stq	$8,0($16)
+	bne	$28,Loop0
+
+L0:	sll	$4,$19,$24
+	beq	$18,Lend
+ # warm up phase 1
+	ldq	$1,-16($17)
+	subq	$18,4,$18
+	ldq	$2,-24($17)
+	ldq	$3,-32($17)
+	ldq	$4,-40($17)
+	beq	$18,Lcool1
+ # warm up phase 2
+	srl	$1,$20,$7
+	sll	$1,$19,$21
+	srl	$2,$20,$8
+	ldq	$1,-48($17)
+	sll	$2,$19,$22
+	ldq	$2,-56($17)
+	srl	$3,$20,$5
+	or	$7,$24,$7
+	sll	$3,$19,$23
+	or	$8,$21,$8
+	srl	$4,$20,$6
+	ldq	$3,-64($17)
+	sll	$4,$19,$24
+	ldq	$4,-72($17)
+	subq	$18,4,$18
+	beq	$18,Lcool1
+	.align  4
+ # main loop
+Loop:	stq	$7,-8($16)
+	or	$5,$22,$5
+	stq	$8,-16($16)
+	or	$6,$23,$6
+
+	srl	$1,$20,$7
+	subq	$18,4,$18
+	sll	$1,$19,$21
+	unop	# ldq	$31,-96($17)
+
+	srl	$2,$20,$8
+	ldq	$1,-80($17)
+	sll	$2,$19,$22
+	ldq	$2,-88($17)
+
+	stq	$5,-24($16)
+	or	$7,$24,$7
+	stq	$6,-32($16)
+	or	$8,$21,$8
+
+	srl	$3,$20,$5
+	unop	# ldq	$31,-96($17)
+	sll	$3,$19,$23
+	subq	$16,32,$16
+
+	srl	$4,$20,$6
+	ldq	$3,-96($17
+	sll	$4,$19,$24
+	ldq	$4,-104($17)
+
+	subq	$17,32,$17
+	bne	$18,Loop
+	unop
+	unop
+ # cool down phase 2/1
+Lcool1:	stq	$7,-8($16)
+	or	$5,$22,$5
+	stq	$8,-16($16)
+	or	$6,$23,$6
+	srl	$1,$20,$7
+	sll	$1,$19,$21
+	srl	$2,$20,$8
+	sll	$2,$19,$22
+	stq	$5,-24($16)
+	or	$7,$24,$7
+	stq	$6,-32($16)
+	or	$8,$21,$8
+	srl	$3,$20,$5
+	sll	$3,$19,$23
+	srl	$4,$20,$6
+	sll	$4,$19,$24
+ # cool down phase 2/2
+	stq	$7,-40($16)
+	or	$5,$22,$5
+	stq	$8,-48($16)
+	or	$6,$23,$6
+	stq	$5,-56($16)
+	stq	$6,-64($16)
+ # cool down phase 2/3
+	stq	$24,-72($16)
+	ret	$31,($26),1
+
+ # cool down phase 1/1
+Lcool1:	srl	$1,$20,$7
+	sll	$1,$19,$21
+	srl	$2,$20,$8
+	sll	$2,$19,$22
+	srl	$3,$20,$5
+	or	$7,$24,$7
+	sll	$3,$19,$23
+	or	$8,$21,$8
+	srl	$4,$20,$6
+	sll	$4,$19,$24
+ # cool down phase 1/2
+	stq	$7,-8($16)
+	or	$5,$22,$5
+	stq	$8,-16($16)
+	or	$6,$23,$6
+	stq	$5,-24($16)
+	stq	$6,-32($16)
+	stq	$24,-40($16)
+	ret	$31,($26),1
+
+Lend	stq	$24,-8($16)
+	ret	$31,($26),1
+	.end	__mpn_lshift
--- a/sysdeps/alpha/alphaev5/rshift.s
+++ b/sysdeps/alpha/alphaev5/rshift.s
@ -0,0 +1,173 @@
+ # Alpha EV5 __mpn_rshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # cnt		r19
+
+ # This code runs at 4.25 cycles/limb on the EV5.
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_rshift
+	.ent	__mpn_rshift
+__mpn_rshift:
+	.frame	$30,0,$26,0
+
+	ldq	$4,0($17)	# load first limb
+	subq	$31,$19,$20
+	subq	$18,1,$18
+	and	$18,4-1,$28	# number of limbs in first loop
+	sll	$4,$20,$0	# compute function result
+
+	beq	$28,L0
+	subq	$18,$28,$18
+
+	.align	3
+Loop0:	ldq	$3,8($17)
+	addq	$16,8,$16
+	srl	$4,$19,$5
+	addq	$17,8,$17
+	subq	$28,1,$28
+	sll	$3,$20,$6
+	or	$3,$3,$4
+	or	$5,$6,$8
+	stq	$8,-8($16)
+	bne	$28,Loop0
+
+L0:	srl	$4,$19,$24
+	beq	$18,Lend
+ # warm up phase 1
+	ldq	$1,8($17)
+	subq	$18,4,$18
+	ldq	$2,16($17)
+	ldq	$3,24($17)
+	ldq	$4,32($17)
+	beq	$18,Lcool1
+ # warm up phase 2
+	sll	$1,$20,$7
+	srl	$1,$19,$21
+	sll	$2,$20,$8
+	ldq	$1,40($17)
+	srl	$2,$19,$22
+	ldq	$2,48($17)
+	sll	$3,$20,$5
+	or	$7,$24,$7
+	srl	$3,$19,$23
+	or	$8,$21,$8
+	sll	$4,$20,$6
+	ldq	$3,56($17)
+	srl	$4,$19,$24
+	ldq	$4,64($17)
+	subq	$18,4,$18
+	beq	$18,Lcool2
+	.align  4
+ # main loop
+Loop:	stq	$7,0($16)
+	or	$5,$22,$5
+	stq	$8,8($16)
+	or	$6,$23,$6
+
+	sll	$1,$20,$7
+	subq	$18,4,$18
+	srl	$1,$19,$21
+	unop	# ldq	$31,-96($17)
+
+	sll	$2,$20,$8
+	ldq	$1,72($17)
+	srl	$2,$19,$22
+	ldq	$2,80($17)
+
+	stq	$5,16($16)
+	or	$7,$24,$7
+	stq	$6,24($16)
+	or	$8,$21,$8
+
+	sll	$3,$20,$5
+	unop	# ldq	$31,-96($17)
+	srl	$3,$19,$23
+	addq	$16,32,$16
+
+	sll	$4,$20,$6
+	ldq	$3,88($17)
+	srl	$4,$19,$24
+	ldq	$4,96($17)
+
+	addq	$17,32,$17
+	bne	$18,Loop
+	unop
+	unop
+ # cool down phase 2/1
+Lcool2:	stq	$7,0($16)
+	or	$5,$22,$5
+	stq	$8,8($16)
+	or	$6,$23,$6
+	sll	$1,$20,$7
+	srl	$1,$19,$21
+	sll	$2,$20,$8
+	srl	$2,$19,$22
+	stq	$5,16($16)
+	or	$7,$24,$7
+	stq	$6,24($16)
+	or	$8,$21,$8
+	sll	$3,$20,$5
+	srl	$3,$19,$23
+	sll	$4,$20,$6
+	srl	$4,$19,$24
+ # cool down phase 2/2
+	stq	$7,32($16)
+	or	$5,$22,$5
+	stq	$8,40($16)
+	or	$6,$23,$6
+	stq	$5,48($16)
+	stq	$6,56($16)
+ # cool down phase 2/3
+	stq	$24,64($16)
+	ret	$31,($26),1
+
+ # cool down phase 1/1
+Lcool1:	sll	$1,$20,$7
+	srl	$1,$19,$21
+	sll	$2,$20,$8
+	srl	$2,$19,$22
+	sll	$3,$20,$5
+	or	$7,$24,$7
+	srl	$3,$19,$23
+	or	$8,$21,$8
+	sll	$4,$20,$6
+	srl	$4,$19,$24
+ # cool down phase 1/2
+	stq	$7,0($16)
+	or	$5,$22,$5
+	stq	$8,8($16)
+	or	$6,$23,$6
+	stq	$5,16($16)
+	stq	$6,24($16)
+	stq	$24,32($16)
+	ret	$31,($26),1
+
+Lend:	stq	$24,0($16)
+	ret	$31,($26),1
+	.end	__mpn_rshift
--- a/sysdeps/alpha/lshift.s
+++ b/sysdeps/alpha/lshift.s
@ -0,0 +1,108 @@
+ # Alpha 21064 __mpn_lshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # cnt		r19
+
+ # This code runs at 4.8 cycles/limb on the 21064.  With infinite unrolling,
+ # it would take 4 cycles/limb.  It should be possible to get down to 3
+ # cycles/limb since both ldq and stq can be paired with the other used
+ # instructions.  But there are many restrictions in the 21064 pipeline that
+ # makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+ # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+ # 2. Only aligned instruction pairs can be paired.
+ # 3. The store buffer or silo might not be able to deal with the bandwidth.
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_lshift
+	.ent	__mpn_lshift
+__mpn_lshift:
+	.frame	$30,0,$26,0
+
+	s8addq	$18,$17,$17	# make r17 point at end of s1
+	ldq	$4,-8($17)	# load first limb
+	subq	$17,8,$17
+	subq	$31,$19,$7
+	s8addq	$18,$16,$16	# make r16 point at end of RES
+	subq	$18,1,$18
+	and	$18,4-1,$20	# number of limbs in first loop
+	srl	$4,$7,$0	# compute function result
+
+	beq	$20,L0
+	subq	$18,$20,$18
+
+	.align	3
+Loop0:
+	ldq	$3,-8($17)
+	subq	$16,8,$16
+	subq	$17,8,$17
+	subq	$20,1,$20
+	sll	$4,$19,$5
+	srl	$3,$7,$6
+	bis	$3,$3,$4
+	bis	$5,$6,$8
+	stq	$8,0($16)
+	bne	$20,Loop0
+
+L0:	beq	$18,Lend
+
+	.align	3
+Loop:	ldq	$3,-8($17)
+	subq	$16,32,$16
+	subq	$18,4,$18
+	sll	$4,$19,$5
+	srl	$3,$7,$6
+
+	ldq	$4,-16($17)
+	sll	$3,$19,$1
+	bis	$5,$6,$8
+	stq	$8,24($16)
+	srl	$4,$7,$2
+
+	ldq	$3,-24($17)
+	sll	$4,$19,$5
+	bis	$1,$2,$8
+	stq	$8,16($16)
+	srl	$3,$7,$6
+
+	ldq	$4,-32($17)
+	sll	$3,$19,$1
+	bis	$5,$6,$8
+	stq	$8,8($16)
+	srl	$4,$7,$2
+
+	subq	$17,32,$17
+	bis	$1,$2,$8
+	stq	$8,0($16)
+
+	bgt	$18,Loop
+
+Lend:	sll	$4,$19,$8
+	stq	$8,-8($16)
+	ret	$31,($26),1
+	.end	__mpn_lshift
--- a/sysdeps/alpha/mul_1.s
+++ b/sysdeps/alpha/mul_1.s
@ -0,0 +1,84 @@
+ # Alpha 21064 __mpn_mul_1 -- Multiply a limb vector with a limb and store
+ # the result in a second limb vector.
+
+ # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # s2_limb	r19
+
+ # This code runs at 42 cycles/limb on the EV4 and 18 cycles/limb on the EV5.
+
+ # To improve performance for long multiplications, we would use
+ # 'fetch' for S1 and 'fetch_m' for RES.  It's not obvious how to use
+ # these instructions without slowing down the general code: 1. We can
+ # only have two prefetches in operation at any time in the Alpha
+ # architecture.  2. There will seldom be any special alignment
+ # between RES_PTR and S1_PTR.  Maybe we can simply divide the current
+ # loop into an inner and outer loop, having the inner loop handle
+ # exactly one prefetch block?
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_mul_1
+	.ent	__mpn_mul_1 2
+__mpn_mul_1:
+	.frame	$30,0,$26
+
+	ldq	$2,0($17)	# $2 = s1_limb
+	subq	$18,1,$18	# size--
+	mulq	$2,$19,$3	# $3 = prod_low
+	bic	$31,$31,$4	# clear cy_limb
+	umulh	$2,$19,$0	# $0 = prod_high
+	beq	$18,Lend1	# jump if size was == 1
+	ldq	$2,8($17)	# $2 = s1_limb
+	subq	$18,1,$18	# size--
+	stq	$3,0($16)
+	beq	$18,Lend2	# jump if size was == 2
+
+	.align	3
+Loop:	mulq	$2,$19,$3	# $3 = prod_low
+	addq	$4,$0,$0	# cy_limb = cy_limb + 'cy'
+	subq	$18,1,$18	# size--
+	umulh	$2,$19,$4	# $4 = cy_limb
+	ldq	$2,16($17)	# $2 = s1_limb
+	addq	$17,8,$17	# s1_ptr++
+	addq	$3,$0,$3	# $3 = cy_limb + prod_low
+	stq	$3,8($16)
+	cmpult	$3,$0,$0	# $0 = carry from (cy_limb + prod_low)
+	addq	$16,8,$16	# res_ptr++
+	bne	$18,Loop
+
+Lend2:	mulq	$2,$19,$3	# $3 = prod_low
+	addq	$4,$0,$0	# cy_limb = cy_limb + 'cy'
+	umulh	$2,$19,$4	# $4 = cy_limb
+	addq	$3,$0,$3	# $3 = cy_limb + prod_low
+	cmpult	$3,$0,$0	# $0 = carry from (cy_limb + prod_low)
+	stq	$3,8($16)
+	addq	$4,$0,$0	# cy_limb = prod_high + cy
+	ret	$31,($26),1
+Lend1:	stq	$3,0($16)
+	ret	$31,($26),1
+
+	.end	__mpn_mul_1
--- a/sysdeps/alpha/rshift.s
+++ b/sysdeps/alpha/rshift.s
@ -0,0 +1,106 @@
+ # Alpha 21064 __mpn_rshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # cnt		r19
+
+ # This code runs at 4.8 cycles/limb on the 21064.  With infinite unrolling,
+ # it would take 4 cycles/limb.  It should be possible to get down to 3
+ # cycles/limb since both ldq and stq can be paired with the other used
+ # instructions.  But there are many restrictions in the 21064 pipeline that
+ # makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+ # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+ # 2. Only aligned instruction pairs can be paired.
+ # 3. The store buffer or silo might not be able to deal with the bandwidth.
+      
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_rshift
+	.ent	__mpn_rshift
+__mpn_rshift:
+	.frame	$30,0,$26,0
+
+	ldq	$4,0($17)	# load first limb
+	addq	$17,8,$17
+	subq	$31,$19,$7
+	subq	$18,1,$18
+	and	$18,4-1,$20	# number of limbs in first loop
+	sll	$4,$7,$0	# compute function result
+
+	beq	$20,L0
+	subq	$18,$20,$18
+
+	.align	3
+Loop0:
+	ldq	$3,0($17)
+	addq	$16,8,$16
+	addq	$17,8,$17
+	subq	$20,1,$20
+	srl	$4,$19,$5
+	sll	$3,$7,$6
+	bis	$3,$3,$4
+	bis	$5,$6,$8
+	stq	$8,-8($16)
+	bne	$20,Loop0
+
+L0:	beq	$18,Lend
+
+	.align	3
+Loop:	ldq	$3,0($17)
+	addq	$16,32,$16
+	subq	$18,4,$18
+	srl	$4,$19,$5
+	sll	$3,$7,$6
+
+	ldq	$4,8($17)
+	srl	$3,$19,$1
+	bis	$5,$6,$8
+	stq	$8,-32($16)
+	sll	$4,$7,$2
+
+	ldq	$3,16($17)
+	srl	$4,$19,$5
+	bis	$1,$2,$8
+	stq	$8,-24($16)
+	sll	$3,$7,$6
+
+	ldq	$4,24($17)
+	srl	$3,$19,$1
+	bis	$5,$6,$8
+	stq	$8,-16($16)
+	sll	$4,$7,$2
+
+	addq	$17,32,$17
+	bis	$1,$2,$8
+	stq	$8,-8($16)
+
+	bgt	$18,Loop
+
+Lend:	srl	$4,$19,$8
+	stq	$8,0($16)
+	ret	$31,($26),1
+	.end	__mpn_rshift
--- a/sysdeps/alpha/sub_n.s
+++ b/sysdeps/alpha/sub_n.s
@ -0,0 +1,119 @@
+ # Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$16
+ # s1_ptr	$17
+ # s2_ptr	$18
+ # size		$19
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_sub_n
+	.ent	__mpn_sub_n
+__mpn_sub_n:
+	.frame	$30,0,$26,0
+
+	ldq	$3,0($17)
+	ldq	$4,0($18)
+
+	subq	$19,1,$19
+	and	$19,4-1,$2	# number of limbs in first loop
+	bis	$31,$31,$0
+	beq	$2,.L0		# if multiple of 4 limbs, skip first loop
+
+	subq	$19,$2,$19
+
+.Loop0:	subq	$2,1,$2
+	ldq	$5,8($17)
+	addq	$4,$0,$4
+	ldq	$6,8($18)
+	cmpult	$4,$0,$1
+	subq	$3,$4,$4
+	cmpult	$3,$4,$0
+	stq	$4,0($16)
+	or	$0,$1,$0
+
+	addq	$17,8,$17
+	addq	$18,8,$18
+	bis	$5,$5,$3
+	bis	$6,$6,$4
+	addq	$16,8,$16
+	bne	$2,.Loop0
+
+.L0:	beq	$19,.Lend
+
+	.align	3
+.Loop:	subq	$19,4,$19
+
+	ldq	$5,8($17)
+	addq	$4,$0,$4
+	ldq	$6,8($18)
+	cmpult	$4,$0,$1
+	subq	$3,$4,$4
+	cmpult	$3,$4,$0
+	stq	$4,0($16)
+	or	$0,$1,$0
+
+	ldq	$3,16($17)
+	addq	$6,$0,$6
+	ldq	$4,16($18)
+	cmpult	$6,$0,$1
+	subq	$5,$6,$6
+	cmpult	$5,$6,$0
+	stq	$6,8($16)
+	or	$0,$1,$0
+
+	ldq	$5,24($17)
+	addq	$4,$0,$4
+	ldq	$6,24($18)
+	cmpult	$4,$0,$1
+	subq	$3,$4,$4
+	cmpult	$3,$4,$0
+	stq	$4,16($16)
+	or	$0,$1,$0
+
+	ldq	$3,32($17)
+	addq	$6,$0,$6
+	ldq	$4,32($18)
+	cmpult	$6,$0,$1
+	subq	$5,$6,$6
+	cmpult	$5,$6,$0
+	stq	$6,24($16)
+	or	$0,$1,$0
+
+	addq	$17,32,$17
+	addq	$18,32,$18
+	addq	$16,32,$16
+	bne	$19,.Loop
+
+.Lend:	addq	$4,$0,$4
+	cmpult	$4,$0,$1
+	subq	$3,$4,$4
+	cmpult	$3,$4,$0
+	stq	$4,0($16)
+	or	$0,$1,$0
+	ret	$31,($26),1
+
+	.end	__mpn_sub_n
--- a/sysdeps/alpha/submul_1.s
+++ b/sysdeps/alpha/submul_1.s
@ -0,0 +1,100 @@
+ # Alpha 21064 __mpn_submul_1 -- Multiply a limb vector with a limb and
+ # subtract the result from a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # s2_limb	r19
+
+ # This code runs at 42 cycles/limb on the 21064.
+
+ # To improve performance for long multiplications, we would use
+ # 'fetch' for S1 and 'fetch_m' for RES.  It's not obvious how to use
+ # these instructions without slowing down the general code: 1. We can
+ # only have two prefetches in operation at any time in the Alpha
+ # architecture.  2. There will seldom be any special alignment
+ # between RES_PTR and S1_PTR.  Maybe we can simply divide the current
+ # loop into an inner and outer loop, having the inner loop handle
+ # exactly one prefetch block?
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_submul_1
+	.ent	__mpn_submul_1 2
+__mpn_submul_1:
+	.frame	$30,0,$26
+
+	ldq	$2,0($17)	# $2 = s1_limb
+	addq	$17,8,$17	# s1_ptr++
+	subq	$18,1,$18	# size--
+	mulq	$2,$19,$3	# $3 = prod_low
+	ldq	$5,0($16)	# $5 = *res_ptr
+	umulh	$2,$19,$0	# $0 = prod_high
+	beq	$18,Lend1	# jump if size was == 1
+	ldq	$2,0($17)	# $2 = s1_limb
+	addq	$17,8,$17	# s1_ptr++
+	subq	$18,1,$18	# size--
+	subq	$5,$3,$3
+	cmpult	$5,$3,$4
+	stq	$3,0($16)
+	addq	$16,8,$16	# res_ptr++
+	beq	$18,Lend2	# jump if size was == 2
+
+	.align	3
+Loop:	mulq	$2,$19,$3	# $3 = prod_low
+	ldq	$5,0($16)	# $5 = *res_ptr
+	addq	$4,$0,$0	# cy_limb = cy_limb + 'cy'
+	subq	$18,1,$18	# size--
+	umulh	$2,$19,$4	# $4 = cy_limb
+	ldq	$2,0($17)	# $2 = s1_limb
+	addq	$17,8,$17	# s1_ptr++
+	addq	$3,$0,$3	# $3 = cy_limb + prod_low
+	cmpult	$3,$0,$0	# $0 = carry from (cy_limb + prod_low)
+	subq	$5,$3,$3
+	cmpult	$5,$3,$5
+	stq	$3,0($16)
+	addq	$16,8,$16	# res_ptr++
+	addq	$5,$0,$0	# combine carries
+	bne	$18,Loop
+
+Lend2:	mulq	$2,$19,$3	# $3 = prod_low
+	ldq	$5,0($16)	# $5 = *res_ptr
+	addq	$4,$0,$0	# cy_limb = cy_limb + 'cy'
+	umulh	$2,$19,$4	# $4 = cy_limb
+	addq	$3,$0,$3	# $3 = cy_limb + prod_low
+	cmpult	$3,$0,$0	# $0 = carry from (cy_limb + prod_low)
+	subq	$5,$3,$3
+	cmpult	$5,$3,$5
+	stq	$3,0($16)
+	addq	$5,$0,$0	# combine carries
+	addq	$4,$0,$0	# cy_limb = prod_high + cy
+	ret	$31,($26),1
+Lend1:	subq	$5,$3,$3
+	cmpult	$5,$3,$5
+	stq	$3,0($16)
+	addq	$0,$5,$0
+	ret	$31,($26),1
+
+	.end	__mpn_submul_1
--- a/sysdeps/alpha/udiv_qrnnd.S
+++ b/sysdeps/alpha/udiv_qrnnd.S
@ -134,7 +134,7 @@ Loop2:	cmplt	n0,0,tmp
 	ret	$31,($26),1

 Odd:
-	/* q' in n0.  r' in n1.  */
+	/* q' in n0. r' in n1 */
 	addq	n1,n0,n1
 	cmpult	n1,n0,tmp	# tmp := carry from addq
 	beq	tmp,LLp6
--- a/sysdeps/generic/divmod_1.c
+++ b/sysdeps/generic/divmod_1.c
@ -83,14 +83,12 @@ __mpn_divmod_1 (quot_ptr, dividend_ptr, dividend_size, divisor_limb)
 	     result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the
 	     most significant bit (with weight 2**N) implicit.  */

-#if 0 /* This can't happen when normalization_steps != 0 */
 	  /* Special case for DIVISOR_LIMB == 100...000.  */
 	  if (divisor_limb << 1 == 0)
 	    divisor_limb_inverted = ~(mp_limb) 0;
 	  else
-#endif
-	  udiv_qrnnd (divisor_limb_inverted, dummy,
-		      -divisor_limb, 0, divisor_limb);
+	    udiv_qrnnd (divisor_limb_inverted, dummy,
+			-divisor_limb, 0, divisor_limb);

 	  n1 = dividend_ptr[dividend_size - 1];
 	  r = n1 >> (BITS_PER_MP_LIMB - normalization_steps);
--- a/sysdeps/generic/mod_1.c
+++ b/sysdeps/generic/mod_1.c
@ -3,8 +3,6 @@
   Return the single-limb remainder.
   There are no constraints on the value of the divisor.

-   QUOT_PTR and DIVIDEND_PTR might point to the same limb.
-
 Copyright (C) 1991, 1993, 1994, Free Software Foundation, Inc.

 This file is part of the GNU MP Library.
--- a/sysdeps/hppa/add_n.s
+++ b/sysdeps/hppa/add_n.s
@ -0,0 +1,57 @@
+; HP-PA  __mpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; One might want to unroll this as for other processors, but it turns
+; out that the data cache contention after a store makes such
+; unrolling useless.  We can't come under 5 cycles/limb anyway.
+
+	.code
+	.export		__mpn_add_n
+__mpn_add_n
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+
+	addib,=		-1,%r23,L$end	; check for (SIZE == 1)
+	 add		%r20,%r19,%r28	; add first limbs ignoring cy
+
+L$loop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,<>	-1,%r23,L$loop
+	 addc		%r20,%r19,%r28
+
+L$end	stws		%r28,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r0,%r28
+
+	.exit
+	.procend
--- a/sysdeps/hppa/hppa1.1/addmul_1.s
+++ b/sysdeps/hppa/hppa1.1/addmul_1.s
@ -0,0 +1,101 @@
+; HP-PA-1.1 __mpn_addmul_1 -- Multiply a limb vector with a limb and
+; add the result to a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r26
+; s1_ptr	r25
+; size		r24
+; s2_limb	r23
+
+; This runs at 11 cycles/limb on a PA7000.  With the used instructions, it
+; can not become faster due to data cache contention after a store.  On the
+; PA7100 it runs at 10 cycles/limb, and that can not be improved either,
+; since only the xmpyu does not need the integer pipeline, so the only
+; dual-issue we will get are addc+xmpyu.  Unrolling could gain a cycle/limb
+; on the PA7100.
+
+; There are some ideas described in mul_1.s that applies to this code too.
+
+	.code
+	.export		__mpn_addmul_1
+__mpn_addmul_1
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldo		64(%r30),%r30
+	fldws,ma	4(%r25),%fr5
+	stw		%r23,-16(%r30)		; move s2_limb ...
+	addib,=		-1,%r24,L$just_one_limb
+	 fldws		-16(%r30),%fr4		; ... into fr4
+	add		%r0,%r0,%r0		; clear carry
+	xmpyu		%fr4,%fr5,%fr6
+	fldws,ma	4(%r25),%fr7
+	fstds		%fr6,-16(%r30)
+	xmpyu		%fr4,%fr7,%fr8
+	ldw		-12(%r30),%r19		; least significant limb in product
+	ldw		-16(%r30),%r28
+
+	fstds		%fr8,-16(%r30)
+	addib,=		-1,%r24,L$end
+	 ldw		-12(%r30),%r1
+
+; Main loop
+L$loop	ldws		0(%r26),%r29
+	fldws,ma	4(%r25),%fr5
+	add		%r29,%r19,%r19
+	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		-16(%r30),%r28
+	fstds		%fr6,-16(%r30)
+	addc		%r0,%r28,%r28
+	addib,<>	-1,%r24,L$loop
+	 ldw		-12(%r30),%r1
+
+L$end	ldw		0(%r26),%r29
+	add		%r29,%r19,%r19
+	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	ldw		-16(%r30),%r28
+	ldws		0(%r26),%r29
+	addc		%r0,%r28,%r28
+	add		%r29,%r19,%r19
+	stws,ma		%r19,4(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+L$just_one_limb
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		0(%r26),%r29
+	fstds		%fr6,-16(%r30)
+	ldw		-12(%r30),%r1
+	ldw		-16(%r30),%r28
+	add		%r29,%r1,%r19
+	stw		%r19,0(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+	.exit
+	.procend
--- a/sysdeps/hppa/hppa1.1/mul_1.s
+++ b/sysdeps/hppa/hppa1.1/mul_1.s
@ -0,0 +1,97 @@
+; HP-PA-1.1 __mpn_mul_1 -- Multiply a limb vector with a limb and store
+; the result in a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r26
+; s1_ptr	r25
+; size		r24
+; s2_limb	r23
+
+; This runs at 9 cycles/limb on a PA7000.  With the used instructions, it can
+; not become faster due to data cache contention after a store.  On the
+; PA7100 it runs at 7 cycles/limb, and that can not be improved either, since
+; only the xmpyu does not need the integer pipeline, so the only dual-issue
+; we will get are addc+xmpyu.  Unrolling would not help either CPU.
+
+; We could use fldds to read two limbs at a time from the S1 array, and that
+; could bring down the times to 8.5 and 6.5 cycles/limb for the PA7000 and
+; PA7100, respectively.  We don't do that since it does not seem worth the
+; (alignment) troubles...
+
+; At least the PA7100 is rumored to be able to deal with cache-misses
+; without stalling instruction issue.  If this is true, and the cache is
+; actually also lockup-free, we should use a deeper software pipeline, and
+; load from S1 very early!  (The loads and stores to -12(sp) will surely be
+; in the cache.)
+
+	.code
+	.export		__mpn_mul_1
+__mpn_mul_1
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldo		64(%r30),%r30
+	fldws,ma	4(%r25),%fr5
+	stw		%r23,-16(%r30)		; move s2_limb ...
+	addib,=		-1,%r24,L$just_one_limb
+	 fldws		-16(%r30),%fr4		; ... into fr4
+	add		%r0,%r0,%r0		; clear carry
+	xmpyu		%fr4,%fr5,%fr6
+	fldws,ma	4(%r25),%fr7
+	fstds	 	%fr6,-16(%r30)
+	xmpyu		%fr4,%fr7,%fr8
+	ldw		-12(%r30),%r19		; least significant limb in product
+	ldw		-16(%r30),%r28
+
+	fstds		%fr8,-16(%r30)
+	addib,=		-1,%r24,L$end
+	 ldw		-12(%r30),%r1
+
+; Main loop
+L$loop	fldws,ma	4(%r25),%fr5
+	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		-16(%r30),%r28
+	fstds		%fr6,-16(%r30)
+	addib,<>	-1,%r24,L$loop
+	 ldw		-12(%r30),%r1
+
+L$end	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	ldw		-16(%r30),%r28
+	stws,ma		%r19,4(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+L$just_one_limb
+	xmpyu		%fr4,%fr5,%fr6
+	fstds		%fr6,-16(%r30)
+	ldw		-16(%r30),%r28
+	ldo		-64(%r30),%r30
+	bv		0(%r2)
+	 fstws		%fr6R,0(%r26)
+
+	.exit
+	.procend
--- a/sysdeps/hppa/hppa1.1/submul_1.s
+++ b/sysdeps/hppa/hppa1.1/submul_1.s
@ -0,0 +1,110 @@
+; HP-PA-1.1 __mpn_submul_1 -- Multiply a limb vector with a limb and
+; subtract the result from a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r26
+; s1_ptr	r25
+; size		r24
+; s2_limb	r23
+
+; This runs at 12 cycles/limb on a PA7000.  With the used instructions, it
+; can not become faster due to data cache contention after a store.  On the
+; PA7100 it runs at 11 cycles/limb, and that can not be improved either,
+; since only the xmpyu does not need the integer pipeline, so the only
+; dual-issue we will get are addc+xmpyu.  Unrolling could gain a cycle/limb
+; on the PA7100.
+
+; There are some ideas described in mul_1.s that applies to this code too.
+
+; It seems possible to make this run as fast as __mpn_addmul_1, if we use
+; 	sub,>>=	%r29,%r19,%r22
+;	addi	1,%r28,%r28
+; but that requires reworking the hairy software pipeline...
+
+	.code
+	.export		__mpn_submul_1
+__mpn_submul_1
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldo		64(%r30),%r30
+	fldws,ma	4(%r25),%fr5
+	stw		%r23,-16(%r30)		; move s2_limb ...
+	addib,=		-1,%r24,L$just_one_limb
+	 fldws		-16(%r30),%fr4		; ... into fr4
+	add		%r0,%r0,%r0		; clear carry
+	xmpyu		%fr4,%fr5,%fr6
+	fldws,ma	4(%r25),%fr7
+	fstds		%fr6,-16(%r30)
+	xmpyu		%fr4,%fr7,%fr8
+	ldw		-12(%r30),%r19		; least significant limb in product
+	ldw		-16(%r30),%r28
+
+	fstds		%fr8,-16(%r30)
+	addib,=		-1,%r24,L$end
+	 ldw		-12(%r30),%r1
+
+; Main loop
+L$loop	ldws		0(%r26),%r29
+	fldws,ma	4(%r25),%fr5
+	sub		%r29,%r19,%r22
+	add		%r22,%r19,%r0
+	stws,ma		%r22,4(%r26)
+	addc		%r28,%r1,%r19
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		-16(%r30),%r28
+	fstds		%fr6,-16(%r30)
+	addc		%r0,%r28,%r28
+	addib,<>	-1,%r24,L$loop
+	 ldw		-12(%r30),%r1
+
+L$end	ldw		0(%r26),%r29
+	sub		%r29,%r19,%r22
+	add		%r22,%r19,%r0
+	stws,ma		%r22,4(%r26)
+	addc		%r28,%r1,%r19
+	ldw		-16(%r30),%r28
+	ldws		0(%r26),%r29
+	addc		%r0,%r28,%r28
+	sub		%r29,%r19,%r22
+	add		%r22,%r19,%r0
+	stws,ma		%r22,4(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+L$just_one_limb
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		0(%r26),%r29
+	fstds		%fr6,-16(%r30)
+	ldw		-12(%r30),%r1
+	ldw		-16(%r30),%r28
+	sub		%r29,%r1,%r22
+	add		%r22,%r1,%r0
+	stw		%r22,0(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+	.exit
+	.procend
--- a/sysdeps/hppa/hppa1.1/udiv_qrnnd.s
+++ b/sysdeps/hppa/hppa1.1/udiv_qrnnd.s
@ -0,0 +1,74 @@
+; HP-PA  __udiv_qrnnd division support, used from longlong.h.
+; This version runs fast on PA 7000 and later.
+
+; Copyright (C) 1993, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; rem_ptr	gr26
+; n1		gr25
+; n0		gr24
+; d		gr23
+
+	.code
+L$0000	.word		0x43f00000
+	.word		0x0
+	.export		__udiv_qrnnd
+__udiv_qrnnd
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+	ldo		64(%r30),%r30
+
+	stws		%r25,-16(0,%r30)	; n_hi
+	stws		%r24,-12(0,%r30)	; n_lo
+	ldil		L'L$0000,%r19
+	ldo		R'L$0000(%r19),%r19
+	fldds		-16(0,%r30),%fr5
+	stws		%r23,-12(0,%r30)
+	comib,<=	0,%r25,L$1
+	fcnvxf,dbl,dbl	%fr5,%fr5
+	fldds		0(0,%r19),%fr4
+	fadd,dbl	%fr4,%fr5,%fr5
+L$1
+	fcpy,sgl	%fr0,%fr6L
+	fldws		-12(0,%r30),%fr6R
+	fcnvxf,dbl,dbl	%fr6,%fr4
+
+	fdiv,dbl	%fr5,%fr4,%fr5
+
+	fcnvfx,dbl,dbl	%fr5,%fr4
+	fstws		%fr4R,-16(%r30)
+	xmpyu		%fr4R,%fr6R,%fr6
+	ldws		-16(%r30),%r28
+	fstds		%fr6,-16(0,%r30)
+	ldws		-12(0,%r30),%r21
+	ldws		-16(0,%r30),%r20
+	sub		%r24,%r21,%r22
+	subb		%r25,%r20,%r19
+	comib,=		0,%r19,L$2
+	ldo		-64(%r30),%r30
+
+	add		%r22,%r23,%r22
+	ldo		-1(%r28),%r28
+L$2	bv		0(%r2)
+	stws		%r22,0(0,%r26)
+
+	.exit
+	.procend
--- a/sysdeps/hppa/lshift.s
+++ b/sysdeps/hppa/lshift.s
@ -0,0 +1,65 @@
+; HP-PA  __mpn_lshift --
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s_ptr		gr25
+; size		gr24
+; cnt		gr23
+
+	.code
+	.export		__mpn_lshift
+__mpn_lshift
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	sh2add		%r24,%r25,%r25
+	sh2add		%r24,%r26,%r26
+	ldws,mb		-4(0,%r25),%r22
+	subi		32,%r23,%r1
+	mtsar		%r1
+	addib,=		-1,%r24,L$0004
+	vshd		%r0,%r22,%r28		; compute carry out limb
+	ldws,mb		-4(0,%r25),%r29
+	addib,=		-1,%r24,L$0002
+	vshd		%r22,%r29,%r20
+
+L$loop	ldws,mb		-4(0,%r25),%r22
+	stws,mb		%r20,-4(0,%r26)
+	addib,=		-1,%r24,L$0003
+	vshd		%r29,%r22,%r20
+	ldws,mb		-4(0,%r25),%r29
+	stws,mb		%r20,-4(0,%r26)
+	addib,<>	-1,%r24,L$loop
+	vshd		%r22,%r29,%r20
+
+L$0002	stws,mb		%r20,-4(0,%r26)
+	vshd		%r29,%r0,%r20
+	bv		0(%r2)
+	stw		%r20,-4(0,%r26)
+L$0003	stws,mb		%r20,-4(0,%r26)
+L$0004	vshd		%r22,%r0,%r20
+	bv		0(%r2)
+	stw		%r20,-4(0,%r26)
+
+	.exit
+	.procend
--- a/sysdeps/hppa/rshift.s
+++ b/sysdeps/hppa/rshift.s
@ -0,0 +1,62 @@
+; HP-PA  __mpn_rshift -- 
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s_ptr		gr25
+; size		gr24
+; cnt		gr23
+
+	.code
+	.export		__mpn_rshift
+__mpn_rshift
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r22
+	mtsar		%r23
+	addib,=		-1,%r24,L$0004
+	vshd		%r22,%r0,%r28		; compute carry out limb
+	ldws,ma		4(0,%r25),%r29
+	addib,=		-1,%r24,L$0002
+	vshd		%r29,%r22,%r20
+
+L$loop	ldws,ma		4(0,%r25),%r22
+	stws,ma		%r20,4(0,%r26)
+	addib,=		-1,%r24,L$0003
+	vshd		%r22,%r29,%r20
+	ldws,ma		4(0,%r25),%r29
+	stws,ma		%r20,4(0,%r26)
+	addib,<>	-1,%r24,L$loop
+	vshd		%r29,%r22,%r20
+
+L$0002	stws,ma		%r20,4(0,%r26)
+	vshd		%r0,%r29,%r20
+	bv		0(%r2)
+	stw		%r20,0(0,%r26)
+L$0003	stws,ma		%r20,4(0,%r26)
+L$0004	vshd		%r0,%r22,%r20
+	bv		0(%r2)
+	stw		%r20,0(0,%r26)
+
+	.exit
+	.procend
--- a/sysdeps/hppa/sub_n.s
+++ b/sysdeps/hppa/sub_n.s
@ -0,0 +1,58 @@
+; HP-PA  __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; One might want to unroll this as for other processors, but it turns
+; out that the data cache contention after a store makes such
+; unrolling useless.  We can't come under 5 cycles/limb anyway.
+
+	.code
+	.export		__mpn_sub_n
+__mpn_sub_n
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+
+	addib,=		-1,%r23,L$end	; check for (SIZE == 1)
+	 sub		%r20,%r19,%r28	; subtract first limbs ignoring cy
+
+L$loop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,<>	-1,%r23,L$loop
+	 subb		%r20,%r19,%r28
+
+L$end	stws		%r28,0(0,%r26)
+	addc		%r0,%r0,%r28
+	bv		0(%r2)
+	 subi		1,%r28,%r28
+
+	.exit
+	.procend
--- a/sysdeps/hppa/udiv_qrnnd.s
+++ b/sysdeps/hppa/udiv_qrnnd.s
@ -0,0 +1,285 @@
+; HP-PA  __udiv_qrnnd division support, used from longlong.h.
+; This version runs fast on pre-PA7000 CPUs.
+
+; Copyright (C) 1993, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; rem_ptr	gr26
+; n1		gr25
+; n0		gr24
+; d		gr23
+
+; The code size is a bit excessive.  We could merge the last two ds;addc
+; sequences by simply moving the "bb,< Odd" instruction down.  The only
+; trouble is the FFFFFFFF code that would need some hacking.
+
+	.code
+	.export		__udiv_qrnnd
+__udiv_qrnnd
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	comb,<		%r23,0,L$largedivisor
+	 sub		%r0,%r23,%r1		; clear cy as side-effect
+	ds		%r0,%r1,%r0
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r28
+	ds		%r25,%r23,%r25
+	comclr,>=	%r25,%r0,%r0
+	addl		%r25,%r23,%r25
+	stws		%r25,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r28,%r28,%r28
+
+L$largedivisor
+	extru		%r24,31,1,%r19		; r19 = n0 & 1
+	bb,<		%r23,31,L$odd
+	 extru		%r23,30,31,%r22		; r22 = d >> 1
+	shd		%r25,%r24,1,%r24	; r24 = new n0
+	extru		%r25,30,31,%r25		; r25 = new n1
+	sub		%r0,%r22,%r21
+	ds		%r0,%r21,%r0
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	comclr,>=	%r25,%r0,%r0
+	addl		%r25,%r22,%r25
+	sh1addl		%r25,%r19,%r25
+	stws		%r25,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r24,%r24,%r28
+
+L$odd	addib,sv,n	1,%r22,L$FF..		; r22 = (d / 2 + 1)
+	shd		%r25,%r24,1,%r24	; r24 = new n0
+	extru		%r25,30,31,%r25		; r25 = new n1
+	sub		%r0,%r22,%r21
+	ds		%r0,%r21,%r0
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r28
+	comclr,>=	%r25,%r0,%r0
+	addl		%r25,%r22,%r25
+	sh1addl		%r25,%r19,%r25
+; We have computed (n1,,n0) / (d + 1), q' = r28, r' = r25
+	add,nuv		%r28,%r25,%r25
+	addl		%r25,%r1,%r25
+	addc		%r0,%r28,%r28
+	sub,<<		%r25,%r23,%r0
+	addl		%r25,%r1,%r25
+	stws		%r25,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r28,%r28
+
+; This is just a special case of the code above.
+; We come here when d == 0xFFFFFFFF
+L$FF..	add,uv		%r25,%r24,%r24
+	sub,<<		%r24,%r23,%r0
+	ldo		1(%r24),%r24
+	stws		%r24,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r25,%r28
+
+	.exit
+	.procend
--- a/sysdeps/i386/add_n.S
+++ b/sysdeps/i386/add_n.S
@ -1,7 +1,7 @@
 /* i80386 __mpn_add_n -- Add two limb vectors of the same length > 0 and store
 sum in a third limb vector.

-Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.

 This file is part of the GNU MP Library.

@ -54,14 +54,18 @@ C_SYMBOL_NAME(__mpn_add_n:)
 	subl	%eax,%edx		/* ... enter the loop */
 	shrl	$2,%eax			/* restore previous value */
 #ifdef PIC
-	call	here
-here:	leal	(Loop - 3 - here)(%eax,%eax,8),%eax
-	addl	%eax,(%esp)
-	ret
+/* Calculate start address in loop for PIC.  Due to limitations in some
+   assemblers, Loop-L0-3 cannot be put into the leal */
+	call	L0
+L0:	leal	(%eax,%eax,8),%eax
+	addl	(%esp),%eax
+	addl	$(Loop-L0-3),%eax 
+	addl	$4,%esp
 #else
-	leal	(Loop - 3)(%eax,%eax,8),%eax	/* calc start addr in loop */
-	jmp	*%eax			/* jump into loop */
+/* Calculate start address in loop for non-PIC.  */
+ 	leal	(Loop - 3)(%eax,%eax,8),%eax
 #endif
+	jmp	*%eax			/* jump into loop */
 	ALIGN (3)
 Loop:	movl	(%esi),%eax
 	adcl	(%edx),%eax
--- a/sysdeps/i386/gmp-mparam.h
+++ b/sysdeps/i386/gmp-mparam.h
@ -0,0 +1,28 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+#define IEEE_DOUBLE_BIG_ENDIAN 0
--- a/sysdeps/i386/i486/strcat.S
+++ b/sysdeps/i386/i486/strcat.S
@ -0,0 +1,260 @@
+/* strcat(dest, src) -- Append SRC on the end of DEST.
+For Intel 80x86, x>=4.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@ipd.info.uni-karlsruhe.de>.
+Optimised a little by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   dest		(sp + 4)
+   src		(sp + 8)
+*/
+
+	.text
+ENTRY (strcat)
+	pushl %edi		/* Save callee-safe register.  */
+
+	movl 12(%esp), %ecx	/* load source pointer */
+	movl 8(%esp), %edx	/* load destination pointer */
+
+	testb $0xff, (%ecx)	/* Is source string empty? */
+	jz L8			/* yes => return */
+
+	/* Test the first bytes separately until destination is aligned.  */
+	testb $3, %edx		/* destination pointer aligned? */
+	jz L1			/* yes => begin scan loop */
+	testb $0xff, (%edx)	/* is end of string? */
+	jz L2			/* yes => start appending */
+	incl %edx		/* increment source pointer */
+
+	testb $3, %edx		/* destination pointer aligned? */
+	jz L1			/* yes => begin scan loop */
+	testb $0xff, (%edx)	/* is end of string? */
+	jz L2			/* yes => start appending */
+	incl %edx		/* increment source pointer */
+
+	testb $3, %edx		/* destination pointer aligned? */
+	jz L1			/* yes => begin scan loop */
+	testb $0xff, (%edx)	/* is end of string? */
+	jz L2			/* yes => start appending */
+	incl %edx		/* increment source pointer */
+
+	/* Now we are aligned.  Begin scan loop.  */
+	jmp L1
+
+	ALIGN(4)
+
+L4:	addl $16,%edx		/* increment destination pointer for round */
+
+L1:	movl (%edx), %eax	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+
+	/* If you compare this with the algorithm in memchr.S you will
+	   notice that here is an `xorl' statement missing.  But you must
+	   not forget that we are looking for C == 0 and `xorl $0, %eax'
+	   is a no-op.  */
+
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+
+	/* According to the algorithm we had to reverse the effect of the
+	   XOR first and then test the overflow bits.  But because the
+	   following XOR would destroy the carry flag and it would (in a
+	   representation with more than 32 bits) not alter then last
+	   overflow, we can now test this condition.  If no carry is signaled
+	   no overflow must have occured in the last byte => it was 0.	*/
+	jnc L3
+
+	/* We are only interested in carry bits that change due to the
+	   previous add, so remove original bits */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+
+	/* Now test for the other three overflow bits.  */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	/* If at least one byte of the word is C we don't get 0 in %ecx.  */
+	jnz L3
+
+	movl 4(%edx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L5			/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L5			/* one byte is NUL => stop copying */
+
+	movl 8(%edx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L6			/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L6			/* one byte is NUL => stop copying */
+
+	movl 12(%edx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L7			/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz L4			/* no byte is NUL => carry on copying */
+
+L7:	addl $4, %edx		/* adjust source pointer */
+L6:	addl $4, %edx
+L5:	addl $4, %edx
+
+L3:	testb %al, %al		/* is first byte NUL? */
+	jz L2			/* yes => start copying */
+	incl %edx		/* increment source pointer */
+
+	testb %ah, %ah		/* is second byte NUL? */
+	jz L2			/* yes => start copying */
+	incl %edx		/* increment source pointer */
+
+	testl $0xff0000, %eax	/* is third byte NUL? */
+	jz L2			/* yes => start copying */
+	incl %edx		/* increment source pointer */
+
+L2:	subl %ecx, %edx		/* reduce number of loop variants */
+
+	/* Now we have to align the source pointer.  */
+	testb $3, %ecx		/* pointer correctly aligned? */
+	jz L29			/* yes => start copy loop */
+	movb (%ecx), %al	/* get first byte */
+	movb %al, (%ecx,%edx)	/* and store it */
+	andl %al, %al		/* is byte NUL? */
+	jz L8			/* yes => return */
+	incl %ecx		/* increment pointer */
+
+	testb $3, %ecx		/* pointer correctly aligned? */
+	jz L29			/* yes => start copy loop */
+	movb (%ecx), %al	/* get first byte */
+	movb %al, (%ecx,%edx)	/* and store it */
+	andl %al, %al		/* is byte NUL? */
+	jz L8			/* yes => return */
+	incl %ecx		/* increment pointer */
+
+	testb $3, %ecx		/* pointer correctly aligned? */
+	jz L29			/* yes => start copy loop */
+	movb (%ecx), %al	/* get first byte */
+	movb %al, (%ecx,%edx)	/* and store it */
+	andl %al, %al		/* is byte NUL? */
+	jz L8			/* yes => return */
+	incl %ecx		/* increment pointer */
+
+	/* Now we are aligned.  */
+	jmp L29			/* start copy loop */
+
+	ALIGN(4)
+
+L28:	movl %eax, 12(%ecx,%edx)/* store word at destination */
+	addl $16, %ecx		/* adjust pointer for full round */
+
+L29:	movl (%ecx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L9			/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L9			/* one byte is NUL => stop copying */
+	movl %eax, (%ecx,%edx)	/* store word to destination */
+
+	movl 4(%ecx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L91			/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L91			/* one byte is NUL => stop copying */
+	movl %eax, 4(%ecx,%edx)	/* store word to destination */
+
+	movl 8(%ecx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L92			/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L92			/* one byte is NUL => stop copying */
+	movl %eax, 8(%ecx,%edx)	/* store word to destination */
+
+	movl 12(%ecx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L93			/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz L28			/* no is NUL => carry on copying */
+
+L93:	addl $4, %ecx		/* adjust pointer */
+L92:	addl $4, %ecx
+L91:	addl $4, %ecx
+
+L9:	movb %al, (%ecx,%edx)	/* store first byte of last word */
+	orb %al, %al		/* is it NUL? */
+	jz L8			/* yes => return */
+
+	movb %ah, 1(%ecx,%edx)	/* store second byte of last word */
+	orb %ah, %ah		/* is it NUL? */
+	jz L8			/* yes => return */
+
+	shrl $16, %eax		/* make upper bytes accessible */
+	movb %al, 2(%ecx,%edx)	/* store third byte of last word */
+	orb %al, %al		/* is it NUL? */
+	jz L8			/* yes => return */
+
+	movb %ah, 3(%ecx,%edx)	/* store fourth byte of last word */
+
+L8:	movl 8(%esp), %eax	/* start address of destination is result */
+	popl %edi		/* restore saved register */
+
+	ret
--- a/sysdeps/i386/i486/strlen.S
+++ b/sysdeps/i386/i486/strlen.S
@ -0,0 +1,132 @@
+/* strlen(str) -- determine the length of the string STR.
+Optimized for Intel 80x86, x>=4.
+Copyright (C) 1991, 1992, 1993, 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>.
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   str		(sp + 4)
+*/
+
+	.text
+ENTRY (strlen)
+	movl 4(%esp), %ecx	/* get string pointer */
+	movl %ecx, %eax		/* duplicate it */
+
+	andl $3, %ecx		/* mask alignment bits */
+	jz L1			/* aligned => start loop */
+	cmpb %ch, (%eax)	/* is byte NUL? */
+	je L2			/* yes => return */
+	incl %eax		/* increment pointer */
+
+	xorl $3, %ecx		/* was alignment = 3? */
+	jz L1			/* yes => now it is aligned and start loop */
+	cmpb %ch, (%eax)	/* is byte NUL? */
+	je L2			/* yes => return */
+	addl $1, %eax		/* increment pointer */
+
+	subl $1, %ecx		/* was alignment = 2? */
+	jz L1			/* yes => now it is aligned and start loop */
+	cmpb %ch, (%eax)	/* is byte NUL? */
+	je L2			/* yes => return */
+
+/* Don't change the above `addl $1,%eax' and `subl $1, %ecx' into `incl %eax'
+   and `decl %ecx' resp.  The additional two byte per instruction make the
+   label 4 to be aligned on a 16 byte boundary with nops.
+
+   The following `sub $15, %eax' is part of this trick, too.  Together with
+   the next instruction (`addl $16, %eax') it is in fact a `incl %eax', just
+   as expected from the algorithm.  But doing so has the advantage that
+   no jump to label 1 is necessary and so the pipeline is not flushed.  */
+
+	subl $15, %eax		/* effectively +1 */
+
+
+L4:	addl $16, %eax		/* adjust pointer for full loop */
+
+L1:	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edx	/* magic value */
+	addl %ecx, %edx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L3			/* highest byte is NUL => return pointer */
+	xorl %ecx, %edx		/* (word+magic)^word */
+	orl $0xfefefeff, %edx	/* set all non-carry bits */
+	incl %edx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L3			/* found NUL => return pointer */
+
+	movl 4(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edx	/* magic value */
+	addl %ecx, %edx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L5			/* highest byte is NUL => return pointer */
+	xorl %ecx, %edx		/* (word+magic)^word */
+	orl $0xfefefeff, %edx	/* set all non-carry bits */
+	incl %edx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L5			/* found NUL => return pointer */
+
+	movl 8(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edx	/* magic value */
+	addl %ecx, %edx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L6			/* highest byte is NUL => return pointer */
+	xorl %ecx, %edx		/* (word+magic)^word */
+	orl $0xfefefeff, %edx	/* set all non-carry bits */
+	incl %edx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L6			/* found NUL => return pointer */
+
+	movl 12(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edx	/* magic value */
+	addl %ecx, %edx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L7			/* highest byte is NUL => return pointer */
+	xorl %ecx, %edx		/* (word+magic)^word */
+	orl $0xfefefeff, %edx	/* set all non-carry bits */
+	incl %edx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz L4			/* no NUL found => continue loop */
+
+L7:	addl $4, %eax		/* adjust pointer */
+L6:	addl $4, %eax
+L5:	addl $4, %eax
+
+L3:	testb %cl, %cl		/* is first byte NUL? */
+	jz L2			/* yes => return */
+	incl %eax		/* increment pointer */
+
+	testb %ch, %ch		/* is second byte NUL? */
+	jz L2			/* yes => return */
+	incl %eax		/* increment pointer */
+
+	testl $0xff0000, %ecx	/* is third byte NUL? */
+	jz L2			/* yes => return pointer */
+	incl %eax		/* increment pointer */
+
+L2:	subl 4(%esp), %eax	/* compute difference to string start */
+
+	ret
--- a/sysdeps/i386/i586/Implies
+++ b/sysdeps/i386/i586/Implies
@ -0,0 +1,2 @@
+# Code optimized for i486 is better than simple i386 code.
+i386/i486
--- a/sysdeps/i386/i586/add_n.S
+++ b/sysdeps/i386/i586/add_n.S
@ -0,0 +1,136 @@
+/* Pentium __mpn_add_n -- Add two limb vectors of the same length > 0 and store
+   sum in a third limb vector.
+
+Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+   INPUT PARAMETERS
+   res_ptr	(sp + 4)
+   s1_ptr	(sp + 8)
+   s2_ptr	(sp + 12)
+   size		(sp + 16)
+*/
+
+#define r1	%eax
+#define r2	%edx
+#define src1	%esi
+#define src2	%ebp
+#define dst	%edi
+#define x	%ebx
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+.text
+	ALIGN (3)
+	.globl C_SYMBOL_NAME(__mpn_add_n)
+C_SYMBOL_NAME(__mpn_add_n:)
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+
+	movl	20(%esp),dst		/* res_ptr */
+	movl	24(%esp),src1		/* s1_ptr */
+	movl	28(%esp),src2		/* s2_ptr */
+	movl	32(%esp),%ecx		/* size */
+
+	movl	(src2),x
+
+	decl	%ecx
+	movl	%ecx,r2
+	shrl	$3,%ecx
+	andl	$7,r2
+	testl	%ecx,%ecx		/* zero carry flag */
+	jz	Lend
+	pushl	r2
+
+	ALIGN (3)
+Loop:	movl	28(dst),%eax		/* fetch destination cache line */
+	leal	32(dst),dst
+
+L1:	movl	(src1),r1
+	movl	4(src1),r2
+	adcl	x,r1
+	movl	4(src2),x
+	adcl	x,r2
+	movl	8(src2),x
+	movl	r1,-32(dst)
+	movl	r2,-28(dst)
+
+L2:	movl	8(src1),r1
+	movl	12(src1),r2
+	adcl	x,r1
+	movl	12(src2),x
+	adcl	x,r2
+	movl	16(src2),x
+	movl	r1,-24(dst)
+	movl	r2,-20(dst)
+
+L3:	movl	16(src1),r1
+	movl	20(src1),r2
+	adcl	x,r1
+	movl	20(src2),x
+	adcl	x,r2
+	movl	24(src2),x
+	movl	r1,-16(dst)
+	movl	r2,-12(dst)
+
+L4:	movl	24(src1),r1
+	movl	28(src1),r2
+	adcl	x,r1
+	movl	28(src2),x
+	adcl	x,r2
+	movl	32(src2),x
+	movl	r1,-8(dst)
+	movl	r2,-4(dst)
+
+	leal	32(src1),src1
+	leal	32(src2),src2
+	decl	%ecx
+	jnz	Loop
+
+	popl	r2
+Lend:
+	decl	r2			/* test r2 w/o clobbering carry */
+	js	Lend2
+	incl	r2
+Loop2:
+	leal	4(dst),dst
+	movl	(src1),r1
+	adcl	x,r1
+	movl	4(src2),x
+	movl	r1,-4(dst)
+	leal	4(src1),src1
+	leal	4(src2),src2
+	decl	r2
+	jnz	Loop2
+Lend2:
+	movl	(src1),r1
+	adcl	x,r1
+	movl	r1,(dst)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
--- a/sysdeps/i386/i586/addmul_1.S
+++ b/sysdeps/i386/i586/addmul_1.S
@ -0,0 +1,84 @@
+/* Pentium __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+   the result to a second limb vector.
+
+Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+   INPUT PARAMETERS
+   res_ptr	(sp + 4)
+   s1_ptr	(sp + 8)
+   size		(sp + 12)
+   s2_limb	(sp + 16)
+*/
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define res_ptr edi
+#define s1_ptr esi
+#define s2_limb ebp
+
+	TEXT
+	ALIGN (3)
+	GLOBL	C_SYMBOL_NAME(__mpn_addmul_1)
+	.type	C_SYMBOL_NAME(__mpn_addmul_1),@function
+C_SYMBOL_NAME(__mpn_addmul_1:)
+
+	INSN1(push,l	,R(edi))
+	INSN1(push,l	,R(esi))
+	INSN1(push,l	,R(ebx))
+	INSN1(push,l	,R(ebp))
+
+	INSN2(mov,l	,R(res_ptr),MEM_DISP(esp,20))
+	INSN2(mov,l	,R(s1_ptr),MEM_DISP(esp,24))
+	INSN2(mov,l	,R(ecx),MEM_DISP(esp,28))
+	INSN2(mov,l	,R(s2_limb),MEM_DISP(esp,32))
+
+	INSN2(lea,l	,R(res_ptr),MEM_INDEX(res_ptr,ecx,4))
+	INSN2(lea,l	,R(s1_ptr),MEM_INDEX(s1_ptr,ecx,4))
+	INSN1(neg,l	,R(ecx))
+	INSN2(xor,l	,R(edx),R(edx))
+	ALIGN (3)
+Loop:
+	INSN2(mov,l	,R(ebx),R(edx))
+	INSN2(mov,l	,R(eax),MEM_INDEX(s1_ptr,ecx,4))
+
+	INSN1(mul,l	,R(s2_limb))
+
+	INSN2(add,l	,R(eax),R(ebx))
+	INSN2(mov,l	,R(ebx),MEM_INDEX(res_ptr,ecx,4))
+
+	INSN2(adc,l	,R(edx),$0)
+	INSN2(add,l	,R(ebx),R(eax))
+
+	INSN2(adc,l	,R(edx),$0)
+	INSN2(mov,l	,MEM_INDEX(res_ptr,ecx,4),R(ebx))
+
+	INSN1(inc,l	,R(ecx))
+	INSN1(jnz,	,Loop)
+
+
+	INSN2(mov,l	,R(eax),R(edx))
+	INSN1(pop,l	,R(ebp))
+	INSN1(pop,l	,R(ebx))
+	INSN1(pop,l	,R(esi))
+	INSN1(pop,l	,R(edi))
+	ret
+Lfe1:
+	.size	C_SYMBOL_NAME(__mpn_addmul_1),Lfe1-C_SYMBOL_NAME(__mpn_addmul_1)
--- a/sysdeps/i386/i586/lshift.S
+++ b/sysdeps/i386/i586/lshift.S
@ -0,0 +1,213 @@
+/* Pentium optimized __mpn_lshift -- 
+
+Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s_ptr		(sp + 8)
+  size		(sp + 12)
+  cnt		(sp + 16)
+*/
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+.text
+	ALIGN (3)
+	.globl C_SYMBOL_NAME(__mpn_lshift)
+C_SYMBOL_NAME(__mpn_lshift:)
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+
+	movl	20(%esp),%edi		/* res_ptr */
+	movl	24(%esp),%esi		/* s_ptr */
+	movl	28(%esp),%ebp		/* size */
+	movl	32(%esp),%ecx		/* cnt */
+
+	cmp	$1,%ecx
+	jne	Lnormal
+	movl	%edi,%eax
+	subl	%esi,%eax
+	cmpl	%ebp,%eax
+	jnc	Lspecial
+
+Lnormal:
+	leal	-4(%edi,%ebp,4),%edi
+	leal	-4(%esi,%ebp,4),%esi
+
+	movl	(%esi),%edx
+	subl	$4,%esi
+	xorl	%eax,%eax
+	shldl	%cl,%edx,%eax		/* compute carry limb */
+	pushl	%eax			/* push carry limb onto stack */
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+	jz	Lend
+
+	movl	(%edi),%eax		/* fetch destination cache line */
+
+	ALIGN	(2)
+Loop:	movl	-28(%edi),%eax		/* fetch destination cache line */
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	-4(%esi),%edx
+	shldl	%cl,%eax,%ebx
+	shldl	%cl,%edx,%eax
+	movl	%ebx,(%edi)
+	movl	%eax,-4(%edi)
+
+	movl	-8(%esi),%ebx
+	movl	-12(%esi),%eax
+	shldl	%cl,%ebx,%edx
+	shldl	%cl,%eax,%ebx
+	movl	%edx,-8(%edi)
+	movl	%ebx,-12(%edi)
+
+	movl	-16(%esi),%edx
+	movl	-20(%esi),%ebx
+	shldl	%cl,%edx,%eax
+	shldl	%cl,%ebx,%edx
+	movl	%eax,-16(%edi)
+	movl	%edx,-20(%edi)
+
+	movl	-24(%esi),%eax
+	movl	-28(%esi),%edx
+	shldl	%cl,%eax,%ebx
+	shldl	%cl,%edx,%eax
+	movl	%ebx,-24(%edi)
+	movl	%eax,-28(%edi)
+
+	subl	$32,%esi
+	subl	$32,%edi
+	decl	%ebp
+	jnz	Loop
+
+Lend:	popl	%ebp
+	andl	$7,%ebp
+	jz	Lend2
+Loop2:	movl	(%esi),%eax
+	shldl	%cl,%eax,%edx
+	movl	%edx,(%edi)
+	movl	%eax,%edx
+	subl	$4,%esi
+	subl	$4,%edi
+	decl	%ebp
+	jnz	Loop2
+
+Lend2:	shll	%cl,%edx		/* compute least significant limb */
+	movl	%edx,(%edi)		/* store it */
+
+	popl	%eax			/* pop carry limb */
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+/* We loop from least significant end of the arrays, which is only
+   permissable if the source and destination don't overlap, since the
+   function is documented to work for overlapping source and destination.
+*/
+
+Lspecial:
+	movl	(%esi),%edx
+	addl	$4,%esi
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+
+	addl	%edx,%edx
+	incl	%ebp
+	decl	%ebp
+	jz	LLend
+
+	movl	(%edi),%eax		/* fetch destination cache line */
+
+	ALIGN	(2)
+LLoop:	movl	28(%edi),%eax		/* fetch destination cache line */
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	adcl	%eax,%eax
+	movl	%ebx,(%edi)
+	adcl	%edx,%edx
+	movl	%eax,4(%edi)
+
+	movl	8(%esi),%ebx
+	movl	12(%esi),%eax
+	adcl	%ebx,%ebx
+	movl	%edx,8(%edi)
+	adcl	%eax,%eax
+	movl	%ebx,12(%edi)
+
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	adcl	%edx,%edx
+	movl	%eax,16(%edi)
+	adcl	%ebx,%ebx
+	movl	%edx,20(%edi)
+
+	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	adcl	%eax,%eax
+	movl	%ebx,24(%edi)
+	adcl	%edx,%edx
+	movl	%eax,28(%edi)
+
+	leal	32(%esi),%esi		/* use leal not to clobber carry */
+	leal	32(%edi),%edi
+	decl	%ebp
+	jnz	LLoop
+
+LLend:	popl	%ebp
+	sbbl	%eax,%eax		/* save carry in %eax */
+	andl	$7,%ebp
+	jz	LLend2
+	addl	%eax,%eax		/* restore carry from eax */
+LLoop2:	movl	%edx,%ebx
+	movl	(%esi),%edx
+	adcl	%edx,%edx
+	movl	%ebx,(%edi)
+
+	leal	4(%esi),%esi		/* use leal not to clobber carry */
+	leal	4(%edi),%edi
+	decl	%ebp
+	jnz	LLoop2
+
+	jmp	LL1
+LLend2:	addl	%eax,%eax		/* restore carry from eax */
+LL1:	movl	%edx,(%edi)		/* store last limb */
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
--- a/sysdeps/i386/i586/memcopy.h
+++ b/sysdeps/i386/i586/memcopy.h
@ -1,5 +1,5 @@
 /* memcopy.h -- definitions for memory copy functions.  Pentium version.
-   Copyright (C) 1994 Free Software Foundation, Inc.
+   Copyright (C) 1994, 1995 Free Software Foundation, Inc.
   Contributed by Torbjorn Granlund (tege@sics.se).

 This file is part of the GNU C Library.
@ -88,7 +88,7 @@ Cambridge, MA 02139, USA.  */
 		    "subl	$32,%2\n"				\
 		    "jns	1b\n"					\
 		    "2: addl	$32,%2" :				\
-		    "=r" (dst_bp), "=r" (src_bp), "=r" (nbytes_left) :	\
-		    "0" (dst_bp), "1" (src_bp), "2" (nbytes) :		\
+		    "=r" (dst_ep), "=r" (src_ep), "=r" (nbytes_left) :	\
+		    "0" (dst_ep), "1" (src_ep), "2" (nbytes) :		\
 		    "ax", "dx");					\
    } while (0)
--- a/sysdeps/i386/i586/mul_1.S
+++ b/sysdeps/i386/i586/mul_1.S
@ -0,0 +1,78 @@
+/* Pentium __mpn_mul_1 -- Multiply a limb vector with a limb and store
+   the result in a second limb vector.
+
+Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+   INPUT PARAMETERS
+   res_ptr	(sp + 4)
+   s1_ptr	(sp + 8)
+   size		(sp + 12)
+   s2_limb	(sp + 16)
+*/
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define res_ptr edi
+#define s1_ptr esi
+#define size ecx
+#define s2_limb ebp
+
+	TEXT
+	ALIGN (3)
+	GLOBL	C_SYMBOL_NAME(__mpn_mul_1)
+C_SYMBOL_NAME(__mpn_mul_1:)
+
+	INSN1(push,l	,R(edi))
+	INSN1(push,l	,R(esi))
+	INSN1(push,l	,R(ebx))
+	INSN1(push,l	,R(ebp))
+
+	INSN2(mov,l	,R(res_ptr),MEM_DISP(esp,20))
+	INSN2(mov,l	,R(s1_ptr),MEM_DISP(esp,24))
+	INSN2(mov,l	,R(size),MEM_DISP(esp,28))
+	INSN2(mov,l	,R(s2_limb),MEM_DISP(esp,32))
+
+	INSN2(lea,l	,R(res_ptr),MEM_INDEX(res_ptr,size,4))
+	INSN2(lea,l	,R(s1_ptr),MEM_INDEX(s1_ptr,size,4))
+	INSN1(neg,l	,R(size))
+	INSN2(xor,l	,R(edx),R(edx))
+	ALIGN (3)
+Loop:
+	INSN2(mov,l	,R(ebx),R(edx))
+	INSN2(mov,l	,R(eax),MEM_INDEX(s1_ptr,size,4))
+
+	INSN1(mul,l	,R(s2_limb))
+
+	INSN2(add,l	,R(eax),R(ebx))
+
+	INSN2(adc,l	,R(edx),$0)
+	INSN2(mov,l	,MEM_INDEX(res_ptr,size,4),R(eax))
+
+	INSN1(inc,l	,R(size))
+	INSN1(jnz,	,Loop)
+
+
+	INSN2(mov,l	,R(eax),R(edx))
+	INSN1(pop,l	,R(ebp))
+	INSN1(pop,l	,R(ebx))
+	INSN1(pop,l	,R(esi))
+	INSN1(pop,l	,R(edi))
+	ret
--- a/sysdeps/i386/i586/rshift.S
+++ b/sysdeps/i386/i586/rshift.S
@ -0,0 +1,213 @@
+/* Pentium optimized __mpn_rshift -- 
+
+Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s_ptr		(sp + 8)
+  size		(sp + 12)
+  cnt		(sp + 16)
+*/
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+.text
+	ALIGN (3)
+	.globl C_SYMBOL_NAME(__mpn_rshift)
+C_SYMBOL_NAME(__mpn_rshift:)
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+
+	movl	20(%esp),%edi		/* res_ptr */
+	movl	24(%esp),%esi		/* s_ptr */
+	movl	28(%esp),%ebp		/* size */
+	movl	32(%esp),%ecx		/* cnt */
+
+	cmp	$1,%ecx
+	jne	Lnormal
+	movl	%edi,%eax
+	subl	%esi,%eax
+	cmpl	%ebp,%eax
+	jnc	Lspecial
+
+Lnormal:
+	movl	(%esi),%edx
+	addl	$4,%esi
+	xorl	%eax,%eax
+	shrdl	%cl,%edx,%eax		/* compute carry limb */
+	pushl	%eax			/* push carry limb onto stack */
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+	jz	Lend
+
+	movl	(%edi),%eax		/* fetch destination cache line */
+
+	ALIGN	(2)
+Loop:	movl	28(%edi),%eax		/* fetch destination cache line */
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	shrdl	%cl,%eax,%ebx
+	shrdl	%cl,%edx,%eax
+	movl	%ebx,(%edi)
+	movl	%eax,4(%edi)
+
+	movl	8(%esi),%ebx
+	movl	12(%esi),%eax
+	shrdl	%cl,%ebx,%edx
+	shrdl	%cl,%eax,%ebx
+	movl	%edx,8(%edi)
+	movl	%ebx,12(%edi)
+
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	shrdl	%cl,%edx,%eax
+	shrdl	%cl,%ebx,%edx
+	movl	%eax,16(%edi)
+	movl	%edx,20(%edi)
+
+	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	shrdl	%cl,%eax,%ebx
+	shrdl	%cl,%edx,%eax
+	movl	%ebx,24(%edi)
+	movl	%eax,28(%edi)
+
+	addl	$32,%esi
+	addl	$32,%edi
+	decl	%ebp
+	jnz	Loop
+
+Lend:	popl	%ebp
+	andl	$7,%ebp
+	jz	Lend2
+Loop2:	movl	(%esi),%eax
+	shrdl	%cl,%eax,%edx		/* compute result limb */
+	movl	%edx,(%edi)
+	movl	%eax,%edx
+	addl	$4,%esi
+	addl	$4,%edi
+	decl	%ebp
+	jnz	Loop2
+
+Lend2:	shrl	%cl,%edx		/* compute most significant limb */
+	movl	%edx,(%edi)		/* store it */
+
+	popl	%eax			/* pop carry limb */
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+/* We loop from least significant end of the arrays, which is only
+   permissable if the source and destination don't overlap, since the
+   function is documented to work for overlapping source and destination.
+*/
+
+Lspecial:
+	leal	-4(%edi,%ebp,4),%edi
+	leal	-4(%esi,%ebp,4),%esi
+
+	movl	(%esi),%edx
+	subl	$4,%esi
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+
+	shrl	$1,%edx
+	incl	%ebp
+	decl	%ebp
+	jz	LLend
+
+	movl	(%edi),%eax		/* fetch destination cache line */
+
+	ALIGN	(2)
+LLoop:	movl	-28(%edi),%eax		/* fetch destination cache line */
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	-4(%esi),%edx
+	rcrl	$1,%eax
+	movl	%ebx,(%edi)
+	rcrl	$1,%edx
+	movl	%eax,-4(%edi)
+
+	movl	-8(%esi),%ebx
+	movl	-12(%esi),%eax
+	rcrl	$1,%ebx
+	movl	%edx,-8(%edi)
+	rcrl	$1,%eax
+	movl	%ebx,-12(%edi)
+
+	movl	-16(%esi),%edx
+	movl	-20(%esi),%ebx
+	rcrl	$1,%edx
+	movl	%eax,-16(%edi)
+	rcrl	$1,%ebx
+	movl	%edx,-20(%edi)
+
+	movl	-24(%esi),%eax
+	movl	-28(%esi),%edx
+	rcrl	$1,%eax
+	movl	%ebx,-24(%edi)
+	rcrl	$1,%edx
+	movl	%eax,-28(%edi)
+
+	leal	-32(%esi),%esi		/* use leal not to clobber carry */
+	leal	-32(%edi),%edi
+	decl	%ebp
+	jnz	LLoop
+
+LLend:	popl	%ebp
+	sbbl	%eax,%eax		/* save carry in %eax */
+	andl	$7,%ebp
+	jz	LLend2
+	addl	%eax,%eax		/* restore carry from eax */
+LLoop2:	movl	%edx,%ebx
+	movl	(%esi),%edx
+	rcrl	$1,%edx
+	movl	%ebx,(%edi)
+
+	leal	-4(%esi),%esi		/* use leal not to clobber carry */
+	leal	-4(%edi),%edi
+	decl	%ebp
+	jnz	LLoop2
+
+	jmp	LL1
+LLend2:	addl	%eax,%eax		/* restore carry from eax */
+LL1:	movl	%edx,(%edi)		/* store last limb */
+
+	movl	$0,%eax
+	rcrl	$1,%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
--- a/sysdeps/i386/i586/strchr.S
+++ b/sysdeps/i386/i586/strchr.S
@ -0,0 +1,334 @@
+/* strchr -- find character CH in a NUL terminated string.
+Highly optimized version for ix85, x>=5.
+Copyright (C) 1995 Free Software Foundation, Inc.
+This file is part of the GNU C Library.
+Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+
+/* This version is especially optimized for the i586 (and following?)
+   processors.  This is mainly done by using the two pipelines.  The
+   version optimized for i486 is weak in this aspect because to get
+   as much parallelism we have to executs some *more* instructions.
+
+   The code below is structured to reflect the pairing of the instructions
+   as *I think* it is.  I have no processor data book to verify this.
+   If you find something you think is incorrect let me know.  */
+
+
+/* The magic value which is used throughout in the whole code.  */
+#define magic 0xfefefeff
+
+/*
+   INPUT PARAMETERS:
+   str		(sp + 4)
+   ch		(sp + 8)
+*/
+
+	.text
+ENTRY (strchr)
+	pushl %edi		/* Save callee-safe registers.  */
+	pushl %esi
+
+	pushl %ebx
+	pushl %ebp
+
+	movl 20(%esp), %eax	/* get string pointer */
+	movl 24(%esp), %edx	/* get character we are looking for */
+
+	movl %eax, %edi		/* duplicate string pointer for later */
+	xorl %ecx, %ecx		/* clear %ecx */
+
+	/* At the moment %edx contains C.  What we need for the
+	   algorithm is C in all bytes of the dword.  Avoid
+	   operations on 16 bit words because these require an
+	   prefix byte (and one more cycle).  */
+	movb %dl, %dh		/* now it is 0|0|c|c */
+	movb %dl, %cl		/* we construct the lower half in %ecx */
+
+	shll $16, %edx		/* now %edx is c|c|0|0 */
+	movb %cl, %ch		/* now %ecx is 0|0|c|c */
+
+	orl %ecx, %edx		/* and finally c|c|c|c */
+	andl $3, %edi		/* mask alignment bits */
+
+	jz L11			/* alignment is 0 => start loop */
+
+	movb (%eax), %cl	/* load single byte */
+	cmpb %cl, %dl		/* is byte == C? */
+
+	je L2			/* aligned => return pointer */
+
+	cmp $0, %cl		/* is byte NUL? */
+	je L3			/* yes => return NULL */
+
+	incl %eax		/* increment pointer */
+	cmp $3, %edi		/* was alignment == 3? */
+
+	je L11			/* yes => start loop */
+
+	movb (%eax), %cl	/* load single byte */
+	cmpb %cl, %dl		/* is byte == C? */
+
+	je L2			/* aligned => return pointer */
+
+	cmp $0, %cl		/* is byte NUL? */
+	je L3			/* yes => return NULL */
+
+	incl %eax		/* increment pointer */
+	cmp $2, %edi		/* was alignment == 2? */
+
+	je L11			/* yes => start loop */
+
+	movb (%eax), %cl	/* load single byte */
+	cmpb %cl, %dl		/* is byte == C? */
+
+	je L2			/* aligned => return pointer */
+
+	cmp $0, %cl		/* is byte NUL? */
+	je L3			/* yes => return NULL */
+
+	incl %eax		/* increment pointer */
+
+	/* The following code is the preparation for the loop.  The
+	   four instruction up to `L1' will not be executed in the loop
+	   because the same code is found at the end of the loop, but
+	   there it is executed in parallel with other instructions.  */
+L11:	movl (%eax), %ecx
+	movl $magic, %ebp
+
+	movl $magic, %edi
+	addl %ecx, %ebp
+
+	/* The main loop: it looks complex and indeed it is.  I would
+	   love to say `it was hard to write, so it should he hard to
+	   read' but I will give some more hints.  To fully understand
+	   this code you should first take a look at the i486 version.
+	   The basic algorithm is the same, but here the code organized
+	   in a way which permits to use both pipelines all the time.
+
+	   I tried to make it a bit more understandable by indenting
+	   the code according to stage in the algorithm.  It goes as
+	   follows:
+		check for 0 in 1st word
+			check for C in 1st word
+					check for 0 in 2nd word
+						check for C in 2nd word
+		check for 0 in 3rd word
+			check for C in 3rd word
+					check for 0 in 4th word
+						check for C in 4th word
+
+	   Please note that doing the test for NUL before the test for
+	   C allows us to overlap the test for 0 in the next word with
+	   the test for C.  */
+
+L1:	xorl %ecx, %ebp			/* (word^magic) */
+	addl %ecx, %edi			/* add magic word */
+
+	leal 4(%eax), %eax		/* increment pointer */
+	jnc L4				/* previous addl caused overflow? */
+
+		movl %ecx, %ebx		/* duplicate original word */
+	orl $magic, %ebp		/* (word^magic)|magic */
+
+	addl $1, %ebp			/* (word^magic)|magic == 0xffffffff? */
+	jne L4				/* yes => we found word with NUL */
+
+		movl $magic, %esi	/* load magic value */
+		xorl %edx, %ebx		/* clear words which are C */
+
+					movl (%eax), %ecx
+		addl %ebx, %esi		/* (word+magic) */
+
+					movl $magic, %edi
+		jnc L5			/* previous addl caused overflow? */
+
+					movl %edi, %ebp
+		xorl %ebx, %esi		/* (word+magic)^word */
+
+					addl %ecx, %ebp
+		orl $magic, %esi	/* ((word+magic)^word)|magic */
+
+		addl $1, %esi		/* ((word+magic)^word)|magic==0xf..f?*/
+		jne L5			/* yes => we found word with C */
+
+					xorl %ecx, %ebp
+					addl %ecx, %edi
+
+					leal 4(%eax), %eax
+					jnc L4
+
+						movl %ecx, %ebx
+					orl $magic, %ebp
+
+					addl $1, %ebp
+					jne L4
+
+						movl $magic, %esi
+						xorl %edx, %ebx
+
+	movl (%eax), %ecx
+						addl %ebx, %esi
+
+	movl $magic, %edi
+						jnc L5
+
+	movl %edi, %ebp
+						xorl %ebx, %esi
+
+	addl %ecx, %ebp
+						orl $magic, %esi
+
+						addl $1, %esi
+						jne L5
+
+	xorl %ecx, %ebp
+	addl %ecx, %edi
+
+	leal 4(%eax), %eax
+	jnc L4
+
+		movl %ecx, %ebx
+	orl $magic, %ebp
+
+	addl $1, %ebp
+	jne L4
+
+		movl $magic, %esi
+		xorl %edx, %ebx
+
+					movl (%eax), %ecx
+		addl %ebx, %esi
+
+					movl $magic, %edi
+		jnc L5
+
+					movl %edi, %ebp
+		xorl %ebx, %esi
+
+					addl %ecx, %ebp
+		orl $magic, %esi
+
+		addl $1, %esi
+		jne L5
+
+					xorl %ecx, %ebp
+					addl %ecx, %edi
+
+					leal 4(%eax), %eax
+					jnc L4
+
+						movl %ecx, %ebx
+					orl $magic, %ebp
+
+					addl $1, %ebp
+					jne L4
+
+						movl $magic, %esi
+						xorl %edx, %ebx
+
+	movl (%eax), %ecx
+						addl %ebx, %esi
+
+	movl $magic, %edi
+						jnc L5
+
+	movl %edi, %ebp
+						xorl %ebx, %esi
+
+	addl %ecx, %ebp
+						orl $magic, %esi
+
+						addl $1, %esi
+
+						je L1
+
+	/* We know there is no NUL byte but a C byte in the word.
+	   %ebx contains NUL in this particular byte.  */
+L5:	subl $4, %eax		/* adjust pointer */
+	testb %bl, %bl		/* first byte == C? */
+
+	jz L2			/* yes => return pointer */
+
+	incl %eax		/* increment pointer */
+	testb %bh, %bh		/* second byte == C? */
+
+	jz L2			/* yes => return pointer */
+
+	shrl $16, %ebx		/* make upper bytes accessible */
+	incl %eax		/* increment pointer */
+
+	cmp $0, %bl		/* third byte == C */
+	je L2			/* yes => return pointer */
+
+	incl %eax		/* increment pointer */
+
+L2:	popl %ebp		/* restore saved registers */
+	popl %ebx
+
+	popl %esi
+	popl %edi
+
+	ret
+
+	/* We know there is a NUL byte in the word.  But we have to test
+	   whether there is an C byte before it in the word.  */
+L4:	subl $4, %eax		/* adjust pointer */
+	cmpb %dl, %cl		/* first byte == C? */
+
+	je L2			/* yes => return pointer */
+
+	cmpb $0, %cl		/* first byte == NUL? */
+	je L3			/* yes => return NULL */
+
+	incl %eax		/* increment pointer */
+
+	cmpb %dl, %ch		/* second byte == C? */
+	je L2			/* yes => return pointer */
+
+	cmpb $0, %ch		/* second byte == NUL? */
+	je L3			/* yes => return NULL */
+
+	shrl $16, %ecx		/* make upper bytes accessible */
+	incl %eax		/* increment pointer */
+
+	cmpb %dl, %cl		/* third byte == C? */
+	je L2			/* yes => return pointer */
+
+	cmpb $0, %cl		/* third byte == NUL? */
+	je L3			/* yes => return NULL */
+
+	incl %eax		/* increment pointer */
+
+	/* The test four the fourth byte is necessary!  */
+	cmpb %dl, %ch		/* fourth byte == C? */
+	je L2			/* yes => return pointer */
+
+L3:	xorl %eax, %eax		/* set return value = NULL */
+
+	popl %ebp		/* restore saved registers */
+	popl %ebx
+
+	popl %esi
+	popl %edi
+
+	ret
+
+#undef index
+weak_alias (strchr, index)
--- a/sysdeps/i386/i586/strlen.S
+++ b/sysdeps/i386/i586/strlen.S
@ -0,0 +1,185 @@
+/* strlen -- Compute length og NUL terminated string.
+Highly optimized version for ix86, x>=5.
+Copyright (C) 1995 Free Software Foundation, Inc.
+This file is part of the GNU C Library.
+Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+
+/* This version is especially optimized for the i586 (and following?)
+   processors.  This is mainly done by using the two pipelines.  The
+   version optimized for i486 is weak in this aspect because to get
+   as much parallelism we have to executs some *more* instructions.
+
+   The code below is structured to reflect the pairing of the instructions
+   as *I think* it is.  I have no processor data book to verify this.
+   If you find something you think is incorrect let me know.  */
+
+
+/* The magic value which is used throughout in the whole code.  */
+#define magic 0xfefefeff
+
+/*
+   INPUT PARAMETERS:
+   str		(sp + 4)
+*/
+
+	.text
+ENTRY(strlen)
+	movl 4(%esp), %eax	/* get string pointer */
+
+	movl %eax, %ecx		/* duplicate it */
+	andl $3, %ecx		/* mask alignment bits */
+
+	jz L11			/* aligned => start loop */
+
+	cmpb %ch, (%eax)	/* is byte NUL? */
+	je L2			/* yes => return */
+
+	incl %eax		/* increment pointer */
+	cmpl $3, %ecx		/* was alignment = 3? */
+
+	je L11			/* yes => now it is aligned and start loop */
+
+	cmpb %ch, (%eax)	/* is byte NUL? */
+	je L2			/* yes => return */
+
+	incl %eax		/* increment pointer */
+	cmpl $2, %ecx		/* was alignment = 2? */
+
+	je L11			/* yes => now it is aligned and start loop */
+
+	cmpb %ch, (%eax)	/* is byte NUL? */
+	je L2			/* yes => return */
+
+	incl %eax		/* increment pointer */
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+	 change any of the hole bits of LONGWORD.
+
+	 1) Is this safe?  Will it catch all the zero bytes?
+	 Suppose there is a byte with all zeros.  Any carry bits
+	 propagating from its left will fall into the hole at its
+	 least significant bit and stop.  Since there will be no
+	 carry from its most significant bit, the LSB of the
+	 byte to the left will be unchanged, and the zero will be
+	 detected.
+
+	 2) Is this worthwhile?  Will it ignore everything except
+	 zero bytes?  Suppose every byte of LONGWORD has a bit set
+	 somewhere.  There will be a carry into bit 8.	If bit 8
+	 is set, this will carry into bit 16.  If bit 8 is clear,
+	 one of bits 9-15 must be set, so there will be a carry
+	 into bit 16.  Similarly, there will be a carry into bit
+	 24.  If one of bits 24-31 is set, there will be a carry
+	 into bit 32 (=carry flag), so all of the hole bits will
+	 be changed.  */
+L11:	xorl %edx, %edx		/* We need %edx == 0 for later */
+
+L1:
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	addl $4, %eax		/* adjust pointer for *next* word */
+
+	subl %ecx, %edx		/* first step to negate word */
+	addl $magic, %ecx	/* add magic word */
+
+	decl %edx		/* complete negation of word */
+	jnc L3			/* previous addl caused overflow? */
+
+	xorl %ecx, %edx		/* (word+magic)^word */
+	subl $magic, %ecx	/* undo previous addl to restore word */
+
+	andl $~magic, %edx	/* any of the carry flags set? */
+
+	jne L3			/* yes => determine byte */
+
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	addl $4, %eax		/* adjust pointer for *next* word */
+
+	subl %ecx, %edx		/* first step to negate word */
+	addl $magic, %ecx	/* add magic word */
+
+	decl %edx		/* complete negation of word */
+	jnc L3			/* previous addl caused overflow? */
+
+	xorl %ecx, %edx		/* (word+magic)^word */
+	subl $magic, %ecx	/* undo previous addl to restore word */
+
+	andl $~magic, %edx	/* any of the carry flags set? */
+
+	jne L3			/* yes => determine byte */
+
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	addl $4, %eax		/* adjust pointer for *next* word */
+
+	subl %ecx, %edx		/* first step to negate word */
+	addl $magic, %ecx	/* add magic word */
+
+	decl %edx		/* complete negation of word */
+	jnc L3			/* previous addl caused overflow? */
+
+	xorl %ecx, %edx		/* (word+magic)^word */
+	subl $magic, %ecx	/* undo previous addl to restore word */
+
+	andl $~magic, %edx	/* any of the carry flags set? */
+
+	jne L3			/* yes => determine byte */
+
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	addl $4, %eax		/* adjust pointer for *next* word */
+
+	subl %ecx, %edx		/* first step to negate word */
+	addl $magic, %ecx	/* add magic word */
+
+	decl %edx		/* wcomplete negation of ord */
+	jnc L3			/* previous addl caused overflow? */
+
+	xorl %ecx, %edx		/* (word+magic)^word */
+	subl $magic, %ecx	/* undo previous addl to restore word */
+
+	andl $~magic, %edx	/* any of the carry flags set? */
+
+	je L1			/* no => start loop again */
+
+
+L3:	subl $4, %eax		/* correct too early pointer increment */
+	testb %cl, %cl		/* lowest byte NUL? */
+
+	jz L2			/* yes => return */
+
+	inc %eax		/* increment pointer */
+	testb %ch, %ch		/* second byte NUL? */
+
+	jz L2			/* yes => return */
+
+	shrl $16, %ecx		/* make upper bytes accessible */
+	incl %eax		/* increment pointer */
+
+	cmpb $0, %cl		/* is third byte NUL? */
+	jz L2			/* yes => return */
+
+	incl %eax		/* increment pointer */
+
+L2:	subl 4(%esp), %eax	/* now compute the length as difference
+				   between start and terminating NUL
+				   character */
+
+	ret
--- a/sysdeps/i386/i586/sub_n.S
+++ b/sysdeps/i386/i586/sub_n.S
@ -0,0 +1,136 @@
+/* Pentium __mpn_sub_n -- Subtract two limb vectors of the same length > 0
+   and store difference in a third limb vector.
+
+Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+   INPUT PARAMETERS
+   res_ptr	(sp + 4)
+   s1_ptr	(sp + 8)
+   s2_ptr	(sp + 12)
+   size		(sp + 16)
+*/
+
+#define r1	%eax
+#define r2	%edx
+#define src1	%esi
+#define src2	%ebp
+#define dst	%edi
+#define x	%ebx
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+.text
+	ALIGN (3)
+	.globl C_SYMBOL_NAME(__mpn_sub_n)
+C_SYMBOL_NAME(__mpn_sub_n:)
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+
+	movl	20(%esp),dst		/* res_ptr */
+	movl	24(%esp),src1		/* s1_ptr */
+	movl	28(%esp),src2		/* s2_ptr */
+	movl	32(%esp),%ecx		/* size */
+
+	movl	(src2),x
+
+	decl	%ecx
+	movl	%ecx,r2
+	shrl	$3,%ecx
+	andl	$7,r2
+	testl	%ecx,%ecx		/* zero carry flag */
+	jz	Lend
+	pushl	r2
+
+	ALIGN (3)
+Loop:	movl	28(dst),%eax		/* fetch destination cache line */
+	leal	32(dst),dst
+
+L1:	movl	(src1),r1
+	movl	4(src1),r2
+	sbbl	x,r1
+	movl	4(src2),x
+	sbbl	x,r2
+	movl	8(src2),x
+	movl	r1,-32(dst)
+	movl	r2,-28(dst)
+
+L2:	movl	8(src1),r1
+	movl	12(src1),r2
+	sbbl	x,r1
+	movl	12(src2),x
+	sbbl	x,r2
+	movl	16(src2),x
+	movl	r1,-24(dst)
+	movl	r2,-20(dst)
+
+L3:	movl	16(src1),r1
+	movl	20(src1),r2
+	sbbl	x,r1
+	movl	20(src2),x
+	sbbl	x,r2
+	movl	24(src2),x
+	movl	r1,-16(dst)
+	movl	r2,-12(dst)
+
+L4:	movl	24(src1),r1
+	movl	28(src1),r2
+	sbbl	x,r1
+	movl	28(src2),x
+	sbbl	x,r2
+	movl	32(src2),x
+	movl	r1,-8(dst)
+	movl	r2,-4(dst)
+
+	leal	32(src1),src1
+	leal	32(src2),src2
+	decl	%ecx
+	jnz	Loop
+
+	popl	r2
+Lend:
+	decl	r2			/* test r2 w/o clobbering carry */
+	js	Lend2
+	incl	r2
+Loop2:
+	leal	4(dst),dst
+	movl	(src1),r1
+	sbbl	x,r1
+	movl	4(src2),x
+	movl	r1,-4(dst)
+	leal	4(src1),src1
+	leal	4(src2),src2
+	decl	r2
+	jnz	Loop2
+Lend2:
+	movl	(src1),r1
+	sbbl	x,r1
+	movl	r1,(dst)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
--- a/sysdeps/i386/i586/submul_1.S
+++ b/sysdeps/i386/i586/submul_1.S
@ -0,0 +1,82 @@
+/* Pentium __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+   the result from a second limb vector.
+
+Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+   INPUT PARAMETERS
+   res_ptr	(sp + 4)
+   s1_ptr	(sp + 8)
+   size		(sp + 12)
+   s2_limb	(sp + 16)
+*/
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define res_ptr edi
+#define s1_ptr esi
+#define size ecx
+#define s2_limb ebp
+
+	TEXT
+	ALIGN (3)
+	GLOBL	C_SYMBOL_NAME(__mpn_submul_1)
+C_SYMBOL_NAME(__mpn_submul_1:)
+
+	INSN1(push,l	,R(edi))
+	INSN1(push,l	,R(esi))
+	INSN1(push,l	,R(ebx))
+	INSN1(push,l	,R(ebp))
+
+	INSN2(mov,l	,R(res_ptr),MEM_DISP(esp,20))
+	INSN2(mov,l	,R(s1_ptr),MEM_DISP(esp,24))
+	INSN2(mov,l	,R(size),MEM_DISP(esp,28))
+	INSN2(mov,l	,R(s2_limb),MEM_DISP(esp,32))
+
+	INSN2(lea,l	,R(res_ptr),MEM_INDEX(res_ptr,size,4))
+	INSN2(lea,l	,R(s1_ptr),MEM_INDEX(s1_ptr,size,4))
+	INSN1(neg,l	,R(size))
+	INSN2(xor,l	,R(edx),R(edx))
+	ALIGN (3)
+Loop:
+	INSN2(mov,l	,R(ebx),R(edx))
+	INSN2(mov,l	,R(eax),MEM_INDEX(s1_ptr,size,4))
+
+	INSN1(mul,l	,R(s2_limb))
+
+	INSN2(add,l	,R(eax),R(ebx))
+	INSN2(mov,l	,R(ebx),MEM_INDEX(res_ptr,size,4))
+
+	INSN2(adc,l	,R(edx),$0)
+	INSN2(sub,l	,R(ebx),R(eax))
+
+	INSN2(adc,l	,R(edx),$0)
+	INSN2(mov,l	,MEM_INDEX(res_ptr,size,4),R(ebx))
+
+	INSN1(inc,l	,R(size))
+	INSN1(jnz,	,Loop)
+
+
+	INSN2(mov,l	,R(eax),R(edx))
+	INSN1(pop,l	,R(ebp))
+	INSN1(pop,l	,R(ebx))
+	INSN1(pop,l	,R(esi))
+	INSN1(pop,l	,R(edi))
+	ret
--- a/sysdeps/i386/memchr.S
+++ b/sysdeps/i386/memchr.S
@ -0,0 +1,315 @@
+/* memchr (str, ch, n) -- Return pointer to first occurrence of CH in STR less
+   than N.
+For Intel 80x86, x>=3.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+Optimised a little by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+This file is part of the GNU C Library.
+
+This version is developed using the same algorithm as the fast C
+version which carries the following introduction:
+
+Based on strlen implemention by Torbjorn Granlund (tege@sics.se),
+with help from Dan Sahlin (dan@sics.se) and
+commentary by Jim Blandy (jimb@ai.mit.edu);
+adaptation to memchr suggested by Dick Karpinski (dick@cca.ucsf.edu),
+and implemented by Roland McGrath (roland@ai.mit.edu).
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   str		(sp + 4)
+   c		(sp + 8)
+   len		(sp + 12)
+*/
+
+	.text
+ENTRY (memchr)
+	/* Save callee-safe registers used in this function.  */
+	pushl %esi
+	pushl %edi
+
+	/* Load parameters into registers.  */
+	movl 12(%esp), %eax	/* str: pointer to memory block.  */
+	movl 16(%esp), %edx	/* c: byte we are looking for.  */
+	movl 20(%esp), %esi	/* len: length of memory block.  */
+
+	/* If my must not test more than three characters test
+	   them one by one.  This is especially true for 0.  */
+	cmpl $4, %esi
+	jb L3
+
+	/* At the moment %edx contains C.  What we need for the
+	   algorithm is C in all bytes of the dword.  Avoid
+	   operations on 16 bit words because these require an
+	   prefix byte (and one more cycle).  */
+	movb %dl, %dh		/* Now it is 0|0|c|c */
+	movl %edx, %ecx
+	shll $16, %edx		/* Now c|c|0|0 */
+	movw %cx, %dx		/* And finally c|c|c|c */
+
+	/* Better performance can be achieved if the word (32
+	   bit) memory access is aligned on a four-byte-boundary.
+	   So process first bytes one by one until boundary is
+	   reached. Don't use a loop for better performance.  */
+
+	testb $3, %eax		/* correctly aligned ? */
+	je L2			/* yes => begin loop */
+	cmpb %dl, (%eax)	/* compare byte */
+	je L9			/* target found => return */
+	incl %eax		/* increment source pointer */
+	decl %esi		/* decrement length counter */
+	je L4			/* len==0 => return NULL */
+
+	testb $3, %eax		/* correctly aligned ? */
+	je L2			/* yes => begin loop */
+	cmpb %dl, (%eax)	/* compare byte */
+	je L9			/* target found => return */
+	incl %eax		/* increment source pointer */
+	decl %esi		/* decrement length counter */
+	je L4			/* len==0 => return NULL */
+
+	testb $3, %eax		/* correctly aligned ? */
+	je L2			/* yes => begin loop */
+	cmpb %dl, (%eax)	/* compare byte */
+	je L9			/* target found => return */
+	incl %eax		/* increment source pointer */
+	decl %esi		/* decrement length counter */
+	/* no test for len==0 here, because this is done in the
+	   loop head */
+	jmp L2
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+	 change any of the hole bits of LONGWORD.
+
+	 1) Is this safe?  Will it catch all the zero bytes?
+	 Suppose there is a byte with all zeros.  Any carry bits
+	 propagating from its left will fall into the hole at its
+	 least significant bit and stop.  Since there will be no
+	 carry from its most significant bit, the LSB of the
+	 byte to the left will be unchanged, and the zero will be
+	 detected.
+
+	 2) Is this worthwhile?  Will it ignore everything except
+	 zero bytes?  Suppose every byte of LONGWORD has a bit set
+	 somewhere.  There will be a carry into bit 8.	If bit 8
+	 is set, this will carry into bit 16.  If bit 8 is clear,
+	 one of bits 9-15 must be set, so there will be a carry
+	 into bit 16.  Similarly, there will be a carry into bit
+	 24.  If one of bits 24-31 is set, there will be a carry
+	 into bit 32 (=carry flag), so all of the hole bits will
+	 be changed.
+
+	 3) But wait!  Aren't we looking for C, not zero?
+	 Good point.  So what we do is XOR LONGWORD with a longword,
+	 each of whose bytes is C.  This turns each byte that is C
+	 into a zero.  */
+
+
+	/* Each round the main loop processes 16 bytes.  */
+
+	ALIGN (4)
+
+L1:	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+
+	/* According to the algorithm we had to reverse the effect of the
+	   XOR first and then test the overflow bits.  But because the
+	   following XOR would destroy the carry flag and it would (in a
+	   representation with more than 32 bits) not alter then last
+	   overflow, we can now test this condition.  If no carry is signaled
+	   no overflow must have occured in the last byte => it was 0.	*/
+	jnc L8
+
+	/* We are only interested in carry bits that change due to the
+	   previous add, so remove original bits */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+
+	/* Now test for the other three overflow bits.  */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	/* If at least one byte of the word is C we don't get 0 in %edi.  */
+	jnz L8			/* found it => return pointer */
+
+	/* This process is unfolded four times for better performance.
+	   we don't increment the source pointer each time.  Instead we
+	   use offsets and increment by 16 in each run of the loop.  But
+	   before probing for the matching byte we need some extra code
+	   (following LL(13) below).  Even the len can be compared with
+	   constants instead of decrementing each time.  */
+
+	movl 4(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L7			/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L7			/* found it => return pointer */
+
+	movl 8(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L6			/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L6			/* found it => return pointer */
+
+	movl 12(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L5			/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L5			/* found it => return pointer */
+
+	/* Adjust both counters for a full round, i.e. 16 bytes.  */
+	addl $16, %eax
+L2:	subl $16, %esi
+	jae L1			/* Still more than 16 bytes remaining */
+
+	/* Process remaining bytes separately.  */
+	cmpl $4-16, %esi	/* rest < 4 bytes? */
+	jb L3			/* yes, than test byte by byte */
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L8			/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jne L8			/* found it => return pointer */
+	addl $4, %eax		/* adjust source pointer */
+
+	cmpl $8-16, %esi	/* rest < 8 bytes? */
+	jb L3			/* yes, than test byte by byte */
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L8			/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jne L8			/* found it => return pointer */
+	addl $4, %eax		/* adjust source pointer */
+
+	cmpl $12-16, %esi	/* rest < 12 bytes? */
+	jb L3			/* yes, than test byte by byte */
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L8			/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jne L8			/* found it => return pointer */
+	addl $4, %eax		/* adjust source pointer */
+
+	/* Check the remaining bytes one by one.  */
+L3:	andl $3, %esi		/* mask out uninteresting bytes */
+	jz L4			/* no remaining bytes => return NULL */
+
+	cmpb %dl, (%eax)	/* compare byte with C */
+	je L9			/* equal, than return pointer */
+	incl %eax		/* increment source pointer */
+	decl %esi		/* decrement length */
+	jz L4			/* no remaining bytes => return NULL */
+
+	cmpb %dl, (%eax)	/* compare byte with C */
+	je L9			/* equal, than return pointer */
+	incl %eax		/* increment source pointer */
+	decl %esi		/* decrement length */
+	jz L4			/* no remaining bytes => return NULL */
+
+	cmpb %dl, (%eax)	/* compare byte with C */
+	je L9			/* equal, than return pointer */
+
+L4:	/* no byte found => return NULL */
+	xorl %eax, %eax
+	jmp L9
+
+	/* add missing source pointer increments */
+L5:	addl $4, %eax
+L6:	addl $4, %eax
+L7:	addl $4, %eax
+
+	/* Test for the matching byte in the word.  %ecx contains a NUL
+	   char in the byte which originally was the byte we are looking
+	   at.  */
+L8:	testb %cl, %cl		/* test first byte in dword */
+	jz L9			/* if zero => return pointer */
+	incl %eax		/* increment source pointer */
+
+	testb %ch, %ch		/* test second byte in dword */
+	jz L9			/* if zero => return pointer */
+	incl %eax		/* increment source pointer */
+
+	testl $0xff0000, %ecx	/* test third byte in dword */
+	jz L9			/* if zero => return pointer */
+	incl %eax		/* increment source pointer */
+
+	/* No further test needed we we known it is one of the four byytes.  */
+
+L9:	popl %edi		/* pop saved registers */
+	popl %esi
+
+	ret
--- a/sysdeps/i386/memchr.c
+++ b/sysdeps/i386/memchr.c
@ -1,48 +0,0 @@
-/* memchr (str, ch, n) -- Return pointer to first occurrence of CH in STR less
-   than N.
-   For Intel 80x86, x>=3.
-   Copyright (C) 1991, 1992, 1993 Free Software Foundation, Inc.
-   Contributed by Torbjorn Granlund (tege@sics.se).
-
-The GNU C Library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Library General Public License as
-published by the Free Software Foundation; either version 2 of the
-License, or (at your option) any later version.
-
-The GNU C Library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-Library General Public License for more details.
-
-You should have received a copy of the GNU Library General Public
-License along with the GNU C Library; see the file COPYING.LIB.  If
-not, write to the Free Software Foundation, Inc., 675 Mass Ave,
-Cambridge, MA 02139, USA.  */
-
-#include <ansidecl.h>
-#include <string.h>
-
-#ifdef	__GNUC__
-
-PTR
-DEFUN(memchr, (str, c, len),
-      CONST PTR str AND int c AND size_t len)
-{
-  PTR retval;
-  asm("cld\n"			/* Search forward.  */
-      "testl %1,%1\n"		/* Clear Z flag, to handle LEN == 0.  */
-      /* Some old versions of gas need `repne' instead of `repnz'.  */
-      "repnz\n"			/* Search for C in al.  */
-      "scasb\n"
-      "movl %2,%0\n"		/* Set %0 to 0 (without affecting Z flag).  */
-      "jnz done\n"		/* Jump if we found nothing equal to C.  */
-      "leal -1(%1),%0\n"	/* edi has been incremented.  Return edi-1.  */
-      "done:" :
-      "=a" (retval), "=D" (str), "=c" (len) :
-      "0" (c), "1" (str), "2" (len));
-  return retval;
-}
-
-#else
-#include <sysdeps/generic/memchr.c>
-#endif
--- a/sysdeps/i386/memcmp.S
+++ b/sysdeps/i386/memcmp.S
@ -0,0 +1,68 @@
+/* memcmp -- compare two memory blocks for differences in the first COUNT
+	     bytes.
+Copyright (C) 1995 Free Software Foundation, Inc.
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   block1	(sp + 4)
+   block2	(sp + 8)
+   len		(sp + 12)
+*/
+
+	.text
+ENTRY (memcmp)
+	pushl %esi		/* Save callee-safe registers.  */
+	movl %edi, %edx		/* Note that %edx is not used and can
+				   so be used to save %edi.  It's faster.  */
+
+	movl 12(%esp), %esi	/* Load address of block #1.  */
+	movl 16(%esp), %edi	/* Load address of block #2.  */
+	movl 20(%esp), %ecx	/* Load maximal length of compare area.  */
+
+	cld			/* Set direction of comparison.  */
+
+	xorl %eax, %eax		/* Default result.  */
+
+	repe			/* Compare at most %ecx bytes.  */
+	cmpsb
+	jz L1			/* If even last byte was equal we return 0.  */
+
+	/* The memory blocks are not equal.  So result of the last
+	   subtraction is present in the carry flag.  It is set when
+	   the byte in block #2 is bigger.  In this case we have to
+	   return -1 (=0xffffffff), else 1.  */
+	sbbl %eax, %eax		/* This is tricky.  %eax == 0 and carry is set
+				   or not depending on last subtraction.  */
+
+	/* At this point %eax == 0, if the byte of block #1 was bigger, and
+	   0xffffffff if the last byte of block #2 was bigger.  The later
+	   case is already correct but the former needs a little adjustment.
+	   Note that the following operation does not change 0xffffffff.  */
+	orb $1, %al		/* Change 0 to 1.  */
+
+L1:	popl %esi		/* Restore registers.  */
+	movl %edx, %edi
+
+	ret
+
+#undef bcmp
+weak_alias (memcmp, bcmp)
--- a/sysdeps/i386/stpcpy.S
+++ b/sysdeps/i386/stpcpy.S
@ -0,0 +1,87 @@
+/* stpcpy -- copy SRC to DEST returning the address of the terminating '\0'
+	     in DEST.
+For Intel 80x86, x>=3.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper (drepper@gnu.ai.mit.edu).
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+/* This function is defined neither in ANSI nor POSIX standards but is
+   also not invented here.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   dest		(sp + 4)
+   src		(sp + 8)
+*/
+
+	.text
+ENTRY (__stpcpy)
+	movl 4(%esp), %eax	/* load destination pointer */
+	movl 8(%esp), %ecx	/* load source pointer */
+
+	subl %eax, %ecx		/* magic: reduce number of loop variants
+				   to one using addressing mode */
+
+	/* Here we would like to write
+
+	subl $4, %eax
+	ALIGN (4)
+
+	but the assembler is too smart and optimizes for the shortest
+	form where the number only needs one byte.  But if we could
+	have the long form we would not need the alignment.  */
+
+	.byte 0x81, 0xe8	/* This is `subl $0x00000004, %eax' */
+	.long 0x00000004
+
+	/* Four times unfolded loop with only one loop counter.  This
+	   is achieved by the use of index+base adressing mode.  As the
+	   loop counter we use the destination address because this is
+	   also the result.  */
+L1:	addl $4, %eax		/* increment loop counter */
+
+	movb (%eax,%ecx), %dl	/* load current char */
+	movb %dl, (%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L2			/* yes, then exit */
+
+	movb 1(%eax,%ecx), %dl	/* load current char */
+	movb %dl, 1(%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L3			/* yes, then exit */
+
+	movb 2(%eax,%ecx), %dl	/* load current char */
+	movb %dl, 2(%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L4			/* yes, then exit */
+
+	movb 3(%eax,%ecx), %dl	/* load current char */
+	movb %dl, 3(%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jnz L1			/* no, then continue loop */
+
+	incl %eax		/* correct loop counter */
+L4:	incl %eax
+L3:	incl %eax
+L2:
+	ret
+
+weak_alias (__stpcpy, stpcpy)
--- a/sysdeps/i386/stpncpy.S
+++ b/sysdeps/i386/stpncpy.S
@ -0,0 +1,143 @@
+/* stpncpy -- copy no more then N bytes from SRC to DEST, returning the
+	      address of the terminating '\0' in DEST.
+For Intel 80x86, x>=3.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+Some bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+  - original wrote n+1 chars in some cases.
+  - stpncpy() ought to behave like strncpy() ie. not null-terminate
+    if limited by n.  glibc-1.09 stpncpy() does this.
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   dest		(sp + 4)
+   src		(sp + 8)
+   maxlen	(sp + 12)
+*/
+
+	.text
+ENTRY (__stpncpy)
+
+	pushl %esi
+
+	movl 8(%esp), %eax	/* load destination pointer */
+	movl 12(%esp), %esi	/* load source pointer */
+	movl 16(%esp), %ecx	/* load maximal length */
+
+	subl %eax, %esi		/* magic: reduce number of loop variants
+				   to one using addressing mode */
+	jmp L1			/* jump to loop "head" */
+
+	ALIGN(4)
+
+	/* Four times unfolded loop with two loop counters.  We get the
+	   the third value (the source address) by using the index+base
+	   adressing mode.  */
+L2:	movb (%eax,%esi), %dl	/* load current char */
+	movb %dl, (%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L7			/* yes, then exit */
+
+	movb 1(%eax,%esi), %dl	/* load current char */
+	movb %dl, 1(%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L6			/* yes, then exit */
+
+	movb 2(%eax,%esi), %dl	/* load current char */
+	movb %dl, 2(%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L5			/* yes, then exit */
+
+	movb 3(%eax,%esi), %dl	/* load current char */
+	movb %dl, 3(%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L4			/* yes, then exit */
+
+	addl $4, %eax		/* increment loop counter for full round */
+
+L1:	subl $4, %ecx		/* still more than 4 bytes allowed? */
+	jae L2			/* yes, then go to start of loop */
+
+	/* The maximal remaining 15 bytes are not processed in a loop.  */
+
+	addl $4, %ecx		/* correct above subtraction */
+	jz L9			/* maximal allowed char reached => go to end */
+
+	movb (%eax,%esi), %dl	/* load current char */
+	movb %dl, (%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L3			/* yes, then exit */
+
+	incl %eax		/* increment pointer */
+	decl %ecx		/* decrement length counter */
+	jz L9			/* no more allowed => exit */
+
+	movb (%eax,%esi), %dl	/* load current char */
+	movb %dl, (%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L3			/* yes, then exit */
+
+	incl %eax		/* increment pointer */
+	decl %ecx		/* decrement length counter */
+	jz L9			/* no more allowed => exit */
+
+	movb (%eax,%esi), %dl	/* load current char */
+	movb %dl, (%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L3			/* yes, then exit */
+
+	incl %eax		/* increment pointer */
+	jmp L9			/* we don't have to test for counter underflow
+				   because we know we had a most 3 bytes
+				   remaining => exit */
+
+	/* When coming from the main loop we have to adjust the pointer.  */
+L4:	decl %ecx		/* decrement counter */
+	incl %eax		/* increment pointer */
+
+L5:	decl %ecx		/* increment pointer */
+	incl %eax		/* increment pointer */
+
+L6:	decl %ecx		/* increment pointer */
+	incl %eax		/* increment pointer */
+L7:
+
+	addl $3, %ecx		/* correct pre-decrementation of counter
+				   at the beginning of the loop; but why 3
+				   and not 4?  Very simple, we have to count
+				   the NUL char we already wrote.  */
+	jz L9			/* counter is also 0 => exit */
+
+	/* We now have to fill the rest of the buffer with NUL.  This
+	   is done in a tricky way.  Please note that the adressing mode
+	   used below is not the same we used above.  Here we use the
+	   %ecx register.  */
+L8:
+	movb $0, (%ecx,%eax)	/* store NUL char */
+L3:	decl %ecx		/* all bytes written? */
+	jnz L8			/* no, then again */
+
+L9:	popl %esi		/* restore saved register content */
+
+	ret
+
+weak_alias (__stpncpy, stpncpy)
--- a/sysdeps/i386/strchr.S
+++ b/sysdeps/i386/strchr.S
@ -0,0 +1,278 @@
+/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR.
+For Intel 80x86, x>=3.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+Some optimisations by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   str		(sp + 4)
+   ch		(sp + 8)
+*/
+
+	.text
+ENTRY (strchr)
+	pushl %edi		/* Save callee-safe registers used here.  */
+
+	movl 8(%esp), %eax	/* get string pointer */
+	movl 12(%esp), %edx	/* get character we are looking for */
+
+	/* At the moment %edx contains C.  What we need for the
+	   algorithm is C in all bytes of the dword.  Avoid
+	   operations on 16 bit words because these require an
+	   prefix byte (and one more cycle).  */
+	movb %dl, %dh		/* now it is 0|0|c|c */
+	movl %edx, %ecx
+	shll $16, %edx		/* now it is c|c|0|0 */
+	movw %cx, %dx		/* and finally c|c|c|c */
+
+	/* Before we start with the main loop we process single bytes
+	   until the source pointer is aligned.  This has two reasons:
+	   1. aligned 32-bit memory access is faster
+	   and (more important)
+	   2. we process in the main loop 32 bit in one step although
+	      we don't know the end of the string.  But accessing at
+	      4-byte alignment guarantees that we never access illegal
+	      memory if this would not also be done by the trivial
+	      implementation (this is because all processor inherant
+	      boundaries are multiples of 4.  */
+
+	testb $3, %eax		/* correctly aligned ? */
+	jz L11			/* yes => begin loop */
+	movb (%eax), %cl	/* load byte in question (we need it twice) */
+	cmpb %cl, %dl		/* compare byte */
+	je L6			/* target found => return */
+	testb %cl, %cl		/* is NUL? */
+	jz L2			/* yes => return NULL */
+	incl %eax		/* increment pointer */
+
+	testb $3, %eax		/* correctly aligned ? */
+	jz L11			/* yes => begin loop */
+	movb (%eax), %cl	/* load byte in question (we need it twice) */
+	cmpb %cl, %dl		/* compare byte */
+	je L6			/* target found => return */
+	testb %cl, %cl		/* is NUL? */
+	jz L2			/* yes => return NULL */
+	incl %eax		/* increment pointer */
+
+	testb $3, %eax		/* correctly aligned ? */
+	jz L11			/* yes => begin loop */
+	movb (%eax), %cl	/* load byte in question (we need it twice) */
+	cmpb %cl, %dl		/* compare byte */
+	je L6			/* target found => return */
+	testb %cl, %cl		/* is NUL? */
+	jz L2			/* yes => return NULL */
+	incl %eax		/* increment pointer */
+
+	/* No we have reached alignment.  */
+	jmp L11			/* begin loop */
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+	 change any of the hole bits of LONGWORD.
+
+	 1) Is this safe?  Will it catch all the zero bytes?
+	 Suppose there is a byte with all zeros.  Any carry bits
+	 propagating from its left will fall into the hole at its
+	 least significant bit and stop.  Since there will be no
+	 carry from its most significant bit, the LSB of the
+	 byte to the left will be unchanged, and the zero will be
+	 detected.
+
+	 2) Is this worthwhile?  Will it ignore everything except
+	 zero bytes?  Suppose every byte of LONGWORD has a bit set
+	 somewhere.  There will be a carry into bit 8.	If bit 8
+	 is set, this will carry into bit 16.  If bit 8 is clear,
+	 one of bits 9-15 must be set, so there will be a carry
+	 into bit 16.  Similarly, there will be a carry into bit
+	 24.  If one of bits 24-31 is set, there will be a carry
+	 into bit 32 (=carry flag), so all of the hole bits will
+	 be changed.
+
+	 3) But wait!  Aren't we looking for C, not zero?
+	 Good point.  So what we do is XOR LONGWORD with a longword,
+	 each of whose bytes is C.  This turns each byte that is C
+	 into a zero.  */
+
+	/* Each round the main loop processes 16 bytes.  */
+
+	ALIGN(4)
+
+L1:	addl $16, %eax		/* adjust pointer for whole round */
+
+L11:	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* C */
+
+	/* According to the algorithm we had to reverse the effect of the
+	   XOR first and then test the overflow bits.  But because the
+	   following XOR would destroy the carry flag and it would (in a
+	   representation with more than 32 bits) not alter then last
+	   overflow, we can now test this condition.  If no carry is signaled
+	   no overflow must have occured in the last byte => it was 0.	*/
+	jnc L7
+
+	/* We are only interested in carry bits that change due to the
+	   previous add, so remove original bits */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+
+	/* Now test for the other three overflow bits.  */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	/* If at least one byte of the word is C we don't get 0 in %edi.  */
+	jnz L7			/* found it => return pointer */
+
+	/* Now we made sure the dword does not contain the character we are
+	   looking for.  But because we deal with strings we have to check
+	   for the end of string before testing the next dword.  */
+
+	xorl %edx, %ecx		/* restore original dword without reload */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L2			/* highest byte is NUL => return NULL */
+	xorl %ecx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L2			/* found NUL => return NULL */
+
+	movl 4(%eax), %ecx	/* get word (= 4 bytes) in question */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* C */
+	jnc L71			/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L71			/* found it => return pointer */
+	xorl %edx, %ecx		/* restore original dword without reload */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L2			/* highest byte is NUL => return NULL */
+	xorl %ecx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L2			/* found NUL => return NULL */
+
+	movl 8(%eax), %ecx	/* get word (= 4 bytes) in question */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* C */
+	jnc L72			/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L72			/* found it => return pointer */
+	xorl %edx, %ecx		/* restore original dword without reload */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L2			/* highest byte is NUL => return NULL */
+	xorl %ecx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L2			/* found NUL => return NULL */
+
+	movl 12(%eax), %ecx	/* get word (= 4 bytes) in question */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* C */
+	jnc L73			/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L73			/* found it => return pointer */
+	xorl %edx, %ecx		/* restore original dword without reload */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L2			/* highest byte is NUL => return NULL */
+	xorl %ecx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz L1			/* no NUL found => restart loop */
+
+L2:	/* Return NULL.  */
+	xorl %eax, %eax		/* load NULL in return value register */
+	popl %edi		/* restore saved register content */
+	ret
+
+L73:	addl $4, %eax		/* adjust pointer */
+L72:	addl $4, %eax
+L71:	addl $4, %eax
+
+	/* We now scan for the byte in which the character was matched.
+	   But we have to take care of the case that a NUL char is
+	   found before this in the dword.  */
+
+L7:	testb %cl, %cl		/* is first byte C? */
+	jz L6			/* yes => return pointer */
+	cmpb %dl, %cl		/* is first byte NUL? */
+	je L2			/* yes => return NULL */
+	incl %eax		/* it's not in the first byte */
+
+	testb %ch, %ch		/* is second byte C? */
+	jz L6			/* yes => return pointer */
+	cmpb %dl, %ch		/* is second byte NUL? */
+	je L2			/* yes => return NULL? */
+	incl %eax		/* it's not in the second byte */
+
+	shrl $16, %ecx		/* make upper byte accessible */
+	testb %cl, %cl		/* is third byte C? */
+	jz L6			/* yes => return pointer */
+	cmpb %dl, %cl		/* is third byte NUL? */
+	je L2			/* yes => return NULL */
+
+	/* It must be in the fourth byte and it cannot be NUL.  */
+	incl %eax
+
+L6:	popl %edi		/* restore saved register content */
+
+	ret
+
+weak_alias (strchr, index)
--- a/sysdeps/i386/strcspn.S
+++ b/sysdeps/i386/strcspn.S
@ -0,0 +1,176 @@
+/* strcspn (str, ss) -- Return the length of the initial segement of STR
+			which contains no characters from SS.
+For Intel 80x86, x>=3.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   str		(sp + 4)
+   stopset	(sp + 8)
+*/
+
+	.text
+ENTRY (strcspn)
+	movl 4(%esp), %edx	/* get string pointer */
+	movl 8(%esp), %eax	/* get stopset pointer */
+
+	/* First we create a table with flags for all possible characters.
+	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+	   supported by the C string functions we have 256 characters.
+	   Before inserting marks for the stop characters we clear the whole
+	   table.  The unrolled form is much faster than a loop.  */
+	xorl %ecx, %ecx		/* %ecx = 0 !!! */
+
+	pushl %ecx		/* make a 256 bytes long block filled with 0 */
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl $0		/* These immediate values make the label 2 */
+	pushl $0		/* to be aligned on a 16 byte boundary to */
+	pushl $0		/* get a better performance of the loop.  */
+	pushl $0
+	pushl $0
+	pushl $0
+
+/* For understanding the following code remember that %ecx == 0 now.
+   Although all the following instruction only modify %cl we always
+   have a correct zero-extended 32-bit value in %ecx.  */
+
+/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl".  We want
+   longer instructions so that the next loop aligns without adding nops.  */
+
+L2:	movb (%eax), %cl	/* get byte from stopset */
+	testb %cl, %cl		/* is NUL char? */
+	jz L1			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 1(%eax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L1			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 2(%eax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L1			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 3(%eax), %cl	/* get byte from stopset */
+	addl $4, %eax		/* increment stopset pointer */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+	testb $0xff, %cl	/* is NUL char? */
+	jnz L2			/* no => process next dword from stopset */
+
+L1:	leal -4(%edx), %eax	/* prepare loop */
+
+	/* We use a neat trick for the following loop.  Normally we would
+	   have to test for two termination conditions
+	   1. a character in the stopset was found
+	   and
+	   2. the end of the string was found
+	   But as a sign that the chracter is in the stopset we store its
+	   value in the table.  But the value of NUL is NUL so the loop
+	   terminates for NUL in every case.  */
+
+L3:	addl $4, %eax		/* adjust pointer for full loop round */
+
+	movb (%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	je L4			/* yes => return */
+
+	movb 1(%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	je L5			/* yes => return */
+
+	movb 2(%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	je L6			/* yes => return */
+
+	movb 3(%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	jne L3			/* yes => return */
+
+	incl %eax		/* adjust pointer */
+L6:	incl %eax
+L5:	incl %eax
+
+L4:	subl %edx, %eax		/* we have to return the number of valid
+				   characters, so compute distance to first
+				   non-valid character */
+	addl $256, %esp		/* remove stopset */
+
+	ret
--- a/sysdeps/i386/strpbrk.S
+++ b/sysdeps/i386/strpbrk.S
@ -0,0 +1,177 @@
+/* strcspn (str, ss) -- Return the length of the initial segement of STR
+			which contains no characters from SS.
+For Intel 80x86, x>=3.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   str		(sp + 4)
+   stopset	(sp + 8)
+*/
+
+	.text
+ENTRY (strpbrk)
+	movl 4(%esp), %edx	/* get string pointer */
+	movl 8(%esp), %eax	/* get stopset pointer */
+
+	/* First we create a table with flags for all possible characters.
+	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+	   supported by the C string functions we have 256 characters.
+	   Before inserting marks for the stop characters we clear the whole
+	   table.  The unrolled form is much faster than a loop.  */
+	xorl %ecx, %ecx		/* %ecx = 0 !!! */
+
+	pushl %ecx		/* make a 256 bytes long block filled with 0 */
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl $0		/* These immediate values make the label 2 */
+	pushl $0		/* to be aligned on a 16 byte boundary to */
+	pushl $0		/* get a better performance of the loop.  */
+	pushl $0
+	pushl $0
+	pushl $0
+
+/* For understanding the following code remember that %ecx == 0 now.
+   Although all the following instruction only modify %cl we always
+   have a correct zero-extended 32-bit value in %ecx.  */
+
+/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl".  We want
+   longer instructions so that the next loop aligns without adding nops.  */
+
+L2:	movb (%eax), %cl	/* get byte from stopset */
+	testb %cl, %cl		/* is NUL char? */
+	jz L1			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 1(%eax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L1			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 2(%eax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L1			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 3(%eax), %cl	/* get byte from stopset */
+	addl $4, %eax		/* increment stopset pointer */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+	testb $0xff, %cl	/* is NUL char? */
+	jnz L2			/* no => process next dword from stopset */
+
+L1:	leal -4(%edx), %eax	/* prepare loop */
+
+	/* We use a neat trick for the following loop.  Normally we would
+	   have to test for two termination conditions
+	   1. a character in the stopset was found
+	   and
+	   2. the end of the string was found
+	   But as a sign that the chracter is in the stopset we store its
+	   value in the table.  But the value of NUL is NUL so the loop
+	   terminates for NUL in every case.  */
+
+L3:	addl $4, %eax		/* adjust pointer for full loop round */
+
+	movb (%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	je L4			/* yes => return */
+
+	movb 1(%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	je L5			/* yes => return */
+
+	movb 2(%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	je L6			/* yes => return */
+
+	movb 3(%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	jne L3			/* yes => return */
+
+	incl %eax		/* adjust pointer */
+L6:	incl %eax
+L5:	incl %eax
+
+L4:	addl $256, %esp		/* remove stopset */
+
+	orb %cl, %cl		/* was last character NUL? */
+	jnz L7			/* no => return pointer */
+	xorl %eax, %eax		/* return NULL */
+
+L7:	ret
--- a/sysdeps/i386/strrchr.S
+++ b/sysdeps/i386/strrchr.S
@ -0,0 +1,321 @@
+/* strchr (str, ch) -- Return pointer to last occurrence of CH in STR.
+For Intel 80x86, x>=3.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+Some optimisations by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   str		(sp + 4)
+   ch		(sp + 8)
+*/
+
+	.text
+ENTRY (strrchr)
+	pushl %edi		/* Save callee-safe registers used here.  */
+	pushl %esi
+
+	xorl %eax, %eax
+	movl 12(%esp), %esi	/* get string pointer */
+	movl 16(%esp), %ecx	/* get character we are looking for */
+
+	/* At the moment %ecx contains C.  What we need for the
+	   algorithm is C in all bytes of the dword.  Avoid
+	   operations on 16 bit words because these require an
+	   prefix byte (and one more cycle).  */
+	movb %cl, %ch		/* now it is 0|0|c|c */
+	movl %ecx, %edx
+	shll $16, %ecx		/* now it is c|c|0|0 */
+	movw %dx, %cx		/* and finally c|c|c|c */
+
+	/* Before we start with the main loop we process single bytes
+	   until the source pointer is aligned.  This has two reasons:
+	   1. aligned 32-bit memory access is faster
+	   and (more important)
+	   2. we process in the main loop 32 bit in one step although
+	      we don't know the end of the string.  But accessing at
+	      4-byte alignment guarantees that we never access illegal
+	      memory if this would not also be done by the trivial
+	      implementation (this is because all processor inherant
+	      boundaries are multiples of 4.  */
+
+	testb $3, %esi		/* correctly aligned ? */
+	jz L19			/* yes => begin loop */
+	movb (%esi), %dl	/* load byte in question (we need it twice) */
+	cmpb %dl, %cl		/* compare byte */
+	jne L11			/* target found => return */
+	movl %esi, %eax		/* remember pointer as possible result */
+L11:	orb %dl, %dl		/* is NUL? */
+	jz L2			/* yes => return NULL */
+	incl %esi		/* increment pointer */
+
+	testb $3, %esi		/* correctly aligned ? */
+	jz L19			/* yes => begin loop */
+	movb (%esi), %dl	/* load byte in question (we need it twice) */
+	cmpb %dl, %cl		/* compare byte */
+	jne L12			/* target found => return */
+	movl %esi, %eax		/* remember pointer as result */
+L12:	orb %dl, %dl		/* is NUL? */
+	jz L2			/* yes => return NULL */
+	incl %esi		/* increment pointer */
+
+	testb $3, %esi		/* correctly aligned ? */
+	jz L19			/* yes => begin loop */
+	movb (%esi), %dl	/* load byte in question (we need it twice) */
+	cmpb %dl, %cl		/* compare byte */
+	jne L13			/* target found => return */
+	movl %esi, %eax		/* remember pointer as result */
+L13:	orb %cl, %cl		/* is NUL? */
+	jz L2			/* yes => return NULL */
+	incl %esi		/* increment pointer */
+
+	/* No we have reached alignment.  */
+	jmp L19			/* begin loop */
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+	 change any of the hole bits of LONGWORD.
+
+	 1) Is this safe?  Will it catch all the zero bytes?
+	 Suppose there is a byte with all zeros.  Any carry bits
+	 propagating from its left will fall into the hole at its
+	 least significant bit and stop.  Since there will be no
+	 carry from its most significant bit, the LSB of the
+	 byte to the left will be unchanged, and the zero will be
+	 detected.
+
+	 2) Is this worthwhile?  Will it ignore everything except
+	 zero bytes?  Suppose every byte of LONGWORD has a bit set
+	 somewhere.  There will be a carry into bit 8.	If bit 8
+	 is set, this will carry into bit 16.  If bit 8 is clear,
+	 one of bits 9-15 must be set, so there will be a carry
+	 into bit 16.  Similarly, there will be a carry into bit
+	 24.  If one of bits 24-31 is set, there will be a carry
+	 into bit 32 (=carry flag), so all of the hole bits will
+	 be changed.
+
+	 3) But wait!  Aren't we looking for C, not zero?
+	 Good point.  So what we do is XOR LONGWORD with a longword,
+	 each of whose bytes is C.  This turns each byte that is C
+	 into a zero.  */
+
+	/* Each round the main loop processes 16 bytes.  */
+
+	/* Jump to here when the character is detected.  We chose this
+	   way around because the character one is looking for is not
+	   as frequent as the rest and taking a conditional jump is more
+	   expensive than ignoring it.
+
+	   Some more words to the code below: it might not be obvious why
+	   we decrement the source pointer here.  In the loop the pointer
+	   is not pre-incremented and so it still points before the word
+	   we are looking at.  But you should take a look at the instruction
+	   which gets executed before we get into the loop: `addl $16, %esi'.
+	   This makes the following subs into adds.  */
+
+	/* These fill bytes make the main loop be correctly aligned.
+	   We cannot use align because it is not the following instruction
+	   which should be aligned.  */
+	.byte 0, 0, 0, 0, 0, 0, 0, 0
+
+L4:	subl $4, %esi		/* adjust pointer */
+L41:	subl $4, %esi
+L42:	subl $4, %esi
+L43:	testl $0xff000000, %edx	/* is highest byte == C? */
+	jnz L33			/* no => try other bytes */
+	leal 15(%esi), %eax	/* store address as result */
+	jmp L1			/* and start loop again */
+
+L3:	subl $4, %esi		/* adjust pointer */
+L31:	subl $4, %esi
+L32:	subl $4, %esi
+L33:	testl $0xff0000, %edx	/* is C in third byte? */
+	jnz L51			/* no => try other bytes */
+	leal 14(%esi), %eax	/* store address as result */
+	jmp L1			/* and start loop again */
+
+L51:
+	/* At this point we know that the byte is in one of the lower bytes.
+	   We make a guess and correct it if necessary.  This reduces the
+	   number of necessary jumps.  */
+	leal 12(%esi), %eax	/* guess address of lowest byte as result */
+	testb %dh, %dh		/* is guess correct? */
+	jnz L1			/* yes => start loop */
+	leal 13(%esi), %eax	/* correct guess to second byte */
+
+L1:	addl $16, %esi		/* increment pointer for full round */
+
+L19:	movl (%esi), %edx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+
+	/* According to the algorithm we had to reverse the effect of the
+	   XOR first and then test the overflow bits.  But because the
+	   following XOR would destroy the carry flag and it would (in a
+	   representation with more than 32 bits) not alter then last
+	   overflow, we can now test this condition.  If no carry is signaled
+	   no overflow must have occured in the last byte => it was 0.	*/
+
+	jnc L20			/* found NUL => check last word */
+
+	/* We are only interested in carry bits that change due to the
+	   previous add, so remove original bits */
+	xorl %edx, %edi		/* (word+magic)^word */
+
+	/* Now test for the other three overflow bits.  */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	/* If at least one byte of the word is C we don't get 0 in %edi.  */
+	jnz L20			/* found NUL => check last word */
+
+	/* Now we made sure the dword does not contain the character we are
+	   looking for.  But because we deal with strings we have to check
+	   for the end of string before testing the next dword.  */
+
+	xorl %ecx, %edx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L4			/* highest byte is C => examine dword */
+	xorl %edx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L3			/* C is detected in the word => examine it */
+
+	movl 4(%esi), %edx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L21			/* found NUL => check last word */
+	xorl %edx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L21			/* found NUL => check last word */
+	xorl %ecx, %edx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L41			/* highest byte is C => examine dword */
+	xorl %edx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L31			/* C is detected in the word => examine it */
+
+	movl 8(%esi), %edx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L22			/* found NUL => check last word */
+	xorl %edx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L22			/* found NUL => check last word */
+	xorl %ecx, %edx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L42			/* highest byte is C => examine dword */
+	xorl %edx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L32			/* C is detected in the word => examine it */
+
+	movl 12(%esi), %edx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L23			/* found NUL => check last word */
+	xorl %edx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L23			/* found NUL => check last word */
+	xorl %ecx, %edx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L43			/* highest byte is C => examine dword */
+	xorl %edx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz L1			/* C is not detected => restart loop */
+	jmp L33			/* examine word */
+
+L23:	addl $4, %esi		/* adjust pointer */
+L22:	addl $4, %esi
+L21:	addl $4, %esi
+
+	/* What remains to do is to test which byte the NUL char is and
+	   whether the searched character appears in one of the bytes
+	   before.  A special case is that the searched byte maybe NUL.
+	   In this case a pointer to the terminating NUL char has to be
+	   returned.  */
+
+L20:	cmpb %cl, %dl		/* is first byte == C? */
+	jne L24			/* no => skip */
+	movl %esi, %eax		/* store address as result */
+L24:	testb %dl, %dl		/* is first byte == NUL? */
+	jz L2			/* yes => return */
+
+	cmpb %cl, %dh		/* is second byte == C? */
+	jne L25			/* no => skip */
+	leal 1(%esi), %eax	/* store address as result */
+L25:	testb %dh, %dh		/* is second byte == NUL? */
+	jz L2			/* yes => return */
+
+	shrl $16,%edx		/* make upper bytes accessible */
+	cmpb %cl, %dl		/* is third byte == C */
+	jne L26			/* no => skip */
+	leal 2(%esi), %eax	/* store address as result */
+L26:	testb %dl, %dl		/* is third byte == NUL */
+	jz L2			/* yes => return */
+
+	cmpb %cl, %dh		/* is fourth byte == C */
+	jne L2			/* no => skip */
+	leal 3(%esi), %eax	/* store address as result */
+
+L2:	popl %esi		/* restore saved register content */
+	popl %edi
+
+	ret
+
+weak_alias (strrchr, rindex)
--- a/sysdeps/i386/strspn.S
+++ b/sysdeps/i386/strspn.S
@ -0,0 +1,176 @@
+/* strcspn (str, ss) -- Return the length of the initial segement of STR
+			which contains only characters from SS.
+For Intel 80x86, x>=3.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   str		(sp + 4)
+   skipset	(sp + 8)
+*/
+
+	.text
+ENTRY (strspn)
+	movl 4(%esp), %edx	/* get string pointer */
+	movl 8(%esp), %eax	/* get skipset pointer */
+
+	/* First we create a table with flags for all possible characters.
+	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+	   supported by the C string functions we have 256 characters.
+	   Before inserting marks for the stop characters we clear the whole
+	   table.  The unrolled form is much faster than a loop.  */
+	xorl %ecx, %ecx		/* %ecx = 0 !!! */
+
+	pushl %ecx		/* make a 256 bytes long block filled with 0 */
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl $0		/* These immediate values make the label 2 */
+	pushl $0		/* to be aligned on a 16 byte boundary to */
+	pushl $0		/* get a better performance of the loop.  */
+	pushl $0
+	pushl $0
+	pushl $0
+
+/* For understanding the following code remember that %ecx == 0 now.
+   Although all the following instruction only modify %cl we always
+   have a correct zero-extended 32-bit value in %ecx.  */
+
+/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl".  We want
+   longer instructions so that the next loop aligns without adding nops.  */
+
+L2:	movb (%eax), %cl	/* get byte from stopset */
+	testb %cl, %cl		/* is NUL char? */
+	jz L1			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 1(%eax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L1			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 2(%eax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L1			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 3(%eax), %cl	/* get byte from stopset */
+	addl $4, %eax		/* increment stopset pointer */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+	testb $0xff, %cl	/* is NUL char? */
+	jnz L2			/* no => process next dword from stopset */
+
+L1:	leal -4(%edx), %eax	/* prepare loop */
+
+	/* We use a neat trick for the following loop.  Normally we would
+	   have to test for two termination conditions
+	   1. a character in the stopset was found
+	   and
+	   2. the end of the string was found
+	   But as a sign that the chracter is in the stopset we store its
+	   value in the table.  But the value of NUL is NUL so the loop
+	   terminates for NUL in every case.  */
+
+L3:	addl $4, %eax		/* adjust pointer for full loop round */
+
+	movb (%eax), %cl	/* get byte from string */
+	testb %cl, (%esp,%ecx)	/* is it contained in skipset? */
+	jz L4			/* no => return */
+
+	movb 1(%eax), %cl	/* get byte from string */
+	testb %cl, (%esp,%ecx)	/* is it contained in skipset? */
+	jz L5			/* no => return */
+
+	movb 2(%eax), %cl	/* get byte from string */
+	testb %cl, (%esp,%ecx)	/* is it contained in skipset? */
+	jz L6			/* no => return */
+
+	movb 3(%eax), %cl	/* get byte from string */
+	testb %cl, (%esp,%ecx)	/* is it contained in skipset? */
+	jnz L3			/* yes => start loop again */
+
+	incl %eax		/* adjust pointer */
+L6:	incl %eax
+L5:	incl %eax
+
+L4:	subl %edx, %eax		/* we have to return the number of valid
+				   characters, so compute distance to first
+				   non-valid character */
+	addl $256, %esp		/* remove stopset */
+
+	ret
--- a/sysdeps/i386/sub_n.S
+++ b/sysdeps/i386/sub_n.S
@ -1,7 +1,7 @@
 /* i80386 __mpn_sub_n -- Add two limb vectors of the same length > 0 and store
   sum in a third limb vector.

-Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.

 This file is part of the GNU MP Library.

@ -37,10 +37,10 @@ C_SYMBOL_NAME(__mpn_sub_n:)
 	pushl %edi
 	pushl %esi

-	movl 12(%esp),%edi	/* res_ptr */
-	movl 16(%esp),%esi	/* s1_ptr */
-	movl 20(%esp),%edx	/* s2_ptr */
-	movl 24(%esp),%ecx	/* size */
+	movl 12(%esp),%edi		/* res_ptr */
+	movl 16(%esp),%esi		/* s1_ptr */
+	movl 20(%esp),%edx		/* s2_ptr */
+	movl 24(%esp),%ecx		/* size */

 	movl	%ecx,%eax
 	shrl	$3,%ecx			/* compute count for unrolled loop */
@ -54,14 +54,18 @@ C_SYMBOL_NAME(__mpn_sub_n:)
 	subl	%eax,%edx		/* ... enter the loop */
 	shrl	$2,%eax			/* restore previous value */
 #ifdef PIC
-	call	here
-here:	leal	(Loop - 3 - here)(%eax,%eax,8),%eax
-	addl	%eax,(%esp)
-	ret
+/* Calculate start address in loop for PIC.  Due to limitations in some
+   assemblers, Loop-L0-3 cannot be put into the leal */
+	call	L0
+L0:	leal	(%eax,%eax,8),%eax
+	addl	(%esp),%eax
+	addl	$(Loop-L0-3),%eax 
+	addl	$4,%esp
 #else
-	leal	(Loop - 3)(%eax,%eax,8),%eax	/* calc start addr in loop */
-	jmp	*%eax			/* jump into loop */
+/* Calculate start address in loop for non-PIC.  */
+ 	leal	(Loop - 3)(%eax,%eax,8),%eax
 #endif
+	jmp	*%eax			/* jump into loop */
 	ALIGN (3)
 Loop:	movl	(%esi),%eax
 	sbbl	(%edx),%eax
--- a/sysdeps/i960/add_n.s
+++ b/sysdeps/i960/add_n.s
@ -0,0 +1,21 @@
+.text
+	.align 4
+	.globl ___mpn_add_n
+___mpn_add_n:
+	mov	0,g6		# clear carry-save register
+	cmpo	1,0		# clear cy
+
+Loop:	subo	1,g3,g3		# update loop counter
+	ld	(g1),g5		# load from s1_ptr
+	addo	4,g1,g1		# s1_ptr++
+	ld	(g2),g4		# load from s2_ptr
+	addo	4,g2,g2		# s2_ptr++
+	cmpo	g6,1		# restore cy from g6, relies on cy being 0
+	addc	g4,g5,g4	# main add
+	subc	0,0,g6		# save cy in g6
+	st	g4,(g0)		# store result to res_ptr
+	addo	4,g0,g0		# res_ptr++
+	cmpobne	0,g3,Loop	# when branch is taken, clears C bit
+
+	mov	g6,g0
+	ret
--- a/sysdeps/i960/addmul_1.s
+++ b/sysdeps/i960/addmul_1.s
@ -0,0 +1,26 @@
+.text
+	.align	4
+	.globl	___mpn_mul_1
+___mpn_mul_1:
+	subo	g2,0,g2
+	shlo	2,g2,g4
+	subo	g4,g1,g1
+	subo	g4,g0,g13
+	mov	0,g0
+
+	cmpo	1,0		# clear C bit on AC.cc
+
+Loop:	ld	(g1)[g2*4],g5
+	emul	g3,g5,g6
+	ld	(g13)[g2*4],g5
+
+	addc	g0,g6,g6	# relies on that C bit is clear
+	addc	0,g7,g7
+	addc	g5,g6,g6	# relies on that C bit is clear
+	st	g6,(g13)[g2*4]
+	addc	0,g7,g0
+
+	addo	g2,1,g2
+	cmpobne	0,g2,Loop	# when branch is taken, clears C bit
+
+	ret
--- a/sysdeps/i960/mul_1.s
+++ b/sysdeps/i960/mul_1.s
@ -0,0 +1,23 @@
+.text
+	.align	4
+	.globl	___mpn_mul_1
+___mpn_mul_1:
+	subo	g2,0,g2
+	shlo	2,g2,g4
+	subo	g4,g1,g1
+	subo	g4,g0,g13
+	mov	0,g0
+
+	cmpo	1,0		# clear C bit on AC.cc
+
+Loop:	ld	(g1)[g2*4],g5
+	emul	g3,g5,g6
+
+	addc	g0,g6,g6	# relies on that C bit is clear
+	st	g6,(g13)[g2*4]
+	addc	0,g7,g0
+
+	addo	g2,1,g2
+	cmpobne	0,g2,Loop	# when branch is taken, clears C bit
+
+	ret
--- a/sysdeps/i960/sub_n.s
+++ b/sysdeps/i960/sub_n.s
@ -0,0 +1,21 @@
+.text
+	.align 4
+	.globl ___mpn_sub_n
+___mpn_sub_n:
+	mov	1,g6		# set carry-save register
+	cmpo	1,0		# clear cy
+
+Loop:	subo	1,g3,g3		# update loop counter
+	ld	(g1),g5		# load from s1_ptr
+	addo	4,g1,g1		# s1_ptr++
+	ld	(g2),g4		# load from s2_ptr
+	addo	4,g2,g2		# s2_ptr++
+	cmpo	g6,1		# restore cy from g6, relies on cy being 0
+	subc	g4,g5,g4	# main subtract
+	subc	0,0,g6		# save cy in g6
+	st	g4,(g0)		# store result to res_ptr
+	addo	4,g0,g0		# res_ptr++
+	cmpobne	0,g3,Loop	# when branch is taken, cy will be 0
+
+	mov	g6,g0
+	ret
--- a/sysdeps/m88k/m88100/add_n.s
+++ b/sysdeps/m88k/m88100/add_n.s
@ -0,0 +1,103 @@
+; mc88100 __mpn_add -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; s2_ptr	r4
+; size		r5
+
+; This code has been optimized to run one instruction per clock, avoiding
+; load stalls and writeback contention.  As a result, the instruction
+; order is not always natural.
+
+; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100,
+; but on the 88110, it seems to run much slower, 6.6 clocks/limb.
+
+	text
+	align	 16
+	global	 ___mpn_add_n
+___mpn_add_n:
+	ld	r6,r3,0			; read first limb from s1_ptr
+	extu	r10,r5,3
+	ld	r7,r4,0			; read first limb from s2_ptr
+
+	subu.co	r5,r0,r5		; (clear carry as side effect)
+	mak	r5,r5,3<4>
+	bcnd	eq0,r5,Lzero
+
+	or	r12,r0,lo16(Lbase)
+	or.u	r12,r12,hi16(Lbase)
+	addu	r12,r12,r5		; r12 is address for entering in loop
+
+	extu	r5,r5,2			; divide by 4
+	subu	r2,r2,r5		; adjust res_ptr
+	subu	r3,r3,r5		; adjust s1_ptr
+	subu	r4,r4,r5		; adjust s2_ptr
+
+	or	r8,r6,r0
+
+	jmp.n	r12
+	 or	r9,r7,r0
+
+Loop:	addu	r3,r3,32
+	st	r8,r2,28
+	addu	r4,r4,32
+	ld	r6,r3,0
+	addu	r2,r2,32
+	ld	r7,r4,0
+Lzero:	subu	r10,r10,1		; add 0 + 8r limbs (adj loop cnt)
+Lbase:	ld	r8,r3,4
+	addu.cio r6,r6,r7
+	ld	r9,r4,4
+	st	r6,r2,0
+	ld	r6,r3,8			; add 7 + 8r limbs
+	addu.cio r8,r8,r9
+	ld	r7,r4,8
+	st	r8,r2,4
+	ld	r8,r3,12		; add 6 + 8r limbs
+	addu.cio r6,r6,r7
+	ld	r9,r4,12
+	st	r6,r2,8
+	ld	r6,r3,16		; add 5 + 8r limbs
+	addu.cio r8,r8,r9
+	ld	r7,r4,16
+	st	r8,r2,12
+	ld	r8,r3,20		; add 4 + 8r limbs
+	addu.cio r6,r6,r7
+	ld	r9,r4,20
+	st	r6,r2,16
+	ld	r6,r3,24		; add 3 + 8r limbs
+	addu.cio r8,r8,r9
+	ld	r7,r4,24
+	st	r8,r2,20
+	ld	r8,r3,28		; add 2 + 8r limbs
+	addu.cio r6,r6,r7
+	ld	r9,r4,28
+	st	r6,r2,24
+	bcnd.n	ne0,r10,Loop		; add 1 + 8r limbs
+	 addu.cio r8,r8,r9
+
+	st	r8,r2,28		; store most significant limb
+
+	jmp.n	 r1
+	 addu.ci r2,r0,r0		; return carry-out from most sign. limb
--- a/sysdeps/m88k/m88100/mul_1.s
+++ b/sysdeps/m88k/m88100/mul_1.s
@ -0,0 +1,128 @@
+; mc88100 __mpn_mul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; size		r4
+; s2_limb	r5
+
+; Common overhead is about 11 cycles/invocation.
+
+; The speed for S2_LIMB >= 0x10000 is approximately 21 cycles/limb.  (The
+; pipeline stalls 2 cycles due to WB contention.)
+
+; The speed for S2_LIMB < 0x10000 is approximately 16 cycles/limb.  (The
+; pipeline stalls 2 cycles due to WB contention and 1 cycle due to latency.)
+
+; To enhance speed:
+; 1. Unroll main loop 4-8 times.
+; 2. Schedule code to avoid WB contention.  It might be tempting to move the
+;    ld instruction in the loops down to save 2 cycles (less WB contention),
+;    but that looses because the ultimate value will be read from outside
+;    the allocated space.  But if we handle the ultimate multiplication in
+;    the tail, we can do this.
+; 3. Make the multiplication with less instructions.  I think the code for
+;    (S2_LIMB >= 0x10000) is not minimal.
+; With these techniques the (S2_LIMB >= 0x10000) case would run in 17 or
+; less cycles/limb; the (S2_LIMB < 0x10000) case would run in 11
+; cycles/limb.  (Assuming infinite unrolling.)
+
+	text
+	align	 16
+	global	 ___mpn_mul_1
+___mpn_mul_1:
+
+	; Make S1_PTR and RES_PTR point at the end of their blocks
+	; and negate SIZE.
+	lda	 r3,r3[r4]
+	lda	 r6,r2[r4]		; RES_PTR in r6 since r2 is retval
+	subu	 r4,r0,r4
+
+	addu.co	 r2,r0,r0		; r2 = cy = 0
+	ld	 r9,r3[r4]
+	mask	 r7,r5,0xffff		; r7 = lo(S2_LIMB)
+	extu	 r8,r5,16		; r8 = hi(S2_LIMB)
+	bcnd.n	 eq0,r8,Lsmall		; jump if (hi(S2_LIMB) == 0)
+	 subu	 r6,r6,4
+
+; General code for any value of S2_LIMB.
+
+	; Make a stack frame and save r25 and r26
+	subu	 r31,r31,16
+	st.d	 r25,r31,8
+
+	; Enter the loop in the middle
+	br.n	L1
+	addu	 r4,r4,1
+
+Loop:
+	ld	 r9,r3[r4]
+	st	 r26,r6[r4]
+; bcnd	ne0,r0,0			; bubble
+	addu	 r4,r4,1
+L1:	mul	 r26,r9,r5		; low word of product	mul_1	WB ld
+	mask	 r12,r9,0xffff		; r12 = lo(s1_limb)	mask_1
+	mul	 r11,r12,r7		; r11 =  prod_0		mul_2	WB mask_1
+	mul	 r10,r12,r8		; r10 = prod_1a		mul_3
+	extu	 r13,r9,16		; r13 = hi(s1_limb)	extu_1	WB mul_1
+	mul	 r12,r13,r7		; r12 = prod_1b		mul_4	WB extu_1
+	mul	 r25,r13,r8		; r25  = prod_2		mul_5	WB mul_2
+	extu	 r11,r11,16		; r11 = hi(prod_0)	extu_2	WB mul_3
+	addu	 r10,r10,r11		;			addu_1	WB extu_2
+; bcnd	ne0,r0,0			; bubble			WB addu_1
+	addu.co	 r10,r10,r12		;				WB mul_4
+	mask.u	 r10,r10,0xffff		; move the 16 most significant bits...
+	addu.ci	 r10,r10,r0		; ...to the low half of the word...
+	rot	 r10,r10,16		; ...and put carry in pos 16.
+	addu.co	 r26,r26,r2		; add old carry limb
+	bcnd.n	 ne0,r4,Loop
+	 addu.ci r2,r25,r10		; compute new carry limb
+
+	st	 r26,r6[r4]
+	ld.d	 r25,r31,8
+	jmp.n	 r1
+	 addu	 r31,r31,16
+
+; Fast code for S2_LIMB < 0x10000
+Lsmall:
+	; Enter the loop in the middle
+	br.n	SL1
+	addu	 r4,r4,1
+
+SLoop:
+	ld	 r9,r3[r4]		;
+	st	 r8,r6[r4]		;
+	addu	 r4,r4,1		;
+SL1:	mul	 r8,r9,r5		; low word of product
+	mask	 r12,r9,0xffff		; r12 = lo(s1_limb)
+	extu	 r13,r9,16		; r13 = hi(s1_limb)
+	mul	 r11,r12,r7		; r11 =  prod_0
+	mul	 r12,r13,r7		; r12 = prod_1b
+	addu.cio r8,r8,r2		; add old carry limb
+	extu	 r10,r11,16		; r11 = hi(prod_0)
+	addu	 r10,r10,r12		;
+	bcnd.n	 ne0,r4,SLoop
+	extu	 r2,r10,16		; r2 = new carry limb
+
+	jmp.n	 r1
+	st	 r8,r6[r4]
--- a/sysdeps/m88k/m88100/sub_n.s
+++ b/sysdeps/m88k/m88100/sub_n.s
@ -0,0 +1,104 @@
+; mc88100 __mpn_sub -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; s2_ptr	r4
+; size		r5
+
+; This code has been optimized to run one instruction per clock, avoiding
+; load stalls and writeback contention.  As a result, the instruction
+; order is not always natural.
+
+; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100,
+; but on the 88110, it seems to run much slower, 6.6 clocks/limb.
+
+	text
+	align	 16
+	global	 ___mpn_sub_n
+___mpn_sub_n:
+	ld	r6,r3,0			; read first limb from s1_ptr
+	extu	r10,r5,3
+	ld	r7,r4,0			; read first limb from s2_ptr
+
+	subu.co	r5,r0,r5		; (clear carry as side effect)
+	mak	r5,r5,3<4>
+	bcnd	eq0,r5,Lzero
+
+	or	r12,r0,lo16(Lbase)
+	or.u	r12,r12,hi16(Lbase)
+	addu	r12,r12,r5		; r12 is address for entering in loop
+
+	extu	r5,r5,2			; divide by 4
+	subu	r2,r2,r5		; adjust res_ptr
+	subu	r3,r3,r5		; adjust s1_ptr
+	subu	r4,r4,r5		; adjust s2_ptr
+
+	or	r8,r6,r0
+
+	jmp.n	r12
+	 or	r9,r7,r0
+
+Loop:	addu	r3,r3,32
+	st	r8,r2,28
+	addu	r4,r4,32
+	ld	r6,r3,0
+	addu	r2,r2,32
+	ld	r7,r4,0
+Lzero:	subu	r10,r10,1		; subtract 0 + 8r limbs (adj loop cnt)
+Lbase:	ld	r8,r3,4
+	subu.cio r6,r6,r7
+	ld	r9,r4,4
+	st	r6,r2,0
+	ld	r6,r3,8			; subtract 7 + 8r limbs
+	subu.cio r8,r8,r9
+	ld	r7,r4,8
+	st	r8,r2,4
+	ld	r8,r3,12		; subtract 6 + 8r limbs
+	subu.cio r6,r6,r7
+	ld	r9,r4,12
+	st	r6,r2,8
+	ld	r6,r3,16		; subtract 5 + 8r limbs
+	subu.cio r8,r8,r9
+	ld	r7,r4,16
+	st	r8,r2,12
+	ld	r8,r3,20		; subtract 4 + 8r limbs
+	subu.cio r6,r6,r7
+	ld	r9,r4,20
+	st	r6,r2,16
+	ld	r6,r3,24		; subtract 3 + 8r limbs
+	subu.cio r8,r8,r9
+	ld	r7,r4,24
+	st	r8,r2,20
+	ld	r8,r3,28		; subtract 2 + 8r limbs
+	subu.cio r6,r6,r7
+	ld	r9,r4,28
+	st	r6,r2,24
+	bcnd.n	ne0,r10,Loop		; subtract 1 + 8r limbs
+	 subu.cio r8,r8,r9
+
+	st	r8,r2,28		; store most significant limb
+
+	addu.ci r2,r0,r0		; return carry-out from most sign. limb
+	jmp.n	 r1
+	 xor	r2,r2,1
--- a/sysdeps/m88k/m88110/mul_1.s
+++ b/sysdeps/m88k/m88110/mul_1.s
@ -0,0 +1,84 @@
+; mc88110 __mpn_mul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; size		r4
+; s2_limb	r5
+
+	text
+	align	16
+	global	___mpn_mul_1
+___mpn_mul_1:
+	; Make S1_PTR and RES_PTR point at the end of their blocks
+	; and negate SIZE.
+	lda	 r3,r3[r4]
+	lda	 r8,r2[r4]		; RES_PTR in r8 since r2 is retval
+	subu	 r4,r0,r4
+
+	addu.co	 r2,r0,r0		; r2 = cy = 0
+
+	ld	 r6,r3[r4]
+	addu	 r4,r4,1
+	mulu.d	 r10,r6,r5
+	bcnd.n	 eq0,r4,Lend
+	 subu	 r8,r8,8
+
+Loop:	ld	 r6,r3[r4]
+	addu.cio r9,r11,r2
+	or	 r2,r10,r0		; could be avoided if unrolled
+	addu	 r4,r4,1
+	mulu.d	 r10,r6,r5
+	bcnd.n	 ne0,r4,Loop
+	 st	 r9,r8[r4]
+
+Lend:	addu.cio r9,r11,r2
+	st	 r9,r8,4
+	jmp.n	 r1
+	 addu.ci r2,r10,r0
+
+; This is the Right Way to do this on '110.  4 cycles / 64-bit limb.
+;	ld.d	r10,
+;	mulu.d
+;	addu.cio
+;	addu.cio
+;	st.d
+;	mulu.d	,r11,r5
+;	ld.d	r12,
+;	mulu.d	,r10,r5
+;	addu.cio
+;	addu.cio
+;	st.d
+;	mulu.d
+;	ld.d	r10,
+;	mulu.d
+;	addu.cio
+;	addu.cio
+;	st.d
+;	mulu.d
+;	ld.d	r10,
+;	mulu.d
+;	addu.cio
+;	addu.cio
+;	st.d
+;	mulu.d
--- a/sysdeps/mips/add_n.s
+++ b/sysdeps/mips/add_n.s
@ -0,0 +1,119 @@
+ # MIPS2 __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # s2_ptr	$6
+ # size		$7
+
+	.text
+	.align	2
+	.globl	__mpn_add_n
+	.ent	__mpn_add_n
+__mpn_add_n:
+	.set	noreorder
+	.set	nomacro
+
+	lw	$10,0($5)
+	lw	$11,0($6)
+
+	addiu	$7,$7,-1
+	and	$9,$7,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 move	$2,$0
+
+	subu	$7,$7,$9
+
+.Loop0:	addiu	$9,$9,-1
+	lw	$12,4($5)
+	addu	$11,$11,$2
+	lw	$13,4($6)
+	sltu	$8,$11,$2
+	addu	$11,$10,$11
+	sltu	$2,$11,$10
+	sw	$11,0($4)
+	or	$2,$2,$8
+
+	addiu	$5,$5,4
+	addiu	$6,$6,4
+	move	$10,$12
+	move	$11,$13
+	bne	$9,$0,.Loop0
+	 addiu	$4,$4,4
+
+.L0:	beq	$7,$0,.Lend
+	 nop
+
+.Loop:	addiu	$7,$7,-4
+
+	lw	$12,4($5)
+	addu	$11,$11,$2
+	lw	$13,4($6)
+	sltu	$8,$11,$2
+	addu	$11,$10,$11
+	sltu	$2,$11,$10
+	sw	$11,0($4)
+	or	$2,$2,$8
+
+	lw	$10,8($5)
+	addu	$13,$13,$2
+	lw	$11,8($6)
+	sltu	$8,$13,$2
+	addu	$13,$12,$13
+	sltu	$2,$13,$12
+	sw	$13,4($4)
+	or	$2,$2,$8
+
+	lw	$12,12($5)
+	addu	$11,$11,$2
+	lw	$13,12($6)
+	sltu	$8,$11,$2
+	addu	$11,$10,$11
+	sltu	$2,$11,$10
+	sw	$11,8($4)
+	or	$2,$2,$8
+
+	lw	$10,16($5)
+	addu	$13,$13,$2
+	lw	$11,16($6)
+	sltu	$8,$13,$2
+	addu	$13,$12,$13
+	sltu	$2,$13,$12
+	sw	$13,12($4)
+	or	$2,$2,$8
+
+	addiu	$5,$5,16
+	addiu	$6,$6,16
+
+	bne	$7,$0,.Loop
+	 addiu	$4,$4,16
+
+.Lend:	addu	$11,$11,$2
+	sltu	$8,$11,$2
+	addu	$11,$10,$11
+	sltu	$2,$11,$10
+	sw	$11,0($4)
+	j	$31
+	or	$2,$2,$8
+
+	.end	__mpn_add_n
--- a/sysdeps/mips/addmul_1.s
+++ b/sysdeps/mips/addmul_1.s
@ -0,0 +1,96 @@
+ # MIPS __mpn_addmul_1 -- Multiply a limb vector with a single limb and
+ # add the product to a second limb vector.
+
+ # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	 4
+	.globl	 __mpn_addmul_1
+	.ent	__mpn_addmul_1
+__mpn_addmul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	lw	$8,0($5)
+
+ # warm up phase 1
+	addiu	$5,$5,4
+	multu	$8,$7
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	lw	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addiu	$5,$5,4
+	addu	$3,$3,$2	# add old carry limb to low product limb
+	multu	$8,$7
+	lw	$8,0($5)	# load new s1 limb as early as possible
+	addiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$3,$2	# carry from previous addition -> $2
+	addu	$3,$10,$3
+	sltu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	addiu	$4,$4,4
+	bne	$6,$0,Loop	# should be "bnel"
+	 addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addu	$3,$3,$2
+	sltu	$2,$3,$2
+	multu	$8,$7
+	addu	$3,$10,$3
+	sltu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	addiu	$4,$4,4
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addu	$3,$3,$2
+	sltu	$2,$3,$2
+	addu	$3,$10,$3
+	sltu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	j	$31
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__mpn_addmul_1
--- a/sysdeps/mips/lshift.s
+++ b/sysdeps/mips/lshift.s
@ -0,0 +1,94 @@
+ # MIPS2 __mpn_lshift --
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # src_ptr	$5
+ # size		$6
+ # cnt		$7
+
+	.text
+	.align	2
+	.globl	__mpn_lshift
+	.ent	__mpn_lshift
+__mpn_lshift:
+	.set	noreorder
+	.set	nomacro
+
+	sll	$2,$6,2
+	addu	$5,$5,$2	# make r5 point at end of src
+	lw	$10,-4($5)	# load first limb
+	subu	$13,$0,$7
+	addu	$4,$4,$2	# make r4 point at end of res
+	addiu	$6,$6,-1
+	and	$9,$6,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 srl	$2,$10,$13	# compute function result
+
+	subu	$6,$6,$9
+
+.Loop0:	lw	$3,-8($5)
+	addiu	$4,$4,-4
+	addiu	$5,$5,-4
+	addiu	$9,$9,-1
+	sll	$11,$10,$7
+	srl	$12,$3,$13
+	move	$10,$3
+	or	$8,$11,$12
+	bne	$9,$0,.Loop0
+	 sw	$8,0($4)
+
+.L0:	beq	$6,$0,.Lend
+	 nop
+
+.Loop:	lw	$3,-8($5)
+	addiu	$4,$4,-16
+	addiu	$6,$6,-4
+	sll	$11,$10,$7
+	srl	$12,$3,$13
+
+	lw	$10,-12($5)
+	sll	$14,$3,$7
+	or	$8,$11,$12
+	sw	$8,12($4)
+	srl	$9,$10,$13
+
+	lw	$3,-16($5)
+	sll	$11,$10,$7
+	or	$8,$14,$9
+	sw	$8,8($4)
+	srl	$12,$3,$13
+
+	lw	$10,-20($5)
+	sll	$14,$3,$7
+	or	$8,$11,$12
+	sw	$8,4($4)
+	srl	$9,$10,$13
+
+	addiu	$5,$5,-16
+	or	$8,$14,$9
+	bgtz	$6,.Loop
+	 sw	$8,0($4)
+
+.Lend:	sll	$8,$10,$7
+	j	$31
+	sw	$8,-4($4)
+	.end	__mpn_lshift
--- a/sysdeps/mips/mips3/add_n.s
+++ b/sysdeps/mips/mips3/add_n.s
@ -0,0 +1,119 @@
+ # MIPS3 __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # s2_ptr	$6
+ # size		$7
+
+	.text
+	.align	2
+	.globl	__mpn_add_n
+	.ent	__mpn_add_n
+__mpn_add_n:
+	.set	noreorder
+	.set	nomacro
+
+	ld	$10,0($5)
+	ld	$11,0($6)
+
+	daddiu	$7,$7,-1
+	and	$9,$7,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 move	$2,$0
+
+	dsubu	$7,$7,$9
+
+.Loop0:	daddiu	$9,$9,-1
+	ld	$12,8($5)
+	daddu	$11,$11,$2
+	ld	$13,8($6)
+	sltu	$8,$11,$2
+	daddu	$11,$10,$11
+	sltu	$2,$11,$10
+	sd	$11,0($4)
+	or	$2,$2,$8
+
+	daddiu	$5,$5,8
+	daddiu	$6,$6,8
+	move	$10,$12
+	move	$11,$13
+	bne	$9,$0,.Loop0
+	 daddiu	$4,$4,8
+
+.L0:	beq	$7,$0,.Lend
+	 nop
+
+.Loop:	daddiu	$7,$7,-4
+
+	ld	$12,8($5)
+	daddu	$11,$11,$2
+	ld	$13,8($6)
+	sltu	$8,$11,$2
+	daddu	$11,$10,$11
+	sltu	$2,$11,$10
+	sd	$11,0($4)
+	or	$2,$2,$8
+
+	ld	$10,16($5)
+	daddu	$13,$13,$2
+	ld	$11,16($6)
+	sltu	$8,$13,$2
+	daddu	$13,$12,$13
+	sltu	$2,$13,$12
+	sd	$13,8($4)
+	or	$2,$2,$8
+
+	ld	$12,24($5)
+	daddu	$11,$11,$2
+	ld	$13,24($6)
+	sltu	$8,$11,$2
+	daddu	$11,$10,$11
+	sltu	$2,$11,$10
+	sd	$11,16($4)
+	or	$2,$2,$8
+
+	ld	$10,32($5)
+	daddu	$13,$13,$2
+	ld	$11,32($6)
+	sltu	$8,$13,$2
+	daddu	$13,$12,$13
+	sltu	$2,$13,$12
+	sd	$13,24($4)
+	or	$2,$2,$8
+
+	daddiu	$5,$5,32
+	daddiu	$6,$6,32
+
+	bne	$7,$0,.Loop
+	 daddiu	$4,$4,32
+
+.Lend:	daddu	$11,$11,$2
+	sltu	$8,$11,$2
+	daddu	$11,$10,$11
+	sltu	$2,$11,$10
+	sd	$11,0($4)
+	j	$31
+	or	$2,$2,$8
+
+	.end	__mpn_add_n
--- a/sysdeps/mips/mips3/addmul_1.s
+++ b/sysdeps/mips/mips3/addmul_1.s
@ -0,0 +1,96 @@
+ # MIPS3 __mpn_addmul_1 -- Multiply a limb vector with a single limb and
+ # add the product to a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	4
+	.globl	__mpn_addmul_1
+	.ent	__mpn_addmul_1
+__mpn_addmul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	ld	$8,0($5)
+
+ # warm up phase 1
+	daddiu	$5,$5,8
+	dmultu	$8,$7
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	ld	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddiu	$5,$5,8
+	daddu	$3,$3,$2	# add old carry limb to low product limb
+	dmultu	$8,$7
+	ld	$8,0($5)	# load new s1 limb as early as possible
+	daddiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$3,$2	# carry from previous addition -> $2
+	daddu	$3,$10,$3
+	sltu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	daddiu	$4,$4,8
+	bne	$6,$0,Loop	# should be "bnel"
+	 daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddu	$3,$3,$2
+	sltu	$2,$3,$2
+	dmultu	$8,$7
+	daddu	$3,$10,$3
+	sltu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	daddiu	$4,$4,8
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddu	$3,$3,$2
+	sltu	$2,$3,$2
+	daddu	$3,$10,$3
+	sltu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	j	$31
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__mpn_addmul_1
--- a/sysdeps/mips/mips3/gmp-mparam.h
+++ b/sysdeps/mips/mips3/gmp-mparam.h
@ -0,0 +1,26 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
--- a/sysdeps/mips/mips3/lshift.s
+++ b/sysdeps/mips/mips3/lshift.s
@ -0,0 +1,94 @@
+ # MIPS3 __mpn_lshift --
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # src_ptr	$5
+ # size		$6
+ # cnt		$7
+
+	.text
+	.align	2
+	.globl	__mpn_lshift
+	.ent	__mpn_lshift
+__mpn_lshift:
+	.set	noreorder
+	.set	nomacro
+
+	dsll	$2,$6,3
+	daddu	$5,$5,$2	# make r5 point at end of src
+	ld	$10,-8($5)	# load first limb
+	dsubu	$13,$0,$7
+	daddu	$4,$4,$2	# make r4 point at end of res
+	daddiu	$6,$6,-1
+	and	$9,$6,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 dsrl	$2,$10,$13	# compute function result
+
+	dsubu	$6,$6,$9
+
+.Loop0:	ld	$3,-16($5)
+	daddiu	$4,$4,-8
+	daddiu	$5,$5,-8
+	daddiu	$9,$9,-1
+	dsll	$11,$10,$7
+	dsrl	$12,$3,$13
+	move	$10,$3
+	or	$8,$11,$12
+	bne	$9,$0,.Loop0
+	 sd	$8,0($4)
+
+.L0:	beq	$6,$0,.Lend
+	 nop
+
+.Loop:	ld	$3,-16($5)
+	daddiu	$4,$4,-32
+	daddiu	$6,$6,-4
+	dsll	$11,$10,$7
+	dsrl	$12,$3,$13
+
+	ld	$10,-24($5)
+	dsll	$14,$3,$7
+	or	$8,$11,$12
+	sd	$8,24($4)
+	dsrl	$9,$10,$13
+
+	ld	$3,-32($5)
+	dsll	$11,$10,$7
+	or	$8,$14,$9
+	sd	$8,16($4)
+	dsrl	$12,$3,$13
+
+	ld	$10,-40($5)
+	dsll	$14,$3,$7
+	or	$8,$11,$12
+	sd	$8,8($4)
+	dsrl	$9,$10,$13
+
+	daddiu	$5,$5,-32
+	or	$8,$14,$9
+	bgtz	$6,.Loop
+	 sd	$8,0($4)
+
+.Lend:	dsll	$8,$10,$7
+	j	$31
+	sd	$8,-8($4)
+	.end	__mpn_lshift
--- a/sysdeps/mips/mips3/mul_1.s
+++ b/sysdeps/mips/mips3/mul_1.s
@ -0,0 +1,84 @@
+ # MIPS3 __mpn_mul_1 -- Multiply a limb vector with a single limb and
+ # store the product in a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	4
+	.globl	__mpn_mul_1
+	.ent	__mpn_mul_1
+__mpn_mul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	ld	$8,0($5)
+
+ # warm up phase 1
+	daddiu	$5,$5,8
+	dmultu	$8,$7
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	ld	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	mflo	$10
+	mfhi	$9
+	daddiu	$5,$5,8
+	daddu	$10,$10,$2	# add old carry limb to low product limb
+	dmultu	$8,$7
+	ld	$8,0($5)	# load new s1 limb as early as possible
+	daddiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$10,$2	# carry from previous addition -> $2
+	sd	$10,0($4)
+	daddiu	$4,$4,8
+	bne	$6,$0,Loop	# should be "bnel"
+	 daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	mflo	$10
+	mfhi	$9
+	daddu	$10,$10,$2
+	sltu	$2,$10,$2
+	dmultu	$8,$7
+	sd	$10,0($4)
+	daddiu	$4,$4,8
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	mflo	$10
+	mfhi	$9
+	daddu	$10,$10,$2
+	sltu	$2,$10,$2
+	sd	$10,0($4)
+	j	$31
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__mpn_mul_1
--- a/sysdeps/mips/mips3/rshift.s
+++ b/sysdeps/mips/mips3/rshift.s
@ -0,0 +1,91 @@
+ # MIPS3 __mpn_rshift --
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # src_ptr	$5
+ # size		$6
+ # cnt		$7
+
+	.text
+	.align	2
+	.globl	__mpn_rshift
+	.ent	__mpn_rshift
+__mpn_rshift:
+	.set	noreorder
+	.set	nomacro
+
+	ld	$10,0($5)	# load first limb
+	dsubu	$13,$0,$7
+	daddiu	$6,$6,-1
+	and	$9,$6,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 dsll	$2,$10,$13	# compute function result
+
+	dsubu	$6,$6,$9
+
+.Loop0:	ld	$3,8($5)
+	daddiu	$4,$4,8
+	daddiu	$5,$5,8
+	daddiu	$9,$9,-1
+	dsrl	$11,$10,$7
+	dsll	$12,$3,$13
+	move	$10,$3
+	or	$8,$11,$12
+	bne	$9,$0,.Loop0
+	 sd	$8,-8($4)
+
+.L0:	beq	$6,$0,.Lend
+	 nop
+
+.Loop:	ld	$3,8($5)
+	daddiu	$4,$4,32
+	daddiu	$6,$6,-4
+	dsrl	$11,$10,$7
+	dsll	$12,$3,$13
+
+	ld	$10,16($5)
+	dsrl	$14,$3,$7
+	or	$8,$11,$12
+	sd	$8,-32($4)
+	dsll	$9,$10,$13
+
+	ld	$3,24($5)
+	dsrl	$11,$10,$7
+	or	$8,$14,$9
+	sd	$8,-24($4)
+	dsll	$12,$3,$13
+
+	ld	$10,32($5)
+	dsrl	$14,$3,$7
+	or	$8,$11,$12
+	sd	$8,-16($4)
+	dsll	$9,$10,$13
+
+	daddiu	$5,$5,32
+	or	$8,$14,$9
+	bgtz	$6,.Loop
+	 sd	$8,-8($4)
+
+.Lend:	dsrl	$8,$10,$7
+	j	$31
+	sd	$8,0($4)
+	.end	__mpn_rshift
--- a/sysdeps/mips/mips3/sub_n.s
+++ b/sysdeps/mips/mips3/sub_n.s
@ -0,0 +1,119 @@
+ # MIPS3 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # s2_ptr	$6
+ # size		$7
+
+	.text
+	.align	2
+	.globl	__mpn_sub_n
+	.ent	__mpn_sub_n
+__mpn_sub_n:
+	.set	noreorder
+	.set	nomacro
+
+	ld	$10,0($5)
+	ld	$11,0($6)
+
+	daddiu	$7,$7,-1
+	and	$9,$7,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 move	$2,$0
+
+	dsubu	$7,$7,$9
+
+.Loop0:	daddiu	$9,$9,-1
+	ld	$12,8($5)
+	daddu	$11,$11,$2
+	ld	$13,8($6)
+	sltu	$8,$11,$2
+	dsubu	$11,$10,$11
+	sltu	$2,$10,$11
+	sd	$11,0($4)
+	or	$2,$2,$8
+
+	daddiu	$5,$5,8
+	daddiu	$6,$6,8
+	move	$10,$12
+	move	$11,$13
+	bne	$9,$0,.Loop0
+	 daddiu	$4,$4,8
+
+.L0:	beq	$7,$0,.Lend
+	 nop
+
+.Loop:	daddiu	$7,$7,-4
+
+	ld	$12,8($5)
+	daddu	$11,$11,$2
+	ld	$13,8($6)
+	sltu	$8,$11,$2
+	dsubu	$11,$10,$11
+	sltu	$2,$10,$11
+	sd	$11,0($4)
+	or	$2,$2,$8
+
+	ld	$10,16($5)
+	daddu	$13,$13,$2
+	ld	$11,16($6)
+	sltu	$8,$13,$2
+	dsubu	$13,$12,$13
+	sltu	$2,$12,$13
+	sd	$13,8($4)
+	or	$2,$2,$8
+
+	ld	$12,24($5)
+	daddu	$11,$11,$2
+	ld	$13,24($6)
+	sltu	$8,$11,$2
+	dsubu	$11,$10,$11
+	sltu	$2,$10,$11
+	sd	$11,16($4)
+	or	$2,$2,$8
+
+	ld	$10,32($5)
+	daddu	$13,$13,$2
+	ld	$11,32($6)
+	sltu	$8,$13,$2
+	dsubu	$13,$12,$13
+	sltu	$2,$12,$13
+	sd	$13,24($4)
+	or	$2,$2,$8
+
+	daddiu	$5,$5,32
+	daddiu	$6,$6,32
+
+	bne	$7,$0,.Loop
+	 daddiu	$4,$4,32
+
+.Lend:	daddu	$11,$11,$2
+	sltu	$8,$11,$2
+	dsubu	$11,$10,$11
+	sltu	$2,$10,$11
+	sd	$11,0($4)
+	j	$31
+	or	$2,$2,$8
+
+	.end	__mpn_sub_n
--- a/sysdeps/mips/mips3/submul_1.s
+++ b/sysdeps/mips/mips3/submul_1.s
@ -0,0 +1,96 @@
+ # MIPS3 __mpn_submul_1 -- Multiply a limb vector with a single limb and
+ # subtract the product from a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	4
+	.globl	__mpn_submul_1
+	.ent	__mpn_submul_1
+__mpn_submul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	ld	$8,0($5)
+
+ # warm up phase 1
+	daddiu	$5,$5,8
+	dmultu	$8,$7
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	ld	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddiu	$5,$5,8
+	daddu	$3,$3,$2	# add old carry limb to low product limb
+	dmultu	$8,$7
+	ld	$8,0($5)	# load new s1 limb as early as possible
+	daddiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$3,$2	# carry from previous addition -> $2
+	dsubu	$3,$10,$3
+	sgtu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	daddiu	$4,$4,8
+	bne	$6,$0,Loop	# should be "bnel"
+	 daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddu	$3,$3,$2
+	sltu	$2,$3,$2
+	dmultu	$8,$7
+	dsubu	$3,$10,$3
+	sgtu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	daddiu	$4,$4,8
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddu	$3,$3,$2
+	sltu	$2,$3,$2
+	dsubu	$3,$10,$3
+	sgtu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	j	$31
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__mpn_submul_1
--- a/sysdeps/mips/mul_1.s
+++ b/sysdeps/mips/mul_1.s
@ -0,0 +1,84 @@
+ # MIPS __mpn_mul_1 -- Multiply a limb vector with a single limb and
+ # store the product in a second limb vector.
+
+ # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	 4
+	.globl	 __mpn_mul_1
+	.ent	__mpn_mul_1
+__mpn_mul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	lw	$8,0($5)
+
+ # warm up phase 1
+	addiu	$5,$5,4
+	multu	$8,$7
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	lw	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	mflo	$10
+	mfhi	$9
+	addiu	$5,$5,4
+	addu	$10,$10,$2	# add old carry limb to low product limb
+	multu	$8,$7
+	lw	$8,0($5)	# load new s1 limb as early as possible
+	addiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$10,$2	# carry from previous addition -> $2
+	sw	$10,0($4)
+	addiu	$4,$4,4
+	bne	$6,$0,Loop	# should be "bnel"
+	 addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	mflo	$10
+	mfhi	$9
+	addu	$10,$10,$2
+	sltu	$2,$10,$2
+	multu	$8,$7
+	sw	$10,0($4)
+	addiu	$4,$4,4
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	mflo	$10
+	mfhi	$9
+	addu	$10,$10,$2
+	sltu	$2,$10,$2
+	sw	$10,0($4)
+	j	$31
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__mpn_mul_1
--- a/sysdeps/mips/rshift.s
+++ b/sysdeps/mips/rshift.s
@ -0,0 +1,91 @@
+ # MIPS2 __mpn_rshift --
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # src_ptr	$5
+ # size		$6
+ # cnt		$7
+
+	.text
+	.align	2
+	.globl	__mpn_rshift
+	.ent	__mpn_rshift
+__mpn_rshift:
+	.set	noreorder
+	.set	nomacro
+
+	lw	$10,0($5)	# load first limb
+	subu	$13,$0,$7
+	addiu	$6,$6,-1
+	and	$9,$6,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 sll	$2,$10,$13	# compute function result
+
+	subu	$6,$6,$9
+
+.Loop0:	lw	$3,4($5)
+	addiu	$4,$4,4
+	addiu	$5,$5,4
+	addiu	$9,$9,-1
+	srl	$11,$10,$7
+	sll	$12,$3,$13
+	move	$10,$3
+	or	$8,$11,$12
+	bne	$9,$0,.Loop0
+	 sw	$8,-4($4)
+
+.L0:	beq	$6,$0,.Lend
+	 nop
+
+.Loop:	lw	$3,4($5)
+	addiu	$4,$4,16
+	addiu	$6,$6,-4
+	srl	$11,$10,$7
+	sll	$12,$3,$13
+
+	lw	$10,8($5)
+	srl	$14,$3,$7
+	or	$8,$11,$12
+	sw	$8,-16($4)
+	sll	$9,$10,$13
+
+	lw	$3,12($5)
+	srl	$11,$10,$7
+	or	$8,$14,$9
+	sw	$8,-12($4)
+	sll	$12,$3,$13
+
+	lw	$10,16($5)
+	srl	$14,$3,$7
+	or	$8,$11,$12
+	sw	$8,-8($4)
+	sll	$9,$10,$13
+
+	addiu	$5,$5,16
+	or	$8,$14,$9
+	bgtz	$6,.Loop
+	 sw	$8,-4($4)
+
+.Lend:	srl	$8,$10,$7
+	j	$31
+	sw	$8,0($4)
+	.end	__mpn_rshift
--- a/sysdeps/mips/sub_n.s
+++ b/sysdeps/mips/sub_n.s
@ -0,0 +1,119 @@
+ # MIPS2 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # s2_ptr	$6
+ # size		$7
+
+	.text
+	.align	2
+	.globl	__mpn_sub_n
+	.ent	__mpn_sub_n
+__mpn_sub_n:
+	.set	noreorder
+	.set	nomacro
+
+	lw	$10,0($5)
+	lw	$11,0($6)
+
+	addiu	$7,$7,-1
+	and	$9,$7,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 move	$2,$0
+
+	subu	$7,$7,$9
+
+.Loop0:	addiu	$9,$9,-1
+	lw	$12,4($5)
+	addu	$11,$11,$2
+	lw	$13,4($6)
+	sltu	$8,$11,$2
+	subu	$11,$10,$11
+	sltu	$2,$10,$11
+	sw	$11,0($4)
+	or	$2,$2,$8
+
+	addiu	$5,$5,4
+	addiu	$6,$6,4
+	move	$10,$12
+	move	$11,$13
+	bne	$9,$0,.Loop0
+	 addiu	$4,$4,4
+
+.L0:	beq	$7,$0,.Lend
+	 nop
+
+.Loop:	addiu	$7,$7,-4
+
+	lw	$12,4($5)
+	addu	$11,$11,$2
+	lw	$13,4($6)
+	sltu	$8,$11,$2
+	subu	$11,$10,$11
+	sltu	$2,$10,$11
+	sw	$11,0($4)
+	or	$2,$2,$8
+
+	lw	$10,8($5)
+	addu	$13,$13,$2
+	lw	$11,8($6)
+	sltu	$8,$13,$2
+	subu	$13,$12,$13
+	sltu	$2,$12,$13
+	sw	$13,4($4)
+	or	$2,$2,$8
+
+	lw	$12,12($5)
+	addu	$11,$11,$2
+	lw	$13,12($6)
+	sltu	$8,$11,$2
+	subu	$11,$10,$11
+	sltu	$2,$10,$11
+	sw	$11,8($4)
+	or	$2,$2,$8
+
+	lw	$10,16($5)
+	addu	$13,$13,$2
+	lw	$11,16($6)
+	sltu	$8,$13,$2
+	subu	$13,$12,$13
+	sltu	$2,$12,$13
+	sw	$13,12($4)
+	or	$2,$2,$8
+
+	addiu	$5,$5,16
+	addiu	$6,$6,16
+
+	bne	$7,$0,.Loop
+	 addiu	$4,$4,16
+
+.Lend:	addu	$11,$11,$2
+	sltu	$8,$11,$2
+	subu	$11,$10,$11
+	sltu	$2,$10,$11
+	sw	$11,0($4)
+	j	$31
+	or	$2,$2,$8
+
+	.end	__mpn_sub_n
--- a/sysdeps/mips/submul_1.s
+++ b/sysdeps/mips/submul_1.s
@ -0,0 +1,96 @@
+ # MIPS __mpn_submul_1 -- Multiply a limb vector with a single limb and
+ # subtract the product from a second limb vector.
+
+ # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	 4
+	.globl	 __mpn_submul_1
+	.ent	__mpn_submul_1
+__mpn_submul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	lw	$8,0($5)
+
+ # warm up phase 1
+	addiu	$5,$5,4
+	multu	$8,$7
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	lw	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addiu	$5,$5,4
+	addu	$3,$3,$2	# add old carry limb to low product limb
+	multu	$8,$7
+	lw	$8,0($5)	# load new s1 limb as early as possible
+	addiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$3,$2	# carry from previous addition -> $2
+	subu	$3,$10,$3
+	sgtu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	addiu	$4,$4,4
+	bne	$6,$0,Loop	# should be "bnel"
+	 addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addu	$3,$3,$2
+	sltu	$2,$3,$2
+	multu	$8,$7
+	subu	$3,$10,$3
+	sgtu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	addiu	$4,$4,4
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addu	$3,$3,$2
+	sltu	$2,$3,$2
+	subu	$3,$10,$3
+	sgtu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	j	$31
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__mpn_submul_1
--- a/sysdeps/rs6000/add_n.s
+++ b/sysdeps/rs6000/add_n.s
@ -0,0 +1,54 @@
+# IBM POWER __mpn_add_n -- Add two limb vectors of equal, non-zero length.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# s2_ptr	r5
+# size		r6
+
+	.toc
+	.extern __mpn_add_n[DS]
+	.extern .__mpn_add_n
+.csect [PR]
+	.align 2
+	.globl __mpn_add_n
+	.globl .__mpn_add_n
+	.csect __mpn_add_n[DS]
+__mpn_add_n:
+	.long .__mpn_add_n, TOC[tc0], 0
+	.csect [PR]
+.__mpn_add_n:
+	mtctr	6		# copy size into CTR
+	l	8,0(4)		# load least significant s1 limb
+	l	0,0(5)		# load least significant s2 limb
+	cal	3,-4(3)		# offset res_ptr, it's updated before used
+	a	7,0,8		# add least significant limbs, set cy
+	bdz	Lend		# If done, skip loop
+Loop:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	stu	7,4(3)		# store previous limb in load latecny slot
+	ae	7,0,8		# add new limbs with cy, set cy
+	bdn	Loop		# decrement CTR and loop back
+Lend:	st	7,4(3)		# store ultimate result limb
+	lil	3,0		# load cy into ...
+	aze	3,3		# ... return value register
+	br
--- a/sysdeps/rs6000/addmul_1.s
+++ b/sysdeps/rs6000/addmul_1.s
@ -0,0 +1,122 @@
+# IBM POWER __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction.  To
+# obtain that operation, we have to use the 32x32->64 signed multiplication
+# instruction, and add the appropriate compensation to the high limb of the
+# result.  We add the multiplicand if the multiplier has its most significant
+# bit set, and we add the multiplier if the multiplicand has its most
+# significant bit set.  We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit
+# we can branch in zero cycles, so that's how we perform the additions.
+
+	.toc
+	.csect .__mpn_addmul_1[PR]
+	.align 2
+	.globl __mpn_addmul_1
+	.globl .__mpn_addmul_1
+	.csect __mpn_addmul_1[DS]
+__mpn_addmul_1:
+	.long .__mpn_addmul_1[PR], TOC[tc0], 0
+	.csect .__mpn_addmul_1[PR]
+.__mpn_addmul_1:
+
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	8
+	cax	9,9,7
+	l	7,4(3)
+	a	8,8,7		# add res_limb
+	blt	Lneg
+Lpos:	bdz	Lend
+
+Lploop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	8,0,9		# low limb + old_cy_limb + old cy
+	l	7,4(3)
+	aze	10,10		# propagate cy to new cy_limb
+	a	8,8,7		# add res_limb
+	bge	Lp0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	8,0,10
+	l	7,4(3)
+	aze	9,9
+	a	8,8,7
+	bge	Lp1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	7
+	ae	8,7,9
+	l	7,4(3)
+	ae	10,10,0		# propagate cy to new cy_limb
+	a	8,8,7		# add res_limb
+	bge	Ln0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	7
+	ae	8,7,10
+	l	7,4(3)
+	ae	9,9,0		# propagate cy to new cy_limb
+	a	8,8,7		# add res_limb
+	bge	Ln1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
--- a/sysdeps/rs6000/lshift.s
+++ b/sysdeps/rs6000/lshift.s
@ -0,0 +1,58 @@
+# IBM POWER __mpn_lshift -- 
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s_ptr		r4
+# size		r5
+# cnt		r6
+
+	.toc
+	.extern __mpn_lshift[DS]
+	.extern .__mpn_lshift
+.csect [PR]
+	.align 2
+	.globl __mpn_lshift
+	.globl .__mpn_lshift
+	.csect __mpn_lshift[DS]
+__mpn_lshift:
+	.long .__mpn_lshift, TOC[tc0], 0
+	.csect [PR]
+.__mpn_lshift:
+	sli	0,5,2
+	cax	9,3,0
+	cax	4,4,0
+	sfi	8,6,32
+	mtctr	5		# put limb count in CTR loop register
+	lu	0,-4(4)		# read most significant limb
+	sre	3,0,8		# compute carry out limb, and init MQ register
+	bdz	Lend2		# if just one limb, skip loop
+	lu	0,-4(4)		# read 2:nd most significant limb
+	sreq	7,0,8		# compute most significant limb of result
+	bdz	Lend		# if just two limb, skip loop
+Loop:	lu	0,-4(4)		# load next lower limb
+	stu	7,-4(9)		# store previous result during read latency
+	sreq	7,0,8		# compute result limb
+	bdn	Loop		# loop back until CTR is zero
+Lend:	stu	7,-4(9)		# store 2:nd least significant limb
+Lend2:	sle	7,0,6		# compute least significant limb
+	st      7,-4(9)		# store it"				\
+	br
--- a/sysdeps/rs6000/mul_1.s
+++ b/sysdeps/rs6000/mul_1.s
@ -0,0 +1,109 @@
+# IBM POWER __mpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction.  To
+# obtain that operation, we have to use the 32x32->64 signed multiplication
+# instruction, and add the appropriate compensation to the high limb of the
+# result.  We add the multiplicand if the multiplier has its most significant
+# bit set, and we add the multiplier if the multiplicand has its most
+# significant bit set.  We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit
+# we can branch in zero cycles, so that's how we perform the additions.
+
+	.toc
+	.csect .__mpn_mul_1[PR]
+	.align 2
+	.globl __mpn_mul_1
+	.globl .__mpn_mul_1
+	.csect __mpn_mul_1[DS]
+__mpn_mul_1:
+	.long .__mpn_mul_1[PR], TOC[tc0], 0
+	.csect .__mpn_mul_1[PR]
+.__mpn_mul_1:
+
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	8
+	ai	0,0,0		# reset carry
+	cax	9,9,7
+	blt	Lneg
+Lpos:	bdz	Lend
+Lploop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	8,0,9
+	bge	Lp0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	8,0,10
+	bge	Lp1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	cax	10,10,0		# adjust high limb for negative s2_limb
+	mfmq	0
+	ae	8,0,9
+	bge	Ln0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	cax	9,9,0		# adjust high limb for negative s2_limb
+	mfmq	0
+	ae	8,0,10
+	bge	Ln1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
--- a/sysdeps/rs6000/rshift.s
+++ b/sysdeps/rs6000/rshift.s
@ -0,0 +1,56 @@
+# IBM POWER __mpn_rshift -- 
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s_ptr		r4
+# size		r5
+# cnt		r6
+
+	.toc
+	.extern __mpn_rshift[DS]
+	.extern .__mpn_rshift
+.csect [PR]
+	.align 2
+	.globl __mpn_rshift
+	.globl .__mpn_rshift
+	.csect __mpn_rshift[DS]
+__mpn_rshift:
+	.long .__mpn_rshift, TOC[tc0], 0
+	.csect [PR]
+.__mpn_rshift:
+	sfi	8,6,32
+	mtctr	5		# put limb count in CTR loop register
+	l	0,0(4)		# read least significant limb
+	ai	9,3,-4		# adjust res_ptr since it's offset in the stu:s
+	sle	3,0,8		# compute carry limb, and init MQ register
+	bdz	Lend2		# if just one limb, skip loop
+	lu	0,4(4)		# read 2:nd least significant limb
+	sleq	7,0,8		# compute least significant limb of result
+	bdz	Lend		# if just two limb, skip loop
+Loop:	lu	0,4(4)		# load next higher limb
+	stu	7,4(9)		# store previous result during read latency
+	sleq	7,0,8		# compute result limb
+	bdn	Loop		# loop back until CTR is zero
+Lend:	stu	7,4(9)		# store 2:nd most significant limb
+Lend2:	sre	7,0,6		# compute most significant limb
+	st      7,4(9)		# store it"				\
+	br
--- a/sysdeps/rs6000/sub_n.s
+++ b/sysdeps/rs6000/sub_n.s
@ -0,0 +1,55 @@
+# IBM POWER __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+# store difference in a third limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# s2_ptr	r5
+# size		r6
+
+	.toc
+	.extern __mpn_sub_n[DS]
+	.extern .__mpn_sub_n
+.csect [PR]
+	.align 2
+	.globl __mpn_sub_n
+	.globl .__mpn_sub_n
+	.csect __mpn_sub_n[DS]
+__mpn_sub_n:
+	.long .__mpn_sub_n, TOC[tc0], 0
+	.csect [PR]
+.__mpn_sub_n:
+	mtctr	6		# copy size into CTR
+	l	8,0(4)		# load least significant s1 limb
+	l	0,0(5)		# load least significant s2 limb
+	cal	3,-4(3)		# offset res_ptr, it's updated before used
+	sf	7,0,8		# add least significant limbs, set cy
+	bdz	Lend		# If done, skip loop
+Loop:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	stu	7,4(3)		# store previous limb in load latecny slot
+	sfe	7,0,8		# add new limbs with cy, set cy
+	bdn	Loop		# decrement CTR and loop back
+Lend:	st	7,4(3)		# store ultimate result limb
+	sfe	3,0,0		# load !cy into ...
+	sfi	3,3,0		# ... return value register
+	br
--- a/sysdeps/rs6000/submul_1.s
+++ b/sysdeps/rs6000/submul_1.s
@ -0,0 +1,127 @@
+# IBM POWER __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction.  To
+# obtain that operation, we have to use the 32x32->64 signed multiplication
+# instruction, and add the appropriate compensation to the high limb of the
+# result.  We add the multiplicand if the multiplier has its most significant
+# bit set, and we add the multiplier if the multiplicand has its most
+# significant bit set.  We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit
+# we can branch in zero cycles, so that's how we perform the additions.
+
+	.toc
+	.csect .__mpn_submul_1[PR]
+	.align 2
+	.globl __mpn_submul_1
+	.globl .__mpn_submul_1
+	.csect __mpn_submul_1[DS]
+__mpn_submul_1:
+	.long .__mpn_submul_1[PR], TOC[tc0], 0
+	.csect .__mpn_submul_1[PR]
+.__mpn_submul_1:
+
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	11
+	cax	9,9,7
+	l	7,4(3)
+	sf	8,11,7		# add res_limb
+	a	11,8,11		# invert cy (r11 is junk)
+	blt	Lneg
+Lpos:	bdz	Lend
+
+Lploop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	11,0,9		# low limb + old_cy_limb + old cy
+	l	7,4(3)
+	aze	10,10		# propagate cy to new cy_limb
+	sf	8,11,7		# add res_limb
+	a	11,8,11		# invert cy (r11 is junk)
+	bge	Lp0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	11,0,10
+	l	7,4(3)
+	aze	9,9
+	sf	8,11,7
+	a	11,8,11		# invert cy (r11 is junk)
+	bge	Lp1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	7
+	ae	11,7,9
+	l	7,4(3)
+	ae	10,10,0		# propagate cy to new cy_limb
+	sf	8,11,7		# add res_limb
+	a	11,8,11		# invert cy (r11 is junk)
+	bge	Ln0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	7
+	ae	11,7,10
+	l	7,4(3)
+	ae	9,9,0		# propagate cy to new cy_limb
+	sf	8,11,7		# add res_limb
+	a	11,8,11		# invert cy (r11 is junk)
+	bge	Ln1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
--- a/sysdeps/sparc/add_n.S
+++ b/sysdeps/sparc/add_n.S
@ -1,7 +1,7 @@
 ! sparc __mpn_add_n -- Add two limb vectors of the same length > 0 and store
 ! sum in a third limb vector.

-! Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+! Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.

 ! This file is part of the GNU MP Library.

@ -39,20 +39,25 @@ C_SYMBOL_NAME(__mpn_add_n):
 	sub	%g0,%o3,%o3
 	andcc	%o3,(16-1),%o3
 	be	Lzero
-	 nop
+	 mov	%o4,%g2			! put first s1_limb in g2 too

 	sll	%o3,2,%o3		! multiply by 4
 	sub	%o0,%o3,%o0		! adjust res_ptr
 	sub	%o1,%o3,%o1		! adjust s1_ptr
 	sub	%o2,%o3,%o2		! adjust s2_ptr

-	mov	%o4,%g2
-
+#if PIC
+	mov	%o7,%g4			! Save return address register
+	call	1f
+	add	%o7,Lbase-1f,%g3
+1:	mov	%g4,%o7			! Restore return address register
+#else
 	sethi	%hi(Lbase),%g3
 	or	%g3,%lo(Lbase),%g3
+#endif
 	sll	%o3,2,%o3		! multiply by 4
 	jmp	%g3+%o3
-	 mov	%o5,%g3
+	 mov	%o5,%g3			! put first s2_limb in g3 too

 Loop:	addxcc	%g2,%g3,%o3
 	add	%o1,64,%o1
--- a/sysdeps/sparc/sparc8/addmul_1.S
+++ b/sysdeps/sparc/sparc8/addmul_1.S
@ -37,8 +37,15 @@ C_SYMBOL_NAME(__mpn_addmul_1):

 	sll	%o2,4,%g1
 	and	%g1,(4-1)<<4,%g1
+#if PIC
+	mov	%o7,%g4			! Save return address register
+	call	1f
+	add	%o7,LL-1f,%g3
+1:	mov	%g4,%o7			! Restore return address register
+#else
 	sethi	%hi(LL),%g3
 	or	%g3,%lo(LL),%g3
+#endif
 	jmp	%g3+%g1
 	nop
 LL:
--- a/sysdeps/sparc/sparc8/mul_1.S
+++ b/sysdeps/sparc/sparc8/mul_1.S
@ -34,8 +34,15 @@
 C_SYMBOL_NAME(__mpn_mul_1):
 	sll	%o2,4,%g1
 	and	%g1,(4-1)<<4,%g1
+#if PIC
+	mov	%o7,%g4			! Save return address register
+	call	1f
+	add	%o7,LL-1f,%g3
+1:	mov	%g4,%o7			! Restore return address register
+#else
 	sethi	%hi(LL),%g3
 	or	%g3,%lo(LL),%g3
+#endif
 	jmp	%g3+%g1
 	ld	[%o1+0],%o4	! 1
 LL:
--- a/sysdeps/sparc/sub_n.S
+++ b/sysdeps/sparc/sub_n.S
@ -1,7 +1,7 @@
 ! sparc __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
 ! store difference in a third limb vector.

-! Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+! Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.

 ! This file is part of the GNU MP Library.

@ -39,20 +39,25 @@ C_SYMBOL_NAME(__mpn_sub_n):
 	sub	%g0,%o3,%o3
 	andcc	%o3,(16-1),%o3
 	be	Lzero
-	 nop
+	 mov	%o4,%g2			! put first s1_limb in g2 too

 	sll	%o3,2,%o3		! multiply by 4
 	sub	%o0,%o3,%o0		! adjust res_ptr
 	sub	%o1,%o3,%o1		! adjust s1_ptr
 	sub	%o2,%o3,%o2		! adjust s2_ptr

-	mov	%o4,%g2
-
+#if PIC
+	mov	%o7,%g4			! Save return address register
+	call	1f
+	add	%o7,Lbase-1f,%g3
+1:	mov	%g4,%o7			! Restore return address register
+#else
 	sethi	%hi(Lbase),%g3
 	or	%g3,%lo(Lbase),%g3
+#endif
 	sll	%o3,2,%o3		! multiply by 4
 	jmp	%g3+%o3
-	 mov	%o5,%g3
+	 mov	%o5,%g3			! put first s2_limb in g3 too

 Loop:	subxcc	%g2,%g3,%o3
 	add	%o1,64,%o1
--- a/sysdeps/unix/sysv/linux/Dist
+++ b/sysdeps/unix/sysv/linux/Dist
@ -1,2 +1,3 @@
 sys/socketcall.h
 sys/timex.h
+nfs/nfs.h
--- a/sysdeps/unix/sysv/linux/Makefile
+++ b/sysdeps/unix/sysv/linux/Makefile
@ -20,7 +20,11 @@ sysdep_routines := $(sysdep_routines) ipc
 endif

 ifeq ($(subdir), socket)
-headers += sys/socketcall.h 
+headers += sys/socketcall.h
+endif
+
+ifeq ($(subdir), sunrpc)
+headers += nfs/nfs.h
 endif

 config-LDFLAGS = -Wl,-dynamic-linker=/lib/ld-gnu.so.1
--- a/sysdeps/unix/sysv/linux/i386/sysdep.h
+++ b/sysdeps/unix/sysv/linux/i386/sysdep.h
@ -93,43 +93,61 @@ Cambridge, MA 02139, USA.  */
   (2 * movl is less expensive than pushl + popl).

   Second unlike for the other registers we don't save the content of
-   %ecx and %edx when we have than 1 and 2 registers resp.  */
+   %ecx and %edx when we have than 1 and 2 registers resp.
+
+   The code below might look a bit long but we have to take care for
+   the pipelined processors (i586 and up).  Here the `pushl' and `popl'
+   instructions are marked as NP (not pairable) but the exception is
+   two consecutive of these instruction.  This gives no penalty on
+   i386 and i486 processors though.  */

 #undef	DO_CALL
 #define DO_CALL(args)					      		      \
+    PUSHARGS_##args							      \
    DOARGS_##args							      \
-    int $0x80;								      \
-    UNDOARGS_##args
+    int $0x80								      \
+    POPARGS_##args

+#define PUSHARGS_0	/* No arguments to push.  */
 #define	DOARGS_0	/* No arguments to frob.  */
-#define	UNDOARGS_0	/* No arguments to unfrob.  */
-#define	_DOARGS_0(n)	/* No arguments to frob.  */
-#define	_UNDOARGS_0	/* No arguments to unfrob.  */
+#define	POPARGS_0	/* No arguments to pop.  */
+#define	_PUSHARGS_0	/* No arguments to push.  */
+#define _DOARGS_0(n)	/* No arguments to frob.  */
+#define	_POPARGS_0	/* No arguments to pop.  */

-#define	DOARGS_1	movl %ebx, %edx; movl 4(%esp), %ebx; DOARGS_0
-#define	UNDOARGS_1	UNDOARGS_0; movl %edx, %ebx
-#define	_DOARGS_1(n)	pushl %ebx; movl n+4(%esp), %ebx; _DOARGS_0 (n)
-#define	_UNDOARGS_1	_UNDOARGS_0; popl %ebx
+#define PUSHARGS_1	movl %ebx, %edx; PUSHARGS_0
+#define	DOARGS_1	_DOARGS_1 (4)
+#define	POPARGS_1	POPARGS_0; movl %edx, %ebx
+#define	_PUSHARGS_1	pushl %ebx; _PUSHARGS_0
+#define _DOARGS_1(n)	movl n(%esp), %ebx; _DOARGS_0(n-4)
+#define	_POPARGS_1	_POPARGS_0; popl %ebx

-#define	DOARGS_2	movl 8(%esp), %ecx; DOARGS_1
-#define	UNDOARGS_2	UNDOARGS_1
+#define PUSHARGS_2	PUSHARGS_1
+#define	DOARGS_2	_DOARGS_2 (8)
+#define	POPARGS_2	POPARGS_1
+#define _PUSHARGS_2	_PUSHARGS_1
 #define	_DOARGS_2(n)	movl n(%esp), %ecx; _DOARGS_1 (n-4)
-#define	_UNDOARGS_2	_UNDOARGS_1
+#define	_POPARGS_2	_POPARGS_1

-#define DOARGS_3	_DOARGS_3 (12)
-#define UNDOARGS_3	_UNDOARGS_3
+#define PUSHARGS_3	_PUSHARGS_2
+#define DOARGS_3	_DOARGS_3 (16)
+#define POPARGS_3	_POPARGS_3
+#define _PUSHARGS_3	_PUSHARGS_2
 #define _DOARGS_3(n)	movl n(%esp), %edx; _DOARGS_2 (n-4)
-#define _UNDOARGS_3	_UNDOARGS_2
+#define _POPARGS_3	_POPARGS_2

-#define DOARGS_4	_DOARGS_4 (16)
-#define UNDOARGS_4	_UNDOARGS_4
-#define _DOARGS_4(n)	pushl %esi; movl n+4(%esp), %esi; _DOARGS_3 (n)
-#define _UNDOARGS_4	_UNDOARGS_3; popl %esi
-
-#define DOARGS_5	_DOARGS_5 (20)
-#define UNDOARGS_5	_UNDOARGS_5
-#define _DOARGS_5(n)	pushl %edi; movl n+4(%esp), %edi; _DOARGS_4 (n)
-#define _UNDOARGS_5	_UNDOARGS_4; popl %edi
+#define PUSHARGS_4	_PUSHARGS_4
+#define DOARGS_4	_DOARGS_4 (24)
+#define POPARGS_4	_POPARGS_4
+#define _PUSHARGS_4	pushl %esi; _PUSHARGS_3
+#define _DOARGS_4(n)	movl n(%esp), %esi; _DOARGS_3 (n-4)
+#define _POPARGS_4	_POPARGS_3; popl %esi

+#define PUSHARGS_5	_PUSHARGS_5
+#define DOARGS_5	_DOARGS_5 (32)
+#define POPARGS_5	_POPARGS_5
+#define _PUSHARGS_5	pushl %edi; _PUSHARGS_4
+#define _DOARGS_5(n)	movl n(%esp), %edi; _DOARGS_4 (n-4)
+#define _POPARGS_5	_POPARGS_4; popl %edi

 #endif	/* ASSEMBLER */
--- a/sysdeps/unix/sysv/linux/local_lim.h
+++ b/sysdeps/unix/sysv/linux/local_lim.h
@ -1,6 +1,6 @@
-/* Minimum guaranteed maximum values for system limits.  Hurd version.
+/* Minimum guaranteed maximum values for system limits.  Linux version.

-Copyright (C) 1993, 1994 Free Software Foundation, Inc.
+Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
 This file is part of the GNU C Library.

 The GNU C Library is free software; you can redistribute it and/or
@ -18,14 +18,5 @@ License along with the GNU C Library; see the file COPYING.LIB.  If
 not, write to the Free Software Foundation, Inc., 675 Mass Ave,
 Cambridge, MA 02139, USA.  */

-/* Linux has a fixed limit of supplementary groups allocated with a
-   process.  This value is determined by the size of the `groups'
-   member of the `task_struct' structure in <linux/sched.h>.  */
-   
-#define NGROUPS_MAX	32
-
-
-/* Maximum size of file names.  Not all file system types support
-   this size but it is only a maximum value.  */
-
-#define NAME_MAX	255
+/* The kernel sources contain a file with all the needed information.  */
+#include <linux/limits.h>
--- a/sysdeps/unix/sysv/linux/nfs/nfs.h
+++ b/sysdeps/unix/sysv/linux/nfs/nfs.h
@ -0,0 +1 @@
+#include <linux/nfs.h>
--- a/sysdeps/unix/sysv/linux/sys/param.h
+++ b/sysdeps/unix/sysv/linux/sys/param.h
@ -1,3 +1,21 @@
+/* Copyright (C) 1995 Free Software Foundation, Inc.
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
 #ifndef _SYS_PARAM_H
 #define _SYS_PARAM_H

@ -7,26 +25,21 @@

 #include <sys/types.h>

-/* Don't change it. H.J. */
-#ifdef OLD_LINUX
-#undef	MAXHOSTNAMELEN
-#define MAXHOSTNAMELEN 	8	/* max length of hostname */
-#endif

 #ifndef howmany
-#define howmany(x, y)	(((x)+((y)-1))/(y))
+# define howmany(x, y)	(((x)+((y)-1))/(y))
 #endif

 #ifndef roundup
-#define roundup(x, y)	((((x)+((y)-1))/(y))*(y))
+# define roundup(x, y)	((((x)+((y)-1))/(y))*(y))
 #endif

 #define MAXPATHLEN      PATH_MAX
 #define NOFILE          OPEN_MAX

 /*  Following the information of some of the kernel people I here assume
- *  that block size (i.e. the value of stat.st_blocks) for all filesystem
- *  is 512 bytes.  If not tell me or HJ.  -- Uli */
+    that block size (i.e. the value of stat.st_blocks) for all filesystem
+    is 512 bytes.  If not tell HJ, Roland, or me.  -- drepper */
 #define DEV_BSIZE       512

 #endif
--- a/Show More
+++ b/Show More