2000-12-08  Jakub Jelinek  <jakub@redhat.com>

	* elf/dl-load.c (fillin_rpath): Don't assume there is '\0' at
	cp + len.  Compute where from dirname.
	Reported by <jreiser@BitWagon.com>.

2000-12-08  Richard Henderson  <rth@twiddle.net>

	* sysdeps/alpha/_mcount.S: Fix typo.
	* sysdeps/alpha/strncpy.S: Likewise.

	* sysdeps/alpha/alphaev6/Implies: New file.
	* sysdeps/alpha/alphaev67/Implies: New file.
	* sysdeps/alpha/alphaev67/ffs.S: New file.
	* sysdeps/alpha/alphaev67/ffsll.S: New file.
	* sysdeps/alpha/alphaev67/rawmemchr.S: New file.
	* sysdeps/alpha/alphaev67/stpcpy.S: New file.
	* sysdeps/alpha/alphaev67/stpncpy.S: New file.
	* sysdeps/alpha/rawmemchr.S: New file.
	* sysdeps/alpha/strcat.S: Tail call to __stxcpy.
	* sysdeps/alpha/strcpy.S: Likewise.

	From GMP 3.1.1:
	* sysdeps/alpha/alphaev6/addmul_1.s: New file.

	From rick.gorton@alpha-processor.com:
	* sysdeps/alpha/alphaev6/memchr.S: New file.
	* sysdeps/alpha/alphaev6/memcpy.S: New file.
	* sysdeps/alpha/alphaev6/memset.S: New file.
	* sysdeps/alpha/alphaev6/stxcpy.S: New file.
	* sysdeps/alpha/alphaev6/stxncpy.S: New file.
	* sysdeps/alpha/alphaev67/strcat.S: New file.
	* sysdeps/alpha/alphaev67/strchr.S: New file.
	* sysdeps/alpha/alphaev67/strlen.S: New file.
	* sysdeps/alpha/alphaev67/strncat.S: New file.
	* sysdeps/alpha/htonl.S: Use a shorter sequence.

2000-12-08  Jakub Jelinek  <jakub@redhat.com>

	* inet/getnameinfo.c (getnameinfo): Fix NI_NOFQDN support.
	Reported by <pspencer@fields.utoronto.ca>.

2000-12-07  Jes Sorensen  <jes@linuxcare.com>

	* sysdeps/ia64/elf/start.S (__data_start): Add __data_start variable.
	Pointed out by Hans Boehm.

2000-12-07  H.J. Lu  <hjl@gnu.org>

	* elf/dl-version.c (match_symbol): Check map->l_name[0] for printing.

2000-12-07  Andreas Jaeger  <aj@suse.de>

	* misc/error.c: Add format attributes for __error and __error_at_line.

	* nscd/dbg_log.h: Add format attribute.

2000-12-08  Ulrich Drepper  <drepper@redhat.com>

	* misc/sys/syslog.h: Add format attributes to syslog and vsyslog.
	Patch by Joseph S. Myers <jsm28@cam.ac.uk>.

	* sysdeps/generic/dl-cache.c (HWCAP_CHECK): Fix access to _dl_hwcap.
	* manual/install.texi: Recommend to set LANGUAGE=C LC_ALL=C during
This commit is contained in:
Ulrich Drepper 2000-12-08 17:27:11 +00:00
parent 4e9b4067d7
commit 104d0bd3ef
31 changed files with 2752 additions and 27 deletions

View File

@ -1,3 +1,65 @@
2000-12-08 Jakub Jelinek <jakub@redhat.com>
* elf/dl-load.c (fillin_rpath): Don't assume there is '\0' at
cp + len. Compute where from dirname.
Reported by <jreiser@BitWagon.com>.
2000-12-08 Richard Henderson <rth@twiddle.net>
* sysdeps/alpha/_mcount.S: Fix typo.
* sysdeps/alpha/strncpy.S: Likewise.
* sysdeps/alpha/alphaev6/Implies: New file.
* sysdeps/alpha/alphaev67/Implies: New file.
* sysdeps/alpha/alphaev67/ffs.S: New file.
* sysdeps/alpha/alphaev67/ffsll.S: New file.
* sysdeps/alpha/alphaev67/rawmemchr.S: New file.
* sysdeps/alpha/alphaev67/stpcpy.S: New file.
* sysdeps/alpha/alphaev67/stpncpy.S: New file.
* sysdeps/alpha/rawmemchr.S: New file.
* sysdeps/alpha/strcat.S: Tail call to __stxcpy.
* sysdeps/alpha/strcpy.S: Likewise.
From GMP 3.1.1:
* sysdeps/alpha/alphaev6/addmul_1.s: New file.
From rick.gorton@alpha-processor.com:
* sysdeps/alpha/alphaev6/memchr.S: New file.
* sysdeps/alpha/alphaev6/memcpy.S: New file.
* sysdeps/alpha/alphaev6/memset.S: New file.
* sysdeps/alpha/alphaev6/stxcpy.S: New file.
* sysdeps/alpha/alphaev6/stxncpy.S: New file.
* sysdeps/alpha/alphaev67/strcat.S: New file.
* sysdeps/alpha/alphaev67/strchr.S: New file.
* sysdeps/alpha/alphaev67/strlen.S: New file.
* sysdeps/alpha/alphaev67/strncat.S: New file.
* sysdeps/alpha/htonl.S: Use a shorter sequence.
2000-12-08 Jakub Jelinek <jakub@redhat.com>
* inet/getnameinfo.c (getnameinfo): Fix NI_NOFQDN support.
Reported by <pspencer@fields.utoronto.ca>.
2000-12-07 Jes Sorensen <jes@linuxcare.com>
* sysdeps/ia64/elf/start.S (__data_start): Add __data_start variable.
Pointed out by Hans Boehm.
2000-12-07 H.J. Lu <hjl@gnu.org>
* elf/dl-version.c (match_symbol): Check map->l_name[0] for printing.
2000-12-07 Andreas Jaeger <aj@suse.de>
* misc/error.c: Add format attributes for __error and __error_at_line.
* nscd/dbg_log.h: Add format attribute.
2000-12-08 Ulrich Drepper <drepper@redhat.com>
* misc/sys/syslog.h: Add format attributes to syslog and vsyslog.
Patch by Joseph S. Myers <jsm28@cam.ac.uk>.
2000-12-07 Dan Pop <Dan.Pop@cern.ch> 2000-12-07 Dan Pop <Dan.Pop@cern.ch>
* sysdeps/ia64/strcpy.S: Fix a bug in a recovery code sequence. * sysdeps/ia64/strcpy.S: Fix a bug in a recovery code sequence.
@ -948,8 +1010,7 @@
2000-11-14 Andreas Jaeger <aj@suse.de> 2000-11-14 Andreas Jaeger <aj@suse.de>
* sysdeps/generic/dl-cache.c (HWCAP_CHECK): Fix access to * sysdeps/generic/dl-cache.c (HWCAP_CHECK): Fix access to _dl_hwcap.
_dl_hwcap.
2000-11-13 Roland McGrath <roland@frob.com> 2000-11-13 Roland McGrath <roland@frob.com>
@ -992,7 +1053,7 @@
2000-11-03 Bruno Haible <haible@clisp.cons.org> 2000-11-03 Bruno Haible <haible@clisp.cons.org>
* manual/install.texi: Recommend to set LANGUAGE=C LC_ALL-C during * manual/install.texi: Recommend to set LANGUAGE=C LC_ALL=C during
"make install", to work around a binary incompatibility between "make install", to work around a binary incompatibility between
glibc 2.1 and glibc 2.2 gconv modules. glibc 2.1 and glibc 2.2 gconv modules.

View File

@ -419,7 +419,7 @@ fillin_rpath (char *rpath, struct r_search_path_elem **result, const char *sep,
dirp->dirname = ((char *) dirp + sizeof (*dirp) dirp->dirname = ((char *) dirp + sizeof (*dirp)
+ ncapstr * sizeof (enum r_dir_status)); + ncapstr * sizeof (enum r_dir_status));
memcpy ((char *) dirp->dirname, cp, len + 1); *((char *) __mempcpy ((char *) dirp->dirname, cp, len)) = '\0';
dirp->dirnamelen = len; dirp->dirnamelen = len;
if (len > max_dirnamelen) if (len > max_dirnamelen)

View File

@ -95,7 +95,7 @@ match_symbol (const char *name, ElfW(Word) hash, const char *string,
only print a message if verbose output is requested. */ only print a message if verbose output is requested. */
if (verbose) if (verbose)
/* XXX We cannot translate the messages. */ /* XXX We cannot translate the messages. */
_dl_signal_cerror (0, map->l_name, _dl_signal_cerror (0, map->l_name[0] ? map->l_name : _dl_argv[0],
make_string ("\ make_string ("\
no version information available (required by ", no version information available (required by ",
name, ")")); name, ")"));

View File

@ -259,7 +259,7 @@ getnameinfo (const struct sockaddr *sa, socklen_t addrlen, char *host,
if (h) if (h)
{ {
char *c; char *c;
if ((flags & NI_NOFQDN) == 0 if ((flags & NI_NOFQDN)
&& (c = nrl_domainname ()) && (c = nrl_domainname ())
&& (c = strstr (h->h_name, c)) && (c = strstr (h->h_name, c))
&& (c != h->h_name) && (*(--c) == '.')) && (c != h->h_name) && (*(--c) == '.'))

View File

@ -74,10 +74,12 @@ unsigned int error_message_count;
/* In GNU libc we want do not want to use the common name `error' directly. /* In GNU libc we want do not want to use the common name `error' directly.
Instead make it a weak alias. */ Instead make it a weak alias. */
extern void __error (int status, int errnum, const char *message, ...); extern void __error (int status, int errnum, const char *message, ...)
__attribute__ ((__format__ (__printf__, 3, 4)));
extern void __error_at_line (int status, int errnum, const char *file_name, extern void __error_at_line (int status, int errnum, const char *file_name,
unsigned int line_number, const char *message, unsigned int line_number, const char *message,
...); ...)
__attribute__ ((__format__ (__printf__, 5, 6)));;
# define error __error # define error __error
# define error_at_line __error_at_line # define error_at_line __error_at_line

View File

@ -179,12 +179,13 @@ extern void openlog (__const char *__ident, int __option, int __facility)
extern int setlogmask (int __mask) __THROW; extern int setlogmask (int __mask) __THROW;
/* Generate a log message using FMT string and option arguments. */ /* Generate a log message using FMT string and option arguments. */
extern void syslog (int __pri, __const char *__fmt, ...) __THROW; extern void syslog (int __pri, __const char *__fmt, ...) __THROW
__attribute__ ((__format__(__printf__, 2, 3)));
#ifdef __USE_BSD #ifdef __USE_BSD
/* Generate a log message using FMT and using arguments pointed to by AP. */ /* Generate a log message using FMT and using arguments pointed to by AP. */
extern void vsyslog (int __pri, __const char *__fmt, __gnuc_va_list __ap) extern void vsyslog (int __pri, __const char *__fmt, __gnuc_va_list __ap)
__THROW; __THROW __attribute__ ((__format__(__printf__, 2, 0)));
#endif #endif
__END_DECLS __END_DECLS

View File

@ -22,7 +22,8 @@
extern int debug_level; extern int debug_level;
extern void dbg_log (const char *str, ...); extern void dbg_log (const char *str, ...)
__attribute__ ((__format__ (__printf__, 1, 0)));;
extern int set_logfile (const char *logfile); extern int set_logfile (const char *logfile);

View File

@ -27,7 +27,7 @@
compiler treats those calls as if they were instructions. In compiler treats those calls as if they were instructions. In
particular, it doesn't save any of the temporary registers (caller particular, it doesn't save any of the temporary registers (caller
saved registers). It is therefore necessary to preserve all saved registers). It is therefore necessary to preserve all
caller-saved registers as well caller-saved registers as well.
Upon entering _mcount, register $at holds the return address and ra Upon entering _mcount, register $at holds the return address and ra
holds the return address of the function's caller (selfpc and frompc, holds the return address of the function's caller (selfpc and frompc,

View File

@ -0,0 +1 @@
alpha/alphaev5

View File

@ -0,0 +1,479 @@
# Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
# the result to a second limb vector.
#
# Copyright (C) 2000 Free Software Foundation, Inc.
#
# This file is part of the GNU MP Library.
#
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published
# by the Free Software Foundation; either version 2.1 of the License, or (at
# your option) any later version.
#
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
# License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
# MA 02111-1307, USA.
# INPUT PARAMETERS
# res_ptr $16
# s1_ptr $17
# size $18
# s2_limb $19
#
# This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and
# exactly 3.625 cycles/limb on EV6...
#
# This code was written in close cooperation with ev6 pipeline expert
# Steve Root (root@toober.hlo.dec.com). Any errors are tege's fault, though.
#
# Register usages for unrolled loop:
# 0-3 mul's
# 4-7 acc's
# 8-15 mul results
# 20,21 carry's
# 22,23 save for stores
#
# Sustains 8 mul-adds in 29 cycles in the unrolled inner loop.
#
# The stores can issue a cycle late so we have paired no-op's to 'catch'
# them, so that further disturbance to the schedule is damped.
#
# We couldn't pair the loads, because the entangled schedule of the
# carry's has to happen on one side {0} of the machine. Note, the total
# use of U0, and the total use of L0 (after attending to the stores).
# which is part of the reason why....
#
# This is a great schedule for the d_cache, a poor schedule for the
# b_cache. The lockup on U0 means that any stall can't be recovered
# from. Consider a ldq in L1. say that load gets stalled because it
# collides with a fill from the b_Cache. On the next cycle, this load
# gets priority. If first looks at L0, and goes there. The instruction
# we intended for L0 gets to look at L1, which is NOT where we want
# it. It either stalls 1, because it can't go in L0, or goes there, and
# causes a further instruction to stall.
#
# So for b_cache, we're likely going to want to put one or more cycles
# back into the code! And, of course, put in prefetches. For the
# accumulator, lds, intent to modify. For the multiplier, you might
# want ldq, evict next, if you're not wanting to use it again soon. Use
# 256 ahead of present pointer value. At a place where we have an mt
# followed by a bookkeeping, put the bookkeeping in upper, and the
# prefetch into lower.
#
# Note, the usage of physical registers per cycle is smoothed off, as
# much as possible.
#
# Note, the ldq's and stq's are at the end of the quadpacks. note, we'd
# like not to have a ldq or stq to preceded a conditional branch in a
# quadpack. The conditional branch moves the retire pointer one cycle
# later.
#
# Optimization notes:
# Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27?
# Reserved regs: $29 $30 $31
# Free caller-saves regs in unrolled code: $24 $25 $28
# We should swap some of the callee-saves regs for some of the free
# caller-saves regs, saving some overhead cycles.
# Most importantly, we should write fast code for the 0-7 case.
# The code we use there are for the 21164, and runs at 7 cycles/limb
# on the 21264. Should not be hard, if we write specialized code for
# 1-7 limbs (the one for 0 limbs should be straightforward). We then just
# need a jump table indexed by the low 3 bits of the count argument.
.set noreorder
.set noat
.text
.globl __mpn_addmul_1
.ent __mpn_addmul_1
__mpn_addmul_1:
.frame $30,0,$26,0
.prologue 0
cmpult $18, 8, $1
beq $1, $Large
ldq $2, 0($17) # $2 = s1_limb
addq $17, 8, $17 # s1_ptr++
subq $18, 1, $18 # size--
mulq $2, $19, $3 # $3 = prod_low
ldq $5, 0($16) # $5 = *res_ptr
umulh $2, $19, $0 # $0 = prod_high
beq $18, $Lend0b # jump if size was == 1
ldq $2, 0($17) # $2 = s1_limb
addq $17, 8, $17 # s1_ptr++
subq $18, 1, $18 # size--
addq $5, $3, $3
cmpult $3, $5, $4
stq $3, 0($16)
addq $16, 8, $16 # res_ptr++
beq $18, $Lend0a # jump if size was == 2
.align 3
$Loop0: mulq $2, $19, $3 # $3 = prod_low
ldq $5, 0($16) # $5 = *res_ptr
addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
subq $18, 1, $18 # size--
umulh $2, $19, $4 # $4 = cy_limb
ldq $2, 0($17) # $2 = s1_limb
addq $17, 8, $17 # s1_ptr++
addq $3, $0, $3 # $3 = cy_limb + prod_low
cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
addq $5, $3, $3
cmpult $3, $5, $5
stq $3, 0($16)
addq $16, 8, $16 # res_ptr++
addq $5, $0, $0 # combine carries
bne $18, $Loop0
$Lend0a:
mulq $2, $19, $3 # $3 = prod_low
ldq $5, 0($16) # $5 = *res_ptr
addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
umulh $2, $19, $4 # $4 = cy_limb
addq $3, $0, $3 # $3 = cy_limb + prod_low
cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
addq $5, $3, $3
cmpult $3, $5, $5
stq $3, 0($16)
addq $5, $0, $0 # combine carries
addq $4, $0, $0 # cy_limb = prod_high + cy
ret $31, ($26), 1
$Lend0b:
addq $5, $3, $3
cmpult $3, $5, $5
stq $3, 0($16)
addq $0, $5, $0
ret $31, ($26), 1
$Large:
lda $30, -240($30)
stq $9, 8($30)
stq $10, 16($30)
stq $11, 24($30)
stq $12, 32($30)
stq $13, 40($30)
stq $14, 48($30)
stq $15, 56($30)
and $18, 7, $20 # count for the first loop, 0-7
srl $18, 3, $18 # count for unrolled loop
bis $31, $31, $0
beq $20, $Lunroll
ldq $2, 0($17) # $2 = s1_limb
addq $17, 8, $17 # s1_ptr++
subq $20, 1, $20 # size--
mulq $2, $19, $3 # $3 = prod_low
ldq $5, 0($16) # $5 = *res_ptr
umulh $2, $19, $0 # $0 = prod_high
beq $20, $Lend1b # jump if size was == 1
ldq $2, 0($17) # $2 = s1_limb
addq $17, 8, $17 # s1_ptr++
subq $20, 1, $20 # size--
addq $5, $3, $3
cmpult $3, $5, $4
stq $3, 0($16)
addq $16, 8, $16 # res_ptr++
beq $20, $Lend1a # jump if size was == 2
.align 3
$Loop1: mulq $2, $19, $3 # $3 = prod_low
ldq $5, 0($16) # $5 = *res_ptr
addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
subq $20, 1, $20 # size--
umulh $2, $19, $4 # $4 = cy_limb
ldq $2, 0($17) # $2 = s1_limb
addq $17, 8, $17 # s1_ptr++
addq $3, $0, $3 # $3 = cy_limb + prod_low
cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
addq $5, $3, $3
cmpult $3, $5, $5
stq $3, 0($16)
addq $16, 8, $16 # res_ptr++
addq $5, $0, $0 # combine carries
bne $20, $Loop1
$Lend1a:
mulq $2, $19, $3 # $3 = prod_low
ldq $5, 0($16) # $5 = *res_ptr
addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
umulh $2, $19, $4 # $4 = cy_limb
addq $3, $0, $3 # $3 = cy_limb + prod_low
cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
addq $5, $3, $3
cmpult $3, $5, $5
stq $3, 0($16)
addq $16, 8, $16 # res_ptr++
addq $5, $0, $0 # combine carries
addq $4, $0, $0 # cy_limb = prod_high + cy
br $31, $Lunroll
$Lend1b:
addq $5, $3, $3
cmpult $3, $5, $5
stq $3, 0($16)
addq $16, 8, $16 # res_ptr++
addq $0, $5, $0
$Lunroll:
lda $17, -16($17) # L1 bookkeeping
lda $16, -16($16) # L1 bookkeeping
bis $0, $31, $12
# ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
ldq $2, 16($17) # L1
ldq $3, 24($17) # L1
lda $18, -1($18) # L1 bookkeeping
ldq $6, 16($16) # L1
ldq $7, 24($16) # L1
ldq $0, 32($17) # L1
mulq $19, $2, $13 # U1
ldq $1, 40($17) # L1
umulh $19, $2, $14 # U1
mulq $19, $3, $15 # U1
lda $17, 64($17) # L1 bookkeeping
ldq $4, 32($16) # L1
ldq $5, 40($16) # L1
umulh $19, $3, $8 # U1
ldq $2, -16($17) # L1
mulq $19, $0, $9 # U1
ldq $3, -8($17) # L1
umulh $19, $0, $10 # U1
addq $6, $13, $6 # L0 lo + acc
mulq $19, $1, $11 # U1
cmpult $6, $13, $20 # L0 lo add => carry
lda $16, 64($16) # L1 bookkeeping
addq $6, $12, $22 # U0 hi add => answer
cmpult $22, $12, $21 # L0 hi add => carry
addq $14, $20, $14 # U0 hi mul + carry
ldq $6, -16($16) # L1
addq $7, $15, $23 # L0 lo + acc
addq $14, $21, $14 # U0 hi mul + carry
ldq $7, -8($16) # L1
umulh $19, $1, $12 # U1
cmpult $23, $15, $20 # L0 lo add => carry
addq $23, $14, $23 # U0 hi add => answer
ldq $0, 0($17) # L1
mulq $19, $2, $13 # U1
cmpult $23, $14, $21 # L0 hi add => carry
addq $8, $20, $8 # U0 hi mul + carry
ldq $1, 8($17) # L1
umulh $19, $2, $14 # U1
addq $4, $9, $4 # L0 lo + acc
stq $22, -48($16) # L0
stq $23, -40($16) # L1
mulq $19, $3, $15 # U1
addq $8, $21, $8 # U0 hi mul + carry
cmpult $4, $9, $20 # L0 lo add => carry
addq $4, $8, $22 # U0 hi add => answer
ble $18, $Lend # U1 bookkeeping
# ____ MAIN UNROLLED LOOP ____
.align 4
$Loop:
bis $31, $31, $31 # U1 mt
cmpult $22, $8, $21 # L0 hi add => carry
addq $10, $20, $10 # U0 hi mul + carry
ldq $4, 0($16) # L1
bis $31, $31, $31 # U1 mt
addq $5, $11, $23 # L0 lo + acc
addq $10, $21, $10 # L0 hi mul + carry
ldq $5, 8($16) # L1
umulh $19, $3, $8 # U1
cmpult $23, $11, $20 # L0 lo add => carry
addq $23, $10, $23 # U0 hi add => answer
ldq $2, 16($17) # L1
mulq $19, $0, $9 # U1
cmpult $23, $10, $21 # L0 hi add => carry
addq $12, $20, $12 # U0 hi mul + carry
ldq $3, 24($17) # L1
umulh $19, $0, $10 # U1
addq $6, $13, $6 # L0 lo + acc
stq $22, -32($16) # L0
stq $23, -24($16) # L1
bis $31, $31, $31 # L0 st slosh
mulq $19, $1, $11 # U1
bis $31, $31, $31 # L1 st slosh
addq $12, $21, $12 # U0 hi mul + carry
cmpult $6, $13, $20 # L0 lo add => carry
bis $31, $31, $31 # U1 mt
lda $18, -1($18) # L1 bookkeeping
addq $6, $12, $22 # U0 hi add => answer
bis $31, $31, $31 # U1 mt
cmpult $22, $12, $21 # L0 hi add => carry
addq $14, $20, $14 # U0 hi mul + carry
ldq $6, 16($16) # L1
bis $31, $31, $31 # U1 mt
addq $7, $15, $23 # L0 lo + acc
addq $14, $21, $14 # U0 hi mul + carry
ldq $7, 24($16) # L1
umulh $19, $1, $12 # U1
cmpult $23, $15, $20 # L0 lo add => carry
addq $23, $14, $23 # U0 hi add => answer
ldq $0, 32($17) # L1
mulq $19, $2, $13 # U1
cmpult $23, $14, $21 # L0 hi add => carry
addq $8, $20, $8 # U0 hi mul + carry
ldq $1, 40($17) # L1
umulh $19, $2, $14 # U1
addq $4, $9, $4 # U0 lo + acc
stq $22, -16($16) # L0
stq $23, -8($16) # L1
bis $31, $31, $31 # L0 st slosh
mulq $19, $3, $15 # U1
bis $31, $31, $31 # L1 st slosh
addq $8, $21, $8 # L0 hi mul + carry
cmpult $4, $9, $20 # L0 lo add => carry
bis $31, $31, $31 # U1 mt
lda $17, 64($17) # L1 bookkeeping
addq $4, $8, $22 # U0 hi add => answer
bis $31, $31, $31 # U1 mt
cmpult $22, $8, $21 # L0 hi add => carry
addq $10, $20, $10 # U0 hi mul + carry
ldq $4, 32($16) # L1
bis $31, $31, $31 # U1 mt
addq $5, $11, $23 # L0 lo + acc
addq $10, $21, $10 # L0 hi mul + carry
ldq $5, 40($16) # L1
umulh $19, $3, $8 # U1
cmpult $23, $11, $20 # L0 lo add => carry
addq $23, $10, $23 # U0 hi add => answer
ldq $2, -16($17) # L1
mulq $19, $0, $9 # U1
cmpult $23, $10, $21 # L0 hi add => carry
addq $12, $20, $12 # U0 hi mul + carry
ldq $3, -8($17) # L1
umulh $19, $0, $10 # U1
addq $6, $13, $6 # L0 lo + acc
stq $22, 0($16) # L0
stq $23, 8($16) # L1
bis $31, $31, $31 # L0 st slosh
mulq $19, $1, $11 # U1
bis $31, $31, $31 # L1 st slosh
addq $12, $21, $12 # U0 hi mul + carry
cmpult $6, $13, $20 # L0 lo add => carry
bis $31, $31, $31 # U1 mt
lda $16, 64($16) # L1 bookkeeping
addq $6, $12, $22 # U0 hi add => answer
bis $31, $31, $31 # U1 mt
cmpult $22, $12, $21 # L0 hi add => carry
addq $14, $20, $14 # U0 hi mul + carry
ldq $6, -16($16) # L1
bis $31, $31, $31 # U1 mt
addq $7, $15, $23 # L0 lo + acc
addq $14, $21, $14 # U0 hi mul + carry
ldq $7, -8($16) # L1
umulh $19, $1, $12 # U1
cmpult $23, $15, $20 # L0 lo add => carry
addq $23, $14, $23 # U0 hi add => answer
ldq $0, 0($17) # L1
mulq $19, $2, $13 # U1
cmpult $23, $14, $21 # L0 hi add => carry
addq $8, $20, $8 # U0 hi mul + carry
ldq $1, 8($17) # L1
umulh $19, $2, $14 # U1
addq $4, $9, $4 # L0 lo + acc
stq $22, -48($16) # L0
stq $23, -40($16) # L1
bis $31, $31, $31 # L0 st slosh
mulq $19, $3, $15 # U1
bis $31, $31, $31 # L1 st slosh
addq $8, $21, $8 # U0 hi mul + carry
cmpult $4, $9, $20 # L0 lo add => carry
addq $4, $8, $22 # U0 hi add => answer
bis $31, $31, $31 # L1 mt
bgt $18, $Loop # U1 bookkeeping
# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
$Lend:
cmpult $22, $8, $21 # L0 hi add => carry
addq $10, $20, $10 # U0 hi mul + carry
ldq $4, 0($16) # L1
addq $5, $11, $23 # L0 lo + acc
addq $10, $21, $10 # L0 hi mul + carry
ldq $5, 8($16) # L1
umulh $19, $3, $8 # U1
cmpult $23, $11, $20 # L0 lo add => carry
addq $23, $10, $23 # U0 hi add => answer
mulq $19, $0, $9 # U1
cmpult $23, $10, $21 # L0 hi add => carry
addq $12, $20, $12 # U0 hi mul + carry
umulh $19, $0, $10 # U1
addq $6, $13, $6 # L0 lo + acc
stq $22, -32($16) # L0
stq $23, -24($16) # L1
mulq $19, $1, $11 # U1
addq $12, $21, $12 # U0 hi mul + carry
cmpult $6, $13, $20 # L0 lo add => carry
addq $6, $12, $22 # U0 hi add => answer
cmpult $22, $12, $21 # L0 hi add => carry
addq $14, $20, $14 # U0 hi mul + carry
addq $7, $15, $23 # L0 lo + acc
addq $14, $21, $14 # U0 hi mul + carry
umulh $19, $1, $12 # U1
cmpult $23, $15, $20 # L0 lo add => carry
addq $23, $14, $23 # U0 hi add => answer
cmpult $23, $14, $21 # L0 hi add => carry
addq $8, $20, $8 # U0 hi mul + carry
addq $4, $9, $4 # U0 lo + acc
stq $22, -16($16) # L0
stq $23, -8($16) # L1
bis $31, $31, $31 # L0 st slosh
addq $8, $21, $8 # L0 hi mul + carry
cmpult $4, $9, $20 # L0 lo add => carry
addq $4, $8, $22 # U0 hi add => answer
cmpult $22, $8, $21 # L0 hi add => carry
addq $10, $20, $10 # U0 hi mul + carry
addq $5, $11, $23 # L0 lo + acc
addq $10, $21, $10 # L0 hi mul + carry
cmpult $23, $11, $20 # L0 lo add => carry
addq $23, $10, $23 # U0 hi add => answer
cmpult $23, $10, $21 # L0 hi add => carry
addq $12, $20, $12 # U0 hi mul + carry
stq $22, 0($16) # L0
stq $23, 8($16) # L1
addq $12, $21, $0 # U0 hi mul + carry
ldq $9, 8($30)
ldq $10, 16($30)
ldq $11, 24($30)
ldq $12, 32($30)
ldq $13, 40($30)
ldq $14, 48($30)
ldq $15, 56($30)
lda $30, 240($30)
ret $31, ($26), 1
.end __mpn_addmul_1

View File

@ -0,0 +1,192 @@
/* Copyright (C) 2000 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by David Mosberger (davidm@cs.arizona.edu).
EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
#include <sysdep.h>
.arch ev6
.set noreorder
.set noat
ENTRY(__memchr)
#ifdef PROF
ldgp gp, 0(pv)
lda AT, _mcount
jsr AT, (AT), _mcount
.prologue 1
#else
.prologue 0
#endif
# Hack -- if someone passes in (size_t)-1, hoping to just
# search til the end of the address space, we will overflow
# below when we find the address of the last byte. Given
# that we will never have a 56-bit address space, cropping
# the length is the easiest way to avoid trouble.
zap $18, 0x80, $5 # U : Bound length
beq $18, $not_found # U :
ldq_u $1, 0($16) # L : load first quadword Latency=3
and $17, 0xff, $17 # E : L L U U : 00000000000000ch
insbl $17, 1, $2 # U : 000000000000ch00
cmpult $18, 9, $4 # E : small (< 1 quad) string?
or $2, $17, $17 # E : 000000000000chch
lda $3, -1($31) # E : U L L U
sll $17, 16, $2 # U : 00000000chch0000
addq $16, $5, $5 # E : Max search address
or $2, $17, $17 # E : 00000000chchchch
sll $17, 32, $2 # U : U L L U : chchchch00000000
or $2, $17, $17 # E : chchchchchchchch
extql $1, $16, $7 # U : $7 is upper bits
beq $4, $first_quad # U :
ldq_u $6, -1($5) # L : L U U L : eight or less bytes to search Latency=3
extqh $6, $16, $6 # U : 2 cycle stall for $6
mov $16, $0 # E :
nop # E :
or $7, $6, $1 # E : L U L U $1 = quadword starting at $16
# Deal with the case where at most 8 bytes remain to be searched
# in $1. E.g.:
# $18 = 6
# $1 = ????c6c5c4c3c2c1
$last_quad:
negq $18, $6 # E :
xor $17, $1, $1 # E :
srl $3, $6, $6 # U : $6 = mask of $18 bits set
cmpbge $31, $1, $2 # E : L U L U
nop
nop
and $2, $6, $2 # E :
beq $2, $not_found # U : U L U L
$found_it:
#if defined(__alpha_fix__) && defined(__alpha_cix__)
/*
* Since we are guaranteed to have set one of the bits, we don't
* have to worry about coming back with a 0x40 out of cttz...
*/
cttz $2, $3 # U0 :
addq $0, $3, $0 # E : All done
nop # E :
ret # L0 : L U L U
#else
/*
* Slow and clunky. It can probably be improved.
* An exercise left for others.
*/
negq $2, $3 # E :
and $2, $3, $2 # E :
and $2, 0x0f, $1 # E :
addq $0, 4, $3 # E :
cmoveq $1, $3, $0 # E : Latency 2, extra map cycle
nop # E : keep with cmov
and $2, 0x33, $1 # E :
addq $0, 2, $3 # E : U L U L : 2 cycle stall on $0
cmoveq $1, $3, $0 # E : Latency 2, extra map cycle
nop # E : keep with cmov
and $2, 0x55, $1 # E :
addq $0, 1, $3 # E : U L U L : 2 cycle stall on $0
cmoveq $1, $3, $0 # E : Latency 2, extra map cycle
nop
nop
ret # L0 : L U L U
#endif
# Deal with the case where $18 > 8 bytes remain to be
# searched. $16 may not be aligned.
.align 4
$first_quad:
andnot $16, 0x7, $0 # E :
insqh $3, $16, $2 # U : $2 = 0000ffffffffffff ($16<0:2> ff)
xor $1, $17, $1 # E :
or $1, $2, $1 # E : U L U L $1 = ====ffffffffffff
cmpbge $31, $1, $2 # E :
bne $2, $found_it # U :
# At least one byte left to process.
ldq $1, 8($0) # L :
subq $5, 1, $18 # E : U L U L
addq $0, 8, $0 # E :
# Make $18 point to last quad to be accessed (the
# last quad may or may not be partial).
andnot $18, 0x7, $18 # E :
cmpult $0, $18, $2 # E :
beq $2, $final # U : U L U L
# At least two quads remain to be accessed.
subq $18, $0, $4 # E : $4 <- nr quads to be processed
and $4, 8, $4 # E : odd number of quads?
bne $4, $odd_quad_count # U :
# At least three quads remain to be accessed
mov $1, $4 # E : L U L U : move prefetched value to correct reg
.align 4
$unrolled_loop:
ldq $1, 8($0) # L : prefetch $1
xor $17, $4, $2 # E :
cmpbge $31, $2, $2 # E :
bne $2, $found_it # U : U L U L
addq $0, 8, $0 # E :
nop # E :
nop # E :
nop # E :
$odd_quad_count:
xor $17, $1, $2 # E :
ldq $4, 8($0) # L : prefetch $4
cmpbge $31, $2, $2 # E :
addq $0, 8, $6 # E :
bne $2, $found_it # U :
cmpult $6, $18, $6 # E :
addq $0, 8, $0 # E :
nop # E :
bne $6, $unrolled_loop # U :
mov $4, $1 # E : move prefetched value into $1
nop # E :
nop # E :
$final: subq $5, $0, $18 # E : $18 <- number of bytes left to do
nop # E :
nop # E :
bne $18, $last_quad # U :
$not_found:
mov $31, $0 # E :
nop # E :
nop # E :
ret # L0 :
END(__memchr)
weak_alias (__memchr, memchr)
#if !__BOUNDED_POINTERS__
weak_alias (__memchr, __ubp_memchr)
#endif

View File

@ -0,0 +1,254 @@
/* Copyright (C) 2000 Free Software Foundation, Inc.
This file is part of the GNU C Library.
EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/*
* Much of the information about 21264 scheduling/coding comes from:
* Compiler Writer's Guide for the Alpha 21264
* abbreviated as 'CWG' in other comments here
* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
* Scheduling notation:
* E - either cluster
* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
*
* Temp usage notes:
* $0 - destination address
* $1,$2, - scratch
*/
#include <sysdep.h>
.arch ev6
.set noreorder
.set noat
ENTRY(memcpy)
mov $16, $0 # E : copy dest to return
ble $18, $nomoredata # U : done with the copy?
xor $16, $17, $1 # E : are source and dest alignments the same?
and $1, 7, $1 # E : are they the same mod 8?
bne $1, $misaligned # U : Nope - gotta do this the slow way
/* source and dest are same mod 8 address */
and $16, 7, $1 # E : Are both 0mod8?
beq $1, $both_0mod8 # U : Yes
nop # E :
/*
* source and dest are same misalignment. move a byte at a time
* until a 0mod8 alignment for both is reached.
* At least one byte more to move
*/
$head_align:
ldbu $1, 0($17) # L : grab a byte
subq $18, 1, $18 # E : count--
addq $17, 1, $17 # E : src++
stb $1, 0($16) # L :
addq $16, 1, $16 # E : dest++
and $16, 7, $1 # E : Are we at 0mod8 yet?
ble $18, $nomoredata # U : done with the copy?
bne $1, $head_align # U :
$both_0mod8:
cmple $18, 127, $1 # E : Can we unroll the loop?
bne $1, $no_unroll # U :
and $16, 63, $1 # E : get mod64 alignment
beq $1, $do_unroll # U : no single quads to fiddle
$single_head_quad:
ldq $1, 0($17) # L : get 8 bytes
subq $18, 8, $18 # E : count -= 8
addq $17, 8, $17 # E : src += 8
nop # E :
stq $1, 0($16) # L : store
addq $16, 8, $16 # E : dest += 8
and $16, 63, $1 # E : get mod64 alignment
bne $1, $single_head_quad # U : still not fully aligned
$do_unroll:
addq $16, 64, $7 # E : Initial (+1 trip) wh64 address
cmple $18, 63, $1 # E : Can we go through the unrolled loop?
bne $1, $tail_quads # U : Nope
nop # E :
$unroll_body:
wh64 ($7) # L1 : memory subsystem hint: 64 bytes at
# ($7) are about to be over-written
ldq $6, 0($17) # L0 : bytes 0..7
nop # E :
nop # E :
ldq $4, 8($17) # L : bytes 8..15
ldq $5, 16($17) # L : bytes 16..23
addq $7, 64, $7 # E : Update next wh64 address
nop # E :
ldq $3, 24($17) # L : bytes 24..31
addq $16, 64, $1 # E : fallback value for wh64
nop # E :
nop # E :
addq $17, 32, $17 # E : src += 32 bytes
stq $6, 0($16) # L : bytes 0..7
nop # E :
nop # E :
stq $4, 8($16) # L : bytes 8..15
stq $5, 16($16) # L : bytes 16..23
subq $18, 192, $2 # E : At least two more trips to go?
nop # E :
stq $3, 24($16) # L : bytes 24..31
addq $16, 32, $16 # E : dest += 32 bytes
nop # E :
nop # E :
ldq $6, 0($17) # L : bytes 0..7
ldq $4, 8($17) # L : bytes 8..15
cmovlt $2, $1, $7 # E : Latency 2, extra map slot - Use
# fallback wh64 address if < 2 more trips
nop # E :
ldq $5, 16($17) # L : bytes 16..23
ldq $3, 24($17) # L : bytes 24..31
addq $16, 32, $16 # E : dest += 32
subq $18, 64, $18 # E : count -= 64
addq $17, 32, $17 # E : src += 32
stq $6, -32($16) # L : bytes 0..7
stq $4, -24($16) # L : bytes 8..15
cmple $18, 63, $1 # E : At least one more trip?
stq $5, -16($16) # L : bytes 16..23
stq $3, -8($16) # L : bytes 24..31
nop # E :
beq $1, $unroll_body
$tail_quads:
$no_unroll:
.align 4
subq $18, 8, $18 # E : At least a quad left?
blt $18, $less_than_8 # U : Nope
nop # E :
nop # E :
$move_a_quad:
ldq $1, 0($17) # L : fetch 8
subq $18, 8, $18 # E : count -= 8
addq $17, 8, $17 # E : src += 8
nop # E :
stq $1, 0($16) # L : store 8
addq $16, 8, $16 # E : dest += 8
bge $18, $move_a_quad # U :
nop # E :
$less_than_8:
.align 4
addq $18, 8, $18 # E : add back for trailing bytes
ble $18, $nomoredata # U : All-done
nop # E :
nop # E :
/* Trailing bytes */
$tail_bytes:
subq $18, 1, $18 # E : count--
ldbu $1, 0($17) # L : fetch a byte
addq $17, 1, $17 # E : src++
nop # E :
stb $1, 0($16) # L : store a byte
addq $16, 1, $16 # E : dest++
bgt $18, $tail_bytes # U : more to be done?
nop # E :
/* branching to exit takes 3 extra cycles, so replicate exit here */
ret $31, ($26), 1 # L0 :
nop # E :
nop # E :
nop # E :
$misaligned:
mov $0, $4 # E : dest temp
and $0, 7, $1 # E : dest alignment mod8
beq $1, $dest_0mod8 # U : life doesnt totally suck
nop
$aligndest:
ble $18, $nomoredata # U :
ldbu $1, 0($17) # L : fetch a byte
subq $18, 1, $18 # E : count--
addq $17, 1, $17 # E : src++
stb $1, 0($4) # L : store it
addq $4, 1, $4 # E : dest++
and $4, 7, $1 # E : dest 0mod8 yet?
bne $1, $aligndest # U : go until we are aligned.
/* Source has unknown alignment, but dest is known to be 0mod8 */
$dest_0mod8:
subq $18, 8, $18 # E : At least a quad left?
blt $18, $misalign_tail # U : Nope
ldq_u $3, 0($17) # L : seed (rotating load) of 8 bytes
nop # E :
$mis_quad:
ldq_u $16, 8($17) # L : Fetch next 8
extql $3, $17, $3 # U : masking
extqh $16, $17, $1 # U : masking
bis $3, $1, $1 # E : merged bytes to store
subq $18, 8, $18 # E : count -= 8
addq $17, 8, $17 # E : src += 8
stq $1, 0($4) # L : store 8 (aligned)
mov $16, $3 # E : "rotate" source data
addq $4, 8, $4 # E : dest += 8
bge $18, $mis_quad # U : More quads to move
nop
nop
$misalign_tail:
addq $18, 8, $18 # E : account for tail stuff
ble $18, $nomoredata # U :
nop
nop
$misalign_byte:
ldbu $1, 0($17) # L : fetch 1
subq $18, 1, $18 # E : count--
addq $17, 1, $17 # E : src++
nop # E :
stb $1, 0($4) # L : store
addq $4, 1, $4 # E : dest++
bgt $18, $misalign_byte # U : more to go?
nop
$nomoredata:
ret $31, ($26), 1 # L0 :
nop # E :
nop # E :
nop # E :
END(memcpy)

View File

@ -0,0 +1,224 @@
/* Copyright (C) 2000 Free Software Foundation, Inc.
Contributed by Richard Henderson (rth@tamu.edu)
EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
#include <sysdep.h>
.arch ev6
.set noat
.set noreorder
ENTRY(memset)
#ifdef PROF
ldgp gp, 0(pv)
lda AT, _mcount
jsr AT, (AT), _mcount
.prologue 1
#else
.prologue 0
#endif
/*
* Serious stalling happens. The only way to mitigate this is to
* undertake a major re-write to interleave the constant materialization
* with other parts of the fall-through code. This is important, even
* though it makes maintenance tougher.
* Do this later.
*/
and $17, 255, $1 # E : 00000000000000ch
insbl $17, 1, $2 # U : 000000000000ch00
mov $16, $0 # E : return value
ble $18, $end # U : zero length requested?
addq $18, $16, $6 # E : max address to write to
or $1, $2, $17 # E : 000000000000chch
insbl $1, 2, $3 # U : 0000000000ch0000
insbl $1, 3, $4 # U : 00000000ch000000
or $3, $4, $3 # E : 00000000chch0000
inswl $17, 4, $5 # U : 0000chch00000000
xor $16, $6, $1 # E : will complete write be within one quadword?
inswl $17, 6, $2 # U : chch000000000000
or $17, $3, $17 # E : 00000000chchchch
or $2, $5, $2 # E : chchchch00000000
bic $1, 7, $1 # E : fit within a single quadword?
and $16, 7, $3 # E : Target addr misalignment
or $17, $2, $17 # E : chchchchchchchch
beq $1, $within_quad # U :
nop # E :
beq $3, $aligned # U : target is 0mod8
/*
* Target address is misaligned, and won't fit within a quadword.
*/
ldq_u $4, 0($16) # L : Fetch first partial
mov $16, $5 # E : Save the address
insql $17, $16, $2 # U : Insert new bytes
subq $3, 8, $3 # E : Invert (for addressing uses)
addq $18, $3, $18 # E : $18 is new count ($3 is negative)
mskql $4, $16, $4 # U : clear relevant parts of the quad
subq $16, $3, $16 # E : $16 is new aligned destination
or $2, $4, $1 # E : Final bytes
nop
stq_u $1,0($5) # L : Store result
nop
nop
.align 4
$aligned:
/*
* We are now guaranteed to be quad aligned, with at least
* one partial quad to write.
*/
sra $18, 3, $3 # U : Number of remaining quads to write
and $18, 7, $18 # E : Number of trailing bytes to write
mov $16, $5 # E : Save dest address
beq $3, $no_quad # U : tail stuff only
/*
* It's worth the effort to unroll this and use wh64 if possible.
* At this point, entry values are:
* $16 Current destination address
* $5 A copy of $16
* $6 The max quadword address to write to
* $18 Number trailer bytes
* $3 Number quads to write
*/
and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
blt $4, $loop # U :
/*
* We know we've got at least 16 quads, minimum of one trip
* through unrolled loop. Do a quad at a time to get us 0mod64
* aligned.
*/
nop # E :
nop # E :
nop # E :
beq $1, $bigalign # U :
$alignmod64:
stq $17, 0($5) # L :
subq $3, 1, $3 # E : For consistency later
addq $1, 8, $1 # E : Increment towards zero for alignment
addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
nop
nop
addq $5, 8, $5 # E : Inc address
blt $1, $alignmod64 # U :
$bigalign:
/*
* $3 - number quads left to go
* $5 - target address (aligned 0mod64)
* $17 - mask of stuff to store
* Scratch registers available: $7, $2, $4, $1
* We know that we'll be taking a minimum of one trip through.
* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
* Assumes the wh64 needs to be for 2 trips through the loop in the future.
* The wh64 is issued on for the starting destination address for trip +2
* through the loop, and if there are less than two trips left, the target
* address will be for the current trip.
*/
$do_wh64:
wh64 ($4) # L1 : memory subsystem write hint
subq $3, 24, $2 # E : For determining future wh64 addresses
stq $17, 0($5) # L :
nop # E :
addq $5, 128, $4 # E : speculative target of next wh64
stq $17, 8($5) # L :
stq $17, 16($5) # L :
addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
stq $17, 24($5) # L :
stq $17, 32($5) # L :
cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
nop
stq $17, 40($5) # L :
stq $17, 48($5) # L :
subq $3, 16, $2 # E : Repeat the loop at least once more?
nop
stq $17, 56($5) # L :
addq $5, 64, $5 # E :
subq $3, 8, $3 # E :
bge $2, $do_wh64 # U :
nop
nop
nop
beq $3, $no_quad # U : Might have finished already
.align 4
/*
* Simple loop for trailing quadwords, or for small amounts
* of data (where we can't use an unrolled loop and wh64)
*/
$loop:
stq $17, 0($5) # L :
subq $3, 1, $3 # E : Decrement number quads left
addq $5, 8, $5 # E : Inc address
bne $3, $loop # U : more?
$no_quad:
/*
* Write 0..7 trailing bytes.
*/
nop # E :
beq $18, $end # U : All done?
ldq $7, 0($5) # L :
mskqh $7, $6, $2 # U : Mask final quad
insqh $17, $6, $4 # U : New bits
or $2, $4, $1 # E : Put it all together
stq $1, 0($5) # L : And back to memory
ret $31,($26),1 # L0 :
$within_quad:
ldq_u $1, 0($16) # L :
insql $17, $16, $2 # U : New bits
mskql $1, $16, $4 # U : Clear old
or $2, $4, $2 # E : New result
mskql $2, $6, $4 # U :
mskqh $1, $6, $2 # U :
or $2, $4, $1 # E :
stq_u $1, 0($16) # L :
$end:
nop
nop
nop
ret $31,($26),1 # L0 :
END(memset)

View File

@ -0,0 +1,329 @@
/* Copyright (C) 2000 Free Software Foundation, Inc.
Contributed by Richard Henderson (rth@tamu.edu)
EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Copy a null-terminated string from SRC to DST.
This is an internal routine used by strcpy, stpcpy, and strcat.
As such, it uses special linkage conventions to make implementation
of these public functions more efficient.
On input:
t9 = return address
a0 = DST
a1 = SRC
On output:
t8 = bitmask (with one bit set) indicating the last byte written
a0 = unaligned address of the last *word* written
Furthermore, v0, a3-a5, t11, and t12 are untouched.
*/
#include <sysdep.h>
.arch ev6
.set noat
.set noreorder
.text
/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
doesn't like putting the entry point for a procedure somewhere in the
middle of the procedure descriptor. Work around this by putting the
aligned copy in its own procedure descriptor */
.ent stxcpy_aligned
.align 4
stxcpy_aligned:
.frame sp, 0, t9
.prologue 0
/* On entry to this basic block:
t0 == the first destination word for masking back in
t1 == the first source word. */
/* Create the 1st output word and detect 0's in the 1st input word. */
lda t2, -1 # E : build a mask against false zero
mskqh t2, a1, t2 # U : detection in the src word (stall)
mskqh t1, a1, t3 # U :
ornot t1, t2, t2 # E : (stall)
mskql t0, a1, t0 # U : assemble the first output word
cmpbge zero, t2, t8 # E : bits set iff null found
or t0, t3, t1 # E : (stall)
bne t8, $a_eos # U : (stall)
/* On entry to this basic block:
t0 == the first destination word for masking back in
t1 == a source word not containing a null. */
/* Nops here to separate store quads from load quads */
$a_loop:
stq_u t1, 0(a0) # L :
addq a0, 8, a0 # E :
nop
nop
ldq_u t1, 0(a1) # L : Latency=3
addq a1, 8, a1 # E :
cmpbge zero, t1, t8 # E : (3 cycle stall)
beq t8, $a_loop # U : (stall for t8)
/* Take care of the final (partial) word store.
On entry to this basic block we have:
t1 == the source word containing the null
t8 == the cmpbge mask that found it. */
$a_eos:
negq t8, t6 # E : find low bit set
and t8, t6, t10 # E : (stall)
/* For the sake of the cache, don't read a destination word
if we're not going to need it. */
and t10, 0x80, t6 # E : (stall)
bne t6, 1f # U : (stall)
/* We're doing a partial word store and so need to combine
our source and original destination words. */
ldq_u t0, 0(a0) # L : Latency=3
subq t10, 1, t6 # E :
zapnot t1, t6, t1 # U : clear src bytes >= null (stall)
or t10, t6, t8 # E : (stall)
zap t0, t8, t0 # E : clear dst bytes <= null
or t0, t1, t1 # E : (stall)
nop
nop
1: stq_u t1, 0(a0) # L :
ret (t9) # L0 : Latency=3
nop
nop
.end stxcpy_aligned
.align 4
.ent __stxcpy
.globl __stxcpy
__stxcpy:
.frame sp, 0, t9
.prologue 0
/* Are source and destination co-aligned? */
xor a0, a1, t0 # E :
unop # E :
and t0, 7, t0 # E : (stall)
bne t0, $unaligned # U : (stall)
/* We are co-aligned; take care of a partial first word. */
ldq_u t1, 0(a1) # L : load first src word
and a0, 7, t0 # E : take care not to load a word ...
addq a1, 8, a1 # E :
beq t0, stxcpy_aligned # U : ... if we wont need it (stall)
ldq_u t0, 0(a0) # L :
br stxcpy_aligned # L0 : Latency=3
nop
nop
/* The source and destination are not co-aligned. Align the destination
and cope. We have to be very careful about not reading too much and
causing a SEGV. */
.align 4
$u_head:
/* We know just enough now to be able to assemble the first
full source word. We can still find a zero at the end of it
that prevents us from outputting the whole thing.
On entry to this basic block:
t0 == the first dest word, for masking back in, if needed else 0
t1 == the low bits of the first source word
t6 == bytemask that is -1 in dest word bytes */
ldq_u t2, 8(a1) # L :
addq a1, 8, a1 # E :
extql t1, a1, t1 # U : (stall on a1)
extqh t2, a1, t4 # U : (stall on a1)
mskql t0, a0, t0 # U :
or t1, t4, t1 # E :
mskqh t1, a0, t1 # U : (stall on t1)
or t0, t1, t1 # E : (stall on t1)
or t1, t6, t6 # E :
cmpbge zero, t6, t8 # E : (stall)
lda t6, -1 # E : for masking just below
bne t8, $u_final # U : (stall)
mskql t6, a1, t6 # U : mask out the bits we have
or t6, t2, t2 # E : already extracted before (stall)
cmpbge zero, t2, t8 # E : testing eos (stall)
bne t8, $u_late_head_exit # U : (stall)
/* Finally, we've got all the stupid leading edge cases taken care
of and we can set up to enter the main loop. */
stq_u t1, 0(a0) # L : store first output word
addq a0, 8, a0 # E :
extql t2, a1, t0 # U : position ho-bits of lo word
ldq_u t2, 8(a1) # U : read next high-order source word
addq a1, 8, a1 # E :
cmpbge zero, t2, t8 # E : (stall for t2)
nop # E :
bne t8, $u_eos # U : (stall)
/* Unaligned copy main loop. In order to avoid reading too much,
the loop is structured to detect zeros in aligned source words.
This has, unfortunately, effectively pulled half of a loop
iteration out into the head and half into the tail, but it does
prevent nastiness from accumulating in the very thing we want
to run as fast as possible.
On entry to this basic block:
t0 == the shifted high-order bits from the previous source word
t2 == the unshifted current source word
We further know that t2 does not contain a null terminator. */
.align 3
$u_loop:
extqh t2, a1, t1 # U : extract high bits for current word
addq a1, 8, a1 # E : (stall)
extql t2, a1, t3 # U : extract low bits for next time (stall)
addq a0, 8, a0 # E :
or t0, t1, t1 # E : current dst word now complete
ldq_u t2, 0(a1) # L : Latency=3 load high word for next time
stq_u t1, -8(a0) # L : save the current word (stall)
mov t3, t0 # E :
cmpbge zero, t2, t8 # E : test new word for eos
beq t8, $u_loop # U : (stall)
nop
nop
/* We've found a zero somewhere in the source word we just read.
If it resides in the lower half, we have one (probably partial)
word to write out, and if it resides in the upper half, we
have one full and one partial word left to write out.
On entry to this basic block:
t0 == the shifted high-order bits from the previous source word
t2 == the unshifted current source word. */
$u_eos:
extqh t2, a1, t1 # U :
or t0, t1, t1 # E : first (partial) source word complete (stall)
cmpbge zero, t1, t8 # E : is the null in this first bit? (stall)
bne t8, $u_final # U : (stall)
$u_late_head_exit:
stq_u t1, 0(a0) # L : the null was in the high-order bits
addq a0, 8, a0 # E :
extql t2, a1, t1 # U :
cmpbge zero, t1, t8 # E : (stall)
/* Take care of a final (probably partial) result word.
On entry to this basic block:
t1 == assembled source word
t8 == cmpbge mask that found the null. */
$u_final:
negq t8, t6 # E : isolate low bit set
and t6, t8, t10 # E : (stall)
and t10, 0x80, t6 # E : avoid dest word load if we can (stall)
bne t6, 1f # U : (stall)
ldq_u t0, 0(a0) # E :
subq t10, 1, t6 # E :
or t6, t10, t8 # E : (stall)
zapnot t1, t6, t1 # U : kill source bytes >= null (stall)
zap t0, t8, t0 # U : kill dest bytes <= null (2 cycle data stall)
or t0, t1, t1 # E : (stall)
nop
nop
1: stq_u t1, 0(a0) # L :
ret (t9) # L0 : Latency=3
nop
nop
/* Unaligned copy entry point. */
.align 4
$unaligned:
ldq_u t1, 0(a1) # L : load first source word
and a0, 7, t4 # E : find dest misalignment
and a1, 7, t5 # E : find src misalignment
/* Conditionally load the first destination word and a bytemask
with 0xff indicating that the destination byte is sacrosanct. */
mov zero, t0 # E :
mov zero, t6 # E :
beq t4, 1f # U :
ldq_u t0, 0(a0) # L :
lda t6, -1 # E :
mskql t6, a0, t6 # U :
nop
nop
nop
1:
subq a1, t4, a1 # E : sub dest misalignment from src addr
/* If source misalignment is larger than dest misalignment, we need
extra startup checks to avoid SEGV. */
cmplt t4, t5, t10 # E :
beq t10, $u_head # U :
lda t2, -1 # E : mask out leading garbage in source
mskqh t2, t5, t2 # U :
ornot t1, t2, t3 # E : (stall)
cmpbge zero, t3, t8 # E : is there a zero? (stall)
beq t8, $u_head # U : (stall)
/* At this point we've found a zero in the first partial word of
the source. We need to isolate the valid source data and mask
it into the original destination data. (Incidentally, we know
that we'll need at least one byte of that original dest word.) */
ldq_u t0, 0(a0) # L :
negq t8, t6 # E : build bitmask of bytes <= zero
and t6, t8, t10 # E : (stall)
and a1, 7, t5 # E :
subq t10, 1, t6 # E :
or t6, t10, t8 # E : (stall)
srl t10, t5, t10 # U : adjust final null return value
zapnot t2, t8, t2 # U : prepare source word; mirror changes (stall)
and t1, t2, t1 # E : to source validity mask
extql t2, a1, t2 # U :
extql t1, a1, t1 # U : (stall)
andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall)
or t0, t1, t1 # e1 : and put it there
stq_u t1, 0(a0) # .. e0 : (stall)
ret (t9) # e1 :
nop
.end __stxcpy

View File

@ -0,0 +1,405 @@
/* Copyright (C) 2000 Free Software Foundation, Inc.
Contributed by Richard Henderson (rth@tamu.edu)
EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Copy no more than COUNT bytes of the null-terminated string from
SRC to DST.
This is an internal routine used by strncpy, stpncpy, and strncat.
As such, it uses special linkage conventions to make implementation
of these public functions more efficient.
On input:
t9 = return address
a0 = DST
a1 = SRC
a2 = COUNT
Furthermore, COUNT may not be zero.
On output:
t0 = last word written
t8 = bitmask (with one bit set) indicating the last byte written
t10 = bitmask (with one bit set) indicating the byte position of
the end of the range specified by COUNT
a0 = unaligned address of the last *word* written
a2 = the number of full words left in COUNT
Furthermore, v0, a3-a5, t11, and t12 are untouched.
*/
#include <sysdep.h>
.arch ev6
.set noat
.set noreorder
/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
doesn't like putting the entry point for a procedure somewhere in the
middle of the procedure descriptor. Work around this by putting the
aligned copy in its own procedure descriptor */
.ent stxncpy_aligned
.align 4
stxncpy_aligned:
.frame sp, 0, t9, 0
.prologue 0
/* On entry to this basic block:
t0 == the first destination word for masking back in
t1 == the first source word. */
/* Create the 1st output word and detect 0's in the 1st input word. */
lda t2, -1 # E : build a mask against false zero
mskqh t2, a1, t2 # U : detection in the src word (stall)
mskqh t1, a1, t3 # U :
ornot t1, t2, t2 # E : (stall)
mskql t0, a1, t0 # U : assemble the first output word
cmpbge zero, t2, t7 # E : bits set iff null found
or t0, t3, t0 # E : (stall)
beq a2, $a_eoc # U :
bne t7, $a_eos # U :
nop
nop
nop
/* On entry to this basic block:
t0 == a source word not containing a null. */
/*
* nops here to:
* separate store quads from load quads
* limit of 1 bcond/quad to permit training
*/
$a_loop:
stq_u t0, 0(a0) # L :
addq a0, 8, a0 # E :
subq a2, 1, a2 # E :
nop
ldq_u t0, 0(a1) # L :
addq a1, 8, a1 # E :
cmpbge zero, t0, t7 # E :
beq a2, $a_eoc # U :
beq t7, $a_loop # U :
nop
nop
nop
/* Take care of the final (partial) word store. At this point
the end-of-count bit is set in t7 iff it applies.
On entry to this basic block we have:
t0 == the source word containing the null
t7 == the cmpbge mask that found it. */
$a_eos:
negq t7, t8 # E : find low bit set
and t7, t8, t8 # E : (stall)
/* For the sake of the cache, don't read a destination word
if we're not going to need it. */
and t8, 0x80, t6 # E : (stall)
bne t6, 1f # U : (stall)
/* We're doing a partial word store and so need to combine
our source and original destination words. */
ldq_u t1, 0(a0) # L :
subq t8, 1, t6 # E :
or t8, t6, t7 # E : (stall)
zapnot t0, t7, t0 # U : clear src bytes > null (stall)
zap t1, t7, t1 # .. e1 : clear dst bytes <= null
or t0, t1, t0 # e1 : (stall)
nop
nop
1: stq_u t0, 0(a0) # L :
ret (t9) # L0 : Latency=3
nop
nop
/* Add the end-of-count bit to the eos detection bitmask. */
$a_eoc:
or t10, t7, t7 # E :
br $a_eos # L0 : Latency=3
nop
nop
.end stxncpy_aligned
.align 4
.ent __stxncpy
.globl __stxncpy
__stxncpy:
.frame sp, 0, t9, 0
.prologue 0
/* Are source and destination co-aligned? */
xor a0, a1, t1 # E :
and a0, 7, t0 # E : find dest misalignment
and t1, 7, t1 # E : (stall)
addq a2, t0, a2 # E : bias count by dest misalignment (stall)
subq a2, 1, a2 # E :
and a2, 7, t2 # E : (stall)
srl a2, 3, a2 # U : a2 = loop counter = (count - 1)/8 (stall)
addq zero, 1, t10 # E :
sll t10, t2, t10 # U : t10 = bitmask of last count byte
bne t1, $unaligned # U :
/* We are co-aligned; take care of a partial first word. */
ldq_u t1, 0(a1) # L : load first src word
addq a1, 8, a1 # E :
beq t0, stxncpy_aligned # U : avoid loading dest word if not needed
ldq_u t0, 0(a0) # L :
nop
nop
br stxncpy_aligned # .. e1 :
nop
nop
nop
/* The source and destination are not co-aligned. Align the destination
and cope. We have to be very careful about not reading too much and
causing a SEGV. */
.align 4
$u_head:
/* We know just enough now to be able to assemble the first
full source word. We can still find a zero at the end of it
that prevents us from outputting the whole thing.
On entry to this basic block:
t0 == the first dest word, unmasked
t1 == the shifted low bits of the first source word
t6 == bytemask that is -1 in dest word bytes */
ldq_u t2, 8(a1) # L : Latency=3 load second src word
addq a1, 8, a1 # E :
mskql t0, a0, t0 # U : mask trailing garbage in dst
extqh t2, a1, t4 # U : (3 cycle stall on t2)
or t1, t4, t1 # E : first aligned src word complete (stall)
mskqh t1, a0, t1 # U : mask leading garbage in src (stall)
or t0, t1, t0 # E : first output word complete (stall)
or t0, t6, t6 # E : mask original data for zero test (stall)
cmpbge zero, t6, t7 # E :
beq a2, $u_eocfin # U :
nop
nop
bne t7, $u_final # U :
lda t6, -1 # E : mask out the bits we have
mskql t6, a1, t6 # U : already seen (stall)
stq_u t0, 0(a0) # L : store first output word
or t6, t2, t2 # E :
cmpbge zero, t2, t7 # E : find nulls in second partial (stall)
addq a0, 8, a0 # E :
subq a2, 1, a2 # E :
bne t7, $u_late_head_exit # U :
/* Finally, we've got all the stupid leading edge cases taken care
of and we can set up to enter the main loop. */
extql t2, a1, t1 # U : position hi-bits of lo word
ldq_u t2, 8(a1) # L : read next high-order source word
addq a1, 8, a1 # E :
cmpbge zero, t2, t7 # E : (stall)
beq a2, $u_eoc # U :
nop
nop
bne t7, $u_eos # e1 :
nop
nop
nop
/* Unaligned copy main loop. In order to avoid reading too much,
the loop is structured to detect zeros in aligned source words.
This has, unfortunately, effectively pulled half of a loop
iteration out into the head and half into the tail, but it does
prevent nastiness from accumulating in the very thing we want
to run as fast as possible.
On entry to this basic block:
t1 == the shifted high-order bits from the previous source word
t2 == the unshifted current source word
We further know that t2 does not contain a null terminator. */
.align 4
$u_loop:
extqh t2, a1, t0 # U : extract high bits for current word
addq a1, 8, a1 # E :
extql t2, a1, t3 # U : extract low bits for next time
addq a0, 8, a0 # E :
or t0, t1, t0 # E : current dst word now complete
ldq_u t2, 0(a1) # U : Latency=3 load high word for next time
stq_u t0, -8(a0) # U : save the current word (stall)
mov t3, t1 # E :
subq a2, 1, a2 # E :
cmpbge zero, t2, t7 # E : test new word for eos (2 cycle stall for data)
beq a2, $u_eoc # U : (stall)
nop
beq t7, $u_loop # U :
nop
nop
nop
/* We've found a zero somewhere in the source word we just read.
If it resides in the lower half, we have one (probably partial)
word to write out, and if it resides in the upper half, we
have one full and one partial word left to write out.
On entry to this basic block:
t1 == the shifted high-order bits from the previous source word
t2 == the unshifted current source word. */
$u_eos:
extqh t2, a1, t0 # U :
or t0, t1, t0 # E : first (partial) source word complete (stall)
cmpbge zero, t0, t7 # E : is the null in this first bit? (stall)
bne t7, $u_final # U : (stall)
stq_u t0, 0(a0) # L : the null was in the high-order bits
addq a0, 8, a0 # E :
subq a2, 1, a2 # E :
nop
$u_late_head_exit:
extql t2, a1, t0 # U :
cmpbge zero, t0, t7 # E :
or t7, t10, t6 # E : (stall)
cmoveq a2, t6, t7 # E : Latency=2, extra map slot (stall)
/* Take care of a final (probably partial) result word.
On entry to this basic block:
t0 == assembled source word
t7 == cmpbge mask that found the null. */
$u_final:
negq t7, t6 # E : isolate low bit set
and t6, t7, t8 # E : (stall)
and t8, 0x80, t6 # E : avoid dest word load if we can (stall)
bne t6, 1f # U : (stall)
ldq_u t1, 0(a0) # L :
subq t8, 1, t6 # E :
or t6, t8, t7 # E : (stall)
zapnot t0, t7, t0 # U : kill source bytes > null
zap t1, t7, t1 # U : kill dest bytes <= null
or t0, t1, t0 # E : (stall)
nop
nop
1: stq_u t0, 0(a0) # L :
ret (t9) # L0 : Latency=3
$u_eoc: # end-of-count
extqh t2, a1, t0 # U :
or t0, t1, t0 # E : (stall)
cmpbge zero, t0, t7 # E : (stall)
nop
$u_eocfin: # end-of-count, final word
or t10, t7, t7 # E :
br $u_final # L0 : Latency=3
nop
nop
/* Unaligned copy entry point. */
.align 4
$unaligned:
ldq_u t1, 0(a1) # L : load first source word
and a0, 7, t4 # E : find dest misalignment
and a1, 7, t5 # E : find src misalignment
/* Conditionally load the first destination word and a bytemask
with 0xff indicating that the destination byte is sacrosanct. */
mov zero, t0 # E :
mov zero, t6 # E :
beq t4, 1f # U :
ldq_u t0, 0(a0) # L :
lda t6, -1 # E :
mskql t6, a0, t6 # U :
nop
nop
nop
1:
subq a1, t4, a1 # E : sub dest misalignment from src addr
/* If source misalignment is larger than dest misalignment, we need
extra startup checks to avoid SEGV. */
cmplt t4, t5, t8 # E :
extql t1, a1, t1 # U : shift src into place
lda t2, -1 # E : for creating masks later
beq t8, $u_head # U : (stall)
mskqh t2, t5, t2 # U : begin src byte validity mask
cmpbge zero, t1, t7 # E : is there a zero?
extql t2, a1, t2 # U :
or t7, t10, t5 # E : test for end-of-count too
cmpbge zero, t2, t3 # E :
cmoveq a2, t5, t7 # E : Latency=2, extra map slot
nop # E : keep with cmoveq
andnot t7, t3, t7 # E : (stall)
beq t7, $u_head # U :
/* At this point we've found a zero in the first partial word of
the source. We need to isolate the valid source data and mask
it into the original destination data. (Incidentally, we know
that we'll need at least one byte of that original dest word.) */
ldq_u t0, 0(a0) # L :
negq t7, t6 # E : build bitmask of bytes <= zero
mskqh t1, t4, t1 # U :
and t6, t7, t8 # E :
subq t8, 1, t6 # E : (stall)
or t6, t8, t7 # E : (stall)
zapnot t2, t7, t2 # U : prepare source word; mirror changes (stall)
zapnot t1, t7, t1 # U : to source validity mask
andnot t0, t2, t0 # E : zero place for source to reside
or t0, t1, t0 # E : and put it there (stall both t0, t1)
stq_u t0, 0(a0) # L : (stall)
ret (t9) # L0 : Latency=3
nop
nop
nop
.end __stxncpy

View File

@ -0,0 +1 @@
alpha/alphaev6

View File

@ -0,0 +1,50 @@
/* Copyright (C) 2000 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Finds the first bit set in an integer. */
#include <sysdep.h>
.arch ev6
.set noreorder
.set noat
ENTRY(__ffs)
#ifdef PROF
ldgp gp, 0(pv)
lda AT, _mcount
jsr AT, (AT), _mcount
.prologue 1
#else
.prologue 0
#endif
zap $16, 0xF0, $16
cttz $16, $0
addq $0, 1, $0
cmoveq $16, 0, $0
nop
nop
nop
ret
END(__ffs)
weak_alias (__ffs, ffs)

View File

@ -0,0 +1,45 @@
/* Copyright (C) 2000 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Finds the first bit set in a long. */
#include <sysdep.h>
.arch ev6
.set noreorder
.set noat
ENTRY(ffsl)
#ifdef PROF
ldgp gp, 0(pv)
lda AT, _mcount
jsr AT, (AT), _mcount
.prologue 1
#else
.prologue 0
#endif
cttz $16, $0
addq $0, 1, $0
cmoveq $16, 0, $0
ret
END(ffsl)
weak_extern (ffsl)
weak_alias (ffsl, ffsll)

View File

@ -0,0 +1,93 @@
/* Copyright (C) 2000 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Return pointer to first occurrence of CH in STR. */
#include <sysdep.h>
.arch ev6
.set noreorder
.set noat
ENTRY(__rawmemchr)
#ifdef PROF
ldgp gp, 0(pv)
lda AT, _mcount
jsr AT, (AT), _mcount
.prologue 1
#else
.prologue 0
#endif
ldq_u t0, 0(a0) # L : load first quadword Latency=3
and a1, 0xff, t3 # E : 00000000000000ch
insbl a1, 1, t5 # U : 000000000000ch00
insbl a1, 7, a2 # U : ch00000000000000
insbl t3, 6, a3 # U : 00ch000000000000
or t5, t3, a1 # E : 000000000000chch
andnot a0, 7, v0 # E : align our loop pointer
lda t4, -1 # E : build garbage mask
mskqh t4, a0, t4 # U : only want relevant part of first quad
or a2, a3, a2 # E : chch000000000000
inswl a1, 2, t5 # E : 00000000chch0000
inswl a1, 4, a3 # E : 0000chch00000000
or a1, a2, a1 # E : chch00000000chch
or a3, t5, t5 # E : 0000chchchch0000
cmpbge zero, t4, t4 # E : bits set iff byte is garbage
nop # E :
/* This quad is _very_ serialized. Lots of stalling happens */
or t5, a1, a1 # E : chchchchchchchch
xor t0, a1, t1 # E : make bytes == c zero
cmpbge zero, t1, t0 # E : bits set iff byte == c
andnot t0, t4, t0 # E : clear garbage bits
cttz t0, a2 # U0 : speculative (in case we get a match)
nop # E :
nop # E :
bne t0, $found # U :
/*
* Yuk. This loop is going to stall like crazy waiting for the
* data to be loaded. Not much can be done about it unless it's
* unrolled multiple times, which is generally unsafe.
*/
$loop:
ldq t0, 8(v0) # L : Latency=3
addq v0, 8, v0 # E :
xor t0, a1, t1 # E :
cmpbge zero, t1, t0 # E : bits set iff byte == c
cttz t0, a2 # U0 : speculative (in case we get a match)
nop # E :
nop # E :
beq t0, $loop # U :
$found:
negq t0, t1 # E : clear all but least set bit
and t0, t1, t0 # E :
addq v0, a2, v0 # E : Add in the bit number from above
ret # L0 :
END(__rawmemchr)
weak_alias (__rawmemchr, rawmemchr)

View File

@ -0,0 +1,52 @@
/* Copyright (C) 2000 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Richard Henderson <rth@redhat.com>.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Copy SRC to DEST returning the address of the terminating 0 in DEST. */
#include <sysdep.h>
.arch ev6
.set noreorder
.set noat
.text
ENTRY(__stpcpy)
ldgp gp, 0(pv)
#ifdef PROF
lda AT, _mcount
jsr AT, (AT), _mcount
#endif
.prologue 1
.align 4
mov a0, v0
nop
jsr t9, __stxcpy
# t8 = bitmask (with one bit set) indicating the last byte written
# a0 = unaligned address of the last *word* written
cttz t8, t8
andnot a0, 7, a0
addq a0, t8, v0
ret
END(__stpcpy)
weak_alias (__stpcpy, stpcpy)

View File

@ -0,0 +1,116 @@
/* Copyright (C) 2000 Free Software Foundation, Inc.
Contributed by Richard Henderson (rth@redhat.com)
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Copy no more then N bytes from SRC to DEST, returning the address of
the terminating '\0' in DEST. */
#include <sysdep.h>
.arch ev6
.set noat
.set noreorder
.text
ENTRY(__stpncpy)
ldgp gp, 0(pv)
#ifdef PROF
lda AT, _mcount
jsr AT, (AT), _mcount
#endif
.prologue 1
mov a0, v0
beq a2, $zerocount
.align 4
nop
nop
jsr t9, __stxncpy # do the work of the copy
cttz t8, t4
zapnot t0, t8, t5
andnot a0, 7, a0
bne a2, $multiword # do we have full words left?
subq t8, 1, t2
subq t10, 1, t3
cmpult zero, t5, t5
addq a0, t4, v0
or t2, t8, t2
or t3, t10, t3
addq v0, t5, v0
andnot t3, t2, t3
zap t0, t3, t0
nop
stq t0, 0(a0)
ret
$multiword:
subq t8, 1, t7 # clear the final bits in the prev word
cmpult zero, t5, t5
or t7, t8, t7
zapnot t0, t7, t0
subq a2, 1, a2
stq t0, 0(a0)
addq a0, 8, a1
beq a2, 1f # loop over full words remaining
nop
nop
nop
blbc a2, 0f
stq zero, 0(a1)
subq a2, 1, a2
addq a1, 8, a1
beq a2, 1f
0: stq zero, 0(a1)
subq a2, 2, a2
nop
nop
stq zero, 8(a1)
addq a1, 16, a1
nop
bne a2, 0b
1: ldq t0, 0(a1) # clear the leading bits in the final word
subq t10, 1, t7
addq a0, t4, v0
nop
or t7, t10, t7
addq v0, t5, v0
zap t0, t7, t0
stq t0, 0(a1)
$zerocount:
nop
nop
nop
ret
END(__stpncpy)
weak_alias (__stpncpy, stpncpy)

View File

@ -0,0 +1,62 @@
/* Copyright (C) 2000 Free Software Foundation, Inc.
Contributed by Richard Henderson <rth@tamu.edu>, 1996.
EV67 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Append a null-terminated string from SRC to DST. */
#include <sysdep.h>
.arch ev6
.set noreorder
.text
ENTRY(strcat)
ldgp gp, 0(pv)
#ifdef PROF
.set noat
lda AT, _mcount
jsr AT, (AT), _mcount
.set at
#endif
.prologue 1
mov $16, $0 # E : set up return value
/* Find the end of the string. */
ldq_u $1, 0($16) # L : load first quadword (a0 may be misaligned)
lda $2, -1 # E :
insqh $2, $16, $2 # U :
andnot $16, 7, $16 # E :
or $2, $1, $1 # E :
cmpbge $31, $1, $2 # E : bits set iff byte == 0
bne $2, $found # U :
$loop: ldq $1, 8($16) # L :
addq $16, 8, $16 # E :
cmpbge $31, $1, $2 # E :
beq $2, $loop # U :
$found: cttz $2, $3 # U0 :
addq $16, $3, $16 # E :
/* Now do the append. */
mov $26, $23 # E :
jmp $31, __stxcpy # L0 :
END(strcat)

View File

@ -0,0 +1,101 @@
/* Copyright (C) 2000 Free Software Foundation, Inc.
Contributed by Richard Henderson <rth@tamu.edu>, 1996.
EV67 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Return the address of a given character within a null-terminated
string, or null if it is not found. */
#include <sysdep.h>
.arch ev6
.set noreorder
.set noat
ENTRY(strchr)
#ifdef PROF
ldgp gp, 0(pv)
lda AT, _mcount
jsr AT, (AT), _mcount
.prologue 1
#else
.prologue 0
#endif
ldq_u t0, 0(a0) # L : load first quadword Latency=3
and a1, 0xff, t3 # E : 00000000000000ch
insbl a1, 1, t5 # U : 000000000000ch00
insbl a1, 7, a2 # U : ch00000000000000
insbl t3, 6, a3 # U : 00ch000000000000
or t5, t3, a1 # E : 000000000000chch
andnot a0, 7, v0 # E : align our loop pointer
lda t4, -1 # E : build garbage mask
mskqh t4, a0, t4 # U : only want relevant part of first quad
or a2, a3, a2 # E : chch000000000000
inswl a1, 2, t5 # E : 00000000chch0000
inswl a1, 4, a3 # E : 0000chch00000000
or a1, a2, a1 # E : chch00000000chch
or a3, t5, t5 # E : 0000chchchch0000
cmpbge zero, t0, t2 # E : bits set iff byte == zero
cmpbge zero, t4, t4 # E : bits set iff byte is garbage
/* This quad is _very_ serialized. Lots of stalling happens */
or t5, a1, a1 # E : chchchchchchchch
xor t0, a1, t1 # E : make bytes == c zero
cmpbge zero, t1, t3 # E : bits set iff byte == c
or t2, t3, t0 # E : bits set iff char match or zero match
andnot t0, t4, t0 # E : clear garbage bits
cttz t0, a2 # U0 : speculative (in case we get a match)
nop # E :
bne t0, $found # U :
/*
* Yuk. This loop is going to stall like crazy waiting for the
* data to be loaded. Not much can be done about it unless it's
* unrolled multiple times, which is generally unsafe.
*/
$loop:
ldq t0, 8(v0) # L : Latency=3
addq v0, 8, v0 # E :
xor t0, a1, t1 # E :
cmpbge zero, t0, t2 # E : bits set iff byte == 0
cmpbge zero, t1, t3 # E : bits set iff byte == c
or t2, t3, t0 # E :
cttz t3, a2 # U0 : speculative (in case we get a match)
beq t0, $loop # U :
$found:
negq t0, t1 # E : clear all but least set bit
and t0, t1, t0 # E :
and t0, t3, t1 # E : bit set iff byte was the char
addq v0, a2, v0 # E : Add in the bit number from above
cmoveq t1, $31, v0 # E : Two mapping slots, latency = 2
nop
nop
ret # L0 :
END(strchr)
weak_alias (strchr, index)

View File

@ -0,0 +1,61 @@
/* Copyright (C) 2000 Free Software Foundation, Inc.
Contributed by David Mosberger (davidm@cs.arizona.edu).
EV67 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Finds length of a 0-terminated string. */
#include <sysdep.h>
.arch ev6
.set noreorder
.set noat
ENTRY(strlen)
#ifdef PROF
ldgp gp, 0(pv)
lda AT, _mcount
jsr AT, (AT), _mcount
.prologue 1
#else
.prologue 0
#endif
ldq_u $1, 0($16) # L : load first quadword ($16 may be misaligned)
lda $2, -1($31) # E :
insqh $2, $16, $2 # U :
andnot $16, 7, $0 # E :
or $2, $1, $1 # E :
cmpbge $31, $1, $2 # E : $2 <- bitmask: bit i == 1 <==> i-th byte == 0
nop # E :
bne $2, $found # U :
$loop: ldq $1, 8($0) # L :
addq $0, 8, $0 # E : addr += 8
cmpbge $31, $1, $2 # E :
beq $2, $loop # U :
$found:
cttz $2, $3 # U0 :
addq $0, $3, $0 # E :
subq $0, $16, $0 # E :
ret $31, ($26) # L0 :
END(strlen)

View File

@ -0,0 +1,101 @@
/* Copyright (C) 2000 Free Software Foundation, Inc.
Contributed by Richard Henderson <rth@tamu.edu>, 1996.
EV67 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Append no more than COUNT characters from the null-terminated string SRC
to the null-terminated string DST. Always null-terminate the new DST. */
#include <sysdep.h>
.arch ev6
.set noreorder
.text
ENTRY(strncat)
ldgp gp, 0(pv)
#ifdef PROF
.set noat
lda AT, _mcount
jsr AT, (AT), _mcount
.set at
#endif
.prologue 1
mov $16, $0 # set up return value
beq $18, $zerocount # U :
/* Find the end of the string. */
ldq_u $1, 0($16) # L : load first quadword ($16 may be misaligned)
lda $2, -1($31) # E :
insqh $2, $0, $2 # U :
andnot $16, 7, $16 # E :
nop # E :
or $2, $1, $1 # E :
nop # E :
nop # E :
cmpbge $31, $1, $2 # E : bits set iff byte == 0
bne $2, $found # U :
$loop: ldq $1, 8($16) # L :
addq $16, 8, $16 # E :
cmpbge $31, $1, $2 # E :
beq $2, $loop # U :
$found: cttz $2, $3 # U0 :
addq $16, $3, $16 # E :
jsr $23, __stxncpy # L0 :/* Now do the append. */
/* Worry about the null termination. */
zapnot $1, $27, $2 # U : was last byte a null?
cmplt $27, $24, $5 # E : did we fill the buffer completely?
bne $2, 0f # U :
ret # L0 :
0: or $5, $18, $2 # E :
nop
bne $2, 2f # U :
and $24, 0x80, $3 # E : no zero next byte
nop # E :
bne $3, 1f # U :
/* Here there are bytes left in the current word. Clear one. */
addq $24, $24, $24 # E : end-of-count bit <<= 1
nop # E :
2: zap $1, $24, $1 # U :
nop # E :
stq_u $1, 0($16) # L :
ret # L0 :
1: /* Here we must clear the first byte of the next DST word */
stb $31, 8($16) # L :
nop # E :
nop # E :
ret # L0 :
$zerocount:
nop # E :
nop # E :
nop # E :
ret # L0 :
END(strncat)

View File

@ -1,4 +1,4 @@
/* Copyright (C) 1996, 1997, 1998 Free Software Foundation, Inc. /* Copyright (C) 1996, 1997, 1998, 2000 Free Software Foundation, Inc.
This file is part of the GNU C Library. This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or The GNU C Library is free software; you can redistribute it and/or
@ -30,15 +30,13 @@ ENTRY(htonl)
.prologue 0 .prologue 0
#endif #endif
extlh a0, 5, t1 # t1 = dd000000 inslh a0, 7, t0 # t0 = 0000000000AABBCC
zap a0, 0xfd, t2 # t2 = 0000cc00 inswl a0, 3, t1 # t1 = 000000CCDD000000
sll t2, 5, t2 # t2 = 00198000 or t1, t0, t1 # t1 = 000000CCDDAABBCC
s8addl t2, t1, t1 # t1 = ddcc0000 sll t1, 16, t2 # t2 = 0000000000CCDDAA
zap a0, 0xfb, t2 # t2 = 00bb0000 zapnot t1, 0x0A, t0 # t0 = 00000000DD00BB00
srl t2, 8, t2 # t2 = 0000bb00 zapnot t2, 0x05, t3 # t3 = 0000000000CC00AA
extbl a0, 3, v0 # v0 = 000000aa addl t0, t3, v0 # v0 = ssssssssDDCCBBAA
or t1, v0, v0 # v0 = ddcc00aa
or t2, v0, v0 # v0 = ddccbbaa
ret ret
END(htonl) END(htonl)

89
sysdeps/alpha/rawmemchr.S Normal file
View File

@ -0,0 +1,89 @@
/* Copyright (C) 2000 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Return pointer to first occurrence of CH in STR. */
#include <sysdep.h>
.set noreorder
.set noat
ENTRY(__rawmemchr)
#ifdef PROF
ldgp gp, 0(pv)
lda AT, _mcount
jsr AT, (AT), _mcount
.prologue 1
#else
.prologue 0
#endif
zapnot a1, 1, a1 # e0 : zero extend the search character
ldq_u t0, 0(a0) # .. e1 : load first quadword
sll a1, 8, t5 # e0 : replicate the search character
andnot a0, 7, v0 # .. e1 : align our loop pointer
or t5, a1, a1 # e0 :
lda t4, -1 # .. e1 : build garbage mask
sll a1, 16, t5 # e0 :
unop # :
mskqh t4, a0, t4 # e0 :
or t5, a1, a1 # .. e1 :
sll a1, 32, t5 # e0 :
cmpbge zero, t4, t4 # .. e1 : bits set iff byte is garbage
or t5, a1, a1 # e0 :
xor t0, a1, t1 # .. e1 : make bytes == c zero
cmpbge zero, t1, t3 # e0 : bits set iff byte == c
unop # :
andnot t3, t4, t0 # e0 : clear garbage bits
fnop # .. fa :
unop # :
bne t0, $found # .. e1 (zdb)
.align 4
$loop:
ldq t0, 8(v0) # e0 :
addq v0, 8, v0 # .. e1 :
nop # e0 :
xor t0, a1, t1 # .. e1 (ev5 data stall)
cmpbge zero, t1, t0 # e0 : bits set iff byte == c
beq t0, $loop # .. e1 (zdb)
$found:
negq t0, t1 # e0 : clear all but least set bit
and t0, t1, t0 # e1 (stall)
and t0, 0xf0, t2 # e0 : binary search for that set bit
and t0, 0xcc, t3 # .. e1 :
and t0, 0xaa, t4 # e0 :
cmovne t2, 4, t2 # .. e1 :
cmovne t3, 2, t3 # e0 :
cmovne t4, 1, t4 # .. e1 :
addq t2, t3, t2 # e0 :
addq v0, t4, v0 # .. e1 :
addq v0, t2, v0 # e0 :
ret # .. e1 :
END(__rawmemchr)
weak_alias (__rawmemchr, rawmemchr)

View File

@ -65,7 +65,7 @@ $found: negq t1, t2 # clear all but least set bit
/* Now do the append. */ /* Now do the append. */
jsr t9, __stxcpy mov ra, t9
ret jmp $31, __stxcpy
END(strcat) END(strcat)

View File

@ -35,7 +35,7 @@ ENTRY(strcpy)
.prologue 1 .prologue 1
mov a0, v0 # set up return value mov a0, v0 # set up return value
jsr t9, __stxcpy # do the copy mov ra, t9
ret jmp $31, __stxcpy # do the copy
END(strcpy) END(strcpy)

View File

@ -53,7 +53,6 @@ ENTRY(strncpy)
ret # .. e1 : ret # .. e1 :
$multiword: $multiword:
subq t8, 1, t7 # e0 : clear the final bits in the prev subq t8, 1, t7 # e0 : clear the final bits in the prev
or t7, t8, t7 # e1 : word or t7, t8, t7 # e1 : word
zapnot t0, t7, t0 # e0 : zapnot t0, t7, t0 # e0 :

View File

@ -82,3 +82,11 @@ _start:
;; ;;
} }
.endp _start# .endp _start#
/* Define a symbol for the first piece of initialized data. */
.data
.globl __data_start
__data_start:
.long 0
.weak data_start
data_start = __data_start