mirror of
https://sourceware.org/git/glibc.git
synced 2024-12-22 02:40:08 +00:00
Update.
2000-12-08 Jakub Jelinek <jakub@redhat.com> * elf/dl-load.c (fillin_rpath): Don't assume there is '\0' at cp + len. Compute where from dirname. Reported by <jreiser@BitWagon.com>. 2000-12-08 Richard Henderson <rth@twiddle.net> * sysdeps/alpha/_mcount.S: Fix typo. * sysdeps/alpha/strncpy.S: Likewise. * sysdeps/alpha/alphaev6/Implies: New file. * sysdeps/alpha/alphaev67/Implies: New file. * sysdeps/alpha/alphaev67/ffs.S: New file. * sysdeps/alpha/alphaev67/ffsll.S: New file. * sysdeps/alpha/alphaev67/rawmemchr.S: New file. * sysdeps/alpha/alphaev67/stpcpy.S: New file. * sysdeps/alpha/alphaev67/stpncpy.S: New file. * sysdeps/alpha/rawmemchr.S: New file. * sysdeps/alpha/strcat.S: Tail call to __stxcpy. * sysdeps/alpha/strcpy.S: Likewise. From GMP 3.1.1: * sysdeps/alpha/alphaev6/addmul_1.s: New file. From rick.gorton@alpha-processor.com: * sysdeps/alpha/alphaev6/memchr.S: New file. * sysdeps/alpha/alphaev6/memcpy.S: New file. * sysdeps/alpha/alphaev6/memset.S: New file. * sysdeps/alpha/alphaev6/stxcpy.S: New file. * sysdeps/alpha/alphaev6/stxncpy.S: New file. * sysdeps/alpha/alphaev67/strcat.S: New file. * sysdeps/alpha/alphaev67/strchr.S: New file. * sysdeps/alpha/alphaev67/strlen.S: New file. * sysdeps/alpha/alphaev67/strncat.S: New file. * sysdeps/alpha/htonl.S: Use a shorter sequence. 2000-12-08 Jakub Jelinek <jakub@redhat.com> * inet/getnameinfo.c (getnameinfo): Fix NI_NOFQDN support. Reported by <pspencer@fields.utoronto.ca>. 2000-12-07 Jes Sorensen <jes@linuxcare.com> * sysdeps/ia64/elf/start.S (__data_start): Add __data_start variable. Pointed out by Hans Boehm. 2000-12-07 H.J. Lu <hjl@gnu.org> * elf/dl-version.c (match_symbol): Check map->l_name[0] for printing. 2000-12-07 Andreas Jaeger <aj@suse.de> * misc/error.c: Add format attributes for __error and __error_at_line. * nscd/dbg_log.h: Add format attribute. 2000-12-08 Ulrich Drepper <drepper@redhat.com> * misc/sys/syslog.h: Add format attributes to syslog and vsyslog. Patch by Joseph S. Myers <jsm28@cam.ac.uk>. * sysdeps/generic/dl-cache.c (HWCAP_CHECK): Fix access to _dl_hwcap. * manual/install.texi: Recommend to set LANGUAGE=C LC_ALL=C during
This commit is contained in:
parent
4e9b4067d7
commit
104d0bd3ef
67
ChangeLog
67
ChangeLog
@ -1,3 +1,65 @@
|
||||
2000-12-08 Jakub Jelinek <jakub@redhat.com>
|
||||
|
||||
* elf/dl-load.c (fillin_rpath): Don't assume there is '\0' at
|
||||
cp + len. Compute where from dirname.
|
||||
Reported by <jreiser@BitWagon.com>.
|
||||
|
||||
2000-12-08 Richard Henderson <rth@twiddle.net>
|
||||
|
||||
* sysdeps/alpha/_mcount.S: Fix typo.
|
||||
* sysdeps/alpha/strncpy.S: Likewise.
|
||||
|
||||
* sysdeps/alpha/alphaev6/Implies: New file.
|
||||
* sysdeps/alpha/alphaev67/Implies: New file.
|
||||
* sysdeps/alpha/alphaev67/ffs.S: New file.
|
||||
* sysdeps/alpha/alphaev67/ffsll.S: New file.
|
||||
* sysdeps/alpha/alphaev67/rawmemchr.S: New file.
|
||||
* sysdeps/alpha/alphaev67/stpcpy.S: New file.
|
||||
* sysdeps/alpha/alphaev67/stpncpy.S: New file.
|
||||
* sysdeps/alpha/rawmemchr.S: New file.
|
||||
* sysdeps/alpha/strcat.S: Tail call to __stxcpy.
|
||||
* sysdeps/alpha/strcpy.S: Likewise.
|
||||
|
||||
From GMP 3.1.1:
|
||||
* sysdeps/alpha/alphaev6/addmul_1.s: New file.
|
||||
|
||||
From rick.gorton@alpha-processor.com:
|
||||
* sysdeps/alpha/alphaev6/memchr.S: New file.
|
||||
* sysdeps/alpha/alphaev6/memcpy.S: New file.
|
||||
* sysdeps/alpha/alphaev6/memset.S: New file.
|
||||
* sysdeps/alpha/alphaev6/stxcpy.S: New file.
|
||||
* sysdeps/alpha/alphaev6/stxncpy.S: New file.
|
||||
* sysdeps/alpha/alphaev67/strcat.S: New file.
|
||||
* sysdeps/alpha/alphaev67/strchr.S: New file.
|
||||
* sysdeps/alpha/alphaev67/strlen.S: New file.
|
||||
* sysdeps/alpha/alphaev67/strncat.S: New file.
|
||||
* sysdeps/alpha/htonl.S: Use a shorter sequence.
|
||||
|
||||
2000-12-08 Jakub Jelinek <jakub@redhat.com>
|
||||
|
||||
* inet/getnameinfo.c (getnameinfo): Fix NI_NOFQDN support.
|
||||
Reported by <pspencer@fields.utoronto.ca>.
|
||||
|
||||
2000-12-07 Jes Sorensen <jes@linuxcare.com>
|
||||
|
||||
* sysdeps/ia64/elf/start.S (__data_start): Add __data_start variable.
|
||||
Pointed out by Hans Boehm.
|
||||
|
||||
2000-12-07 H.J. Lu <hjl@gnu.org>
|
||||
|
||||
* elf/dl-version.c (match_symbol): Check map->l_name[0] for printing.
|
||||
|
||||
2000-12-07 Andreas Jaeger <aj@suse.de>
|
||||
|
||||
* misc/error.c: Add format attributes for __error and __error_at_line.
|
||||
|
||||
* nscd/dbg_log.h: Add format attribute.
|
||||
|
||||
2000-12-08 Ulrich Drepper <drepper@redhat.com>
|
||||
|
||||
* misc/sys/syslog.h: Add format attributes to syslog and vsyslog.
|
||||
Patch by Joseph S. Myers <jsm28@cam.ac.uk>.
|
||||
|
||||
2000-12-07 Dan Pop <Dan.Pop@cern.ch>
|
||||
|
||||
* sysdeps/ia64/strcpy.S: Fix a bug in a recovery code sequence.
|
||||
@ -948,8 +1010,7 @@
|
||||
|
||||
2000-11-14 Andreas Jaeger <aj@suse.de>
|
||||
|
||||
* sysdeps/generic/dl-cache.c (HWCAP_CHECK): Fix access to
|
||||
_dl_hwcap.
|
||||
* sysdeps/generic/dl-cache.c (HWCAP_CHECK): Fix access to _dl_hwcap.
|
||||
|
||||
2000-11-13 Roland McGrath <roland@frob.com>
|
||||
|
||||
@ -992,7 +1053,7 @@
|
||||
|
||||
2000-11-03 Bruno Haible <haible@clisp.cons.org>
|
||||
|
||||
* manual/install.texi: Recommend to set LANGUAGE=C LC_ALL-C during
|
||||
* manual/install.texi: Recommend to set LANGUAGE=C LC_ALL=C during
|
||||
"make install", to work around a binary incompatibility between
|
||||
glibc 2.1 and glibc 2.2 gconv modules.
|
||||
|
||||
|
@ -419,7 +419,7 @@ fillin_rpath (char *rpath, struct r_search_path_elem **result, const char *sep,
|
||||
|
||||
dirp->dirname = ((char *) dirp + sizeof (*dirp)
|
||||
+ ncapstr * sizeof (enum r_dir_status));
|
||||
memcpy ((char *) dirp->dirname, cp, len + 1);
|
||||
*((char *) __mempcpy ((char *) dirp->dirname, cp, len)) = '\0';
|
||||
dirp->dirnamelen = len;
|
||||
|
||||
if (len > max_dirnamelen)
|
||||
|
@ -95,7 +95,7 @@ match_symbol (const char *name, ElfW(Word) hash, const char *string,
|
||||
only print a message if verbose output is requested. */
|
||||
if (verbose)
|
||||
/* XXX We cannot translate the messages. */
|
||||
_dl_signal_cerror (0, map->l_name,
|
||||
_dl_signal_cerror (0, map->l_name[0] ? map->l_name : _dl_argv[0],
|
||||
make_string ("\
|
||||
no version information available (required by ",
|
||||
name, ")"));
|
||||
|
@ -259,7 +259,7 @@ getnameinfo (const struct sockaddr *sa, socklen_t addrlen, char *host,
|
||||
if (h)
|
||||
{
|
||||
char *c;
|
||||
if ((flags & NI_NOFQDN) == 0
|
||||
if ((flags & NI_NOFQDN)
|
||||
&& (c = nrl_domainname ())
|
||||
&& (c = strstr (h->h_name, c))
|
||||
&& (c != h->h_name) && (*(--c) == '.'))
|
||||
|
@ -74,10 +74,12 @@ unsigned int error_message_count;
|
||||
|
||||
/* In GNU libc we want do not want to use the common name `error' directly.
|
||||
Instead make it a weak alias. */
|
||||
extern void __error (int status, int errnum, const char *message, ...);
|
||||
extern void __error (int status, int errnum, const char *message, ...)
|
||||
__attribute__ ((__format__ (__printf__, 3, 4)));
|
||||
extern void __error_at_line (int status, int errnum, const char *file_name,
|
||||
unsigned int line_number, const char *message,
|
||||
...);
|
||||
...)
|
||||
__attribute__ ((__format__ (__printf__, 5, 6)));;
|
||||
# define error __error
|
||||
# define error_at_line __error_at_line
|
||||
|
||||
|
@ -179,12 +179,13 @@ extern void openlog (__const char *__ident, int __option, int __facility)
|
||||
extern int setlogmask (int __mask) __THROW;
|
||||
|
||||
/* Generate a log message using FMT string and option arguments. */
|
||||
extern void syslog (int __pri, __const char *__fmt, ...) __THROW;
|
||||
extern void syslog (int __pri, __const char *__fmt, ...) __THROW
|
||||
__attribute__ ((__format__(__printf__, 2, 3)));
|
||||
|
||||
#ifdef __USE_BSD
|
||||
/* Generate a log message using FMT and using arguments pointed to by AP. */
|
||||
extern void vsyslog (int __pri, __const char *__fmt, __gnuc_va_list __ap)
|
||||
__THROW;
|
||||
__THROW __attribute__ ((__format__(__printf__, 2, 0)));
|
||||
#endif
|
||||
|
||||
__END_DECLS
|
||||
|
@ -22,7 +22,8 @@
|
||||
|
||||
extern int debug_level;
|
||||
|
||||
extern void dbg_log (const char *str, ...);
|
||||
extern void dbg_log (const char *str, ...)
|
||||
__attribute__ ((__format__ (__printf__, 1, 0)));;
|
||||
|
||||
extern int set_logfile (const char *logfile);
|
||||
|
||||
|
@ -27,7 +27,7 @@
|
||||
compiler treats those calls as if they were instructions. In
|
||||
particular, it doesn't save any of the temporary registers (caller
|
||||
saved registers). It is therefore necessary to preserve all
|
||||
caller-saved registers as well
|
||||
caller-saved registers as well.
|
||||
|
||||
Upon entering _mcount, register $at holds the return address and ra
|
||||
holds the return address of the function's caller (selfpc and frompc,
|
||||
|
1
sysdeps/alpha/alphaev6/Implies
Normal file
1
sysdeps/alpha/alphaev6/Implies
Normal file
@ -0,0 +1 @@
|
||||
alpha/alphaev5
|
479
sysdeps/alpha/alphaev6/addmul_1.s
Normal file
479
sysdeps/alpha/alphaev6/addmul_1.s
Normal file
@ -0,0 +1,479 @@
|
||||
# Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
|
||||
# the result to a second limb vector.
|
||||
#
|
||||
# Copyright (C) 2000 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is part of the GNU MP Library.
|
||||
#
|
||||
# The GNU MP Library is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Lesser General Public License as published
|
||||
# by the Free Software Foundation; either version 2.1 of the License, or (at
|
||||
# your option) any later version.
|
||||
#
|
||||
# The GNU MP Library is distributed in the hope that it will be useful, but
|
||||
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
# License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public License
|
||||
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
|
||||
# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
|
||||
# MA 02111-1307, USA.
|
||||
|
||||
# INPUT PARAMETERS
|
||||
# res_ptr $16
|
||||
# s1_ptr $17
|
||||
# size $18
|
||||
# s2_limb $19
|
||||
#
|
||||
# This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and
|
||||
# exactly 3.625 cycles/limb on EV6...
|
||||
#
|
||||
# This code was written in close cooperation with ev6 pipeline expert
|
||||
# Steve Root (root@toober.hlo.dec.com). Any errors are tege's fault, though.
|
||||
#
|
||||
# Register usages for unrolled loop:
|
||||
# 0-3 mul's
|
||||
# 4-7 acc's
|
||||
# 8-15 mul results
|
||||
# 20,21 carry's
|
||||
# 22,23 save for stores
|
||||
#
|
||||
# Sustains 8 mul-adds in 29 cycles in the unrolled inner loop.
|
||||
#
|
||||
# The stores can issue a cycle late so we have paired no-op's to 'catch'
|
||||
# them, so that further disturbance to the schedule is damped.
|
||||
#
|
||||
# We couldn't pair the loads, because the entangled schedule of the
|
||||
# carry's has to happen on one side {0} of the machine. Note, the total
|
||||
# use of U0, and the total use of L0 (after attending to the stores).
|
||||
# which is part of the reason why....
|
||||
#
|
||||
# This is a great schedule for the d_cache, a poor schedule for the
|
||||
# b_cache. The lockup on U0 means that any stall can't be recovered
|
||||
# from. Consider a ldq in L1. say that load gets stalled because it
|
||||
# collides with a fill from the b_Cache. On the next cycle, this load
|
||||
# gets priority. If first looks at L0, and goes there. The instruction
|
||||
# we intended for L0 gets to look at L1, which is NOT where we want
|
||||
# it. It either stalls 1, because it can't go in L0, or goes there, and
|
||||
# causes a further instruction to stall.
|
||||
#
|
||||
# So for b_cache, we're likely going to want to put one or more cycles
|
||||
# back into the code! And, of course, put in prefetches. For the
|
||||
# accumulator, lds, intent to modify. For the multiplier, you might
|
||||
# want ldq, evict next, if you're not wanting to use it again soon. Use
|
||||
# 256 ahead of present pointer value. At a place where we have an mt
|
||||
# followed by a bookkeeping, put the bookkeeping in upper, and the
|
||||
# prefetch into lower.
|
||||
#
|
||||
# Note, the usage of physical registers per cycle is smoothed off, as
|
||||
# much as possible.
|
||||
#
|
||||
# Note, the ldq's and stq's are at the end of the quadpacks. note, we'd
|
||||
# like not to have a ldq or stq to preceded a conditional branch in a
|
||||
# quadpack. The conditional branch moves the retire pointer one cycle
|
||||
# later.
|
||||
#
|
||||
# Optimization notes:
|
||||
# Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27?
|
||||
# Reserved regs: $29 $30 $31
|
||||
# Free caller-saves regs in unrolled code: $24 $25 $28
|
||||
# We should swap some of the callee-saves regs for some of the free
|
||||
# caller-saves regs, saving some overhead cycles.
|
||||
# Most importantly, we should write fast code for the 0-7 case.
|
||||
# The code we use there are for the 21164, and runs at 7 cycles/limb
|
||||
# on the 21264. Should not be hard, if we write specialized code for
|
||||
# 1-7 limbs (the one for 0 limbs should be straightforward). We then just
|
||||
# need a jump table indexed by the low 3 bits of the count argument.
|
||||
|
||||
.set noreorder
|
||||
.set noat
|
||||
.text
|
||||
|
||||
.globl __mpn_addmul_1
|
||||
.ent __mpn_addmul_1
|
||||
__mpn_addmul_1:
|
||||
.frame $30,0,$26,0
|
||||
.prologue 0
|
||||
|
||||
cmpult $18, 8, $1
|
||||
beq $1, $Large
|
||||
|
||||
ldq $2, 0($17) # $2 = s1_limb
|
||||
addq $17, 8, $17 # s1_ptr++
|
||||
subq $18, 1, $18 # size--
|
||||
mulq $2, $19, $3 # $3 = prod_low
|
||||
ldq $5, 0($16) # $5 = *res_ptr
|
||||
umulh $2, $19, $0 # $0 = prod_high
|
||||
beq $18, $Lend0b # jump if size was == 1
|
||||
ldq $2, 0($17) # $2 = s1_limb
|
||||
addq $17, 8, $17 # s1_ptr++
|
||||
subq $18, 1, $18 # size--
|
||||
addq $5, $3, $3
|
||||
cmpult $3, $5, $4
|
||||
stq $3, 0($16)
|
||||
addq $16, 8, $16 # res_ptr++
|
||||
beq $18, $Lend0a # jump if size was == 2
|
||||
|
||||
.align 3
|
||||
$Loop0: mulq $2, $19, $3 # $3 = prod_low
|
||||
ldq $5, 0($16) # $5 = *res_ptr
|
||||
addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
|
||||
subq $18, 1, $18 # size--
|
||||
umulh $2, $19, $4 # $4 = cy_limb
|
||||
ldq $2, 0($17) # $2 = s1_limb
|
||||
addq $17, 8, $17 # s1_ptr++
|
||||
addq $3, $0, $3 # $3 = cy_limb + prod_low
|
||||
cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
|
||||
addq $5, $3, $3
|
||||
cmpult $3, $5, $5
|
||||
stq $3, 0($16)
|
||||
addq $16, 8, $16 # res_ptr++
|
||||
addq $5, $0, $0 # combine carries
|
||||
bne $18, $Loop0
|
||||
$Lend0a:
|
||||
mulq $2, $19, $3 # $3 = prod_low
|
||||
ldq $5, 0($16) # $5 = *res_ptr
|
||||
addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
|
||||
umulh $2, $19, $4 # $4 = cy_limb
|
||||
addq $3, $0, $3 # $3 = cy_limb + prod_low
|
||||
cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
|
||||
addq $5, $3, $3
|
||||
cmpult $3, $5, $5
|
||||
stq $3, 0($16)
|
||||
addq $5, $0, $0 # combine carries
|
||||
addq $4, $0, $0 # cy_limb = prod_high + cy
|
||||
ret $31, ($26), 1
|
||||
$Lend0b:
|
||||
addq $5, $3, $3
|
||||
cmpult $3, $5, $5
|
||||
stq $3, 0($16)
|
||||
addq $0, $5, $0
|
||||
ret $31, ($26), 1
|
||||
|
||||
$Large:
|
||||
lda $30, -240($30)
|
||||
stq $9, 8($30)
|
||||
stq $10, 16($30)
|
||||
stq $11, 24($30)
|
||||
stq $12, 32($30)
|
||||
stq $13, 40($30)
|
||||
stq $14, 48($30)
|
||||
stq $15, 56($30)
|
||||
|
||||
and $18, 7, $20 # count for the first loop, 0-7
|
||||
srl $18, 3, $18 # count for unrolled loop
|
||||
bis $31, $31, $0
|
||||
beq $20, $Lunroll
|
||||
ldq $2, 0($17) # $2 = s1_limb
|
||||
addq $17, 8, $17 # s1_ptr++
|
||||
subq $20, 1, $20 # size--
|
||||
mulq $2, $19, $3 # $3 = prod_low
|
||||
ldq $5, 0($16) # $5 = *res_ptr
|
||||
umulh $2, $19, $0 # $0 = prod_high
|
||||
beq $20, $Lend1b # jump if size was == 1
|
||||
ldq $2, 0($17) # $2 = s1_limb
|
||||
addq $17, 8, $17 # s1_ptr++
|
||||
subq $20, 1, $20 # size--
|
||||
addq $5, $3, $3
|
||||
cmpult $3, $5, $4
|
||||
stq $3, 0($16)
|
||||
addq $16, 8, $16 # res_ptr++
|
||||
beq $20, $Lend1a # jump if size was == 2
|
||||
|
||||
.align 3
|
||||
$Loop1: mulq $2, $19, $3 # $3 = prod_low
|
||||
ldq $5, 0($16) # $5 = *res_ptr
|
||||
addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
|
||||
subq $20, 1, $20 # size--
|
||||
umulh $2, $19, $4 # $4 = cy_limb
|
||||
ldq $2, 0($17) # $2 = s1_limb
|
||||
addq $17, 8, $17 # s1_ptr++
|
||||
addq $3, $0, $3 # $3 = cy_limb + prod_low
|
||||
cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
|
||||
addq $5, $3, $3
|
||||
cmpult $3, $5, $5
|
||||
stq $3, 0($16)
|
||||
addq $16, 8, $16 # res_ptr++
|
||||
addq $5, $0, $0 # combine carries
|
||||
bne $20, $Loop1
|
||||
|
||||
$Lend1a:
|
||||
mulq $2, $19, $3 # $3 = prod_low
|
||||
ldq $5, 0($16) # $5 = *res_ptr
|
||||
addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
|
||||
umulh $2, $19, $4 # $4 = cy_limb
|
||||
addq $3, $0, $3 # $3 = cy_limb + prod_low
|
||||
cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
|
||||
addq $5, $3, $3
|
||||
cmpult $3, $5, $5
|
||||
stq $3, 0($16)
|
||||
addq $16, 8, $16 # res_ptr++
|
||||
addq $5, $0, $0 # combine carries
|
||||
addq $4, $0, $0 # cy_limb = prod_high + cy
|
||||
br $31, $Lunroll
|
||||
$Lend1b:
|
||||
addq $5, $3, $3
|
||||
cmpult $3, $5, $5
|
||||
stq $3, 0($16)
|
||||
addq $16, 8, $16 # res_ptr++
|
||||
addq $0, $5, $0
|
||||
|
||||
$Lunroll:
|
||||
lda $17, -16($17) # L1 bookkeeping
|
||||
lda $16, -16($16) # L1 bookkeeping
|
||||
bis $0, $31, $12
|
||||
|
||||
# ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
|
||||
|
||||
ldq $2, 16($17) # L1
|
||||
ldq $3, 24($17) # L1
|
||||
lda $18, -1($18) # L1 bookkeeping
|
||||
ldq $6, 16($16) # L1
|
||||
ldq $7, 24($16) # L1
|
||||
ldq $0, 32($17) # L1
|
||||
mulq $19, $2, $13 # U1
|
||||
ldq $1, 40($17) # L1
|
||||
umulh $19, $2, $14 # U1
|
||||
mulq $19, $3, $15 # U1
|
||||
lda $17, 64($17) # L1 bookkeeping
|
||||
ldq $4, 32($16) # L1
|
||||
ldq $5, 40($16) # L1
|
||||
umulh $19, $3, $8 # U1
|
||||
ldq $2, -16($17) # L1
|
||||
mulq $19, $0, $9 # U1
|
||||
ldq $3, -8($17) # L1
|
||||
umulh $19, $0, $10 # U1
|
||||
addq $6, $13, $6 # L0 lo + acc
|
||||
mulq $19, $1, $11 # U1
|
||||
cmpult $6, $13, $20 # L0 lo add => carry
|
||||
lda $16, 64($16) # L1 bookkeeping
|
||||
addq $6, $12, $22 # U0 hi add => answer
|
||||
cmpult $22, $12, $21 # L0 hi add => carry
|
||||
addq $14, $20, $14 # U0 hi mul + carry
|
||||
ldq $6, -16($16) # L1
|
||||
addq $7, $15, $23 # L0 lo + acc
|
||||
addq $14, $21, $14 # U0 hi mul + carry
|
||||
ldq $7, -8($16) # L1
|
||||
umulh $19, $1, $12 # U1
|
||||
cmpult $23, $15, $20 # L0 lo add => carry
|
||||
addq $23, $14, $23 # U0 hi add => answer
|
||||
ldq $0, 0($17) # L1
|
||||
mulq $19, $2, $13 # U1
|
||||
cmpult $23, $14, $21 # L0 hi add => carry
|
||||
addq $8, $20, $8 # U0 hi mul + carry
|
||||
ldq $1, 8($17) # L1
|
||||
umulh $19, $2, $14 # U1
|
||||
addq $4, $9, $4 # L0 lo + acc
|
||||
stq $22, -48($16) # L0
|
||||
stq $23, -40($16) # L1
|
||||
mulq $19, $3, $15 # U1
|
||||
addq $8, $21, $8 # U0 hi mul + carry
|
||||
cmpult $4, $9, $20 # L0 lo add => carry
|
||||
addq $4, $8, $22 # U0 hi add => answer
|
||||
ble $18, $Lend # U1 bookkeeping
|
||||
|
||||
# ____ MAIN UNROLLED LOOP ____
|
||||
.align 4
|
||||
$Loop:
|
||||
bis $31, $31, $31 # U1 mt
|
||||
cmpult $22, $8, $21 # L0 hi add => carry
|
||||
addq $10, $20, $10 # U0 hi mul + carry
|
||||
ldq $4, 0($16) # L1
|
||||
|
||||
bis $31, $31, $31 # U1 mt
|
||||
addq $5, $11, $23 # L0 lo + acc
|
||||
addq $10, $21, $10 # L0 hi mul + carry
|
||||
ldq $5, 8($16) # L1
|
||||
|
||||
umulh $19, $3, $8 # U1
|
||||
cmpult $23, $11, $20 # L0 lo add => carry
|
||||
addq $23, $10, $23 # U0 hi add => answer
|
||||
ldq $2, 16($17) # L1
|
||||
|
||||
mulq $19, $0, $9 # U1
|
||||
cmpult $23, $10, $21 # L0 hi add => carry
|
||||
addq $12, $20, $12 # U0 hi mul + carry
|
||||
ldq $3, 24($17) # L1
|
||||
|
||||
umulh $19, $0, $10 # U1
|
||||
addq $6, $13, $6 # L0 lo + acc
|
||||
stq $22, -32($16) # L0
|
||||
stq $23, -24($16) # L1
|
||||
|
||||
bis $31, $31, $31 # L0 st slosh
|
||||
mulq $19, $1, $11 # U1
|
||||
bis $31, $31, $31 # L1 st slosh
|
||||
addq $12, $21, $12 # U0 hi mul + carry
|
||||
|
||||
cmpult $6, $13, $20 # L0 lo add => carry
|
||||
bis $31, $31, $31 # U1 mt
|
||||
lda $18, -1($18) # L1 bookkeeping
|
||||
addq $6, $12, $22 # U0 hi add => answer
|
||||
|
||||
bis $31, $31, $31 # U1 mt
|
||||
cmpult $22, $12, $21 # L0 hi add => carry
|
||||
addq $14, $20, $14 # U0 hi mul + carry
|
||||
ldq $6, 16($16) # L1
|
||||
|
||||
bis $31, $31, $31 # U1 mt
|
||||
addq $7, $15, $23 # L0 lo + acc
|
||||
addq $14, $21, $14 # U0 hi mul + carry
|
||||
ldq $7, 24($16) # L1
|
||||
|
||||
umulh $19, $1, $12 # U1
|
||||
cmpult $23, $15, $20 # L0 lo add => carry
|
||||
addq $23, $14, $23 # U0 hi add => answer
|
||||
ldq $0, 32($17) # L1
|
||||
|
||||
mulq $19, $2, $13 # U1
|
||||
cmpult $23, $14, $21 # L0 hi add => carry
|
||||
addq $8, $20, $8 # U0 hi mul + carry
|
||||
ldq $1, 40($17) # L1
|
||||
|
||||
umulh $19, $2, $14 # U1
|
||||
addq $4, $9, $4 # U0 lo + acc
|
||||
stq $22, -16($16) # L0
|
||||
stq $23, -8($16) # L1
|
||||
|
||||
bis $31, $31, $31 # L0 st slosh
|
||||
mulq $19, $3, $15 # U1
|
||||
bis $31, $31, $31 # L1 st slosh
|
||||
addq $8, $21, $8 # L0 hi mul + carry
|
||||
|
||||
cmpult $4, $9, $20 # L0 lo add => carry
|
||||
bis $31, $31, $31 # U1 mt
|
||||
lda $17, 64($17) # L1 bookkeeping
|
||||
addq $4, $8, $22 # U0 hi add => answer
|
||||
|
||||
bis $31, $31, $31 # U1 mt
|
||||
cmpult $22, $8, $21 # L0 hi add => carry
|
||||
addq $10, $20, $10 # U0 hi mul + carry
|
||||
ldq $4, 32($16) # L1
|
||||
|
||||
bis $31, $31, $31 # U1 mt
|
||||
addq $5, $11, $23 # L0 lo + acc
|
||||
addq $10, $21, $10 # L0 hi mul + carry
|
||||
ldq $5, 40($16) # L1
|
||||
|
||||
umulh $19, $3, $8 # U1
|
||||
cmpult $23, $11, $20 # L0 lo add => carry
|
||||
addq $23, $10, $23 # U0 hi add => answer
|
||||
ldq $2, -16($17) # L1
|
||||
|
||||
mulq $19, $0, $9 # U1
|
||||
cmpult $23, $10, $21 # L0 hi add => carry
|
||||
addq $12, $20, $12 # U0 hi mul + carry
|
||||
ldq $3, -8($17) # L1
|
||||
|
||||
umulh $19, $0, $10 # U1
|
||||
addq $6, $13, $6 # L0 lo + acc
|
||||
stq $22, 0($16) # L0
|
||||
stq $23, 8($16) # L1
|
||||
|
||||
bis $31, $31, $31 # L0 st slosh
|
||||
mulq $19, $1, $11 # U1
|
||||
bis $31, $31, $31 # L1 st slosh
|
||||
addq $12, $21, $12 # U0 hi mul + carry
|
||||
|
||||
cmpult $6, $13, $20 # L0 lo add => carry
|
||||
bis $31, $31, $31 # U1 mt
|
||||
lda $16, 64($16) # L1 bookkeeping
|
||||
addq $6, $12, $22 # U0 hi add => answer
|
||||
|
||||
bis $31, $31, $31 # U1 mt
|
||||
cmpult $22, $12, $21 # L0 hi add => carry
|
||||
addq $14, $20, $14 # U0 hi mul + carry
|
||||
ldq $6, -16($16) # L1
|
||||
|
||||
bis $31, $31, $31 # U1 mt
|
||||
addq $7, $15, $23 # L0 lo + acc
|
||||
addq $14, $21, $14 # U0 hi mul + carry
|
||||
ldq $7, -8($16) # L1
|
||||
|
||||
umulh $19, $1, $12 # U1
|
||||
cmpult $23, $15, $20 # L0 lo add => carry
|
||||
addq $23, $14, $23 # U0 hi add => answer
|
||||
ldq $0, 0($17) # L1
|
||||
|
||||
mulq $19, $2, $13 # U1
|
||||
cmpult $23, $14, $21 # L0 hi add => carry
|
||||
addq $8, $20, $8 # U0 hi mul + carry
|
||||
ldq $1, 8($17) # L1
|
||||
|
||||
umulh $19, $2, $14 # U1
|
||||
addq $4, $9, $4 # L0 lo + acc
|
||||
stq $22, -48($16) # L0
|
||||
stq $23, -40($16) # L1
|
||||
|
||||
bis $31, $31, $31 # L0 st slosh
|
||||
mulq $19, $3, $15 # U1
|
||||
bis $31, $31, $31 # L1 st slosh
|
||||
addq $8, $21, $8 # U0 hi mul + carry
|
||||
|
||||
cmpult $4, $9, $20 # L0 lo add => carry
|
||||
addq $4, $8, $22 # U0 hi add => answer
|
||||
bis $31, $31, $31 # L1 mt
|
||||
bgt $18, $Loop # U1 bookkeeping
|
||||
|
||||
# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
|
||||
$Lend:
|
||||
cmpult $22, $8, $21 # L0 hi add => carry
|
||||
addq $10, $20, $10 # U0 hi mul + carry
|
||||
ldq $4, 0($16) # L1
|
||||
addq $5, $11, $23 # L0 lo + acc
|
||||
addq $10, $21, $10 # L0 hi mul + carry
|
||||
ldq $5, 8($16) # L1
|
||||
umulh $19, $3, $8 # U1
|
||||
cmpult $23, $11, $20 # L0 lo add => carry
|
||||
addq $23, $10, $23 # U0 hi add => answer
|
||||
mulq $19, $0, $9 # U1
|
||||
cmpult $23, $10, $21 # L0 hi add => carry
|
||||
addq $12, $20, $12 # U0 hi mul + carry
|
||||
umulh $19, $0, $10 # U1
|
||||
addq $6, $13, $6 # L0 lo + acc
|
||||
stq $22, -32($16) # L0
|
||||
stq $23, -24($16) # L1
|
||||
mulq $19, $1, $11 # U1
|
||||
addq $12, $21, $12 # U0 hi mul + carry
|
||||
cmpult $6, $13, $20 # L0 lo add => carry
|
||||
addq $6, $12, $22 # U0 hi add => answer
|
||||
cmpult $22, $12, $21 # L0 hi add => carry
|
||||
addq $14, $20, $14 # U0 hi mul + carry
|
||||
addq $7, $15, $23 # L0 lo + acc
|
||||
addq $14, $21, $14 # U0 hi mul + carry
|
||||
umulh $19, $1, $12 # U1
|
||||
cmpult $23, $15, $20 # L0 lo add => carry
|
||||
addq $23, $14, $23 # U0 hi add => answer
|
||||
cmpult $23, $14, $21 # L0 hi add => carry
|
||||
addq $8, $20, $8 # U0 hi mul + carry
|
||||
addq $4, $9, $4 # U0 lo + acc
|
||||
stq $22, -16($16) # L0
|
||||
stq $23, -8($16) # L1
|
||||
bis $31, $31, $31 # L0 st slosh
|
||||
addq $8, $21, $8 # L0 hi mul + carry
|
||||
cmpult $4, $9, $20 # L0 lo add => carry
|
||||
addq $4, $8, $22 # U0 hi add => answer
|
||||
cmpult $22, $8, $21 # L0 hi add => carry
|
||||
addq $10, $20, $10 # U0 hi mul + carry
|
||||
addq $5, $11, $23 # L0 lo + acc
|
||||
addq $10, $21, $10 # L0 hi mul + carry
|
||||
cmpult $23, $11, $20 # L0 lo add => carry
|
||||
addq $23, $10, $23 # U0 hi add => answer
|
||||
cmpult $23, $10, $21 # L0 hi add => carry
|
||||
addq $12, $20, $12 # U0 hi mul + carry
|
||||
stq $22, 0($16) # L0
|
||||
stq $23, 8($16) # L1
|
||||
addq $12, $21, $0 # U0 hi mul + carry
|
||||
|
||||
ldq $9, 8($30)
|
||||
ldq $10, 16($30)
|
||||
ldq $11, 24($30)
|
||||
ldq $12, 32($30)
|
||||
ldq $13, 40($30)
|
||||
ldq $14, 48($30)
|
||||
ldq $15, 56($30)
|
||||
lda $30, 240($30)
|
||||
ret $31, ($26), 1
|
||||
|
||||
.end __mpn_addmul_1
|
192
sysdeps/alpha/alphaev6/memchr.S
Normal file
192
sysdeps/alpha/alphaev6/memchr.S
Normal file
@ -0,0 +1,192 @@
|
||||
/* Copyright (C) 2000 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
Contributed by David Mosberger (davidm@cs.arizona.edu).
|
||||
EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Library General Public License as
|
||||
published by the Free Software Foundation; either version 2 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Library General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Library General Public
|
||||
License along with the GNU C Library; see the file COPYING.LIB. If not,
|
||||
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
Boston, MA 02111-1307, USA. */
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.arch ev6
|
||||
.set noreorder
|
||||
.set noat
|
||||
|
||||
ENTRY(__memchr)
|
||||
#ifdef PROF
|
||||
ldgp gp, 0(pv)
|
||||
lda AT, _mcount
|
||||
jsr AT, (AT), _mcount
|
||||
.prologue 1
|
||||
#else
|
||||
.prologue 0
|
||||
#endif
|
||||
|
||||
# Hack -- if someone passes in (size_t)-1, hoping to just
|
||||
# search til the end of the address space, we will overflow
|
||||
# below when we find the address of the last byte. Given
|
||||
# that we will never have a 56-bit address space, cropping
|
||||
# the length is the easiest way to avoid trouble.
|
||||
zap $18, 0x80, $5 # U : Bound length
|
||||
beq $18, $not_found # U :
|
||||
ldq_u $1, 0($16) # L : load first quadword Latency=3
|
||||
and $17, 0xff, $17 # E : L L U U : 00000000000000ch
|
||||
|
||||
insbl $17, 1, $2 # U : 000000000000ch00
|
||||
cmpult $18, 9, $4 # E : small (< 1 quad) string?
|
||||
or $2, $17, $17 # E : 000000000000chch
|
||||
lda $3, -1($31) # E : U L L U
|
||||
|
||||
sll $17, 16, $2 # U : 00000000chch0000
|
||||
addq $16, $5, $5 # E : Max search address
|
||||
or $2, $17, $17 # E : 00000000chchchch
|
||||
sll $17, 32, $2 # U : U L L U : chchchch00000000
|
||||
|
||||
or $2, $17, $17 # E : chchchchchchchch
|
||||
extql $1, $16, $7 # U : $7 is upper bits
|
||||
beq $4, $first_quad # U :
|
||||
ldq_u $6, -1($5) # L : L U U L : eight or less bytes to search Latency=3
|
||||
|
||||
extqh $6, $16, $6 # U : 2 cycle stall for $6
|
||||
mov $16, $0 # E :
|
||||
nop # E :
|
||||
or $7, $6, $1 # E : L U L U $1 = quadword starting at $16
|
||||
|
||||
# Deal with the case where at most 8 bytes remain to be searched
|
||||
# in $1. E.g.:
|
||||
# $18 = 6
|
||||
# $1 = ????c6c5c4c3c2c1
|
||||
$last_quad:
|
||||
negq $18, $6 # E :
|
||||
xor $17, $1, $1 # E :
|
||||
srl $3, $6, $6 # U : $6 = mask of $18 bits set
|
||||
cmpbge $31, $1, $2 # E : L U L U
|
||||
|
||||
nop
|
||||
nop
|
||||
and $2, $6, $2 # E :
|
||||
beq $2, $not_found # U : U L U L
|
||||
|
||||
$found_it:
|
||||
#if defined(__alpha_fix__) && defined(__alpha_cix__)
|
||||
/*
|
||||
* Since we are guaranteed to have set one of the bits, we don't
|
||||
* have to worry about coming back with a 0x40 out of cttz...
|
||||
*/
|
||||
cttz $2, $3 # U0 :
|
||||
addq $0, $3, $0 # E : All done
|
||||
nop # E :
|
||||
ret # L0 : L U L U
|
||||
#else
|
||||
/*
|
||||
* Slow and clunky. It can probably be improved.
|
||||
* An exercise left for others.
|
||||
*/
|
||||
negq $2, $3 # E :
|
||||
and $2, $3, $2 # E :
|
||||
and $2, 0x0f, $1 # E :
|
||||
addq $0, 4, $3 # E :
|
||||
|
||||
cmoveq $1, $3, $0 # E : Latency 2, extra map cycle
|
||||
nop # E : keep with cmov
|
||||
and $2, 0x33, $1 # E :
|
||||
addq $0, 2, $3 # E : U L U L : 2 cycle stall on $0
|
||||
|
||||
cmoveq $1, $3, $0 # E : Latency 2, extra map cycle
|
||||
nop # E : keep with cmov
|
||||
and $2, 0x55, $1 # E :
|
||||
addq $0, 1, $3 # E : U L U L : 2 cycle stall on $0
|
||||
|
||||
cmoveq $1, $3, $0 # E : Latency 2, extra map cycle
|
||||
nop
|
||||
nop
|
||||
ret # L0 : L U L U
|
||||
#endif
|
||||
|
||||
# Deal with the case where $18 > 8 bytes remain to be
|
||||
# searched. $16 may not be aligned.
|
||||
.align 4
|
||||
$first_quad:
|
||||
andnot $16, 0x7, $0 # E :
|
||||
insqh $3, $16, $2 # U : $2 = 0000ffffffffffff ($16<0:2> ff)
|
||||
xor $1, $17, $1 # E :
|
||||
or $1, $2, $1 # E : U L U L $1 = ====ffffffffffff
|
||||
|
||||
cmpbge $31, $1, $2 # E :
|
||||
bne $2, $found_it # U :
|
||||
# At least one byte left to process.
|
||||
ldq $1, 8($0) # L :
|
||||
subq $5, 1, $18 # E : U L U L
|
||||
|
||||
addq $0, 8, $0 # E :
|
||||
# Make $18 point to last quad to be accessed (the
|
||||
# last quad may or may not be partial).
|
||||
andnot $18, 0x7, $18 # E :
|
||||
cmpult $0, $18, $2 # E :
|
||||
beq $2, $final # U : U L U L
|
||||
|
||||
# At least two quads remain to be accessed.
|
||||
|
||||
subq $18, $0, $4 # E : $4 <- nr quads to be processed
|
||||
and $4, 8, $4 # E : odd number of quads?
|
||||
bne $4, $odd_quad_count # U :
|
||||
# At least three quads remain to be accessed
|
||||
mov $1, $4 # E : L U L U : move prefetched value to correct reg
|
||||
|
||||
.align 4
|
||||
$unrolled_loop:
|
||||
ldq $1, 8($0) # L : prefetch $1
|
||||
xor $17, $4, $2 # E :
|
||||
cmpbge $31, $2, $2 # E :
|
||||
bne $2, $found_it # U : U L U L
|
||||
|
||||
addq $0, 8, $0 # E :
|
||||
nop # E :
|
||||
nop # E :
|
||||
nop # E :
|
||||
|
||||
$odd_quad_count:
|
||||
xor $17, $1, $2 # E :
|
||||
ldq $4, 8($0) # L : prefetch $4
|
||||
cmpbge $31, $2, $2 # E :
|
||||
addq $0, 8, $6 # E :
|
||||
|
||||
bne $2, $found_it # U :
|
||||
cmpult $6, $18, $6 # E :
|
||||
addq $0, 8, $0 # E :
|
||||
nop # E :
|
||||
|
||||
bne $6, $unrolled_loop # U :
|
||||
mov $4, $1 # E : move prefetched value into $1
|
||||
nop # E :
|
||||
nop # E :
|
||||
|
||||
$final: subq $5, $0, $18 # E : $18 <- number of bytes left to do
|
||||
nop # E :
|
||||
nop # E :
|
||||
bne $18, $last_quad # U :
|
||||
|
||||
$not_found:
|
||||
mov $31, $0 # E :
|
||||
nop # E :
|
||||
nop # E :
|
||||
ret # L0 :
|
||||
|
||||
END(__memchr)
|
||||
|
||||
weak_alias (__memchr, memchr)
|
||||
#if !__BOUNDED_POINTERS__
|
||||
weak_alias (__memchr, __ubp_memchr)
|
||||
#endif
|
254
sysdeps/alpha/alphaev6/memcpy.S
Normal file
254
sysdeps/alpha/alphaev6/memcpy.S
Normal file
@ -0,0 +1,254 @@
|
||||
/* Copyright (C) 2000 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Library General Public License as
|
||||
published by the Free Software Foundation; either version 2 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Library General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Library General Public
|
||||
License along with the GNU C Library; see the file COPYING.LIB. If not,
|
||||
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
Boston, MA 02111-1307, USA. */
|
||||
|
||||
/*
|
||||
* Much of the information about 21264 scheduling/coding comes from:
|
||||
* Compiler Writer's Guide for the Alpha 21264
|
||||
* abbreviated as 'CWG' in other comments here
|
||||
* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
|
||||
* Scheduling notation:
|
||||
* E - either cluster
|
||||
* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
|
||||
* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
|
||||
*
|
||||
* Temp usage notes:
|
||||
* $0 - destination address
|
||||
* $1,$2, - scratch
|
||||
*/
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.arch ev6
|
||||
.set noreorder
|
||||
.set noat
|
||||
|
||||
ENTRY(memcpy)
|
||||
|
||||
mov $16, $0 # E : copy dest to return
|
||||
ble $18, $nomoredata # U : done with the copy?
|
||||
xor $16, $17, $1 # E : are source and dest alignments the same?
|
||||
and $1, 7, $1 # E : are they the same mod 8?
|
||||
|
||||
bne $1, $misaligned # U : Nope - gotta do this the slow way
|
||||
/* source and dest are same mod 8 address */
|
||||
and $16, 7, $1 # E : Are both 0mod8?
|
||||
beq $1, $both_0mod8 # U : Yes
|
||||
nop # E :
|
||||
|
||||
/*
|
||||
* source and dest are same misalignment. move a byte at a time
|
||||
* until a 0mod8 alignment for both is reached.
|
||||
* At least one byte more to move
|
||||
*/
|
||||
|
||||
$head_align:
|
||||
ldbu $1, 0($17) # L : grab a byte
|
||||
subq $18, 1, $18 # E : count--
|
||||
addq $17, 1, $17 # E : src++
|
||||
stb $1, 0($16) # L :
|
||||
addq $16, 1, $16 # E : dest++
|
||||
and $16, 7, $1 # E : Are we at 0mod8 yet?
|
||||
ble $18, $nomoredata # U : done with the copy?
|
||||
bne $1, $head_align # U :
|
||||
|
||||
$both_0mod8:
|
||||
cmple $18, 127, $1 # E : Can we unroll the loop?
|
||||
bne $1, $no_unroll # U :
|
||||
and $16, 63, $1 # E : get mod64 alignment
|
||||
beq $1, $do_unroll # U : no single quads to fiddle
|
||||
|
||||
$single_head_quad:
|
||||
ldq $1, 0($17) # L : get 8 bytes
|
||||
subq $18, 8, $18 # E : count -= 8
|
||||
addq $17, 8, $17 # E : src += 8
|
||||
nop # E :
|
||||
|
||||
stq $1, 0($16) # L : store
|
||||
addq $16, 8, $16 # E : dest += 8
|
||||
and $16, 63, $1 # E : get mod64 alignment
|
||||
bne $1, $single_head_quad # U : still not fully aligned
|
||||
|
||||
$do_unroll:
|
||||
addq $16, 64, $7 # E : Initial (+1 trip) wh64 address
|
||||
cmple $18, 63, $1 # E : Can we go through the unrolled loop?
|
||||
bne $1, $tail_quads # U : Nope
|
||||
nop # E :
|
||||
|
||||
$unroll_body:
|
||||
wh64 ($7) # L1 : memory subsystem hint: 64 bytes at
|
||||
# ($7) are about to be over-written
|
||||
ldq $6, 0($17) # L0 : bytes 0..7
|
||||
nop # E :
|
||||
nop # E :
|
||||
|
||||
ldq $4, 8($17) # L : bytes 8..15
|
||||
ldq $5, 16($17) # L : bytes 16..23
|
||||
addq $7, 64, $7 # E : Update next wh64 address
|
||||
nop # E :
|
||||
|
||||
ldq $3, 24($17) # L : bytes 24..31
|
||||
addq $16, 64, $1 # E : fallback value for wh64
|
||||
nop # E :
|
||||
nop # E :
|
||||
|
||||
addq $17, 32, $17 # E : src += 32 bytes
|
||||
stq $6, 0($16) # L : bytes 0..7
|
||||
nop # E :
|
||||
nop # E :
|
||||
|
||||
stq $4, 8($16) # L : bytes 8..15
|
||||
stq $5, 16($16) # L : bytes 16..23
|
||||
subq $18, 192, $2 # E : At least two more trips to go?
|
||||
nop # E :
|
||||
|
||||
stq $3, 24($16) # L : bytes 24..31
|
||||
addq $16, 32, $16 # E : dest += 32 bytes
|
||||
nop # E :
|
||||
nop # E :
|
||||
|
||||
ldq $6, 0($17) # L : bytes 0..7
|
||||
ldq $4, 8($17) # L : bytes 8..15
|
||||
cmovlt $2, $1, $7 # E : Latency 2, extra map slot - Use
|
||||
# fallback wh64 address if < 2 more trips
|
||||
nop # E :
|
||||
|
||||
ldq $5, 16($17) # L : bytes 16..23
|
||||
ldq $3, 24($17) # L : bytes 24..31
|
||||
addq $16, 32, $16 # E : dest += 32
|
||||
subq $18, 64, $18 # E : count -= 64
|
||||
|
||||
addq $17, 32, $17 # E : src += 32
|
||||
stq $6, -32($16) # L : bytes 0..7
|
||||
stq $4, -24($16) # L : bytes 8..15
|
||||
cmple $18, 63, $1 # E : At least one more trip?
|
||||
|
||||
stq $5, -16($16) # L : bytes 16..23
|
||||
stq $3, -8($16) # L : bytes 24..31
|
||||
nop # E :
|
||||
beq $1, $unroll_body
|
||||
|
||||
$tail_quads:
|
||||
$no_unroll:
|
||||
.align 4
|
||||
subq $18, 8, $18 # E : At least a quad left?
|
||||
blt $18, $less_than_8 # U : Nope
|
||||
nop # E :
|
||||
nop # E :
|
||||
|
||||
$move_a_quad:
|
||||
ldq $1, 0($17) # L : fetch 8
|
||||
subq $18, 8, $18 # E : count -= 8
|
||||
addq $17, 8, $17 # E : src += 8
|
||||
nop # E :
|
||||
|
||||
stq $1, 0($16) # L : store 8
|
||||
addq $16, 8, $16 # E : dest += 8
|
||||
bge $18, $move_a_quad # U :
|
||||
nop # E :
|
||||
|
||||
$less_than_8:
|
||||
.align 4
|
||||
addq $18, 8, $18 # E : add back for trailing bytes
|
||||
ble $18, $nomoredata # U : All-done
|
||||
nop # E :
|
||||
nop # E :
|
||||
|
||||
/* Trailing bytes */
|
||||
$tail_bytes:
|
||||
subq $18, 1, $18 # E : count--
|
||||
ldbu $1, 0($17) # L : fetch a byte
|
||||
addq $17, 1, $17 # E : src++
|
||||
nop # E :
|
||||
|
||||
stb $1, 0($16) # L : store a byte
|
||||
addq $16, 1, $16 # E : dest++
|
||||
bgt $18, $tail_bytes # U : more to be done?
|
||||
nop # E :
|
||||
|
||||
/* branching to exit takes 3 extra cycles, so replicate exit here */
|
||||
ret $31, ($26), 1 # L0 :
|
||||
nop # E :
|
||||
nop # E :
|
||||
nop # E :
|
||||
|
||||
$misaligned:
|
||||
mov $0, $4 # E : dest temp
|
||||
and $0, 7, $1 # E : dest alignment mod8
|
||||
beq $1, $dest_0mod8 # U : life doesnt totally suck
|
||||
nop
|
||||
|
||||
$aligndest:
|
||||
ble $18, $nomoredata # U :
|
||||
ldbu $1, 0($17) # L : fetch a byte
|
||||
subq $18, 1, $18 # E : count--
|
||||
addq $17, 1, $17 # E : src++
|
||||
|
||||
stb $1, 0($4) # L : store it
|
||||
addq $4, 1, $4 # E : dest++
|
||||
and $4, 7, $1 # E : dest 0mod8 yet?
|
||||
bne $1, $aligndest # U : go until we are aligned.
|
||||
|
||||
/* Source has unknown alignment, but dest is known to be 0mod8 */
|
||||
$dest_0mod8:
|
||||
subq $18, 8, $18 # E : At least a quad left?
|
||||
blt $18, $misalign_tail # U : Nope
|
||||
ldq_u $3, 0($17) # L : seed (rotating load) of 8 bytes
|
||||
nop # E :
|
||||
|
||||
$mis_quad:
|
||||
ldq_u $16, 8($17) # L : Fetch next 8
|
||||
extql $3, $17, $3 # U : masking
|
||||
extqh $16, $17, $1 # U : masking
|
||||
bis $3, $1, $1 # E : merged bytes to store
|
||||
|
||||
subq $18, 8, $18 # E : count -= 8
|
||||
addq $17, 8, $17 # E : src += 8
|
||||
stq $1, 0($4) # L : store 8 (aligned)
|
||||
mov $16, $3 # E : "rotate" source data
|
||||
|
||||
addq $4, 8, $4 # E : dest += 8
|
||||
bge $18, $mis_quad # U : More quads to move
|
||||
nop
|
||||
nop
|
||||
|
||||
$misalign_tail:
|
||||
addq $18, 8, $18 # E : account for tail stuff
|
||||
ble $18, $nomoredata # U :
|
||||
nop
|
||||
nop
|
||||
|
||||
$misalign_byte:
|
||||
ldbu $1, 0($17) # L : fetch 1
|
||||
subq $18, 1, $18 # E : count--
|
||||
addq $17, 1, $17 # E : src++
|
||||
nop # E :
|
||||
|
||||
stb $1, 0($4) # L : store
|
||||
addq $4, 1, $4 # E : dest++
|
||||
bgt $18, $misalign_byte # U : more to go?
|
||||
nop
|
||||
|
||||
|
||||
$nomoredata:
|
||||
ret $31, ($26), 1 # L0 :
|
||||
nop # E :
|
||||
nop # E :
|
||||
nop # E :
|
||||
|
||||
END(memcpy)
|
224
sysdeps/alpha/alphaev6/memset.S
Normal file
224
sysdeps/alpha/alphaev6/memset.S
Normal file
@ -0,0 +1,224 @@
|
||||
/* Copyright (C) 2000 Free Software Foundation, Inc.
|
||||
Contributed by Richard Henderson (rth@tamu.edu)
|
||||
EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Library General Public License as
|
||||
published by the Free Software Foundation; either version 2 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Library General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Library General Public
|
||||
License along with the GNU C Library; see the file COPYING.LIB. If not,
|
||||
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
Boston, MA 02111-1307, USA. */
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.arch ev6
|
||||
.set noat
|
||||
.set noreorder
|
||||
|
||||
ENTRY(memset)
|
||||
#ifdef PROF
|
||||
ldgp gp, 0(pv)
|
||||
lda AT, _mcount
|
||||
jsr AT, (AT), _mcount
|
||||
.prologue 1
|
||||
#else
|
||||
.prologue 0
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Serious stalling happens. The only way to mitigate this is to
|
||||
* undertake a major re-write to interleave the constant materialization
|
||||
* with other parts of the fall-through code. This is important, even
|
||||
* though it makes maintenance tougher.
|
||||
* Do this later.
|
||||
*/
|
||||
and $17, 255, $1 # E : 00000000000000ch
|
||||
insbl $17, 1, $2 # U : 000000000000ch00
|
||||
mov $16, $0 # E : return value
|
||||
ble $18, $end # U : zero length requested?
|
||||
|
||||
addq $18, $16, $6 # E : max address to write to
|
||||
or $1, $2, $17 # E : 000000000000chch
|
||||
insbl $1, 2, $3 # U : 0000000000ch0000
|
||||
insbl $1, 3, $4 # U : 00000000ch000000
|
||||
|
||||
or $3, $4, $3 # E : 00000000chch0000
|
||||
inswl $17, 4, $5 # U : 0000chch00000000
|
||||
xor $16, $6, $1 # E : will complete write be within one quadword?
|
||||
inswl $17, 6, $2 # U : chch000000000000
|
||||
|
||||
or $17, $3, $17 # E : 00000000chchchch
|
||||
or $2, $5, $2 # E : chchchch00000000
|
||||
bic $1, 7, $1 # E : fit within a single quadword?
|
||||
and $16, 7, $3 # E : Target addr misalignment
|
||||
|
||||
or $17, $2, $17 # E : chchchchchchchch
|
||||
beq $1, $within_quad # U :
|
||||
nop # E :
|
||||
beq $3, $aligned # U : target is 0mod8
|
||||
|
||||
/*
|
||||
* Target address is misaligned, and won't fit within a quadword.
|
||||
*/
|
||||
ldq_u $4, 0($16) # L : Fetch first partial
|
||||
mov $16, $5 # E : Save the address
|
||||
insql $17, $16, $2 # U : Insert new bytes
|
||||
subq $3, 8, $3 # E : Invert (for addressing uses)
|
||||
|
||||
addq $18, $3, $18 # E : $18 is new count ($3 is negative)
|
||||
mskql $4, $16, $4 # U : clear relevant parts of the quad
|
||||
subq $16, $3, $16 # E : $16 is new aligned destination
|
||||
or $2, $4, $1 # E : Final bytes
|
||||
|
||||
nop
|
||||
stq_u $1,0($5) # L : Store result
|
||||
nop
|
||||
nop
|
||||
|
||||
.align 4
|
||||
$aligned:
|
||||
/*
|
||||
* We are now guaranteed to be quad aligned, with at least
|
||||
* one partial quad to write.
|
||||
*/
|
||||
|
||||
sra $18, 3, $3 # U : Number of remaining quads to write
|
||||
and $18, 7, $18 # E : Number of trailing bytes to write
|
||||
mov $16, $5 # E : Save dest address
|
||||
beq $3, $no_quad # U : tail stuff only
|
||||
|
||||
/*
|
||||
* It's worth the effort to unroll this and use wh64 if possible.
|
||||
* At this point, entry values are:
|
||||
* $16 Current destination address
|
||||
* $5 A copy of $16
|
||||
* $6 The max quadword address to write to
|
||||
* $18 Number trailer bytes
|
||||
* $3 Number quads to write
|
||||
*/
|
||||
|
||||
and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
|
||||
subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
|
||||
subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
|
||||
blt $4, $loop # U :
|
||||
|
||||
/*
|
||||
* We know we've got at least 16 quads, minimum of one trip
|
||||
* through unrolled loop. Do a quad at a time to get us 0mod64
|
||||
* aligned.
|
||||
*/
|
||||
|
||||
nop # E :
|
||||
nop # E :
|
||||
nop # E :
|
||||
beq $1, $bigalign # U :
|
||||
|
||||
$alignmod64:
|
||||
stq $17, 0($5) # L :
|
||||
subq $3, 1, $3 # E : For consistency later
|
||||
addq $1, 8, $1 # E : Increment towards zero for alignment
|
||||
addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
|
||||
|
||||
nop
|
||||
nop
|
||||
addq $5, 8, $5 # E : Inc address
|
||||
blt $1, $alignmod64 # U :
|
||||
|
||||
$bigalign:
|
||||
/*
|
||||
* $3 - number quads left to go
|
||||
* $5 - target address (aligned 0mod64)
|
||||
* $17 - mask of stuff to store
|
||||
* Scratch registers available: $7, $2, $4, $1
|
||||
* We know that we'll be taking a minimum of one trip through.
|
||||
* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
|
||||
* Assumes the wh64 needs to be for 2 trips through the loop in the future.
|
||||
* The wh64 is issued on for the starting destination address for trip +2
|
||||
* through the loop, and if there are less than two trips left, the target
|
||||
* address will be for the current trip.
|
||||
*/
|
||||
|
||||
$do_wh64:
|
||||
wh64 ($4) # L1 : memory subsystem write hint
|
||||
subq $3, 24, $2 # E : For determining future wh64 addresses
|
||||
stq $17, 0($5) # L :
|
||||
nop # E :
|
||||
|
||||
addq $5, 128, $4 # E : speculative target of next wh64
|
||||
stq $17, 8($5) # L :
|
||||
stq $17, 16($5) # L :
|
||||
addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
|
||||
|
||||
stq $17, 24($5) # L :
|
||||
stq $17, 32($5) # L :
|
||||
cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
|
||||
nop
|
||||
|
||||
stq $17, 40($5) # L :
|
||||
stq $17, 48($5) # L :
|
||||
subq $3, 16, $2 # E : Repeat the loop at least once more?
|
||||
nop
|
||||
|
||||
stq $17, 56($5) # L :
|
||||
addq $5, 64, $5 # E :
|
||||
subq $3, 8, $3 # E :
|
||||
bge $2, $do_wh64 # U :
|
||||
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
beq $3, $no_quad # U : Might have finished already
|
||||
|
||||
.align 4
|
||||
/*
|
||||
* Simple loop for trailing quadwords, or for small amounts
|
||||
* of data (where we can't use an unrolled loop and wh64)
|
||||
*/
|
||||
$loop:
|
||||
stq $17, 0($5) # L :
|
||||
subq $3, 1, $3 # E : Decrement number quads left
|
||||
addq $5, 8, $5 # E : Inc address
|
||||
bne $3, $loop # U : more?
|
||||
|
||||
$no_quad:
|
||||
/*
|
||||
* Write 0..7 trailing bytes.
|
||||
*/
|
||||
nop # E :
|
||||
beq $18, $end # U : All done?
|
||||
ldq $7, 0($5) # L :
|
||||
mskqh $7, $6, $2 # U : Mask final quad
|
||||
|
||||
insqh $17, $6, $4 # U : New bits
|
||||
or $2, $4, $1 # E : Put it all together
|
||||
stq $1, 0($5) # L : And back to memory
|
||||
ret $31,($26),1 # L0 :
|
||||
|
||||
$within_quad:
|
||||
ldq_u $1, 0($16) # L :
|
||||
insql $17, $16, $2 # U : New bits
|
||||
mskql $1, $16, $4 # U : Clear old
|
||||
or $2, $4, $2 # E : New result
|
||||
|
||||
mskql $2, $6, $4 # U :
|
||||
mskqh $1, $6, $2 # U :
|
||||
or $2, $4, $1 # E :
|
||||
stq_u $1, 0($16) # L :
|
||||
|
||||
$end:
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
ret $31,($26),1 # L0 :
|
||||
|
||||
END(memset)
|
329
sysdeps/alpha/alphaev6/stxcpy.S
Normal file
329
sysdeps/alpha/alphaev6/stxcpy.S
Normal file
@ -0,0 +1,329 @@
|
||||
/* Copyright (C) 2000 Free Software Foundation, Inc.
|
||||
Contributed by Richard Henderson (rth@tamu.edu)
|
||||
EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Library General Public License as
|
||||
published by the Free Software Foundation; either version 2 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Library General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Library General Public
|
||||
License along with the GNU C Library; see the file COPYING.LIB. If not,
|
||||
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
Boston, MA 02111-1307, USA. */
|
||||
|
||||
/* Copy a null-terminated string from SRC to DST.
|
||||
|
||||
This is an internal routine used by strcpy, stpcpy, and strcat.
|
||||
As such, it uses special linkage conventions to make implementation
|
||||
of these public functions more efficient.
|
||||
|
||||
On input:
|
||||
t9 = return address
|
||||
a0 = DST
|
||||
a1 = SRC
|
||||
|
||||
On output:
|
||||
t8 = bitmask (with one bit set) indicating the last byte written
|
||||
a0 = unaligned address of the last *word* written
|
||||
|
||||
Furthermore, v0, a3-a5, t11, and t12 are untouched.
|
||||
*/
|
||||
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.arch ev6
|
||||
.set noat
|
||||
.set noreorder
|
||||
.text
|
||||
|
||||
/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
|
||||
doesn't like putting the entry point for a procedure somewhere in the
|
||||
middle of the procedure descriptor. Work around this by putting the
|
||||
aligned copy in its own procedure descriptor */
|
||||
|
||||
|
||||
.ent stxcpy_aligned
|
||||
.align 4
|
||||
stxcpy_aligned:
|
||||
.frame sp, 0, t9
|
||||
.prologue 0
|
||||
|
||||
/* On entry to this basic block:
|
||||
t0 == the first destination word for masking back in
|
||||
t1 == the first source word. */
|
||||
|
||||
/* Create the 1st output word and detect 0's in the 1st input word. */
|
||||
lda t2, -1 # E : build a mask against false zero
|
||||
mskqh t2, a1, t2 # U : detection in the src word (stall)
|
||||
mskqh t1, a1, t3 # U :
|
||||
ornot t1, t2, t2 # E : (stall)
|
||||
|
||||
mskql t0, a1, t0 # U : assemble the first output word
|
||||
cmpbge zero, t2, t8 # E : bits set iff null found
|
||||
or t0, t3, t1 # E : (stall)
|
||||
bne t8, $a_eos # U : (stall)
|
||||
|
||||
/* On entry to this basic block:
|
||||
t0 == the first destination word for masking back in
|
||||
t1 == a source word not containing a null. */
|
||||
/* Nops here to separate store quads from load quads */
|
||||
|
||||
$a_loop:
|
||||
stq_u t1, 0(a0) # L :
|
||||
addq a0, 8, a0 # E :
|
||||
nop
|
||||
nop
|
||||
|
||||
ldq_u t1, 0(a1) # L : Latency=3
|
||||
addq a1, 8, a1 # E :
|
||||
cmpbge zero, t1, t8 # E : (3 cycle stall)
|
||||
beq t8, $a_loop # U : (stall for t8)
|
||||
|
||||
/* Take care of the final (partial) word store.
|
||||
On entry to this basic block we have:
|
||||
t1 == the source word containing the null
|
||||
t8 == the cmpbge mask that found it. */
|
||||
$a_eos:
|
||||
negq t8, t6 # E : find low bit set
|
||||
and t8, t6, t10 # E : (stall)
|
||||
/* For the sake of the cache, don't read a destination word
|
||||
if we're not going to need it. */
|
||||
and t10, 0x80, t6 # E : (stall)
|
||||
bne t6, 1f # U : (stall)
|
||||
|
||||
/* We're doing a partial word store and so need to combine
|
||||
our source and original destination words. */
|
||||
ldq_u t0, 0(a0) # L : Latency=3
|
||||
subq t10, 1, t6 # E :
|
||||
zapnot t1, t6, t1 # U : clear src bytes >= null (stall)
|
||||
or t10, t6, t8 # E : (stall)
|
||||
|
||||
zap t0, t8, t0 # E : clear dst bytes <= null
|
||||
or t0, t1, t1 # E : (stall)
|
||||
nop
|
||||
nop
|
||||
|
||||
1: stq_u t1, 0(a0) # L :
|
||||
ret (t9) # L0 : Latency=3
|
||||
nop
|
||||
nop
|
||||
|
||||
.end stxcpy_aligned
|
||||
|
||||
.align 4
|
||||
.ent __stxcpy
|
||||
.globl __stxcpy
|
||||
__stxcpy:
|
||||
.frame sp, 0, t9
|
||||
.prologue 0
|
||||
|
||||
/* Are source and destination co-aligned? */
|
||||
xor a0, a1, t0 # E :
|
||||
unop # E :
|
||||
and t0, 7, t0 # E : (stall)
|
||||
bne t0, $unaligned # U : (stall)
|
||||
|
||||
/* We are co-aligned; take care of a partial first word. */
|
||||
ldq_u t1, 0(a1) # L : load first src word
|
||||
and a0, 7, t0 # E : take care not to load a word ...
|
||||
addq a1, 8, a1 # E :
|
||||
beq t0, stxcpy_aligned # U : ... if we wont need it (stall)
|
||||
|
||||
ldq_u t0, 0(a0) # L :
|
||||
br stxcpy_aligned # L0 : Latency=3
|
||||
nop
|
||||
nop
|
||||
|
||||
|
||||
/* The source and destination are not co-aligned. Align the destination
|
||||
and cope. We have to be very careful about not reading too much and
|
||||
causing a SEGV. */
|
||||
|
||||
.align 4
|
||||
$u_head:
|
||||
/* We know just enough now to be able to assemble the first
|
||||
full source word. We can still find a zero at the end of it
|
||||
that prevents us from outputting the whole thing.
|
||||
|
||||
On entry to this basic block:
|
||||
t0 == the first dest word, for masking back in, if needed else 0
|
||||
t1 == the low bits of the first source word
|
||||
t6 == bytemask that is -1 in dest word bytes */
|
||||
|
||||
ldq_u t2, 8(a1) # L :
|
||||
addq a1, 8, a1 # E :
|
||||
extql t1, a1, t1 # U : (stall on a1)
|
||||
extqh t2, a1, t4 # U : (stall on a1)
|
||||
|
||||
mskql t0, a0, t0 # U :
|
||||
or t1, t4, t1 # E :
|
||||
mskqh t1, a0, t1 # U : (stall on t1)
|
||||
or t0, t1, t1 # E : (stall on t1)
|
||||
|
||||
or t1, t6, t6 # E :
|
||||
cmpbge zero, t6, t8 # E : (stall)
|
||||
lda t6, -1 # E : for masking just below
|
||||
bne t8, $u_final # U : (stall)
|
||||
|
||||
mskql t6, a1, t6 # U : mask out the bits we have
|
||||
or t6, t2, t2 # E : already extracted before (stall)
|
||||
cmpbge zero, t2, t8 # E : testing eos (stall)
|
||||
bne t8, $u_late_head_exit # U : (stall)
|
||||
|
||||
/* Finally, we've got all the stupid leading edge cases taken care
|
||||
of and we can set up to enter the main loop. */
|
||||
|
||||
stq_u t1, 0(a0) # L : store first output word
|
||||
addq a0, 8, a0 # E :
|
||||
extql t2, a1, t0 # U : position ho-bits of lo word
|
||||
ldq_u t2, 8(a1) # U : read next high-order source word
|
||||
|
||||
addq a1, 8, a1 # E :
|
||||
cmpbge zero, t2, t8 # E : (stall for t2)
|
||||
nop # E :
|
||||
bne t8, $u_eos # U : (stall)
|
||||
|
||||
/* Unaligned copy main loop. In order to avoid reading too much,
|
||||
the loop is structured to detect zeros in aligned source words.
|
||||
This has, unfortunately, effectively pulled half of a loop
|
||||
iteration out into the head and half into the tail, but it does
|
||||
prevent nastiness from accumulating in the very thing we want
|
||||
to run as fast as possible.
|
||||
|
||||
On entry to this basic block:
|
||||
t0 == the shifted high-order bits from the previous source word
|
||||
t2 == the unshifted current source word
|
||||
|
||||
We further know that t2 does not contain a null terminator. */
|
||||
|
||||
.align 3
|
||||
$u_loop:
|
||||
extqh t2, a1, t1 # U : extract high bits for current word
|
||||
addq a1, 8, a1 # E : (stall)
|
||||
extql t2, a1, t3 # U : extract low bits for next time (stall)
|
||||
addq a0, 8, a0 # E :
|
||||
|
||||
or t0, t1, t1 # E : current dst word now complete
|
||||
ldq_u t2, 0(a1) # L : Latency=3 load high word for next time
|
||||
stq_u t1, -8(a0) # L : save the current word (stall)
|
||||
mov t3, t0 # E :
|
||||
|
||||
cmpbge zero, t2, t8 # E : test new word for eos
|
||||
beq t8, $u_loop # U : (stall)
|
||||
nop
|
||||
nop
|
||||
|
||||
/* We've found a zero somewhere in the source word we just read.
|
||||
If it resides in the lower half, we have one (probably partial)
|
||||
word to write out, and if it resides in the upper half, we
|
||||
have one full and one partial word left to write out.
|
||||
|
||||
On entry to this basic block:
|
||||
t0 == the shifted high-order bits from the previous source word
|
||||
t2 == the unshifted current source word. */
|
||||
$u_eos:
|
||||
extqh t2, a1, t1 # U :
|
||||
or t0, t1, t1 # E : first (partial) source word complete (stall)
|
||||
cmpbge zero, t1, t8 # E : is the null in this first bit? (stall)
|
||||
bne t8, $u_final # U : (stall)
|
||||
|
||||
$u_late_head_exit:
|
||||
stq_u t1, 0(a0) # L : the null was in the high-order bits
|
||||
addq a0, 8, a0 # E :
|
||||
extql t2, a1, t1 # U :
|
||||
cmpbge zero, t1, t8 # E : (stall)
|
||||
|
||||
/* Take care of a final (probably partial) result word.
|
||||
On entry to this basic block:
|
||||
t1 == assembled source word
|
||||
t8 == cmpbge mask that found the null. */
|
||||
$u_final:
|
||||
negq t8, t6 # E : isolate low bit set
|
||||
and t6, t8, t10 # E : (stall)
|
||||
and t10, 0x80, t6 # E : avoid dest word load if we can (stall)
|
||||
bne t6, 1f # U : (stall)
|
||||
|
||||
ldq_u t0, 0(a0) # E :
|
||||
subq t10, 1, t6 # E :
|
||||
or t6, t10, t8 # E : (stall)
|
||||
zapnot t1, t6, t1 # U : kill source bytes >= null (stall)
|
||||
|
||||
zap t0, t8, t0 # U : kill dest bytes <= null (2 cycle data stall)
|
||||
or t0, t1, t1 # E : (stall)
|
||||
nop
|
||||
nop
|
||||
|
||||
1: stq_u t1, 0(a0) # L :
|
||||
ret (t9) # L0 : Latency=3
|
||||
nop
|
||||
nop
|
||||
|
||||
/* Unaligned copy entry point. */
|
||||
.align 4
|
||||
$unaligned:
|
||||
|
||||
ldq_u t1, 0(a1) # L : load first source word
|
||||
and a0, 7, t4 # E : find dest misalignment
|
||||
and a1, 7, t5 # E : find src misalignment
|
||||
/* Conditionally load the first destination word and a bytemask
|
||||
with 0xff indicating that the destination byte is sacrosanct. */
|
||||
mov zero, t0 # E :
|
||||
|
||||
mov zero, t6 # E :
|
||||
beq t4, 1f # U :
|
||||
ldq_u t0, 0(a0) # L :
|
||||
lda t6, -1 # E :
|
||||
|
||||
mskql t6, a0, t6 # U :
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
1:
|
||||
subq a1, t4, a1 # E : sub dest misalignment from src addr
|
||||
/* If source misalignment is larger than dest misalignment, we need
|
||||
extra startup checks to avoid SEGV. */
|
||||
cmplt t4, t5, t10 # E :
|
||||
beq t10, $u_head # U :
|
||||
lda t2, -1 # E : mask out leading garbage in source
|
||||
|
||||
mskqh t2, t5, t2 # U :
|
||||
ornot t1, t2, t3 # E : (stall)
|
||||
cmpbge zero, t3, t8 # E : is there a zero? (stall)
|
||||
beq t8, $u_head # U : (stall)
|
||||
|
||||
/* At this point we've found a zero in the first partial word of
|
||||
the source. We need to isolate the valid source data and mask
|
||||
it into the original destination data. (Incidentally, we know
|
||||
that we'll need at least one byte of that original dest word.) */
|
||||
|
||||
ldq_u t0, 0(a0) # L :
|
||||
negq t8, t6 # E : build bitmask of bytes <= zero
|
||||
and t6, t8, t10 # E : (stall)
|
||||
and a1, 7, t5 # E :
|
||||
|
||||
subq t10, 1, t6 # E :
|
||||
or t6, t10, t8 # E : (stall)
|
||||
srl t10, t5, t10 # U : adjust final null return value
|
||||
zapnot t2, t8, t2 # U : prepare source word; mirror changes (stall)
|
||||
|
||||
and t1, t2, t1 # E : to source validity mask
|
||||
extql t2, a1, t2 # U :
|
||||
extql t1, a1, t1 # U : (stall)
|
||||
andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall)
|
||||
|
||||
or t0, t1, t1 # e1 : and put it there
|
||||
stq_u t1, 0(a0) # .. e0 : (stall)
|
||||
ret (t9) # e1 :
|
||||
nop
|
||||
|
||||
.end __stxcpy
|
||||
|
405
sysdeps/alpha/alphaev6/stxncpy.S
Normal file
405
sysdeps/alpha/alphaev6/stxncpy.S
Normal file
@ -0,0 +1,405 @@
|
||||
/* Copyright (C) 2000 Free Software Foundation, Inc.
|
||||
Contributed by Richard Henderson (rth@tamu.edu)
|
||||
EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Library General Public License as
|
||||
published by the Free Software Foundation; either version 2 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Library General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Library General Public
|
||||
License along with the GNU C Library; see the file COPYING.LIB. If not,
|
||||
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
Boston, MA 02111-1307, USA. */
|
||||
|
||||
/* Copy no more than COUNT bytes of the null-terminated string from
|
||||
SRC to DST.
|
||||
|
||||
This is an internal routine used by strncpy, stpncpy, and strncat.
|
||||
As such, it uses special linkage conventions to make implementation
|
||||
of these public functions more efficient.
|
||||
|
||||
On input:
|
||||
t9 = return address
|
||||
a0 = DST
|
||||
a1 = SRC
|
||||
a2 = COUNT
|
||||
|
||||
Furthermore, COUNT may not be zero.
|
||||
|
||||
On output:
|
||||
t0 = last word written
|
||||
t8 = bitmask (with one bit set) indicating the last byte written
|
||||
t10 = bitmask (with one bit set) indicating the byte position of
|
||||
the end of the range specified by COUNT
|
||||
a0 = unaligned address of the last *word* written
|
||||
a2 = the number of full words left in COUNT
|
||||
|
||||
Furthermore, v0, a3-a5, t11, and t12 are untouched.
|
||||
*/
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.arch ev6
|
||||
.set noat
|
||||
.set noreorder
|
||||
|
||||
/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
|
||||
doesn't like putting the entry point for a procedure somewhere in the
|
||||
middle of the procedure descriptor. Work around this by putting the
|
||||
aligned copy in its own procedure descriptor */
|
||||
|
||||
|
||||
.ent stxncpy_aligned
|
||||
.align 4
|
||||
stxncpy_aligned:
|
||||
.frame sp, 0, t9, 0
|
||||
.prologue 0
|
||||
|
||||
/* On entry to this basic block:
|
||||
t0 == the first destination word for masking back in
|
||||
t1 == the first source word. */
|
||||
|
||||
/* Create the 1st output word and detect 0's in the 1st input word. */
|
||||
lda t2, -1 # E : build a mask against false zero
|
||||
mskqh t2, a1, t2 # U : detection in the src word (stall)
|
||||
mskqh t1, a1, t3 # U :
|
||||
ornot t1, t2, t2 # E : (stall)
|
||||
|
||||
mskql t0, a1, t0 # U : assemble the first output word
|
||||
cmpbge zero, t2, t7 # E : bits set iff null found
|
||||
or t0, t3, t0 # E : (stall)
|
||||
beq a2, $a_eoc # U :
|
||||
|
||||
bne t7, $a_eos # U :
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
|
||||
/* On entry to this basic block:
|
||||
t0 == a source word not containing a null. */
|
||||
|
||||
/*
|
||||
* nops here to:
|
||||
* separate store quads from load quads
|
||||
* limit of 1 bcond/quad to permit training
|
||||
*/
|
||||
$a_loop:
|
||||
stq_u t0, 0(a0) # L :
|
||||
addq a0, 8, a0 # E :
|
||||
subq a2, 1, a2 # E :
|
||||
nop
|
||||
|
||||
ldq_u t0, 0(a1) # L :
|
||||
addq a1, 8, a1 # E :
|
||||
cmpbge zero, t0, t7 # E :
|
||||
beq a2, $a_eoc # U :
|
||||
|
||||
beq t7, $a_loop # U :
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
|
||||
/* Take care of the final (partial) word store. At this point
|
||||
the end-of-count bit is set in t7 iff it applies.
|
||||
|
||||
On entry to this basic block we have:
|
||||
t0 == the source word containing the null
|
||||
t7 == the cmpbge mask that found it. */
|
||||
|
||||
$a_eos:
|
||||
negq t7, t8 # E : find low bit set
|
||||
and t7, t8, t8 # E : (stall)
|
||||
/* For the sake of the cache, don't read a destination word
|
||||
if we're not going to need it. */
|
||||
and t8, 0x80, t6 # E : (stall)
|
||||
bne t6, 1f # U : (stall)
|
||||
|
||||
/* We're doing a partial word store and so need to combine
|
||||
our source and original destination words. */
|
||||
ldq_u t1, 0(a0) # L :
|
||||
subq t8, 1, t6 # E :
|
||||
or t8, t6, t7 # E : (stall)
|
||||
zapnot t0, t7, t0 # U : clear src bytes > null (stall)
|
||||
|
||||
zap t1, t7, t1 # .. e1 : clear dst bytes <= null
|
||||
or t0, t1, t0 # e1 : (stall)
|
||||
nop
|
||||
nop
|
||||
|
||||
1: stq_u t0, 0(a0) # L :
|
||||
ret (t9) # L0 : Latency=3
|
||||
nop
|
||||
nop
|
||||
|
||||
/* Add the end-of-count bit to the eos detection bitmask. */
|
||||
$a_eoc:
|
||||
or t10, t7, t7 # E :
|
||||
br $a_eos # L0 : Latency=3
|
||||
nop
|
||||
nop
|
||||
|
||||
.end stxncpy_aligned
|
||||
|
||||
.align 4
|
||||
.ent __stxncpy
|
||||
.globl __stxncpy
|
||||
__stxncpy:
|
||||
.frame sp, 0, t9, 0
|
||||
.prologue 0
|
||||
|
||||
/* Are source and destination co-aligned? */
|
||||
xor a0, a1, t1 # E :
|
||||
and a0, 7, t0 # E : find dest misalignment
|
||||
and t1, 7, t1 # E : (stall)
|
||||
addq a2, t0, a2 # E : bias count by dest misalignment (stall)
|
||||
|
||||
subq a2, 1, a2 # E :
|
||||
and a2, 7, t2 # E : (stall)
|
||||
srl a2, 3, a2 # U : a2 = loop counter = (count - 1)/8 (stall)
|
||||
addq zero, 1, t10 # E :
|
||||
|
||||
sll t10, t2, t10 # U : t10 = bitmask of last count byte
|
||||
bne t1, $unaligned # U :
|
||||
/* We are co-aligned; take care of a partial first word. */
|
||||
ldq_u t1, 0(a1) # L : load first src word
|
||||
addq a1, 8, a1 # E :
|
||||
|
||||
beq t0, stxncpy_aligned # U : avoid loading dest word if not needed
|
||||
ldq_u t0, 0(a0) # L :
|
||||
nop
|
||||
nop
|
||||
|
||||
br stxncpy_aligned # .. e1 :
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
|
||||
|
||||
|
||||
/* The source and destination are not co-aligned. Align the destination
|
||||
and cope. We have to be very careful about not reading too much and
|
||||
causing a SEGV. */
|
||||
|
||||
.align 4
|
||||
$u_head:
|
||||
/* We know just enough now to be able to assemble the first
|
||||
full source word. We can still find a zero at the end of it
|
||||
that prevents us from outputting the whole thing.
|
||||
|
||||
On entry to this basic block:
|
||||
t0 == the first dest word, unmasked
|
||||
t1 == the shifted low bits of the first source word
|
||||
t6 == bytemask that is -1 in dest word bytes */
|
||||
|
||||
ldq_u t2, 8(a1) # L : Latency=3 load second src word
|
||||
addq a1, 8, a1 # E :
|
||||
mskql t0, a0, t0 # U : mask trailing garbage in dst
|
||||
extqh t2, a1, t4 # U : (3 cycle stall on t2)
|
||||
|
||||
or t1, t4, t1 # E : first aligned src word complete (stall)
|
||||
mskqh t1, a0, t1 # U : mask leading garbage in src (stall)
|
||||
or t0, t1, t0 # E : first output word complete (stall)
|
||||
or t0, t6, t6 # E : mask original data for zero test (stall)
|
||||
|
||||
cmpbge zero, t6, t7 # E :
|
||||
beq a2, $u_eocfin # U :
|
||||
nop
|
||||
nop
|
||||
|
||||
bne t7, $u_final # U :
|
||||
lda t6, -1 # E : mask out the bits we have
|
||||
mskql t6, a1, t6 # U : already seen (stall)
|
||||
stq_u t0, 0(a0) # L : store first output word
|
||||
|
||||
or t6, t2, t2 # E :
|
||||
cmpbge zero, t2, t7 # E : find nulls in second partial (stall)
|
||||
addq a0, 8, a0 # E :
|
||||
subq a2, 1, a2 # E :
|
||||
|
||||
bne t7, $u_late_head_exit # U :
|
||||
/* Finally, we've got all the stupid leading edge cases taken care
|
||||
of and we can set up to enter the main loop. */
|
||||
extql t2, a1, t1 # U : position hi-bits of lo word
|
||||
ldq_u t2, 8(a1) # L : read next high-order source word
|
||||
addq a1, 8, a1 # E :
|
||||
|
||||
cmpbge zero, t2, t7 # E : (stall)
|
||||
beq a2, $u_eoc # U :
|
||||
nop
|
||||
nop
|
||||
|
||||
bne t7, $u_eos # e1 :
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
|
||||
/* Unaligned copy main loop. In order to avoid reading too much,
|
||||
the loop is structured to detect zeros in aligned source words.
|
||||
This has, unfortunately, effectively pulled half of a loop
|
||||
iteration out into the head and half into the tail, but it does
|
||||
prevent nastiness from accumulating in the very thing we want
|
||||
to run as fast as possible.
|
||||
|
||||
On entry to this basic block:
|
||||
t1 == the shifted high-order bits from the previous source word
|
||||
t2 == the unshifted current source word
|
||||
|
||||
We further know that t2 does not contain a null terminator. */
|
||||
|
||||
.align 4
|
||||
$u_loop:
|
||||
extqh t2, a1, t0 # U : extract high bits for current word
|
||||
addq a1, 8, a1 # E :
|
||||
extql t2, a1, t3 # U : extract low bits for next time
|
||||
addq a0, 8, a0 # E :
|
||||
|
||||
or t0, t1, t0 # E : current dst word now complete
|
||||
ldq_u t2, 0(a1) # U : Latency=3 load high word for next time
|
||||
stq_u t0, -8(a0) # U : save the current word (stall)
|
||||
mov t3, t1 # E :
|
||||
|
||||
subq a2, 1, a2 # E :
|
||||
cmpbge zero, t2, t7 # E : test new word for eos (2 cycle stall for data)
|
||||
beq a2, $u_eoc # U : (stall)
|
||||
nop
|
||||
|
||||
beq t7, $u_loop # U :
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
|
||||
/* We've found a zero somewhere in the source word we just read.
|
||||
If it resides in the lower half, we have one (probably partial)
|
||||
word to write out, and if it resides in the upper half, we
|
||||
have one full and one partial word left to write out.
|
||||
|
||||
On entry to this basic block:
|
||||
t1 == the shifted high-order bits from the previous source word
|
||||
t2 == the unshifted current source word. */
|
||||
$u_eos:
|
||||
extqh t2, a1, t0 # U :
|
||||
or t0, t1, t0 # E : first (partial) source word complete (stall)
|
||||
cmpbge zero, t0, t7 # E : is the null in this first bit? (stall)
|
||||
bne t7, $u_final # U : (stall)
|
||||
|
||||
stq_u t0, 0(a0) # L : the null was in the high-order bits
|
||||
addq a0, 8, a0 # E :
|
||||
subq a2, 1, a2 # E :
|
||||
nop
|
||||
|
||||
$u_late_head_exit:
|
||||
extql t2, a1, t0 # U :
|
||||
cmpbge zero, t0, t7 # E :
|
||||
or t7, t10, t6 # E : (stall)
|
||||
cmoveq a2, t6, t7 # E : Latency=2, extra map slot (stall)
|
||||
|
||||
/* Take care of a final (probably partial) result word.
|
||||
On entry to this basic block:
|
||||
t0 == assembled source word
|
||||
t7 == cmpbge mask that found the null. */
|
||||
$u_final:
|
||||
negq t7, t6 # E : isolate low bit set
|
||||
and t6, t7, t8 # E : (stall)
|
||||
and t8, 0x80, t6 # E : avoid dest word load if we can (stall)
|
||||
bne t6, 1f # U : (stall)
|
||||
|
||||
ldq_u t1, 0(a0) # L :
|
||||
subq t8, 1, t6 # E :
|
||||
or t6, t8, t7 # E : (stall)
|
||||
zapnot t0, t7, t0 # U : kill source bytes > null
|
||||
|
||||
zap t1, t7, t1 # U : kill dest bytes <= null
|
||||
or t0, t1, t0 # E : (stall)
|
||||
nop
|
||||
nop
|
||||
|
||||
1: stq_u t0, 0(a0) # L :
|
||||
ret (t9) # L0 : Latency=3
|
||||
|
||||
$u_eoc: # end-of-count
|
||||
extqh t2, a1, t0 # U :
|
||||
or t0, t1, t0 # E : (stall)
|
||||
cmpbge zero, t0, t7 # E : (stall)
|
||||
nop
|
||||
|
||||
$u_eocfin: # end-of-count, final word
|
||||
or t10, t7, t7 # E :
|
||||
br $u_final # L0 : Latency=3
|
||||
nop
|
||||
nop
|
||||
|
||||
/* Unaligned copy entry point. */
|
||||
.align 4
|
||||
$unaligned:
|
||||
|
||||
ldq_u t1, 0(a1) # L : load first source word
|
||||
and a0, 7, t4 # E : find dest misalignment
|
||||
and a1, 7, t5 # E : find src misalignment
|
||||
/* Conditionally load the first destination word and a bytemask
|
||||
with 0xff indicating that the destination byte is sacrosanct. */
|
||||
mov zero, t0 # E :
|
||||
|
||||
mov zero, t6 # E :
|
||||
beq t4, 1f # U :
|
||||
ldq_u t0, 0(a0) # L :
|
||||
lda t6, -1 # E :
|
||||
|
||||
mskql t6, a0, t6 # U :
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
1:
|
||||
subq a1, t4, a1 # E : sub dest misalignment from src addr
|
||||
|
||||
/* If source misalignment is larger than dest misalignment, we need
|
||||
extra startup checks to avoid SEGV. */
|
||||
|
||||
cmplt t4, t5, t8 # E :
|
||||
extql t1, a1, t1 # U : shift src into place
|
||||
lda t2, -1 # E : for creating masks later
|
||||
beq t8, $u_head # U : (stall)
|
||||
|
||||
mskqh t2, t5, t2 # U : begin src byte validity mask
|
||||
cmpbge zero, t1, t7 # E : is there a zero?
|
||||
extql t2, a1, t2 # U :
|
||||
or t7, t10, t5 # E : test for end-of-count too
|
||||
|
||||
cmpbge zero, t2, t3 # E :
|
||||
cmoveq a2, t5, t7 # E : Latency=2, extra map slot
|
||||
nop # E : keep with cmoveq
|
||||
andnot t7, t3, t7 # E : (stall)
|
||||
|
||||
beq t7, $u_head # U :
|
||||
/* At this point we've found a zero in the first partial word of
|
||||
the source. We need to isolate the valid source data and mask
|
||||
it into the original destination data. (Incidentally, we know
|
||||
that we'll need at least one byte of that original dest word.) */
|
||||
ldq_u t0, 0(a0) # L :
|
||||
negq t7, t6 # E : build bitmask of bytes <= zero
|
||||
mskqh t1, t4, t1 # U :
|
||||
|
||||
and t6, t7, t8 # E :
|
||||
subq t8, 1, t6 # E : (stall)
|
||||
or t6, t8, t7 # E : (stall)
|
||||
zapnot t2, t7, t2 # U : prepare source word; mirror changes (stall)
|
||||
|
||||
zapnot t1, t7, t1 # U : to source validity mask
|
||||
andnot t0, t2, t0 # E : zero place for source to reside
|
||||
or t0, t1, t0 # E : and put it there (stall both t0, t1)
|
||||
stq_u t0, 0(a0) # L : (stall)
|
||||
|
||||
ret (t9) # L0 : Latency=3
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
|
||||
.end __stxncpy
|
||||
|
1
sysdeps/alpha/alphaev67/Implies
Normal file
1
sysdeps/alpha/alphaev67/Implies
Normal file
@ -0,0 +1 @@
|
||||
alpha/alphaev6
|
50
sysdeps/alpha/alphaev67/ffs.S
Normal file
50
sysdeps/alpha/alphaev67/ffs.S
Normal file
@ -0,0 +1,50 @@
|
||||
/* Copyright (C) 2000 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Library General Public License as
|
||||
published by the Free Software Foundation; either version 2 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Library General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Library General Public
|
||||
License along with the GNU C Library; see the file COPYING.LIB. If not,
|
||||
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
Boston, MA 02111-1307, USA. */
|
||||
|
||||
/* Finds the first bit set in an integer. */
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.arch ev6
|
||||
.set noreorder
|
||||
.set noat
|
||||
|
||||
|
||||
ENTRY(__ffs)
|
||||
#ifdef PROF
|
||||
ldgp gp, 0(pv)
|
||||
lda AT, _mcount
|
||||
jsr AT, (AT), _mcount
|
||||
.prologue 1
|
||||
#else
|
||||
.prologue 0
|
||||
#endif
|
||||
|
||||
zap $16, 0xF0, $16
|
||||
cttz $16, $0
|
||||
addq $0, 1, $0
|
||||
cmoveq $16, 0, $0
|
||||
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
ret
|
||||
|
||||
END(__ffs)
|
||||
|
||||
weak_alias (__ffs, ffs)
|
45
sysdeps/alpha/alphaev67/ffsll.S
Normal file
45
sysdeps/alpha/alphaev67/ffsll.S
Normal file
@ -0,0 +1,45 @@
|
||||
/* Copyright (C) 2000 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Library General Public License as
|
||||
published by the Free Software Foundation; either version 2 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Library General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Library General Public
|
||||
License along with the GNU C Library; see the file COPYING.LIB. If not,
|
||||
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
Boston, MA 02111-1307, USA. */
|
||||
|
||||
/* Finds the first bit set in a long. */
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.arch ev6
|
||||
.set noreorder
|
||||
.set noat
|
||||
|
||||
ENTRY(ffsl)
|
||||
#ifdef PROF
|
||||
ldgp gp, 0(pv)
|
||||
lda AT, _mcount
|
||||
jsr AT, (AT), _mcount
|
||||
.prologue 1
|
||||
#else
|
||||
.prologue 0
|
||||
#endif
|
||||
|
||||
cttz $16, $0
|
||||
addq $0, 1, $0
|
||||
cmoveq $16, 0, $0
|
||||
ret
|
||||
|
||||
END(ffsl)
|
||||
|
||||
weak_extern (ffsl)
|
||||
weak_alias (ffsl, ffsll)
|
93
sysdeps/alpha/alphaev67/rawmemchr.S
Normal file
93
sysdeps/alpha/alphaev67/rawmemchr.S
Normal file
@ -0,0 +1,93 @@
|
||||
/* Copyright (C) 2000 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Library General Public License as
|
||||
published by the Free Software Foundation; either version 2 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Library General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Library General Public
|
||||
License along with the GNU C Library; see the file COPYING.LIB. If not,
|
||||
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
Boston, MA 02111-1307, USA. */
|
||||
|
||||
/* Return pointer to first occurrence of CH in STR. */
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.arch ev6
|
||||
.set noreorder
|
||||
.set noat
|
||||
|
||||
ENTRY(__rawmemchr)
|
||||
#ifdef PROF
|
||||
ldgp gp, 0(pv)
|
||||
lda AT, _mcount
|
||||
jsr AT, (AT), _mcount
|
||||
.prologue 1
|
||||
#else
|
||||
.prologue 0
|
||||
#endif
|
||||
|
||||
ldq_u t0, 0(a0) # L : load first quadword Latency=3
|
||||
and a1, 0xff, t3 # E : 00000000000000ch
|
||||
insbl a1, 1, t5 # U : 000000000000ch00
|
||||
insbl a1, 7, a2 # U : ch00000000000000
|
||||
|
||||
insbl t3, 6, a3 # U : 00ch000000000000
|
||||
or t5, t3, a1 # E : 000000000000chch
|
||||
andnot a0, 7, v0 # E : align our loop pointer
|
||||
lda t4, -1 # E : build garbage mask
|
||||
|
||||
mskqh t4, a0, t4 # U : only want relevant part of first quad
|
||||
or a2, a3, a2 # E : chch000000000000
|
||||
inswl a1, 2, t5 # E : 00000000chch0000
|
||||
inswl a1, 4, a3 # E : 0000chch00000000
|
||||
|
||||
or a1, a2, a1 # E : chch00000000chch
|
||||
or a3, t5, t5 # E : 0000chchchch0000
|
||||
cmpbge zero, t4, t4 # E : bits set iff byte is garbage
|
||||
nop # E :
|
||||
|
||||
/* This quad is _very_ serialized. Lots of stalling happens */
|
||||
or t5, a1, a1 # E : chchchchchchchch
|
||||
xor t0, a1, t1 # E : make bytes == c zero
|
||||
cmpbge zero, t1, t0 # E : bits set iff byte == c
|
||||
andnot t0, t4, t0 # E : clear garbage bits
|
||||
|
||||
cttz t0, a2 # U0 : speculative (in case we get a match)
|
||||
nop # E :
|
||||
nop # E :
|
||||
bne t0, $found # U :
|
||||
|
||||
/*
|
||||
* Yuk. This loop is going to stall like crazy waiting for the
|
||||
* data to be loaded. Not much can be done about it unless it's
|
||||
* unrolled multiple times, which is generally unsafe.
|
||||
*/
|
||||
$loop:
|
||||
ldq t0, 8(v0) # L : Latency=3
|
||||
addq v0, 8, v0 # E :
|
||||
xor t0, a1, t1 # E :
|
||||
cmpbge zero, t1, t0 # E : bits set iff byte == c
|
||||
|
||||
cttz t0, a2 # U0 : speculative (in case we get a match)
|
||||
nop # E :
|
||||
nop # E :
|
||||
beq t0, $loop # U :
|
||||
|
||||
$found:
|
||||
negq t0, t1 # E : clear all but least set bit
|
||||
and t0, t1, t0 # E :
|
||||
addq v0, a2, v0 # E : Add in the bit number from above
|
||||
ret # L0 :
|
||||
|
||||
END(__rawmemchr)
|
||||
|
||||
weak_alias (__rawmemchr, rawmemchr)
|
52
sysdeps/alpha/alphaev67/stpcpy.S
Normal file
52
sysdeps/alpha/alphaev67/stpcpy.S
Normal file
@ -0,0 +1,52 @@
|
||||
/* Copyright (C) 2000 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
Contributed by Richard Henderson <rth@redhat.com>.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Library General Public License as
|
||||
published by the Free Software Foundation; either version 2 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Library General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Library General Public
|
||||
License along with the GNU C Library; see the file COPYING.LIB. If not,
|
||||
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
Boston, MA 02111-1307, USA. */
|
||||
|
||||
/* Copy SRC to DEST returning the address of the terminating 0 in DEST. */
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.arch ev6
|
||||
.set noreorder
|
||||
.set noat
|
||||
.text
|
||||
|
||||
ENTRY(__stpcpy)
|
||||
ldgp gp, 0(pv)
|
||||
#ifdef PROF
|
||||
lda AT, _mcount
|
||||
jsr AT, (AT), _mcount
|
||||
#endif
|
||||
.prologue 1
|
||||
|
||||
.align 4
|
||||
mov a0, v0
|
||||
nop
|
||||
jsr t9, __stxcpy
|
||||
|
||||
# t8 = bitmask (with one bit set) indicating the last byte written
|
||||
# a0 = unaligned address of the last *word* written
|
||||
|
||||
cttz t8, t8
|
||||
andnot a0, 7, a0
|
||||
addq a0, t8, v0
|
||||
ret
|
||||
|
||||
END(__stpcpy)
|
||||
|
||||
weak_alias (__stpcpy, stpcpy)
|
116
sysdeps/alpha/alphaev67/stpncpy.S
Normal file
116
sysdeps/alpha/alphaev67/stpncpy.S
Normal file
@ -0,0 +1,116 @@
|
||||
/* Copyright (C) 2000 Free Software Foundation, Inc.
|
||||
Contributed by Richard Henderson (rth@redhat.com)
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Library General Public License as
|
||||
published by the Free Software Foundation; either version 2 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Library General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Library General Public
|
||||
License along with the GNU C Library; see the file COPYING.LIB. If not,
|
||||
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
Boston, MA 02111-1307, USA. */
|
||||
|
||||
/* Copy no more then N bytes from SRC to DEST, returning the address of
|
||||
the terminating '\0' in DEST. */
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.arch ev6
|
||||
.set noat
|
||||
.set noreorder
|
||||
.text
|
||||
|
||||
ENTRY(__stpncpy)
|
||||
ldgp gp, 0(pv)
|
||||
#ifdef PROF
|
||||
lda AT, _mcount
|
||||
jsr AT, (AT), _mcount
|
||||
#endif
|
||||
.prologue 1
|
||||
|
||||
mov a0, v0
|
||||
beq a2, $zerocount
|
||||
|
||||
.align 4
|
||||
nop
|
||||
nop
|
||||
jsr t9, __stxncpy # do the work of the copy
|
||||
|
||||
cttz t8, t4
|
||||
zapnot t0, t8, t5
|
||||
andnot a0, 7, a0
|
||||
bne a2, $multiword # do we have full words left?
|
||||
|
||||
subq t8, 1, t2
|
||||
subq t10, 1, t3
|
||||
cmpult zero, t5, t5
|
||||
addq a0, t4, v0
|
||||
|
||||
or t2, t8, t2
|
||||
or t3, t10, t3
|
||||
addq v0, t5, v0
|
||||
andnot t3, t2, t3
|
||||
|
||||
zap t0, t3, t0
|
||||
nop
|
||||
stq t0, 0(a0)
|
||||
ret
|
||||
|
||||
$multiword:
|
||||
subq t8, 1, t7 # clear the final bits in the prev word
|
||||
cmpult zero, t5, t5
|
||||
or t7, t8, t7
|
||||
zapnot t0, t7, t0
|
||||
|
||||
subq a2, 1, a2
|
||||
stq t0, 0(a0)
|
||||
addq a0, 8, a1
|
||||
beq a2, 1f # loop over full words remaining
|
||||
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
blbc a2, 0f
|
||||
|
||||
stq zero, 0(a1)
|
||||
subq a2, 1, a2
|
||||
addq a1, 8, a1
|
||||
beq a2, 1f
|
||||
|
||||
0: stq zero, 0(a1)
|
||||
subq a2, 2, a2
|
||||
nop
|
||||
nop
|
||||
|
||||
stq zero, 8(a1)
|
||||
addq a1, 16, a1
|
||||
nop
|
||||
bne a2, 0b
|
||||
|
||||
1: ldq t0, 0(a1) # clear the leading bits in the final word
|
||||
subq t10, 1, t7
|
||||
addq a0, t4, v0
|
||||
nop
|
||||
|
||||
or t7, t10, t7
|
||||
addq v0, t5, v0
|
||||
zap t0, t7, t0
|
||||
stq t0, 0(a1)
|
||||
|
||||
$zerocount:
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
ret
|
||||
|
||||
END(__stpncpy)
|
||||
|
||||
weak_alias (__stpncpy, stpncpy)
|
62
sysdeps/alpha/alphaev67/strcat.S
Normal file
62
sysdeps/alpha/alphaev67/strcat.S
Normal file
@ -0,0 +1,62 @@
|
||||
/* Copyright (C) 2000 Free Software Foundation, Inc.
|
||||
Contributed by Richard Henderson <rth@tamu.edu>, 1996.
|
||||
EV67 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Library General Public License as
|
||||
published by the Free Software Foundation; either version 2 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Library General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Library General Public
|
||||
License along with the GNU C Library; see the file COPYING.LIB. If not,
|
||||
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
Boston, MA 02111-1307, USA. */
|
||||
|
||||
/* Append a null-terminated string from SRC to DST. */
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.arch ev6
|
||||
.set noreorder
|
||||
.text
|
||||
|
||||
ENTRY(strcat)
|
||||
ldgp gp, 0(pv)
|
||||
#ifdef PROF
|
||||
.set noat
|
||||
lda AT, _mcount
|
||||
jsr AT, (AT), _mcount
|
||||
.set at
|
||||
#endif
|
||||
.prologue 1
|
||||
|
||||
mov $16, $0 # E : set up return value
|
||||
/* Find the end of the string. */
|
||||
ldq_u $1, 0($16) # L : load first quadword (a0 may be misaligned)
|
||||
lda $2, -1 # E :
|
||||
insqh $2, $16, $2 # U :
|
||||
|
||||
andnot $16, 7, $16 # E :
|
||||
or $2, $1, $1 # E :
|
||||
cmpbge $31, $1, $2 # E : bits set iff byte == 0
|
||||
bne $2, $found # U :
|
||||
|
||||
$loop: ldq $1, 8($16) # L :
|
||||
addq $16, 8, $16 # E :
|
||||
cmpbge $31, $1, $2 # E :
|
||||
beq $2, $loop # U :
|
||||
|
||||
$found: cttz $2, $3 # U0 :
|
||||
addq $16, $3, $16 # E :
|
||||
/* Now do the append. */
|
||||
mov $26, $23 # E :
|
||||
jmp $31, __stxcpy # L0 :
|
||||
|
||||
END(strcat)
|
101
sysdeps/alpha/alphaev67/strchr.S
Normal file
101
sysdeps/alpha/alphaev67/strchr.S
Normal file
@ -0,0 +1,101 @@
|
||||
/* Copyright (C) 2000 Free Software Foundation, Inc.
|
||||
Contributed by Richard Henderson <rth@tamu.edu>, 1996.
|
||||
EV67 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Library General Public License as
|
||||
published by the Free Software Foundation; either version 2 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Library General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Library General Public
|
||||
License along with the GNU C Library; see the file COPYING.LIB. If not,
|
||||
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
Boston, MA 02111-1307, USA. */
|
||||
|
||||
/* Return the address of a given character within a null-terminated
|
||||
string, or null if it is not found. */
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.arch ev6
|
||||
.set noreorder
|
||||
.set noat
|
||||
|
||||
ENTRY(strchr)
|
||||
#ifdef PROF
|
||||
ldgp gp, 0(pv)
|
||||
lda AT, _mcount
|
||||
jsr AT, (AT), _mcount
|
||||
.prologue 1
|
||||
#else
|
||||
.prologue 0
|
||||
#endif
|
||||
|
||||
ldq_u t0, 0(a0) # L : load first quadword Latency=3
|
||||
and a1, 0xff, t3 # E : 00000000000000ch
|
||||
insbl a1, 1, t5 # U : 000000000000ch00
|
||||
insbl a1, 7, a2 # U : ch00000000000000
|
||||
|
||||
insbl t3, 6, a3 # U : 00ch000000000000
|
||||
or t5, t3, a1 # E : 000000000000chch
|
||||
andnot a0, 7, v0 # E : align our loop pointer
|
||||
lda t4, -1 # E : build garbage mask
|
||||
|
||||
mskqh t4, a0, t4 # U : only want relevant part of first quad
|
||||
or a2, a3, a2 # E : chch000000000000
|
||||
inswl a1, 2, t5 # E : 00000000chch0000
|
||||
inswl a1, 4, a3 # E : 0000chch00000000
|
||||
|
||||
or a1, a2, a1 # E : chch00000000chch
|
||||
or a3, t5, t5 # E : 0000chchchch0000
|
||||
cmpbge zero, t0, t2 # E : bits set iff byte == zero
|
||||
cmpbge zero, t4, t4 # E : bits set iff byte is garbage
|
||||
|
||||
/* This quad is _very_ serialized. Lots of stalling happens */
|
||||
or t5, a1, a1 # E : chchchchchchchch
|
||||
xor t0, a1, t1 # E : make bytes == c zero
|
||||
cmpbge zero, t1, t3 # E : bits set iff byte == c
|
||||
or t2, t3, t0 # E : bits set iff char match or zero match
|
||||
|
||||
andnot t0, t4, t0 # E : clear garbage bits
|
||||
cttz t0, a2 # U0 : speculative (in case we get a match)
|
||||
nop # E :
|
||||
bne t0, $found # U :
|
||||
|
||||
/*
|
||||
* Yuk. This loop is going to stall like crazy waiting for the
|
||||
* data to be loaded. Not much can be done about it unless it's
|
||||
* unrolled multiple times, which is generally unsafe.
|
||||
*/
|
||||
$loop:
|
||||
ldq t0, 8(v0) # L : Latency=3
|
||||
addq v0, 8, v0 # E :
|
||||
xor t0, a1, t1 # E :
|
||||
cmpbge zero, t0, t2 # E : bits set iff byte == 0
|
||||
|
||||
cmpbge zero, t1, t3 # E : bits set iff byte == c
|
||||
or t2, t3, t0 # E :
|
||||
cttz t3, a2 # U0 : speculative (in case we get a match)
|
||||
beq t0, $loop # U :
|
||||
|
||||
$found:
|
||||
negq t0, t1 # E : clear all but least set bit
|
||||
and t0, t1, t0 # E :
|
||||
and t0, t3, t1 # E : bit set iff byte was the char
|
||||
addq v0, a2, v0 # E : Add in the bit number from above
|
||||
|
||||
cmoveq t1, $31, v0 # E : Two mapping slots, latency = 2
|
||||
nop
|
||||
nop
|
||||
ret # L0 :
|
||||
|
||||
END(strchr)
|
||||
|
||||
weak_alias (strchr, index)
|
61
sysdeps/alpha/alphaev67/strlen.S
Normal file
61
sysdeps/alpha/alphaev67/strlen.S
Normal file
@ -0,0 +1,61 @@
|
||||
/* Copyright (C) 2000 Free Software Foundation, Inc.
|
||||
Contributed by David Mosberger (davidm@cs.arizona.edu).
|
||||
EV67 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Library General Public License as
|
||||
published by the Free Software Foundation; either version 2 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Library General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Library General Public
|
||||
License along with the GNU C Library; see the file COPYING.LIB. If not,
|
||||
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
Boston, MA 02111-1307, USA. */
|
||||
|
||||
/* Finds length of a 0-terminated string. */
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.arch ev6
|
||||
.set noreorder
|
||||
.set noat
|
||||
|
||||
ENTRY(strlen)
|
||||
#ifdef PROF
|
||||
ldgp gp, 0(pv)
|
||||
lda AT, _mcount
|
||||
jsr AT, (AT), _mcount
|
||||
.prologue 1
|
||||
#else
|
||||
.prologue 0
|
||||
#endif
|
||||
|
||||
ldq_u $1, 0($16) # L : load first quadword ($16 may be misaligned)
|
||||
lda $2, -1($31) # E :
|
||||
insqh $2, $16, $2 # U :
|
||||
andnot $16, 7, $0 # E :
|
||||
|
||||
or $2, $1, $1 # E :
|
||||
cmpbge $31, $1, $2 # E : $2 <- bitmask: bit i == 1 <==> i-th byte == 0
|
||||
nop # E :
|
||||
bne $2, $found # U :
|
||||
|
||||
$loop: ldq $1, 8($0) # L :
|
||||
addq $0, 8, $0 # E : addr += 8
|
||||
cmpbge $31, $1, $2 # E :
|
||||
beq $2, $loop # U :
|
||||
|
||||
$found:
|
||||
cttz $2, $3 # U0 :
|
||||
addq $0, $3, $0 # E :
|
||||
subq $0, $16, $0 # E :
|
||||
ret $31, ($26) # L0 :
|
||||
|
||||
END(strlen)
|
101
sysdeps/alpha/alphaev67/strncat.S
Normal file
101
sysdeps/alpha/alphaev67/strncat.S
Normal file
@ -0,0 +1,101 @@
|
||||
/* Copyright (C) 2000 Free Software Foundation, Inc.
|
||||
Contributed by Richard Henderson <rth@tamu.edu>, 1996.
|
||||
EV67 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Library General Public License as
|
||||
published by the Free Software Foundation; either version 2 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Library General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Library General Public
|
||||
License along with the GNU C Library; see the file COPYING.LIB. If not,
|
||||
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
Boston, MA 02111-1307, USA. */
|
||||
|
||||
/* Append no more than COUNT characters from the null-terminated string SRC
|
||||
to the null-terminated string DST. Always null-terminate the new DST. */
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.arch ev6
|
||||
.set noreorder
|
||||
.text
|
||||
|
||||
ENTRY(strncat)
|
||||
ldgp gp, 0(pv)
|
||||
#ifdef PROF
|
||||
.set noat
|
||||
lda AT, _mcount
|
||||
jsr AT, (AT), _mcount
|
||||
.set at
|
||||
#endif
|
||||
.prologue 1
|
||||
|
||||
mov $16, $0 # set up return value
|
||||
beq $18, $zerocount # U :
|
||||
/* Find the end of the string. */
|
||||
ldq_u $1, 0($16) # L : load first quadword ($16 may be misaligned)
|
||||
lda $2, -1($31) # E :
|
||||
|
||||
insqh $2, $0, $2 # U :
|
||||
andnot $16, 7, $16 # E :
|
||||
nop # E :
|
||||
or $2, $1, $1 # E :
|
||||
|
||||
nop # E :
|
||||
nop # E :
|
||||
cmpbge $31, $1, $2 # E : bits set iff byte == 0
|
||||
bne $2, $found # U :
|
||||
|
||||
$loop: ldq $1, 8($16) # L :
|
||||
addq $16, 8, $16 # E :
|
||||
cmpbge $31, $1, $2 # E :
|
||||
beq $2, $loop # U :
|
||||
|
||||
$found: cttz $2, $3 # U0 :
|
||||
addq $16, $3, $16 # E :
|
||||
jsr $23, __stxncpy # L0 :/* Now do the append. */
|
||||
|
||||
/* Worry about the null termination. */
|
||||
|
||||
zapnot $1, $27, $2 # U : was last byte a null?
|
||||
cmplt $27, $24, $5 # E : did we fill the buffer completely?
|
||||
bne $2, 0f # U :
|
||||
ret # L0 :
|
||||
|
||||
0: or $5, $18, $2 # E :
|
||||
nop
|
||||
bne $2, 2f # U :
|
||||
and $24, 0x80, $3 # E : no zero next byte
|
||||
|
||||
nop # E :
|
||||
bne $3, 1f # U :
|
||||
/* Here there are bytes left in the current word. Clear one. */
|
||||
addq $24, $24, $24 # E : end-of-count bit <<= 1
|
||||
nop # E :
|
||||
|
||||
2: zap $1, $24, $1 # U :
|
||||
nop # E :
|
||||
stq_u $1, 0($16) # L :
|
||||
ret # L0 :
|
||||
|
||||
1: /* Here we must clear the first byte of the next DST word */
|
||||
stb $31, 8($16) # L :
|
||||
nop # E :
|
||||
nop # E :
|
||||
ret # L0 :
|
||||
|
||||
$zerocount:
|
||||
nop # E :
|
||||
nop # E :
|
||||
nop # E :
|
||||
ret # L0 :
|
||||
|
||||
END(strncat)
|
@ -1,4 +1,4 @@
|
||||
/* Copyright (C) 1996, 1997, 1998 Free Software Foundation, Inc.
|
||||
/* Copyright (C) 1996, 1997, 1998, 2000 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
@ -30,15 +30,13 @@ ENTRY(htonl)
|
||||
.prologue 0
|
||||
#endif
|
||||
|
||||
extlh a0, 5, t1 # t1 = dd000000
|
||||
zap a0, 0xfd, t2 # t2 = 0000cc00
|
||||
sll t2, 5, t2 # t2 = 00198000
|
||||
s8addl t2, t1, t1 # t1 = ddcc0000
|
||||
zap a0, 0xfb, t2 # t2 = 00bb0000
|
||||
srl t2, 8, t2 # t2 = 0000bb00
|
||||
extbl a0, 3, v0 # v0 = 000000aa
|
||||
or t1, v0, v0 # v0 = ddcc00aa
|
||||
or t2, v0, v0 # v0 = ddccbbaa
|
||||
inslh a0, 7, t0 # t0 = 0000000000AABBCC
|
||||
inswl a0, 3, t1 # t1 = 000000CCDD000000
|
||||
or t1, t0, t1 # t1 = 000000CCDDAABBCC
|
||||
sll t1, 16, t2 # t2 = 0000000000CCDDAA
|
||||
zapnot t1, 0x0A, t0 # t0 = 00000000DD00BB00
|
||||
zapnot t2, 0x05, t3 # t3 = 0000000000CC00AA
|
||||
addl t0, t3, v0 # v0 = ssssssssDDCCBBAA
|
||||
ret
|
||||
|
||||
END(htonl)
|
||||
|
89
sysdeps/alpha/rawmemchr.S
Normal file
89
sysdeps/alpha/rawmemchr.S
Normal file
@ -0,0 +1,89 @@
|
||||
/* Copyright (C) 2000 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Library General Public License as
|
||||
published by the Free Software Foundation; either version 2 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Library General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Library General Public
|
||||
License along with the GNU C Library; see the file COPYING.LIB. If not,
|
||||
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
Boston, MA 02111-1307, USA. */
|
||||
|
||||
/* Return pointer to first occurrence of CH in STR. */
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.set noreorder
|
||||
.set noat
|
||||
|
||||
ENTRY(__rawmemchr)
|
||||
#ifdef PROF
|
||||
ldgp gp, 0(pv)
|
||||
lda AT, _mcount
|
||||
jsr AT, (AT), _mcount
|
||||
.prologue 1
|
||||
#else
|
||||
.prologue 0
|
||||
#endif
|
||||
|
||||
zapnot a1, 1, a1 # e0 : zero extend the search character
|
||||
ldq_u t0, 0(a0) # .. e1 : load first quadword
|
||||
sll a1, 8, t5 # e0 : replicate the search character
|
||||
andnot a0, 7, v0 # .. e1 : align our loop pointer
|
||||
|
||||
or t5, a1, a1 # e0 :
|
||||
lda t4, -1 # .. e1 : build garbage mask
|
||||
sll a1, 16, t5 # e0 :
|
||||
unop # :
|
||||
|
||||
mskqh t4, a0, t4 # e0 :
|
||||
or t5, a1, a1 # .. e1 :
|
||||
sll a1, 32, t5 # e0 :
|
||||
cmpbge zero, t4, t4 # .. e1 : bits set iff byte is garbage
|
||||
|
||||
or t5, a1, a1 # e0 :
|
||||
xor t0, a1, t1 # .. e1 : make bytes == c zero
|
||||
cmpbge zero, t1, t3 # e0 : bits set iff byte == c
|
||||
unop # :
|
||||
|
||||
andnot t3, t4, t0 # e0 : clear garbage bits
|
||||
fnop # .. fa :
|
||||
unop # :
|
||||
bne t0, $found # .. e1 (zdb)
|
||||
|
||||
.align 4
|
||||
$loop:
|
||||
ldq t0, 8(v0) # e0 :
|
||||
addq v0, 8, v0 # .. e1 :
|
||||
nop # e0 :
|
||||
xor t0, a1, t1 # .. e1 (ev5 data stall)
|
||||
|
||||
cmpbge zero, t1, t0 # e0 : bits set iff byte == c
|
||||
beq t0, $loop # .. e1 (zdb)
|
||||
|
||||
$found:
|
||||
negq t0, t1 # e0 : clear all but least set bit
|
||||
and t0, t1, t0 # e1 (stall)
|
||||
and t0, 0xf0, t2 # e0 : binary search for that set bit
|
||||
and t0, 0xcc, t3 # .. e1 :
|
||||
|
||||
and t0, 0xaa, t4 # e0 :
|
||||
cmovne t2, 4, t2 # .. e1 :
|
||||
cmovne t3, 2, t3 # e0 :
|
||||
cmovne t4, 1, t4 # .. e1 :
|
||||
|
||||
addq t2, t3, t2 # e0 :
|
||||
addq v0, t4, v0 # .. e1 :
|
||||
addq v0, t2, v0 # e0 :
|
||||
ret # .. e1 :
|
||||
|
||||
END(__rawmemchr)
|
||||
|
||||
weak_alias (__rawmemchr, rawmemchr)
|
@ -65,7 +65,7 @@ $found: negq t1, t2 # clear all but least set bit
|
||||
|
||||
/* Now do the append. */
|
||||
|
||||
jsr t9, __stxcpy
|
||||
ret
|
||||
mov ra, t9
|
||||
jmp $31, __stxcpy
|
||||
|
||||
END(strcat)
|
||||
|
@ -35,7 +35,7 @@ ENTRY(strcpy)
|
||||
.prologue 1
|
||||
|
||||
mov a0, v0 # set up return value
|
||||
jsr t9, __stxcpy # do the copy
|
||||
ret
|
||||
mov ra, t9
|
||||
jmp $31, __stxcpy # do the copy
|
||||
|
||||
END(strcpy)
|
||||
|
@ -53,7 +53,6 @@ ENTRY(strncpy)
|
||||
ret # .. e1 :
|
||||
|
||||
$multiword:
|
||||
|
||||
subq t8, 1, t7 # e0 : clear the final bits in the prev
|
||||
or t7, t8, t7 # e1 : word
|
||||
zapnot t0, t7, t0 # e0 :
|
||||
|
@ -82,3 +82,11 @@ _start:
|
||||
;;
|
||||
}
|
||||
.endp _start#
|
||||
|
||||
/* Define a symbol for the first piece of initialized data. */
|
||||
.data
|
||||
.globl __data_start
|
||||
__data_start:
|
||||
.long 0
|
||||
.weak data_start
|
||||
data_start = __data_start
|
||||
|
Loading…
Reference in New Issue
Block a user