* sysdeps/ia64/memccpy.S: New file.
	* sysdeps/ia64/memchr.S: New file.
	* sysdeps/ia64/memcmp.S: New file.
	* sysdeps/ia64/memcpy.S: New file.
	* sysdeps/ia64/memmove.S: New file.
	* sysdeps/ia64/memset.S: New file.
	* sysdeps/ia64/strcat.S: New file.
	* sysdeps/ia64/strchr.S: New file.
	* sysdeps/ia64/strcmp.S: New file.
	* sysdeps/ia64/strcpy.S: New file.
	* sysdeps/ia64/strlen.S: New file.
	* sysdeps/ia64/strncmp.S: New file.
	* sysdeps/ia64/strncpy.S: New file.
	* sysdeps/ia64/softpipe.h: New file.
	Patches by Dan Pop <Dan.Pop@cern.ch>.

	* manual/memory.texi: Document memory handling functions (mlock,
	munlock, mlockall, munlockall, brk, and sbrk)
This commit is contained in:
Ulrich Drepper 2000-05-21 22:04:15 +00:00
parent 99a206167b
commit 5bfc425236
15 changed files with 1718 additions and 1 deletions

View File

@ -1,6 +1,23 @@
2000-05-21 Ulrich Drepper <drepper@redhat.com> 2000-05-21 Ulrich Drepper <drepper@redhat.com>
* manual/memory.texi: Document memory handling functions. * sysdeps/ia64/memccpy.S: New file.
* sysdeps/ia64/memchr.S: New file.
* sysdeps/ia64/memcmp.S: New file.
* sysdeps/ia64/memcpy.S: New file.
* sysdeps/ia64/memmove.S: New file.
* sysdeps/ia64/memset.S: New file.
* sysdeps/ia64/strcat.S: New file.
* sysdeps/ia64/strchr.S: New file.
* sysdeps/ia64/strcmp.S: New file.
* sysdeps/ia64/strcpy.S: New file.
* sysdeps/ia64/strlen.S: New file.
* sysdeps/ia64/strncmp.S: New file.
* sysdeps/ia64/strncpy.S: New file.
* sysdeps/ia64/softpipe.h: New file.
Patches by Dan Pop <Dan.Pop@cern.ch>.
* manual/memory.texi: Document memory handling functions (mlock,
munlock, mlockall, munlockall, brk, and sbrk)
* manual/time.texi: Document timespec and friends. * manual/time.texi: Document timespec and friends.
* manual/conf.texi: Fix references. * manual/conf.texi: Fix references.
* manual/ctype.texi: Likewise. * manual/ctype.texi: Likewise.

164
sysdeps/ia64/memccpy.S Normal file
View File

@ -0,0 +1,164 @@
/* Optimized version of the memccpy() function.
This file is part of the GNU C Library.
Copyright (C) 2000 Free Software Foundation, Inc.
Contributed by Dan Pop <Dan.Pop@cern.ch>.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Return: a pointer to the next byte after char in dest or NULL
Inputs:
in0: dest
in1: src
in2: char
in3: byte count
This implementation assumes little endian mode (UM.be = 0).
This implementation assumes that it is safe to do read ahead
in the src block, without getting beyond its limit. */
#include <sysdep.h>
#undef ret
#define OP_T_THRES 16
#define OPSIZ 8
#define saved_pfs r14
#define saved_pr r17
#define saved_lc r18
#define dest r19
#define src r20
#define len r21
#define asrc r22
#define tmp r23
#define char r24
#define charx8 r25
#define sh2 r28
#define sh1 r29
#define loopcnt r30
#define value r31
ENTRY(memccpy)
alloc saved_pfs = ar.pfs, 4, 40 - 4, 0, 40
#include "softpipe.h"
.rotr r[MEMLAT + 3], tmp1[4], tmp2[4], val[4], tmp3[2], pos0[2]
.rotp p[MEMLAT + 6 + 1]
mov ret0 = r0 // return NULL if no match
mov saved_pr = pr // save the predicate registers
mov saved_lc = ar.lc // save the loop counter
mov dest = in0 // dest
mov src = in1 // src
extr.u char = in2, 0, 8 // char
mov len = in3 // len
sub tmp = r0, in0 // tmp = -dest
cmp.ne p7, p0 = r0, r0 // clear p7
;;
and loopcnt = 7, tmp // loopcnt = -dest % 8
cmp.ge p6, p0 = OP_T_THRES, len // is len <= OP_T_THRES
(p6) br.cond.spnt .cpyfew // copy byte by byte
;;
cmp.eq p6, p0 = loopcnt, r0
mux1 charx8 = char, @brcst
(p6) br.cond.sptk .dest_aligned
sub len = len, loopcnt // len -= -dest % 8
adds loopcnt = -1, loopcnt // --loopcnt
;;
mov ar.lc = loopcnt
.l1: // copy -dest % 8 bytes
ld1 value = [src], 1 // value = *src++
;;
st1 [dest] = value, 1 // *dest++ = value
cmp.eq p6, p0 = value, char
(p6) br.cond.spnt .foundit
br.cloop.dptk .l1
.dest_aligned:
and sh1 = 7, src // sh1 = src % 8
and tmp = -8, len // tmp = len & -OPSIZ
and asrc = -8, src // asrc = src & -OPSIZ -- align src
shr.u loopcnt = len, 3 // loopcnt = len / 8
and len = 7, len ;; // len = len % 8
shl sh1 = sh1, 3 // sh1 = 8 * (src % 8)
adds loopcnt = -1, loopcnt // --loopcnt
mov pr.rot = 1 << 16 ;; // set rotating predicates
sub sh2 = 64, sh1 // sh2 = 64 - sh1
mov ar.lc = loopcnt // set LC
cmp.eq p6, p0 = sh1, r0 // is the src aligned?
(p6) br.cond.sptk .src_aligned ;;
add src = src, tmp // src += len & -OPSIZ
mov ar.ec = MEMLAT + 6 + 1 // six more passes needed
ld8 r[1] = [asrc], 8 // r[1] = w0
cmp.ne p6, p0 = r0, r0 ;; // clear p6
.align 32
.l2:
(p[0]) ld8 r[0] = [asrc], 8 // r[0] = w1
(p[MEMLAT]) shr.u tmp1[0] = r[1 + MEMLAT], sh1 // tmp1 = w0 >> sh1
(p[MEMLAT]) shl tmp2[0] = r[0 + MEMLAT], sh2 // tmp2 = w1 << sh2
(p[MEMLAT+4]) xor tmp3[0] = val[1], charx8
(p[MEMLAT+5]) czx1.r pos0[0] = tmp3[1]
(p[MEMLAT+6]) cmp.ne p6, p0 = 8, pos0[1]
(p6) br.cond.spnt .gotit
(p[MEMLAT+6]) st8 [dest] = val[3], 8 // store val to dest
(p[MEMLAT+3]) or val[0] = tmp1[3], tmp2[3] // val = tmp1 | tmp2
br.ctop.sptk .l2
br.cond.sptk .cpyfew
.src_aligned:
cmp.ne p6, p0 = r0, r0 // clear p6
mov ar.ec = MEMLAT + 2 + 1 ;; // set EC
.l3:
(p[0]) ld8 r[0] = [src], 8
(p[MEMLAT]) xor tmp3[0] = r[MEMLAT], charx8
(p[MEMLAT+1]) czx1.r pos0[0] = tmp3[1]
(p[MEMLAT+2]) cmp.ne p7, p0 = 8, pos0[1]
(p7) br.cond.spnt .gotit
(p[MEMLAT+2]) st8 [dest] = r[MEMLAT+2], 8
br.ctop.dptk .l3
.cpyfew:
cmp.eq p6, p0 = len, r0 // is len == 0 ?
adds len = -1, len // --len;
(p6) br.cond.spnt .restore_and_exit ;;
mov ar.lc = len
.l4:
ld1 value = [src], 1
;;
st1 [dest] = value, 1
cmp.eq p6, p0 = value, char
(p6) br.cond.spnt .foundit
br.cloop.dptk .l4 ;;
.foundit:
(p6) mov ret0 = dest
.restore_and_exit:
mov ar.pfs = saved_pfs // restore the PFS
mov pr = saved_pr, -1 // restore the predicate registers
mov ar.lc = saved_lc // restore the loop counter
br.ret.sptk.many b0
.gotit:
(p6) mov value = val[3] // if coming from l2
(p7) mov value = r[MEMLAT+2] // if coming from l3
mov ar.lc = pos0[1] ;;
.l5:
extr.u tmp = value, 0, 8 ;;
st1 [dest] = tmp, 1
shr.u value = value, 8
br.cloop.sptk .l5 ;;
mov ret0 = dest
mov ar.pfs = saved_pfs
mov pr = saved_pr, -1
mov ar.lc = saved_lc
br.ret.sptk.many b0
END(memccpy)

124
sysdeps/ia64/memchr.S Normal file
View File

@ -0,0 +1,124 @@
/* Optimized version of the standard memchr() function.
This file is part of the GNU C Library.
Copyright (C) 2000 Free Software Foundation, Inc.
Contributed by Dan Pop <Dan.Pop@cern.ch>.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Return: the address of the first occurence of chr in str or NULL
Inputs:
in0: str
in1: chr
in2: byte count
This implementation assumes little endian mode. For big endian mode,
the instruction czx1.r should be replaced by czx1.l.
The algorithm is fairly straightforward: search byte by byte until we
we get to a word aligned address, then search word by word as much as
possible; the remaining few bytes are searched one at a time.
The word by word search is performed by xor-ing the word with a word
containing chr in every byte. If there is a hit, the result will
contain a zero byte in the corresponding position. The presence and
position of that zero byte is detected with a czx instruction.
All the loops in this function could have had the internal branch removed
if br.ctop and br.cloop could be predicated :-(. */
#include <sysdep.h>
#undef ret
#define saved_pfs r14
#define saved_pr r15
#define saved_lc r16
#define chr r17
#define len r18
#define pos0 r20
#define val r21
#define tmp r24
#define chrx8 r25
#define loopcnt r30
#define str in0
ENTRY(memchr)
alloc saved_pfs = ar.pfs, 3, 0, 29, 32
#include "softpipe.h"
.rotr value[MEMLAT+1], addr[MEMLAT+3], aux[2], poschr[2]
.rotp p[MEMLAT+3]
mov saved_lc = ar.lc // save the loop counter
mov saved_pr = pr // save the predicates
mov ret0 = str
and tmp = 7, str // tmp = str % 8
cmp.ne p7, p0 = r0, r0 // clear p7
extr.u chr = in1, 0, 8 // chr = (unsigned char) in1
mov len = in2
cmp.gtu p6, p0 = 16, in2 // use a simple loop for short
(p6) br.cond.spnt .srchfew ;; // searches
sub loopcnt = 8, tmp // loopcnt = 8 - tmp
cmp.eq p6, p0 = tmp, r0
(p6) br.cond.sptk .str_aligned;;
sub len = len, loopcnt
adds loopcnt = -1, loopcnt;;
mov ar.lc = loopcnt
.l1:
ld1 val = [ret0], 1
;;
cmp.eq p6, p0 = val, chr
(p6) br.cond.spnt .foundit
br.cloop.sptk .l1 ;;
.str_aligned:
cmp.ne p6, p0 = r0, r0 // clear p6
shr.u loopcnt = len, 3 // loopcnt = len / 8
and len = 7, len ;; // remaining len = len & 7
adds loopcnt = -1, loopcnt
mov ar.ec = MEMLAT + 3
mux1 chrx8 = chr, @brcst ;; // get a word full of chr
mov ar.lc = loopcnt
mov pr.rot = 1 << 16 ;;
.l2:
(p[0]) mov addr[0] = ret0
(p[0]) ld8 value[0] = [ret0], 8
(p[MEMLAT]) xor aux[0] = value[MEMLAT], chrx8
(p[MEMLAT+1]) czx1.r poschr[0] = aux[1]
(p[MEMLAT+2]) cmp.ne p7, p0 = 8, poschr[1]
(p7) br.cond.dpnt .foundit
br.ctop.dptk .l2
.srchfew:
adds loopcnt = -1, len
cmp.eq p6, p0 = len, r0
(p6) br.cond.spnt .notfound ;;
mov ar.lc = loopcnt
.l3:
ld1 val = [ret0], 1
;;
cmp.eq p6, p0 = val, chr
(p6) br.cond.dpnt .foundit
br.cloop.sptk .l3 ;;
.notfound:
cmp.ne p6, p0 = r0, r0 // clear p6 (p7 was already 0 when we got here)
mov ret0 = r0 ;; // return NULL
.foundit:
(p6) adds ret0 = -1, ret0 // if we got here from l1 or l3
(p7) add ret0 = addr[MEMLAT+2], poschr[1] // if we got here from l2
mov pr = saved_pr, -1
mov ar.pfs = saved_pfs
mov ar.lc = saved_lc
br.ret.sptk.many b0
END(memchr)

163
sysdeps/ia64/memcmp.S Normal file
View File

@ -0,0 +1,163 @@
/* Optimized version of the standard memcmp() function.
This file is part of the GNU C Library.
Copyright (C) 2000 Free Software Foundation, Inc.
Contributed by Dan Pop <Dan.Pop@cern.ch>.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Return: the result of the comparison
Inputs:
in0: dest (aka s1)
in1: src (aka s2)
in2: byte count
In this form, it assumes little endian mode. For big endian mode, the
the two shifts in .l2 must be inverted:
shl tmp1[0] = r[1 + MEMLAT], sh1 // tmp1 = w0 << sh1
shr.u tmp2[0] = r[0 + MEMLAT], sh2 // tmp2 = w1 >> sh2
and all the mux1 instructions should be replaced by plain mov's. */
#include <sysdep.h>
#undef ret
#define OP_T_THRES 16
#define OPSIZ 8
#define MEMLAT 2
#define saved_pfs r14
#define start r15
#define saved_pr r17
#define saved_lc r18
#define dest r19
#define src r20
#define len r21
#define asrc r22
#define tmp r23
#define value1 r24
#define value2 r25
#define sh2 r28
#define sh1 r29
#define loopcnt r30
ENTRY(memcmp)
alloc saved_pfs = ar.pfs, 3, 37, 0, 40
.rotr r[MEMLAT + 2], q[MEMLAT + 5], tmp1[4], tmp2[4], val[2]
.rotp p[MEMLAT + 4 + 1]
mov ret0 = r0 // by default return value = 0
mov saved_pr = pr // save the predicate registers
mov saved_lc = ar.lc // save the loop counter
mov dest = in0 // dest
mov src = in1 // src
mov len = in2 // len
sub tmp = r0, in0 // tmp = -dest
;;
and loopcnt = 7, tmp // loopcnt = -dest % 8
cmp.ge p6, p0 = OP_T_THRES, len // is len <= OP_T_THRES
(p6) br.cond.spnt .cmpfew // compare byte by byte
;;
cmp.eq p6, p0 = loopcnt, r0
(p6) br.cond.sptk .dest_aligned
sub len = len, loopcnt // len -= -dest % 8
adds loopcnt = -1, loopcnt // --loopcnt
;;
mov ar.lc = loopcnt
.l1: // copy -dest % 8 bytes
ld1 value1 = [src], 1 // value = *src++
ld1 value2 = [dest], 1
;;
cmp.ne p6, p0 = value1, value2
(p6) br.cond.spnt .done
br.cloop.dptk .l1
.dest_aligned:
and sh1 = 7, src // sh1 = src % 8
and tmp = -8, len // tmp = len & -OPSIZ
and asrc = -8, src // asrc = src & -OPSIZ -- align src
shr.u loopcnt = len, 3 // loopcnt = len / 8
and len = 7, len ;; // len = len % 8
shl sh1 = sh1, 3 // sh1 = 8 * (src % 8)
adds loopcnt = -1, loopcnt // --loopcnt
mov pr.rot = 1 << 16 ;; // set rotating predicates
sub sh2 = 64, sh1 // sh2 = 64 - sh1
mov ar.lc = loopcnt // set LC
cmp.eq p6, p0 = sh1, r0 // is the src aligned?
(p6) br.cond.sptk .src_aligned
add src = src, tmp // src += len & -OPSIZ
mov ar.ec = MEMLAT + 4 + 1 // four more passes needed
ld8 r[1] = [asrc], 8 ;; // r[1] = w0
.align 32
// We enter this loop with p6 cleared by the above comparison
.l2:
(p[0]) ld8 r[0] = [asrc], 8 // r[0] = w1
(p[0]) ld8 q[0] = [dest], 8
(p[MEMLAT]) shr.u tmp1[0] = r[1 + MEMLAT], sh1 // tmp1 = w0 >> sh1
(p[MEMLAT]) shl tmp2[0] = r[0 + MEMLAT], sh2 // tmp2 = w1 << sh2
(p[MEMLAT+4]) cmp.ne p6, p0 = q[MEMLAT + 4], val[1]
(p[MEMLAT+3]) or val[0] = tmp1[3], tmp2[3] // val = tmp1 | tmp2
(p6) br.cond.spnt .l2exit
br.ctop.sptk .l2
br.cond.sptk .cmpfew
.l3exit:
mux1 value1 = r[MEMLAT], @rev
mux1 value2 = q[MEMLAT], @rev
cmp.ne p6, p0 = r0, r0 ;; // clear p6
.l2exit:
(p6) mux1 value1 = val[1], @rev
(p6) mux1 value2 = q[MEMLAT + 4], @rev ;;
cmp.ltu p6, p7 = value2, value1 ;;
(p6) mov ret0 = -1
(p7) mov ret0 = 1
mov ar.pfs = saved_pfs // restore the PFS
mov pr = saved_pr, -1 // restore the predicate registers
mov ar.lc = saved_lc // restore the loop counter
br.ret.sptk.many b0
.src_aligned:
cmp.ne p6, p0 = r0, r0 // clear p6
mov ar.ec = MEMLAT + 1 ;; // set EC
.l3:
(p[0]) ld8 r[0] = [src], 8
(p[0]) ld8 q[0] = [dest], 8
(p[MEMLAT]) cmp.ne p6, p0 = r[MEMLAT], q[MEMLAT]
(p6) br.cond.spnt .l3exit
br.ctop.dptk .l3 ;;
.cmpfew:
cmp.eq p6, p0 = len, r0 // is len == 0 ?
adds len = -1, len // --len;
(p6) br.cond.spnt .restore_and_exit ;;
mov ar.lc = len
.l4:
ld1 value1 = [src], 1
ld1 value2 = [dest], 1
;;
cmp.ne p6, p0 = value1, value2
(p6) br.cond.spnt .done
br.cloop.dptk .l4 ;;
.done:
(p6) sub ret0 = value2, value1 // don't execute it if falling thru
.restore_and_exit:
mov ar.pfs = saved_pfs // restore the PFS
mov pr = saved_pr, -1 // restore the predicate registers
mov ar.lc = saved_lc // restore the loop counter
br.ret.sptk.many b0
END(memcmp)
weak_alias (memcmp, bcmp)

265
sysdeps/ia64/memcpy.S Normal file
View File

@ -0,0 +1,265 @@
/* Optimized version of the standard memcpy() function.
This file is part of the GNU C Library.
Copyright (C) 1991,92,93,97,98,99 Free Software Foundation, Inc.
Contributed by Dan Pop <Dan.Pop@cern.ch>.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Return: dest
Inputs:
in0: dest
in1: src
in2: byte count
An assembly implementation of the algorithm used by the generic C
version from glibc. The case when all three arguments are multiples
of 8 is treated separatedly, for extra performance.
In this form, it assumes little endian mode. For big endian mode,
sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
and the order of r[MEMLAT] and r[MEMLAT+1] must be reverted in the
shrp instruction. */
#include <sysdep.h>
#undef ret
#define OP_T_THRES 16
#define OPSIZ 8
#define saved_pfs r14
#define sf r15
#define rescnt r16
#define saved_pr r17
#define saved_lc r18
#define dest r19
#define src r20
#define len r21
#define asrc r22
#define tmp2 r23
#define tmp3 r24
#define tmp4 r25
#define ptable r26
#define ploop56 r27
#define loopaddr r28
#define sh1 r29
#define loopcnt r30
#define value r31
#define dl0 r22
#define dh0 r23
#define dl1 r24
#define dh1 r25
#define dl2 r26
#define dh2 r27
#define dl3 r28
#define dh3 r29
#define LOOP(shift) \
.align 32 ; \
.loop##shift##: \
(p[0]) ld8 r[0] = [asrc], 8 ; /* w1 */ \
(p[MEMLAT+1]) st8 [dest] = value, 8 ; \
(p[MEMLAT]) shrp value = r[MEMLAT], r[MEMLAT+1], shift ; \
nop.b 0 ; \
nop.b 0 ; \
br.ctop.sptk .loop##shift ; \
br.cond.sptk .cpyfew ; /* deal with the remaining bytes */
ENTRY(memcpy)
alloc saved_pfs = ar.pfs, 3, 40-3, 0, 40
#include "softpipe.h"
.rotr r[MEMLAT + 2], q[MEMLAT + 1], s0[2], s1[2], s2[2], s3[2]
.rotf tl0[5], th0[5], tl1[5], th1[5], tl2[5], th2[5], tl3[5], th3[5]
.rotp p[MEMLAT + 2]
mov ret0 = in0 // return value = dest
mov saved_pr = pr // save the predicate registers
brp.loop.many.tk.tk.imp .l0, .done - 16
mov saved_lc = ar.lc // save the loop counter
or tmp3 = in0, in1 ;; // tmp3 = dest | src
or tmp3 = tmp3, in2 // tmp3 = dest | src | len
mov dest = in0 // dest
mov src = in1 // src
mov len = in2 // len
sub tmp2 = r0, in0 // tmp2 = -dest
cmp.eq p6, p0 = in2, r0 // if (len == 0)
(p6) br.cond.spnt .restore_and_exit;;// return dest;
and tmp4 = 7, tmp3 // tmp4 = (dest | src | len) & 7
tbit.nz p8, p0 = src, 3 ;; // test for 16-byte boundary align
cmp.ne p6, p0 = tmp4, r0 // if ((dest | src | len) & 7 != 0)
(p6) br.cond.sptk .next // goto next;
// The optimal case, when dest, src and len are all multiples of 8
(p8) ld8 value = [src], 8 // align src if necessary
(p8) adds len = -8, len ;; // adjust len accordingly
shr.u loopcnt = len, 6 // loopcnt = len / 64
shr.u rescnt = len, 3 // rescnt = len / 8
mov pr.rot = 1 << 16 // set rotating predicates
mov ar.ec = 4 + 1 ;; // set the epilog counter
cmp.eq p6, p0 = loopcnt, r0
and rescnt = 7, rescnt // resnt = residual word count
adds loopcnt = -1, loopcnt // --loopcnt
(p8) st8 [dest] = value, 8 // copy one word if aligning
(p6) br.cond.spnt .epilog;; // there are < 8 words to copy
add sf = 64 * 4, src
mov ar.lc = loopcnt // set the loop counter
mov s0[1] = src
add s1[1] = 16*1, src
add s2[1] = 16*2, src
add s3[1] = 16*3, src
;;
mov dl0 = dest
add dh0 = 8 * 1, dest
add dl1 = 8 * 2, dest
add dh1 = 8 * 3, dest
add dl2 = 8 * 4, dest
add dh2 = 8 * 5, dest
add dl3 = 8 * 6, dest
add dh3 = 8 * 7, dest
;;
.l0:
(p[0]) lfetch.nta [sf], 64
(p[0]) ldfp8 tl0[0], th0[0] = [s0[1]]
(p[0]) ldfp8 tl1[0], th1[0] = [s1[1]]
(p[0]) ldfp8 tl2[0], th2[0] = [s2[1]]
(p[0]) ldfp8 tl3[0], th3[0] = [s3[1]]
(p[0]) add s0[0] = 64, s0[1]
(p[0]) add s1[0] = 64, s1[1]
(p[0]) add s2[0] = 64, s2[1]
(p[0]) add s3[0] = 64, s3[1]
(p[1]) mov src = s0[1] // for the epilog code
(p[4]) stf8 [dl0] = tl0[4], 64
(p[4]) stf8 [dh0] = th0[4], 64
(p[4]) stf8 [dl1] = tl1[4], 64
(p[4]) stf8 [dh1] = th1[4], 64
(p[4]) stf8 [dl2] = tl2[4], 64
(p[4]) stf8 [dh2] = th2[4], 64
(p[4]) stf8 [dl3] = tl3[4], 64
(p[4]) stf8 [dh3] = th3[4], 64
br.ctop.sptk.many .l0
.done:
mov dest = dl0
.epilog:
cmp.eq p6, p0 = rescnt, r0 // are there any words left to copy?
tbit.nz p10, p0 = rescnt, 0
(p6) br.cond.spnt .restore_and_exit ;;
(p10) ld8 r[0] = [src], 8
tbit.nz p11, p0 = rescnt, 1 ;;
(p11) ld8 r[1] = [src], 8
(p10) st8 [dest] = r[0], 8 ;;
(p11) ld8 r[2] = [src], 8
(p11) st8 [dest] = r[1], 8
tbit.nz p12, p0 = rescnt, 2 ;;
(p12) ld8 r[3] = [src], 8
(p11) st8 [dest] = r[2], 8 ;;
(p12) ld8 r[4] = [src], 8
(p12) st8 [dest] = r[3], 8 ;;
(p12) ld8 r[5] = [src], 8
(p12) st8 [dest] = r[4], 8
mov ar.lc = saved_lc ;; // restore the loop counter
(p12) ld8 r[6] = [src], 8
(p12) st8 [dest] = r[5], 8
mov ar.pfs = saved_pfs;; // restore the PFS
(p12) st8 [dest] = r[6]
mov pr = saved_pr, -1 // restore the predicate registers
br.ret.sptk.many b0
.next:
cmp.ge p6, p0 = OP_T_THRES, len // is len <= OP_T_THRES
and loopcnt = 7, tmp2 // loopcnt = -dest % 8
(p6) br.cond.spnt .cpyfew // copy byte by byte
;;
cmp.eq p6, p0 = loopcnt, r0
(p6) br.cond.sptk .dest_aligned
sub len = len, loopcnt // len -= -dest % 8
adds loopcnt = -1, loopcnt // --loopcnt
;;
mov ar.lc = loopcnt
.l1: // copy -dest % 8 bytes
ld1 value = [src], 1 // value = *src++
;;
st1 [dest] = value, 1 // *dest++ = value
br.cloop.dptk .l1
.dest_aligned:
and sh1 = 7, src // sh1 = src % 8
and tmp2 = -8, len // tmp2 = len & -OPSIZ
and asrc = -8, src // asrc = src & -OPSIZ -- align src
shr.u loopcnt = len, 3 // loopcnt = len / 8
and len = 7, len;; // len = len % 8
adds loopcnt = -1, loopcnt // --loopcnt
addl tmp4 = @ltoff(.table), gp
addl tmp3 = @ltoff(.loop56), gp
mov ar.ec = MEMLAT + 1 // set EC
mov pr.rot = 1 << 16;; // set rotating predicates
mov ar.lc = loopcnt // set LC
cmp.eq p6, p0 = sh1, r0 // is the src aligned?
(p6) br.cond.sptk .src_aligned
add src = src, tmp2 // src += len & -OPSIZ
shl sh1 = sh1, 3 // sh1 = 8 * (src % 8)
ld8 ploop56 = [tmp3] // ploop56 = &loop56
ld8 ptable = [tmp4];; // ptable = &table
add tmp3 = ptable, sh1;; // tmp3 = &table + sh1
mov ar.ec = MEMLAT + 1 + 1 // one more pass needed
ld8 tmp4 = [tmp3];; // tmp4 = loop offset
sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset
ld8 r[1] = [asrc], 8;; // w0
mov b6 = loopaddr;;
br b6 // jump to the appropriate loop
LOOP(8)
LOOP(16)
LOOP(24)
LOOP(32)
LOOP(40)
LOOP(48)
LOOP(56)
.src_aligned:
.l3:
(p[0]) ld8 r[0] = [src], 8
(p[MEMLAT]) st8 [dest] = r[MEMLAT], 8
br.ctop.dptk .l3
.cpyfew:
cmp.eq p6, p0 = len, r0 // is len == 0 ?
adds len = -1, len // --len;
(p6) br.cond.spnt .restore_and_exit ;;
mov ar.lc = len
.l4:
ld1 value = [src], 1
;;
st1 [dest] = value, 1
br.cloop.dptk .l4 ;;
.restore_and_exit:
mov ar.pfs = saved_pfs // restore the PFS
mov pr = saved_pr, -1 // restore the predicate registers
mov ar.lc = saved_lc // restore the loop counter
br.ret.sptk.many b0
.align 8
.table:
data8 0 // dummy entry
data8 .loop56 - .loop8
data8 .loop56 - .loop16
data8 .loop56 - .loop24
data8 .loop56 - .loop32
data8 .loop56 - .loop40
data8 .loop56 - .loop48
data8 .loop56 - .loop56
END(memcpy)

238
sysdeps/ia64/memmove.S Normal file
View File

@ -0,0 +1,238 @@
/* Optimized version of the standard memmove() function.
This file is part of the GNU C Library.
Copyright (C) 2000 Free Software Foundation, Inc.
Contributed by Dan Pop <Dan.Pop@cern.ch>.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Return: dest
Inputs:
in0: dest
in1: src
in2: byte count
The core of the function is the memcpy implementation used in memcpy.S.
When bytes have to be copied backwards, only the easy case, when
all arguments are multiples of 8, is optimised.
In this form, it assumes little endian mode. For big endian mode,
sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
or the UM.be bit should be cleared at the beginning and set at the end. */
#include <sysdep.h>
#undef ret
#define OP_T_THRES 16
#define OPSIZ 8
#define saved_pfs r14
#define adest r15
#define saved_pr r17
#define saved_lc r18
#define dest r19
#define src r20
#define len r21
#define asrc r22
#define tmp2 r23
#define tmp3 r24
#define tmp4 r25
#define ptable r26
#define ploop56 r27
#define loopaddr r28
#define sh1 r29
#define loopcnt r30
#define value r31
#define LOOP(shift) \
.align 32 ; \
.loop##shift##: \
(p[0]) ld8 r[0] = [asrc], 8 ; /* w1 */ \
(p[MEMLAT+1]) st8 [dest] = value, 8 ; \
(p[MEMLAT]) shrp value = r[MEMLAT], r[MEMLAT+1], shift ; \
nop.b 0 ; \
nop.b 0 ; \
br.ctop.sptk .loop##shift ; \
br.cond.sptk .cpyfew ; /* deal with the remaining bytes */
ENTRY(memmove)
alloc saved_pfs = ar.pfs, 3, 29, 0, 32
#include "softpipe.h"
.rotr r[MEMLAT + 2], q[MEMLAT + 1]
.rotp p[MEMLAT + 2]
mov ret0 = in0 // return value = dest
mov saved_pr = pr // save the predicate registers
mov saved_lc = ar.lc // save the loop counter
or tmp3 = in0, in1 ;; // tmp3 = dest | src
or tmp3 = tmp3, in2 // tmp3 = dest | src | len
mov dest = in0 // dest
mov src = in1 // src
mov len = in2 // len
sub tmp2 = r0, in0 // tmp2 = -dest
cmp.eq p6, p0 = in2, r0 // if (len == 0)
(p6) br.cond.spnt .restore_and_exit;;// return dest;
and tmp4 = 7, tmp3 // tmp4 = (dest | src | len) & 7
cmp.le p6, p0 = dest, src // if dest <= src it's always safe
(p6) br.cond.spnt .forward // to copy forward
add tmp3 = src, len;;
cmp.lt p6, p0 = dest, tmp3 // if dest > src && dest < src + len
(p6) br.cond.spnt .backward // we have to copy backward
.forward:
shr.u loopcnt = len, 4 ;; // loopcnt = len / 16
cmp.ne p6, p0 = tmp4, r0 // if ((dest | src | len) & 7 != 0)
(p6) br.cond.sptk .next // goto next;
// The optimal case, when dest, src and len are all multiples of 8
and tmp3 = 0xf, len
mov pr.rot = 1 << 16 // set rotating predicates
mov ar.ec = MEMLAT + 1 ;; // set the epilog counter
cmp.ne p6, p0 = tmp3, r0 // do we have to copy an extra word?
adds loopcnt = -1, loopcnt;; // --loopcnt
(p6) ld8 value = [src], 8;;
(p6) st8 [dest] = value, 8 // copy the "odd" word
mov ar.lc = loopcnt // set the loop counter
cmp.eq p6, p0 = 8, len
(p6) br.cond.spnt .restore_and_exit;;// the one-word special case
adds adest = 8, dest // set adest one word ahead of dest
adds asrc = 8, src ;; // set asrc one word ahead of src
nop.b 0 // get the "golden" alignment for
nop.b 0 // the next loop
.l0:
(p[0]) ld8 r[0] = [src], 16
(p[0]) ld8 q[0] = [asrc], 16
(p[MEMLAT]) st8 [dest] = r[MEMLAT], 16
(p[MEMLAT]) st8 [adest] = q[MEMLAT], 16
br.ctop.dptk .l0 ;;
mov ar.pfs = saved_pfs // restore the PFS
mov pr = saved_pr, -1 // restore the predicate registers
mov ar.lc = saved_lc // restore the loop counter
br.ret.sptk.many b0
.next:
cmp.ge p6, p0 = OP_T_THRES, len // is len <= OP_T_THRES
and loopcnt = 7, tmp2 // loopcnt = -dest % 8
(p6) br.cond.spnt .cpyfew // copy byte by byte
;;
cmp.eq p6, p0 = loopcnt, r0
(p6) br.cond.sptk .dest_aligned
sub len = len, loopcnt // len -= -dest % 8
adds loopcnt = -1, loopcnt // --loopcnt
;;
mov ar.lc = loopcnt
.l1: // copy -dest % 8 bytes
ld1 value = [src], 1 // value = *src++
;;
st1 [dest] = value, 1 // *dest++ = value
br.cloop.dptk .l1
.dest_aligned:
and sh1 = 7, src // sh1 = src % 8
and tmp2 = -8, len // tmp2 = len & -OPSIZ
and asrc = -8, src // asrc = src & -OPSIZ -- align src
shr.u loopcnt = len, 3 // loopcnt = len / 8
and len = 7, len;; // len = len % 8
adds loopcnt = -1, loopcnt // --loopcnt
addl tmp4 = @ltoff(.table), gp
addl tmp3 = @ltoff(.loop56), gp
mov ar.ec = MEMLAT + 1 // set EC
mov pr.rot = 1 << 16;; // set rotating predicates
mov ar.lc = loopcnt // set LC
cmp.eq p6, p0 = sh1, r0 // is the src aligned?
(p6) br.cond.sptk .src_aligned
add src = src, tmp2 // src += len & -OPSIZ
shl sh1 = sh1, 3 // sh1 = 8 * (src % 8)
ld8 ploop56 = [tmp3] // ploop56 = &loop56
ld8 ptable = [tmp4];; // ptable = &table
add tmp3 = ptable, sh1;; // tmp3 = &table + sh1
mov ar.ec = MEMLAT + 1 + 1 // one more pass needed
ld8 tmp4 = [tmp3];; // tmp4 = loop offset
sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset
ld8 r[1] = [asrc], 8;; // w0
mov b6 = loopaddr;;
br b6 // jump to the appropriate loop
LOOP(8)
LOOP(16)
LOOP(24)
LOOP(32)
LOOP(40)
LOOP(48)
LOOP(56)
.src_aligned:
.l3:
(p[0]) ld8 r[0] = [src], 8
(p[MEMLAT]) st8 [dest] = r[MEMLAT], 8
br.ctop.dptk .l3
.cpyfew:
cmp.eq p6, p0 = len, r0 // is len == 0 ?
adds len = -1, len // --len;
(p6) br.cond.spnt .restore_and_exit ;;
mov ar.lc = len
.l4:
ld1 value = [src], 1
;;
st1 [dest] = value, 1
br.cloop.dptk .l4 ;;
.restore_and_exit:
mov ar.pfs = saved_pfs // restore the PFS
mov pr = saved_pr, -1 // restore the predicate registers
mov ar.lc = saved_lc // restore the loop counter
br.ret.sptk.many b0
// In the case of a backward copy, optimise only the case when everything
// is a multiple of 8, otherwise copy byte by byte. The backward copy is
// used only when the blocks are overlapping and dest > src.
.backward:
shr.u loopcnt = len, 3 // loopcnt = len / 8
add src = src, len // src points one byte past the end
add dest = dest, len ;; // dest points one byte past the end
mov ar.ec = MEMLAT + 1 // set the epilog counter
mov pr.rot = 1 << 16 // set rotating predicates
adds loopcnt = -1, loopcnt // --loopcnt
cmp.ne p6, p0 = tmp4, r0 // if ((dest | src | len) & 7 != 0)
(p6) br.cond.sptk .bytecopy ;; // copy byte by byte backward
adds src = -8, src // src points to the last word
adds dest = -8, dest // dest points to the last word
mov ar.lc = loopcnt;; // set the loop counter
.l5:
(p[0]) ld8 r[0] = [src], -8
(p[MEMLAT]) st8 [dest] = r[MEMLAT], -8
br.ctop.dptk .l5
br.cond.sptk .restore_and_exit
.bytecopy:
adds src = -1, src // src points to the last byte
adds dest = -1, dest // dest points to the last byte
adds loopcnt = -1, len;; // loopcnt = len - 1
mov ar.lc = loopcnt;; // set the loop counter
.l6:
(p[0]) ld1 r[0] = [src], -1
(p[MEMLAT]) st1 [dest] = r[MEMLAT], -1
br.ctop.dptk .l6
br.cond.sptk .restore_and_exit
.table:
data8 0 // dummy entry
data8 .loop56 - .loop8
data8 .loop56 - .loop16
data8 .loop56 - .loop24
data8 .loop56 - .loop32
data8 .loop56 - .loop40
data8 .loop56 - .loop48
data8 .loop56 - .loop56
END(memmove)

95
sysdeps/ia64/memset.S Normal file
View File

@ -0,0 +1,95 @@
/* Optimized version of the standard memset() function.
This file is part of the GNU C Library.
Copyright (C) 2000 Free Software Foundation, Inc.
Contributed by Dan Pop <Dan.Pop@cern.ch>.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Return: dest
Inputs:
in0: dest
in1: value
in2: count
The algorithm is fairly straightforward: set byte by byte until we
we get to a word aligned address, then set word by word as much as
possible; the remaining few bytes are set one by one. */
#include <sysdep.h>
#undef ret
#define dest in0
#define byteval in1
#define cnt in2
#define save_pfs loc0
#define ptr1 loc1
#define ptr2 loc2
#define tmp loc3
#define loopcnt loc4
#define save_lc loc5
#define wordval loc6
ENTRY(memset)
alloc save_pfs = ar.pfs, 3, 7, 0, 0
mov save_lc = ar.lc
mov ret0 = dest
and tmp = 7, dest
cmp.eq p6, p0 = cnt, r0
(p6) br.cond.spnt .restore_and_exit ;;
mov ptr1 = dest
sub loopcnt = 8, tmp
cmp.gt p6, p0 = 16, cnt
(p6) br.cond.spnt .set_few;;
cmp.eq p6, p0 = tmp, r0
(p6) br.cond.sptk .dest_aligned
sub cnt = cnt, loopcnt
adds loopcnt = -1, loopcnt;;
mov ar.lc = loopcnt;;
.l1:
st1 [ptr1] = byteval, 1
br.cloop.dptk .l1 ;;
.dest_aligned:
adds ptr2 = 8, ptr1
mux1 wordval = byteval, @brcst
shr.u loopcnt = cnt, 4 ;; // loopcnt = cnt / 16
cmp.eq p6, p0 = loopcnt, r0
(p6) br.cond.spnt .one_more
and cnt = 0xf, cnt // compute the remaining cnt
adds loopcnt = -1, loopcnt;;
mov ar.lc = loopcnt;;
.l2:
st8 [ptr1] = wordval, 16
st8 [ptr2] = wordval, 16
br.cloop.dptk .l2
cmp.le p6, p0 = 8, cnt ;;
.one_more:
(p6) st8 [ptr1] = wordval, 8
(p6) adds cnt = -8, cnt ;;
cmp.eq p6, p0 = cnt, r0
(p6) br.cond.spnt .restore_and_exit
.set_few:
adds loopcnt = -1, cnt;;
mov ar.lc = loopcnt;;
.l3:
st1 [ptr1] = byteval, 1
br.cloop.dptk .l3 ;;
.restore_and_exit:
mov ar.lc = save_lc
mov ar.pfs = save_pfs
br.ret.sptk.many b0
END(memset)

29
sysdeps/ia64/softpipe.h Normal file
View File

@ -0,0 +1,29 @@
/* This file is part of the GNU C Library.
Copyright (C) 2000 Free Software Foundation, Inc.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* The latency of a memory load assumed by the assembly implementation
of the mem and str functions. Since we don't have any clue about
where the data might be, let's assume it's in the L2 cache.
Assuming L3 would be too pessimistic :-)
Some functions define MEMLAT as 2, because they expect their data
to be in the L1D cache. */
#ifndef MEMLAT
# define MEMLAT 6
#endif

60
sysdeps/ia64/strcat.S Normal file
View File

@ -0,0 +1,60 @@
/* IA-64 assembly version of the standard strcat() function.
This file is part of the GNU C Library.
Copyright (C) 2000 Free Software Foundation, Inc.
Contributed by Dan Pop <Dan.Pop@cern.ch>.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Return: dest
Inputs:
in0: dest
in1: src
A straightforword implementation: strcpy(dest + strlen(dest), src).
Could be marginally optimised by inlining the code of strcpy() and
strlen(), to avoid the two function calls. OTOH, by not doing that,
we avoid L1I cache pollution (code using strcat() is likely to also
use strcpy() and strlen(), so we already have a copy of these functions
in the cache). */
#include <sysdep.h>
#undef ret
#define src in1
#define dest in0
#define save_pfs loc0
#define save_b0 loc1
#define tmp loc2
#define rc ret0
ENTRY(strcat)
alloc save_pfs = ar.pfs, 2, 3, 2, 0
mov save_b0 = b0
mov out0 = dest
mov tmp = gp ;;
br.call.sptk.many b0 = strlen# ;; // rc = strlen(dest);
mov gp = tmp
add out0 = dest, rc
mov out1 = src
br.call.sptk.many b0 = strcpy# ;; // strcpy(dest + strlen(dest), src)
mov gp = tmp
mov rc = dest
mov b0 = save_b0
mov ar.pfs = save_pfs
br.ret.sptk.many b0
END(strcat)

111
sysdeps/ia64/strchr.S Normal file
View File

@ -0,0 +1,111 @@
/* Optimized version of the standard strchr() function.
This file is part of the GNU C Library.
Copyright (C) 2000 Free Software Foundation, Inc.
Contributed by Dan Pop <Dan.Pop@cern.ch>.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Return: the address of the first occurence of chr in str or NULL
Inputs:
in0: str
in1: chr
A modified version of memchr.S, the search ends when the character is
found or the terminating null character is encountered.
This implementation assumes little endian mode. For big endian mode,
the instruction czx1.r should be replaced by czx1.l. */
#include <sysdep.h>
#undef ret
#define saved_pfs r14
#define saved_lc r18
#define poschr r19
#define pos0 r20
#define val1 r21
#define val2 r22
#define tmp r24
#define chrx8 r25
#define loopcnt r30
#define str in0
#define chr in1
ENTRY(strchr)
alloc saved_pfs = ar.pfs, 2, 0, 0, 0
mov saved_lc = ar.lc // save the loop counter
mov ret0 = str
and tmp = 7, str // tmp = str % 8
mux1 chrx8 = chr, @brcst
extr.u chr = chr, 0, 8 // retain only the last byte
cmp.ne p8, p0 = r0, r0 // clear p8
;;
sub loopcnt = 8, tmp // loopcnt = 8 - tmp
cmp.eq p6, p0 = tmp, r0
(p6) br.cond.sptk .str_aligned;;
adds loopcnt = -1, loopcnt;;
mov ar.lc = loopcnt
.l1:
ld1 val2 = [ret0], 1
;;
cmp.eq p6, p0 = val2, chr
cmp.eq p7, p0 = val2, r0
(p6) br.cond.spnt .restore_and_exit
(p7) br.cond.spnt .notfound
br.cloop.sptk .l1
.str_aligned:
ld8 val1 = [ret0], 8;;
nop.b 0
nop.b 0
.l2:
ld8.s val2 = [ret0], 8 // don't bomb out here
czx1.r pos0 = val1
xor tmp = val1, chrx8 // if val1 contains chr, tmp will
;; // contain a zero in its position
czx1.r poschr = tmp
cmp.ne p6, p0 = 8, pos0
;;
cmp.ne p7, p0 = 8, poschr
(p7) br.cond.spnt .foundit
(p6) br.cond.spnt .notfound
chk.s val2, .recovery
.back:
mov val1 = val2
br.cond.dptk .l2
.foundit:
(p6) cmp.lt p8, p0 = pos0, poschr // we found chr and null in the word
(p8) br.cond.spnt .notfound // null was found before chr
add ret0 = ret0, poschr ;;
adds ret0 = -15, ret0 ;; // should be -16, but we decrement
.restore_and_exit: // ret0 in the next instruction
adds ret0 = -1, ret0 // ret0 was pointing 1 char too far
mov ar.pfs = saved_pfs // restore the PFS
mov ar.lc = saved_lc // restore the loop counter
br.ret.sptk.many b0
.notfound:
mov ret0 = r0 // return NULL if null was found
mov ar.pfs = saved_pfs // first
mov ar.lc = saved_lc
br.ret.sptk.many b0
.recovery:
adds ret0 = -8, ret0;;
ld8 val2 = [ret0], 8 // bomb out here
br.cond.sptk .back
END(strchr)
weak_alias(strchr, index)

55
sysdeps/ia64/strcmp.S Normal file
View File

@ -0,0 +1,55 @@
/* Optimized version of the standard strcmp() function.
This file is part of the GNU C Library.
Copyright (C) 2000 Free Software Foundation, Inc.
Contributed by Dan Pop <Dan.Pop@cern.ch>.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Return: the result of the comparison
Inputs:
in0: s1
in1: s2
Unlike memcmp(), this function is optimized for mismatches within the
first few characters. */
#include <sysdep.h>
#undef ret
#define s1 in0
#define s2 in1
#define saved_pfs r14
#define val1 r15
#define val2 r16
ENTRY(strcmp)
alloc saved_pfs = ar.pfs, 2, 0, 0, 0
.loop:
ld1 val1 = [s1], 1
ld1 val2 = [s2], 1
cmp.eq p6, p0 = r0, r0 // set p6
;;
cmp.ne.and p6, p0 = val1, r0
cmp.ne.and p6, p0 = val2, r0
cmp.eq.and p6, p0 = val1, val2
(p6) br.cond.sptk .loop
sub ret0 = val1, val2
mov ar.pfs = saved_pfs
br.ret.sptk.many b0
END(strcmp)

142
sysdeps/ia64/strcpy.S Normal file
View File

@ -0,0 +1,142 @@
/* Optimized version of the standard strcpy() function.
This file is part of the GNU C Library.
Copyright (C) 2000 Free Software Foundation, Inc.
Contributed by Dan Pop <Dan.Pop@cern.ch>.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Return: dest
Inputs:
in0: dest
in1: src
In this form, it assumes little endian mode. For big endian mode, the
the two shifts in .l2 must be inverted:
shl value = r[1], sh1 // value = w0 << sh1
shr.u tmp = r[0], sh2 // tmp = w1 >> sh2
*/
#include <sysdep.h>
#undef ret
#define saved_pfs r14
#define saved_lc r15
#define saved_pr r16
#define thresh r17
#define dest r19
#define src r20
#define len r21
#define asrc r22
#define tmp r23
#define pos r24
#define w0 r25
#define w1 r26
#define c r27
#define sh2 r28
#define sh1 r29
#define loopcnt r30
#define value r31
ENTRY(strcpy)
alloc saved_pfs = ar.pfs, 2, 0, 30, 32
#define MEMLAT 2
.rotr r[MEMLAT + 2]
.rotp p[MEMLAT + 1]
mov ret0 = in0 // return value = dest
mov saved_pr = pr // save the predicate registers
mov saved_lc = ar.lc // save the loop counter
sub tmp = r0, in0 ;; // tmp = -dest
mov dest = in0 // dest
mov src = in1 // src
and loopcnt = 7, tmp ;; // loopcnt = -dest % 8
cmp.eq p6, p0 = loopcnt, r0
adds loopcnt = -1, loopcnt // --loopcnt
(p6) br.cond.sptk .dest_aligned ;;
mov ar.lc = loopcnt
.l1: // copy -dest % 8 bytes
ld1 c = [src], 1 // c = *src++
;;
st1 [dest] = c, 1 // *dest++ = c
cmp.eq p6, p0 = c, r0
(p6) br.cond.dpnt .restore_and_exit
br.cloop.dptk .l1 ;;
.dest_aligned:
and sh1 = 7, src // sh1 = src % 8
mov ar.lc = -1 // "infinite" loop
and asrc = -8, src ;; // asrc = src & -OPSIZ -- align src
sub thresh = 8, sh1
mov pr.rot = 1 << 16 // set rotating predicates
cmp.ne p7, p0 = r0, r0 // clear p7
shl sh1 = sh1, 3 ;; // sh1 = 8 * (src % 8)
sub sh2 = 64, sh1 // sh2 = 64 - sh1
cmp.eq p6, p0 = sh1, r0 // is the src aligned?
(p6) br.cond.sptk .src_aligned ;;
ld8 r[1] = [asrc],8 ;;
.align 32
.l2:
ld8.s r[0] = [asrc], 8
shr.u value = r[1], sh1 ;; // value = w0 >> sh1
czx1.r pos = value ;; // do we have an "early" zero
cmp.lt p7, p0 = pos, thresh // in w0 >> sh1?
(p7) br.cond.dpnt .found0
chk.s r[0], .recovery2 // it is safe to do that only
.back2: // after the previous test
shl tmp = r[0], sh2 // tmp = w1 << sh2
;;
or value = value, tmp ;; // value |= tmp
czx1.r pos = value ;;
cmp.ne p7, p0 = 8, pos
(p7) br.cond.dpnt .found0
st8 [dest] = value, 8 // store val to dest
br.ctop.dptk .l2 ;;
.src_aligned:
.l3:
(p[0]) ld8.s r[0] = [src], 8
(p[MEMLAT]) chk.s r[MEMLAT], .recovery3
.back3:
(p[MEMLAT]) mov value = r[MEMLAT]
(p[MEMLAT]) czx1.r pos = r[MEMLAT] ;;
(p[MEMLAT]) cmp.ne p7, p0 = 8, pos
(p7) br.cond.dpnt .found0
(p[MEMLAT]) st8 [dest] = r[MEMLAT], 8
br.ctop.dptk .l3 ;;
.found0:
mov ar.lc = pos
.l4:
extr.u c = value, 0, 8 // c = value & 0xff
shr.u value = value, 8
;;
st1 [dest] = c, 1
br.cloop.dptk .l4 ;;
.restore_and_exit:
mov ar.pfs = saved_pfs // restore the PFS
mov ar.lc = saved_lc // restore the loop counter
mov pr = saved_pr, -1 // restore the predicate registers
br.ret.sptk.many b0
.recovery2:
add tmp = -8, asrc ;;
ld8 r[MEMLAT] = [tmp]
br.cond.sptk .back2
.recovery3:
add tmp = -(MEMLAT + 1) * 8, src ;;
ld8 r[MEMLAT] = [tmp]
br.cond.sptk .back3
END(strcpy)

96
sysdeps/ia64/strlen.S Normal file
View File

@ -0,0 +1,96 @@
/* Optimized version of the standard strlen() function.
This file is part of the GNU C Library.
Copyright (C) 2000 Free Software Foundation, Inc.
Contributed by Dan Pop <Dan.Pop@cern.ch>.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Return: the length of the input string
Input:
in0: str
Look for the null character byte by byte, until we reach a word aligned
address, then search word by word, using the czx instruction. We're
also doing one word of read ahead, which could cause problems if the
null character is on the last word of a page and the next page is not
mapped in the process address space. Hence the use of the speculative
load.
This implementation assumes little endian mode. For big endian mode,
the instruction czx1.r should be replaced by czx1.l. */
#include <sysdep.h>
#undef ret
#define saved_pfs r14
#define saved_lc r18
#define str r19
#define pos0 r20
#define val1 r21
#define val2 r22
#define origadd r23
#define tmp r24
#define loopcnt r30
#define len ret0
ENTRY(strlen)
alloc saved_pfs = ar.pfs, 1, 0, 0, 0
mov saved_lc = ar.lc // save the loop counter
mov str = in0
mov len = r0 // len = 0
and tmp = 7, in0 // tmp = str % 8
;;
sub loopcnt = 8, tmp // loopcnt = 8 - tmp
cmp.eq p6, p0 = tmp, r0
(p6) br.cond.sptk .str_aligned;;
adds loopcnt = -1, loopcnt;;
mov ar.lc = loopcnt
.l1:
ld1 val2 = [str], 1
;;
cmp.eq p6, p0 = val2, r0
(p6) br.cond.spnt .restore_and_exit
adds len = 1, len
br.cloop.dptk .l1
.str_aligned:
mov origadd = str // origadd = orig
ld8 val1 = [str], 8;;
nop.b 0
nop.b 0
l2: ld8.s val2 = [str], 8 // don't bomb out here
czx1.r pos0 = val1
;;
cmp.ne p6, p0 = 8, pos0
(p6) br.cond.spnt .foundit
chk.s val2, .recovery
.back:
mov val1 = val2
br.cond.dptk l2
.foundit:
sub tmp = str, origadd // tmp = crt address - orig
add len = len, pos0;;
add len = len, tmp;;
adds len = -16, len
.restore_and_exit:
mov ar.pfs = saved_pfs // restore the PFS
mov ar.lc = saved_lc // restore the loop counter
br.ret.sptk.many b0
.recovery:
adds str = -8, str;;
ld8 val2 = [str], 8 // bomb out here
br.cond.sptk .back
END(strlen)

63
sysdeps/ia64/strncmp.S Normal file
View File

@ -0,0 +1,63 @@
/* Optimized version of the standard strncmp() function.
This file is part of the GNU C Library.
Copyright (C) 2000 Free Software Foundation, Inc.
Contributed by Dan Pop <Dan.Pop@cern.ch>.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Return: the result of the comparison
Inputs:
in0: s1
in1: s2
in2: n
Unlike memcmp(), this function is optimized for mismatches within the
first few characters. */
#include <sysdep.h>
#undef ret
#define s1 in0
#define s2 in1
#define n in2
#define saved_pfs r14
#define val1 r15
#define val2 r16
ENTRY(strncmp)
alloc saved_pfs = ar.pfs, 3, 0, 0, 0
mov ret0 = r0
cmp.eq p6, p0 = r0, r0 // set p6
cmp.eq p7, p0 = n, r0 // return immediately if n == 0
(p7) br.cond.spnt .restore_and_exit ;;
.loop:
ld1 val1 = [s1], 1
ld1 val2 = [s2], 1
adds n = -1, n // n--
;;
cmp.ne.and p6, p0 = val1, r0
cmp.ne.and p6, p0 = val2, r0
cmp.ne.and p6, p0 = n, r0
cmp.eq.and p6, p0 = val1, val2
(p6) br.cond.sptk .loop
sub ret0 = val1, val2
.restore_and_exit:
mov ar.pfs = saved_pfs
br.ret.sptk.many b0
END(strncmp)

95
sysdeps/ia64/strncpy.S Normal file
View File

@ -0,0 +1,95 @@
/* Optimized version of the standard strncpy() function.
This file is part of the GNU C Library.
Copyright (C) 2000 Free Software Foundation, Inc.
Contributed by Dan Pop <Dan.Pop@cern.ch>.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Return: dest
Inputs:
in0: dest
in1: src
in2: char count
If n >= 24, do a memcpy(dest, src, min(strlen(src)+1, n)), followed by a
memset(dest + strlen(src), 0, n - strlen(src) - 1) if necessary.
Otherwise, copy characters one by one and fill with nulls if necessary. */
#include <sysdep.h>
#undef ret
#define saved_pfs loc0
#define saved_b0 loc1
#define saved_pr loc2
#define saved_lc loc3
#define tmp loc4
#define len loc5
#define dest in0
#define src in1
#define n in2
#define rc ret0
ENTRY(strncpy)
alloc saved_pfs = ar.pfs, 3, 6, 3, 0
mov saved_b0 = b0
mov saved_pr = pr
mov saved_lc = ar.lc
cmp.gtu p6, p0 = 24, n
(p6) br.cond.spnt .cpyfew
mov out0 = src
mov tmp = gp ;;
br.call.sptk.many b0 = strlen# ;; // rc = strlen(src);
add len = 1, rc // include the null in len
mov gp = tmp
mov out0 = dest ;;
cmp.ltu p4, p5 = len, n
mov out1 = src ;;
(p4) mov out2 = len
(p5) mov out2 = n
br.call.sptk.many b0 = memcpy# ;; // memcpy(dest, src, min(len, n));
mov gp = tmp
(p4) add out0 = dest, len
(p4) mov out1 = r0
(p4) sub out2 = n, len
(p4) br.call.sptk.many b0 = memset# ;; // fill the rest with nulls
(p4) mov gp = tmp
mov rc = dest
mov b0 = saved_b0
mov ar.pfs = saved_pfs
mov pr = saved_pr, -1
br.ret.sptk.many b0
.cpyfew:
mov rc = dest
cmp.eq p6, p0 = n, r0
adds n = -1, n
(p6) br.cond.spnt .restore_and_exit ;; // do nothing if n == 0
mov ar.lc = n
cmp.eq p6, p0 = r0, r0 ;; // set p6
.loop:
(p6) ld1 tmp = [src],1
;;
st1 [dest] = tmp, 1
(p6) cmp.ne p6, p0 = tmp, r0 // clear p6 after encountering the
br.cloop.dptk .loop ;; // null character in src
.restore_and_exit:
mov ar.lc = saved_lc
mov ar.pfs = saved_pfs
br.ret.sptk.many b0
END(strncpy)