mirror of
https://sourceware.org/git/glibc.git
synced 2024-11-22 13:00:06 +00:00
Fix spelling errors in sysdeps/powerpc files.
This commit is contained in:
parent
375607b9cc
commit
2ccdea26f2
46
ChangeLog
46
ChangeLog
@ -1,3 +1,49 @@
|
||||
2013-01-07 Anton Blanchard <anton@samba.org>
|
||||
|
||||
* sysdeps/powerpc/fpu/feholdexcpt.c: Fixed spelling errors.
|
||||
* sysdeps/powerpc/fpu/feupdateenv.c: Likewise.
|
||||
* sysdeps/powerpc/fpu/math_ldbl.h: Likewise.
|
||||
* sysdeps/powerpc/powerpc32/bits/atomic.h: Likewise.
|
||||
* sysdeps/powerpc/powerpc32/cell/memcpy.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc32/dl-machine.c: Likewise.
|
||||
* sysdeps/powerpc/powerpc32/dl-start.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc32/memset.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc32/power4/fpu/mpa.c: Likewise.
|
||||
* sysdeps/powerpc/powerpc32/power4/fpu/slowpow.c: Likewise.
|
||||
* sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc32/power4/hp-timing.h: Likewise.
|
||||
* sysdeps/powerpc/powerpc32/power4/memcmp.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc32/power4/strncmp.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc32/power7/memchr.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc32/power7/memcmp.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc32/power7/memrchr.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc32/power7/strcasecmp.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc32/power7/strncmp.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc32/strncmp.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc64/bits/atomic.h: Likewise.
|
||||
* sysdeps/powerpc/powerpc64/cell/memcpy.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc64/dl-machine.h: Likewise.
|
||||
* sysdeps/powerpc/powerpc64/fpu/s_ceill.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc64/fpu/s_nearbyintl.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc64/hp-timing.h: Likewise.
|
||||
* sysdeps/powerpc/powerpc64/memcpy.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc64/power4/fpu/mpa.c: Likewise.
|
||||
* sysdeps/powerpc/powerpc64/power4/fpu/slowpow.c: Likewise.
|
||||
* sysdeps/powerpc/powerpc64/power4/memcmp.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc64/power4/memcpy.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc64/power4/strncmp.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc64/power6/memcpy.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc64/power7/memchr.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc64/power7/memcmp.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc64/power7/memrchr.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc64/power7/strcasecmp.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc64/power7/strncmp.S: Likewise.
|
||||
* sysdeps/powerpc/powerpc64/strncmp.S: Likewise.
|
||||
|
||||
2013-01-07 Joseph Myers <joseph@codesourcery.com>
|
||||
|
||||
* malloc/malloc.h (__MALLOC_P): Remove all definitions.
|
||||
|
@ -33,7 +33,7 @@ feholdexcept (fenv_t *envp)
|
||||
new.l[1] = old.l[1] & 7;
|
||||
new.l[0] = old.l[0];
|
||||
|
||||
/* If the old env had any eabled exceptions, then mask SIGFPE in the
|
||||
/* If the old env had any enabled exceptions, then mask SIGFPE in the
|
||||
MSR FE0/FE1 bits. This may allow the FPU to run faster because it
|
||||
always takes the default action and can not generate SIGFPE. */
|
||||
if ((old.l[1] & _FPU_MASK_ALL) != 0)
|
||||
|
@ -37,14 +37,14 @@ __feupdateenv (const fenv_t *envp)
|
||||
unchanged. */
|
||||
new.l[1] = (old.l[1] & 0x1FFFFF00) | (new.l[1] & 0x1FF80FFF);
|
||||
|
||||
/* If the old env has no eabled exceptions and the new env has any enabled
|
||||
/* If the old env has no enabled exceptions and the new env has any enabled
|
||||
exceptions, then unmask SIGFPE in the MSR FE0/FE1 bits. This will put
|
||||
the hardware into "precise mode" and may cause the FPU to run slower on
|
||||
some hardware. */
|
||||
if ((old.l[1] & _FPU_MASK_ALL) == 0 && (new.l[1] & _FPU_MASK_ALL) != 0)
|
||||
(void)__fe_nomask_env ();
|
||||
|
||||
/* If the old env had any eabled exceptions and the new env has no enabled
|
||||
/* If the old env had any enabled exceptions and the new env has no enabled
|
||||
exceptions, then mask SIGFPE in the MSR FE0/FE1 bits. This may allow the
|
||||
FPU to run faster because it always takes the default action and can not
|
||||
generate SIGFPE. */
|
||||
|
@ -27,7 +27,7 @@ ldbl_extract_mantissa (int64_t *hi64, u_int64_t *lo64, int *exp, long double x)
|
||||
lo |= (1ULL << 52);
|
||||
lo = lo << 7; /* pre-shift lo to match ieee854. */
|
||||
/* The lower double is normalized separately from the upper. We
|
||||
may need to adjust the lower manitissa to reflect this. */
|
||||
may need to adjust the lower mantissa to reflect this. */
|
||||
ediff = eldbl.ieee.exponent - eldbl.ieee.exponent2;
|
||||
if (ediff > 53)
|
||||
lo = lo >> (ediff-53);
|
||||
|
@ -21,7 +21,7 @@
|
||||
This is a hint to the hardware to expect additional updates adjacent
|
||||
to the lock word or not. If we are acquiring a Mutex, the hint
|
||||
should be true. Otherwise we releasing a Mutex or doing a simple
|
||||
atomic operation. In that case we don't expect addtional updates
|
||||
atomic operation. In that case we don't expect additional updates
|
||||
adjacent to the lock word after the Store Conditional and the hint
|
||||
should be false. */
|
||||
|
||||
@ -35,7 +35,7 @@
|
||||
|
||||
/*
|
||||
* The 32-bit exchange_bool is different on powerpc64 because the subf
|
||||
* does signed 64-bit arthmatic while the lwarx is 32-bit unsigned
|
||||
* does signed 64-bit arithmetic while the lwarx is 32-bit unsigned
|
||||
* (a load word and zero (high 32) form). So powerpc64 has a slightly
|
||||
* different version in sysdeps/powerpc/powerpc64/bits/atomic.h.
|
||||
*/
|
||||
|
@ -34,7 +34,7 @@
|
||||
* latency to memory is >400 clocks
|
||||
* To improve copy performance we need to prefetch source data
|
||||
* far ahead to hide this latency
|
||||
* For best performance instructionforms ending in "." like "andi."
|
||||
* For best performance instruction forms ending in "." like "andi."
|
||||
* should be avoided as the are implemented in microcode on CELL.
|
||||
* The below code is loop unrolled for the CELL cache line of 128 bytes
|
||||
*/
|
||||
@ -146,7 +146,7 @@ EALIGN (BP_SYM (memcpy), 5, 0)
|
||||
lfd fp9, 0x08(r4)
|
||||
dcbz r11,r6
|
||||
lfd fp10, 0x10(r4) /* 4 register stride copy is optimal */
|
||||
lfd fp11, 0x18(r4) /* to hide 1st level cache lantency. */
|
||||
lfd fp11, 0x18(r4) /* to hide 1st level cache latency. */
|
||||
lfd fp12, 0x20(r4)
|
||||
stfd fp9, 0x08(r6)
|
||||
stfd fp10, 0x10(r6)
|
||||
|
@ -113,7 +113,7 @@ __elf_preferred_address (struct link_map *loader, size_t maplength,
|
||||
/* Otherwise, quickly look for a suitable gap between 0x3FFFF and
|
||||
0x70000000. 0x3FFFF is so that references off NULL pointers will
|
||||
cause a segfault, 0x70000000 is just paranoia (it should always
|
||||
be superceded by the program's load address). */
|
||||
be superseded by the program's load address). */
|
||||
low = 0x0003FFFF;
|
||||
high = 0x70000000;
|
||||
for (nsid = 0; nsid < DL_NNS; ++nsid)
|
||||
|
@ -74,7 +74,7 @@ _dl_start_user:
|
||||
slwi r5,r3,2
|
||||
add r6,r4,r5
|
||||
addi r5,r6,4
|
||||
/* pass the auxilary vector in r6. This is passed to us just after _envp. */
|
||||
/* pass the auxiliary vector in r6. This is passed to us just after _envp. */
|
||||
2: lwzu r0,4(r6)
|
||||
cmpwi r0,0
|
||||
bne 2b
|
||||
|
@ -275,7 +275,7 @@ L(checklinesize):
|
||||
beq cr1,L(nondcbz)
|
||||
|
||||
/* If the cache line size is 32 bytes then goto to L(zloopstart),
|
||||
which is coded specificly for 32-byte lines (and 601). */
|
||||
which is coded specifically for 32-byte lines (and 601). */
|
||||
cmplwi cr1,rCLS,32
|
||||
beq cr1,L(zloopstart)
|
||||
|
||||
|
@ -409,9 +409,9 @@ void __mul(const mp_no *x, const mp_no *y, mp_no *z, int p) {
|
||||
if (k > p2) {i1=k-p2; i2=p2+1; }
|
||||
else {i1=1; i2=k; }
|
||||
#if 1
|
||||
/* rearange this inner loop to allow the fmadd instructions to be
|
||||
/* rearrange this inner loop to allow the fmadd instructions to be
|
||||
independent and execute in parallel on processors that have
|
||||
dual symetrical FP pipelines. */
|
||||
dual symmetrical FP pipelines. */
|
||||
if (i1 < (i2-1))
|
||||
{
|
||||
/* make sure we have at least 2 iterations */
|
||||
@ -437,7 +437,7 @@ void __mul(const mp_no *x, const mp_no *y, mp_no *z, int p) {
|
||||
zk += x->d[i1]*y->d[i1];
|
||||
}
|
||||
#else
|
||||
/* The orginal code. */
|
||||
/* The original code. */
|
||||
for (i=i1,j=i2-1; i<i2; i++,j--) zk += X[i]*Y[j];
|
||||
#endif
|
||||
|
||||
|
@ -59,7 +59,7 @@ __slowpow (double x, double y, double z)
|
||||
res1 = (double) (ldpp - ldeps);
|
||||
|
||||
if (res != res1) /* if result still not accurate enough */
|
||||
{ /* use mpa for higher persision. */
|
||||
{ /* use mpa for higher precision. */
|
||||
mp_no mpx, mpy, mpz, mpw, mpp, mpr, mpr1;
|
||||
static const mp_no eps = { -3, {1.0, 4.0} };
|
||||
int p;
|
||||
|
@ -22,7 +22,7 @@
|
||||
/* double [fp1] sqrt (double x [fp1])
|
||||
Power4 (ISA V2.0) and above implement sqrt in hardware (not optional).
|
||||
The fsqrt instruction generates the correct value for all inputs and
|
||||
sets the appropriate floating point exceptions. Extented checking is
|
||||
sets the appropriate floating point exceptions. Extended checking is
|
||||
only needed to set errno (via __kernel_standard) if the input value
|
||||
is negative.
|
||||
|
||||
|
@ -22,7 +22,7 @@
|
||||
/* float [fp1] sqrts (float x [fp1])
|
||||
Power4 (ISA V2.0) and above implement sqrt in hardware (not optional).
|
||||
The fsqrts instruction generates the correct value for all inputs and
|
||||
sets the appropriate floating point exceptions. Extented checking is
|
||||
sets the appropriate floating point exceptions. Extended checking is
|
||||
only needed to set errno (via __kernel_standard) if the input value
|
||||
is negative.
|
||||
|
||||
|
@ -82,7 +82,7 @@ typedef unsigned long long int hp_timing_t;
|
||||
/* That's quite simple. Use the `mftb' instruction. Note that the value
|
||||
might not be 100% accurate since there might be some more instructions
|
||||
running in this moment. This could be changed by using a barrier like
|
||||
'lwsync' right before the `mftb' instruciton. But we are not interested
|
||||
'lwsync' right before the `mftb' instruction. But we are not interested
|
||||
in accurate clock cycles here so we don't do this. */
|
||||
|
||||
#define HP_TIMING_NOW(Var) \
|
||||
|
@ -69,7 +69,7 @@ EALIGN (BP_SYM(memcmp), 4, 0)
|
||||
Otherwise we know the two strings have the same alignment (but not
|
||||
yet word aligned). So we force the string addresses to the next lower
|
||||
word boundary and special case this first word using shift left to
|
||||
eliminate bits preceeding the first byte. Since we want to join the
|
||||
eliminate bits preceding the first byte. Since we want to join the
|
||||
normal (word aligned) compare loop, starting at the second word,
|
||||
we need to adjust the length (rN) and special case the loop
|
||||
versioning for the first word. This insures that the loop count is
|
||||
@ -517,7 +517,7 @@ L(zeroLength):
|
||||
Otherwise we know that rSTR1 is not aready word aligned yet.
|
||||
So we can force the string addresses to the next lower word
|
||||
boundary and special case this first word using shift left to
|
||||
eliminate bits preceeding the first byte. Since we want to join the
|
||||
eliminate bits preceding the first byte. Since we want to join the
|
||||
normal (Wualigned) compare loop, starting at the second word,
|
||||
we need to adjust the length (rN) and special case the loop
|
||||
versioning for the first W. This insures that the loop count is
|
||||
|
@ -51,7 +51,7 @@ EALIGN (BP_SYM(strncmp), 4, 0)
|
||||
cmplwi cr1, rN, 0
|
||||
lis rFEFE, -0x101
|
||||
bne L(unaligned)
|
||||
/* We are word alligned so set up for two loops. first a word
|
||||
/* We are word aligned so set up for two loops. first a word
|
||||
loop, then fall into the byte loop if any residual. */
|
||||
srwi. rTMP, rN, 2
|
||||
clrlwi rN, rN, 30
|
||||
|
@ -22,7 +22,7 @@
|
||||
/* double [fp1] sqrt (double x [fp1])
|
||||
Power4 (ISA V2.0) and above implement sqrt in hardware (not optional).
|
||||
The fsqrt instruction generates the correct value for all inputs and
|
||||
sets the appropriate floating point exceptions. Extented checking is
|
||||
sets the appropriate floating point exceptions. Extended checking is
|
||||
only needed to set errno (via __kernel_standard) if the input value
|
||||
is negative.
|
||||
|
||||
|
@ -22,7 +22,7 @@
|
||||
/* float [fp1] sqrts (float x [fp1])
|
||||
Power4 (ISA V2.0) and above implement sqrt in hardware (not optional).
|
||||
The fsqrts instruction generates the correct value for all inputs and
|
||||
sets the appropriate floating point exceptions. Extented checking is
|
||||
sets the appropriate floating point exceptions. Extended checking is
|
||||
only needed to set errno (via __kernel_standard) if the input value
|
||||
is negative.
|
||||
|
||||
|
@ -411,31 +411,31 @@ L(wdu):
|
||||
not. For power4, power5 and power6 machines there is penalty for
|
||||
unaligned loads (src) that cross 32-byte, cacheline, or page
|
||||
boundaries. So we want to use simple (unaligned) loads where
|
||||
posible but avoid them where we know the load would span a 32-byte
|
||||
possible but avoid them where we know the load would span a 32-byte
|
||||
boundary.
|
||||
|
||||
At this point we know we have at least 29 (32-3) bytes to copy
|
||||
the src is unaligned. and we may cross at least one 32-byte
|
||||
boundary. Also we have the following regester values:
|
||||
boundary. Also we have the following register values:
|
||||
r3 == adjusted dst, word aligned
|
||||
r4 == unadjusted src
|
||||
r5 == unadjusted len
|
||||
r9 == adjusted Word length
|
||||
r10 == src alignment (1-3)
|
||||
r12 == adjuested src, not aligned
|
||||
r12 == adjusted src, not aligned
|
||||
r31 == adjusted len
|
||||
|
||||
First we need to copy word upto but not crossing the next 32-byte
|
||||
First we need to copy word up to but not crossing the next 32-byte
|
||||
boundary. Then perform aligned loads just before and just after
|
||||
the boundary and use shifts and or to gernerate the next aligned
|
||||
the boundary and use shifts and or to generate the next aligned
|
||||
word for dst. If more then 32 bytes remain we copy (unaligned src)
|
||||
the next 7 words and repeat the loop until less then 32-bytes
|
||||
remaim.
|
||||
remain.
|
||||
|
||||
Then if more then 4 bytes remain we again use aligned loads,
|
||||
shifts and or to generate the next dst word. We then process the
|
||||
remaining words using unaligned loads as needed. Finally we check
|
||||
if there more then 0 bytes (1-3) bytes remainting and use
|
||||
if there more then 0 bytes (1-3) bytes remaining and use
|
||||
halfword and or byte load/stores to complete the copy.
|
||||
*/
|
||||
mr 4,12 /* restore unaligned adjusted src ptr */
|
||||
@ -512,7 +512,7 @@ L(wdu_h32_4):
|
||||
addi 3,3,4
|
||||
.align 4
|
||||
L(wdu_h32_0):
|
||||
/* set up for 32-byte boundry crossing word move and possibly 32-byte
|
||||
/* set up for 32-byte boundary crossing word move and possibly 32-byte
|
||||
move loop. */
|
||||
clrrwi 12,4,2
|
||||
cmplwi cr5,31,32
|
||||
|
@ -44,7 +44,7 @@ L(proceed):
|
||||
rlwinm r6,r3,3,27,28 /* Calculate padding. */
|
||||
cmpli cr6,r6,0 /* cr6 == Do we have padding? */
|
||||
lwz r12,0(r8) /* Load word from memory. */
|
||||
cmpb r10,r12,r4 /* Check for BYTE's in WORD1. */
|
||||
cmpb r10,r12,r4 /* Check for BYTEs in WORD1. */
|
||||
beq cr6,L(proceed_no_padding)
|
||||
slw r10,r10,r6
|
||||
srw r10,r10,r6
|
||||
|
@ -73,7 +73,7 @@ EALIGN (BP_SYM(memcmp),4,0)
|
||||
Otherwise we know the two strings have the same alignment (but not
|
||||
yet word aligned). So we force the string addresses to the next lower
|
||||
word boundary and special case this first word using shift left to
|
||||
eliminate bits preceeding the first byte. Since we want to join the
|
||||
eliminate bits preceding the first byte. Since we want to join the
|
||||
normal (word aligned) compare loop, starting at the second word,
|
||||
we need to adjust the length (rN) and special case the loop
|
||||
versioning for the first word. This insures that the loop count is
|
||||
@ -520,7 +520,7 @@ L(zeroLength):
|
||||
Otherwise we know that rSTR1 is not aready word aligned yet.
|
||||
So we can force the string addresses to the next lower word
|
||||
boundary and special case this first word using shift left to
|
||||
eliminate bits preceeding the first byte. Since we want to join the
|
||||
eliminate bits preceding the first byte. Since we want to join the
|
||||
normal (Wualigned) compare loop, starting at the second word,
|
||||
we need to adjust the length (rN) and special case the loop
|
||||
versioning for the first W. This insures that the loop count is
|
||||
|
@ -51,7 +51,7 @@ L(proceed):
|
||||
cmpb r10,r12,r4 /* Check for BYTE in WORD1. */
|
||||
slw r10,r10,r0
|
||||
srw r10,r10,r0
|
||||
cmplwi cr7,r10,0 /* If r10 == 0, no BYTE's have been found. */
|
||||
cmplwi cr7,r10,0 /* If r10 == 0, no BYTEs have been found. */
|
||||
bne cr7,L(done)
|
||||
|
||||
/* Are we done already? */
|
||||
|
@ -39,8 +39,8 @@ ENTRY (BP_SYM (__STRCMP))
|
||||
#define rSTR1 r5 /* 1st string */
|
||||
#define rSTR2 r4 /* 2nd string */
|
||||
#define rLOCARG r5 /* 3rd argument: locale_t */
|
||||
#define rCHAR1 r6 /* Byte readed from 1st string */
|
||||
#define rCHAR2 r7 /* Byte readed from 2nd string */
|
||||
#define rCHAR1 r6 /* Byte read from 1st string */
|
||||
#define rCHAR2 r7 /* Byte read from 2nd string */
|
||||
#define rADDR1 r8 /* Address of tolower(rCHAR1) */
|
||||
#define rADDR2 r12 /* Address of tolower(rCHAR2) */
|
||||
#define rLWR1 r8 /* Byte tolower(rCHAR1) */
|
||||
|
@ -55,7 +55,7 @@ EALIGN (BP_SYM(strncmp),5,0)
|
||||
cmplwi cr1,rN,0
|
||||
lis rFEFE,-0x101
|
||||
bne L(unaligned)
|
||||
/* We are word alligned so set up for two loops. first a word
|
||||
/* We are word aligned so set up for two loops. first a word
|
||||
loop, then fall into the byte loop if any residual. */
|
||||
srwi. rTMP,rN,2
|
||||
clrlwi rN,rN,30
|
||||
|
@ -49,7 +49,7 @@ EALIGN (BP_SYM(strncmp), 4, 0)
|
||||
cmplwi cr1, rN, 0
|
||||
lis rFEFE, -0x101
|
||||
bne L(unaligned)
|
||||
/* We are word alligned so set up for two loops. first a word
|
||||
/* We are word aligned so set up for two loops. first a word
|
||||
loop, then fall into the byte loop if any residual. */
|
||||
srwi. rTMP, rN, 2
|
||||
clrlwi rN, rN, 30
|
||||
|
@ -21,7 +21,7 @@
|
||||
This is a hint to the hardware to expect additional updates adjacent
|
||||
to the lock word or not. If we are acquiring a Mutex, the hint
|
||||
should be true. Otherwise we releasing a Mutex or doing a simple
|
||||
atomic operation. In that case we don't expect addtional updates
|
||||
atomic operation. In that case we don't expect additional updates
|
||||
adjacent to the lock word after the Store Conditional and the hint
|
||||
should be false. */
|
||||
|
||||
@ -34,7 +34,7 @@
|
||||
#endif
|
||||
|
||||
/* The 32-bit exchange_bool is different on powerpc64 because the subf
|
||||
does signed 64-bit arthmatic while the lwarx is 32-bit unsigned
|
||||
does signed 64-bit arithmetic while the lwarx is 32-bit unsigned
|
||||
(a load word and zero (high 32) form) load.
|
||||
In powerpc64 register values are 64-bit by default, including oldval.
|
||||
The value in old val unknown sign extension, lwarx loads the 32-bit
|
||||
|
@ -34,7 +34,7 @@
|
||||
* latency to memory is >400 clocks
|
||||
* To improve copy performance we need to prefetch source data
|
||||
* far ahead to hide this latency
|
||||
* For best performance instructionforms ending in "." like "andi."
|
||||
* For best performance instruction forms ending in "." like "andi."
|
||||
* should be avoided as the are implemented in microcode on CELL.
|
||||
* The below code is loop unrolled for the CELL cache line of 128 bytes
|
||||
*/
|
||||
@ -146,7 +146,7 @@ EALIGN (BP_SYM (memcpy), 5, 0)
|
||||
ld r9, 0x08(r4)
|
||||
dcbz r11,r6
|
||||
ld r7, 0x10(r4) /* 4 register stride copy is optimal */
|
||||
ld r8, 0x18(r4) /* to hide 1st level cache lantency. */
|
||||
ld r8, 0x18(r4) /* to hide 1st level cache latency. */
|
||||
ld r0, 0x20(r4)
|
||||
std r9, 0x08(r6)
|
||||
std r7, 0x10(r6)
|
||||
|
@ -202,7 +202,7 @@ BODY_PREFIX "_dl_start_user:\n" \
|
||||
" sldi 5,3,3\n" \
|
||||
" add 6,4,5\n" \
|
||||
" addi 5,6,8\n" \
|
||||
/* Pass the auxilary vector in r6. This is passed to us just after \
|
||||
/* Pass the auxiliary vector in r6. This is passed to us just after \
|
||||
_envp. */ \
|
||||
"2: ldu 0,8(6)\n" \
|
||||
" cmpdi 0,0\n" \
|
||||
@ -322,13 +322,13 @@ elf_machine_runtime_setup (struct link_map *map, int lazy, int profile)
|
||||
/* Relocate the DT_PPC64_GLINK entry in the _DYNAMIC section.
|
||||
elf_get_dynamic_info takes care of the standard entries but
|
||||
doesn't know exactly what to do with processor specific
|
||||
entires. */
|
||||
entries. */
|
||||
if (info[DT_PPC64(GLINK)] != NULL)
|
||||
info[DT_PPC64(GLINK)]->d_un.d_ptr += l_addr;
|
||||
|
||||
if (lazy)
|
||||
{
|
||||
/* The function descriptor of the appropriate trampline
|
||||
/* The function descriptor of the appropriate trampoline
|
||||
routine is used to set the 1st and 2nd doubleword of the
|
||||
plt_reserve. */
|
||||
Elf64_FuncDesc *resolve_fd;
|
||||
|
@ -31,7 +31,7 @@
|
||||
PowerPC64 long double uses the IBM extended format which is
|
||||
represented two 64-floating point double values. The values are
|
||||
non-overlapping giving an effective precision of 106 bits. The first
|
||||
double contains the high order bits of mantisa and is always ceiled
|
||||
double contains the high order bits of mantissa and is always ceiled
|
||||
to represent a normal ceiling of long double to double. Since the
|
||||
long double value is sum of the high and low values, the low double
|
||||
normally has the opposite sign to compensate for the this ceiling.
|
||||
@ -40,7 +40,7 @@
|
||||
1) |x| < 2**52, all the integer bits are in the high double.
|
||||
ceil the high double and set the low double to -0.0.
|
||||
2) |x| >= 2**52, ceiling involves both doubles.
|
||||
See the comment before lable .L2 for details.
|
||||
See the comment before label .L2 for details.
|
||||
*/
|
||||
|
||||
ENTRY (__ceill)
|
||||
|
@ -26,16 +26,16 @@
|
||||
.section ".text"
|
||||
|
||||
/* long double [fp1,fp2] nearbyintl (long double x [fp1,fp2])
|
||||
IEEE 1003.1 nearbyintl function. nearbyintl is simular to the rintl
|
||||
IEEE 1003.1 nearbyintl function. nearbyintl is similar to the rintl
|
||||
but does raise the "inexact" exception. This implementation is
|
||||
based on rintl but explicitly maskes the inexact exception on entry
|
||||
based on rintl but explicitly masks the inexact exception on entry
|
||||
and clears any pending inexact before restoring the exception mask
|
||||
on exit.
|
||||
|
||||
PowerPC64 long double uses the IBM extended format which is
|
||||
represented two 64-floating point double values. The values are
|
||||
non-overlapping giving an effective precision of 106 bits. The first
|
||||
double contains the high order bits of mantisa and is always rounded
|
||||
double contains the high order bits of mantissa and is always rounded
|
||||
to represent a normal rounding of long double to double. Since the
|
||||
long double value is sum of the high and low values, the low double
|
||||
normally has the opposite sign to compensate for the this rounding.
|
||||
@ -44,7 +44,7 @@
|
||||
1) |x| < 2**52, all the integer bits are in the high double.
|
||||
floor the high double and set the low double to -0.0.
|
||||
2) |x| >= 2**52, Rounding involves both doubles.
|
||||
See the comment before lable .L2 for details.
|
||||
See the comment before label .L2 for details.
|
||||
*/
|
||||
ENTRY (__nearbyintl)
|
||||
mffs fp11 /* Save current FPSCR. */
|
||||
|
@ -82,7 +82,7 @@ typedef unsigned long long int hp_timing_t;
|
||||
/* That's quite simple. Use the `mftb' instruction. Note that the value
|
||||
might not be 100% accurate since there might be some more instructions
|
||||
running in this moment. This could be changed by using a barrier like
|
||||
'lwsync' right before the `mftb' instruciton. But we are not interested
|
||||
'lwsync' right before the `mftb' instruction. But we are not interested
|
||||
in accurate clock cycles here so we don't do this. */
|
||||
#ifdef _ARCH_PWR4
|
||||
#define HP_TIMING_NOW(Var) __asm__ __volatile__ ("mfspr %0,268" : "=r" (Var))
|
||||
|
@ -28,11 +28,11 @@
|
||||
with the appropriate combination of byte and halfword load/stores.
|
||||
There is minimal effort to optimize the alignment of short moves.
|
||||
The 64-bit implementations of POWER3 and POWER4 do a reasonable job
|
||||
of handling unligned load/stores that do not cross 32-byte boundries.
|
||||
of handling unaligned load/stores that do not cross 32-byte boundaries.
|
||||
|
||||
Longer moves (>= 32-bytes) justify the effort to get at least the
|
||||
destination doubleword (8-byte) aligned. Further optimization is
|
||||
posible when both source and destination are doubleword aligned.
|
||||
possible when both source and destination are doubleword aligned.
|
||||
Each case has a optimized unrolled loop. */
|
||||
|
||||
EALIGN (BP_SYM (memcpy), 5, 0)
|
||||
@ -43,9 +43,9 @@ EALIGN (BP_SYM (memcpy), 5, 0)
|
||||
std 3,-16(1)
|
||||
std 31,-8(1)
|
||||
cfi_offset(31,-8)
|
||||
andi. 11,3,7 /* check alignement of dst. */
|
||||
andi. 11,3,7 /* check alignment of dst. */
|
||||
clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */
|
||||
clrldi 10,4,61 /* check alignement of src. */
|
||||
clrldi 10,4,61 /* check alignment of src. */
|
||||
cmpldi cr6,5,8
|
||||
ble- cr1,.L2 /* If move < 32 bytes use short move code. */
|
||||
cmpld cr6,10,11
|
||||
@ -56,7 +56,7 @@ EALIGN (BP_SYM (memcpy), 5, 0)
|
||||
beq .L0
|
||||
|
||||
subf 31,0,5
|
||||
/* Move 0-7 bytes as needed to get the destination doubleword alligned. */
|
||||
/* Move 0-7 bytes as needed to get the destination doubleword aligned. */
|
||||
1: bf 31,2f
|
||||
lbz 6,0(12)
|
||||
addi 12,12,1
|
||||
@ -73,10 +73,10 @@ EALIGN (BP_SYM (memcpy), 5, 0)
|
||||
stw 6,0(3)
|
||||
addi 3,3,4
|
||||
0:
|
||||
clrldi 10,12,61 /* check alignement of src again. */
|
||||
clrldi 10,12,61 /* check alignment of src again. */
|
||||
srdi 9,31,3 /* Number of full double words remaining. */
|
||||
|
||||
/* Copy doublewords from source to destination, assumpting the
|
||||
/* Copy doublewords from source to destination, assuming the
|
||||
destination is aligned on a doubleword boundary.
|
||||
|
||||
At this point we know there are at least 25 bytes left (32-7) to copy.
|
||||
@ -152,7 +152,7 @@ EALIGN (BP_SYM (memcpy), 5, 0)
|
||||
add 12,12,0
|
||||
|
||||
/* At this point we have a tail of 0-7 bytes and we know that the
|
||||
destiniation is double word aligned. */
|
||||
destination is double word aligned. */
|
||||
4: bf 29,2f
|
||||
lwz 6,0(12)
|
||||
addi 12,12,4
|
||||
@ -282,7 +282,7 @@ EALIGN (BP_SYM (memcpy), 5, 0)
|
||||
bne cr6,4f
|
||||
/* Would have liked to use use ld/std here but the 630 processors are
|
||||
slow for load/store doubles that are not at least word aligned.
|
||||
Unaligned Load/Store word execute with only a 1 cycle penaltity. */
|
||||
Unaligned Load/Store word execute with only a 1 cycle penalty. */
|
||||
lwz 6,0(4)
|
||||
lwz 7,4(4)
|
||||
stw 6,0(3)
|
||||
|
@ -409,9 +409,9 @@ void __mul(const mp_no *x, const mp_no *y, mp_no *z, int p) {
|
||||
if (k > p2) {i1=k-p2; i2=p2+1; }
|
||||
else {i1=1; i2=k; }
|
||||
#if 1
|
||||
/* rearange this inner loop to allow the fmadd instructions to be
|
||||
/* rearrange this inner loop to allow the fmadd instructions to be
|
||||
independent and execute in parallel on processors that have
|
||||
dual symetrical FP pipelines. */
|
||||
dual symmetrical FP pipelines. */
|
||||
if (i1 < (i2-1))
|
||||
{
|
||||
/* make sure we have at least 2 iterations */
|
||||
@ -437,7 +437,7 @@ void __mul(const mp_no *x, const mp_no *y, mp_no *z, int p) {
|
||||
zk += x->d[i1]*y->d[i1];
|
||||
}
|
||||
#else
|
||||
/* The orginal code. */
|
||||
/* The original code. */
|
||||
for (i=i1,j=i2-1; i<i2; i++,j--) zk += X[i]*Y[j];
|
||||
#endif
|
||||
|
||||
|
@ -59,7 +59,7 @@ __slowpow (double x, double y, double z)
|
||||
res1 = (double) (ldpp - ldeps);
|
||||
|
||||
if (res != res1) /* if result still not accurate enough */
|
||||
{ /* use mpa for higher persision. */
|
||||
{ /* use mpa for higher precision. */
|
||||
mp_no mpx, mpy, mpz, mpw, mpp, mpr, mpr1;
|
||||
static const mp_no eps = { -3, {1.0, 4.0} };
|
||||
int p;
|
||||
|
@ -53,7 +53,7 @@ EALIGN (BP_SYM(memcmp), 4, 0)
|
||||
beq- cr6, L(zeroLength)
|
||||
dcbt 0,rSTR1
|
||||
dcbt 0,rSTR2
|
||||
/* If less than 8 bytes or not aligned, use the unalligned
|
||||
/* If less than 8 bytes or not aligned, use the unaligned
|
||||
byte loop. */
|
||||
blt cr1, L(bytealigned)
|
||||
std rWORD8,-8(r1)
|
||||
@ -62,7 +62,7 @@ EALIGN (BP_SYM(memcmp), 4, 0)
|
||||
cfi_offset(rWORD7,-16)
|
||||
bne L(unaligned)
|
||||
/* At this point we know both strings have the same alignment and the
|
||||
compare length is at least 8 bytes. rBITDIF containes the low order
|
||||
compare length is at least 8 bytes. rBITDIF contains the low order
|
||||
3 bits of rSTR1 and cr5 contains the result of the logical compare
|
||||
of rBITDIF to 0. If rBITDIF == 0 then we are already double word
|
||||
aligned and can perform the DWaligned loop.
|
||||
@ -70,7 +70,7 @@ EALIGN (BP_SYM(memcmp), 4, 0)
|
||||
Otherwise we know the two strings have the same alignment (but not
|
||||
yet DW). So we can force the string addresses to the next lower DW
|
||||
boundary and special case this first DW word using shift left to
|
||||
ellimiate bits preceeding the first byte. Since we want to join the
|
||||
eliminate bits preceding the first byte. Since we want to join the
|
||||
normal (DWaligned) compare loop, starting at the second double word,
|
||||
we need to adjust the length (rN) and special case the loop
|
||||
versioning for the first DW. This insures that the loop count is
|
||||
@ -152,8 +152,8 @@ L(DWaligned):
|
||||
L(dP1):
|
||||
mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
|
||||
/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
|
||||
(8-15 byte compare), we want to use only volitile registers. This
|
||||
means we can avoid restoring non-volitile registers since we did not
|
||||
(8-15 byte compare), we want to use only volatile registers. This
|
||||
means we can avoid restoring non-volatile registers since we did not
|
||||
change any on the early exit path. The key here is the non-early
|
||||
exit path only cares about the condition code (cr5), not about which
|
||||
register pair was used. */
|
||||
@ -215,7 +215,7 @@ L(dP2e):
|
||||
bne cr5, L(dLcr5)
|
||||
b L(dLoop2)
|
||||
/* Again we are on a early exit path (16-23 byte compare), we want to
|
||||
only use volitile registers and avoid restoring non-volitile
|
||||
only use volatile registers and avoid restoring non-volatile
|
||||
registers. */
|
||||
.align 4
|
||||
L(dP2x):
|
||||
@ -256,7 +256,7 @@ L(dP3e):
|
||||
bne cr6, L(dLcr6)
|
||||
b L(dLoop1)
|
||||
/* Again we are on a early exit path (24-31 byte compare), we want to
|
||||
only use volitile registers and avoid restoring non-volitile
|
||||
only use volatile registers and avoid restoring non-volatile
|
||||
registers. */
|
||||
.align 4
|
||||
L(dP3x):
|
||||
@ -340,7 +340,7 @@ L(d04):
|
||||
beq L(zeroLength)
|
||||
/* At this point we have a remainder of 1 to 7 bytes to compare. Since
|
||||
we are aligned it is safe to load the whole double word, and use
|
||||
shift right double to elliminate bits beyond the compare length. */
|
||||
shift right double to eliminate bits beyond the compare length. */
|
||||
L(d00):
|
||||
ld rWORD1, 8(rSTR1)
|
||||
ld rWORD2, 8(rSTR2)
|
||||
@ -496,15 +496,15 @@ L(zeroLength):
|
||||
|
||||
.align 4
|
||||
/* At this point we know the strings have different alignment and the
|
||||
compare length is at least 8 bytes. rBITDIF containes the low order
|
||||
compare length is at least 8 bytes. rBITDIF contains the low order
|
||||
3 bits of rSTR1 and cr5 contains the result of the logical compare
|
||||
of rBITDIF to 0. If rBITDIF == 0 then rStr1 is double word
|
||||
aligned and can perform the DWunaligned loop.
|
||||
|
||||
Otherwise we know that rSTR1 is not aready DW aligned yet.
|
||||
Otherwise we know that rSTR1 is not already DW aligned yet.
|
||||
So we can force the string addresses to the next lower DW
|
||||
boundary and special case this first DW word using shift left to
|
||||
ellimiate bits preceeding the first byte. Since we want to join the
|
||||
eliminate bits preceding the first byte. Since we want to join the
|
||||
normal (DWaligned) compare loop, starting at the second double word,
|
||||
we need to adjust the length (rN) and special case the loop
|
||||
versioning for the first DW. This insures that the loop count is
|
||||
@ -537,7 +537,7 @@ L(unaligned):
|
||||
clrrdi rSTR2, rSTR2, 3
|
||||
std r26,-48(r1)
|
||||
cfi_offset(r26,-48)
|
||||
/* Compute the leaft/right shift counts for the unalign rSTR2,
|
||||
/* Compute the left/right shift counts for the unalign rSTR2,
|
||||
compensating for the logical (DW aligned) start of rSTR1. */
|
||||
clrldi rSHL, r27, 61
|
||||
clrrdi rSTR1, rSTR1, 3
|
||||
@ -876,7 +876,7 @@ L(du14):
|
||||
sldi. rN, rN, 3
|
||||
bne cr5, L(duLcr5)
|
||||
/* At this point we have a remainder of 1 to 7 bytes to compare. We use
|
||||
shift right double to elliminate bits beyond the compare length.
|
||||
shift right double to eliminate bits beyond the compare length.
|
||||
This allows the use of double word subtract to compute the final
|
||||
result.
|
||||
|
||||
|
@ -28,11 +28,11 @@
|
||||
with the appropriate combination of byte and halfword load/stores.
|
||||
There is minimal effort to optimize the alignment of short moves.
|
||||
The 64-bit implementations of POWER3 and POWER4 do a reasonable job
|
||||
of handling unligned load/stores that do not cross 32-byte boundries.
|
||||
of handling unaligned load/stores that do not cross 32-byte boundaries.
|
||||
|
||||
Longer moves (>= 32-bytes) justify the effort to get at least the
|
||||
destination doubleword (8-byte) aligned. Further optimization is
|
||||
posible when both source and destination are doubleword aligned.
|
||||
possible when both source and destination are doubleword aligned.
|
||||
Each case has a optimized unrolled loop. */
|
||||
|
||||
.machine power4
|
||||
@ -44,9 +44,9 @@ EALIGN (BP_SYM (memcpy), 5, 0)
|
||||
std 3,-16(1)
|
||||
std 31,-8(1)
|
||||
cfi_offset(31,-8)
|
||||
andi. 11,3,7 /* check alignement of dst. */
|
||||
andi. 11,3,7 /* check alignment of dst. */
|
||||
clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */
|
||||
clrldi 10,4,61 /* check alignement of src. */
|
||||
clrldi 10,4,61 /* check alignment of src. */
|
||||
cmpldi cr6,5,8
|
||||
ble- cr1,.L2 /* If move < 32 bytes use short move code. */
|
||||
cmpld cr6,10,11
|
||||
@ -57,7 +57,7 @@ EALIGN (BP_SYM (memcpy), 5, 0)
|
||||
beq .L0
|
||||
|
||||
subf 31,0,5
|
||||
/* Move 0-7 bytes as needed to get the destination doubleword alligned. */
|
||||
/* Move 0-7 bytes as needed to get the destination doubleword aligned. */
|
||||
1: bf 31,2f
|
||||
lbz 6,0(12)
|
||||
addi 12,12,1
|
||||
@ -74,10 +74,10 @@ EALIGN (BP_SYM (memcpy), 5, 0)
|
||||
stw 6,0(3)
|
||||
addi 3,3,4
|
||||
0:
|
||||
clrldi 10,12,61 /* check alignement of src again. */
|
||||
clrldi 10,12,61 /* check alignment of src again. */
|
||||
srdi 9,31,3 /* Number of full double words remaining. */
|
||||
|
||||
/* Copy doublewords from source to destination, assumpting the
|
||||
/* Copy doublewords from source to destination, assuming the
|
||||
destination is aligned on a doubleword boundary.
|
||||
|
||||
At this point we know there are at least 25 bytes left (32-7) to copy.
|
||||
@ -154,7 +154,7 @@ EALIGN (BP_SYM (memcpy), 5, 0)
|
||||
add 12,12,0
|
||||
|
||||
/* At this point we have a tail of 0-7 bytes and we know that the
|
||||
destiniation is double word aligned. */
|
||||
destination is double word aligned. */
|
||||
4: bf 29,2f
|
||||
lwz 6,0(12)
|
||||
addi 12,12,4
|
||||
@ -284,7 +284,7 @@ EALIGN (BP_SYM (memcpy), 5, 0)
|
||||
bne cr6,4f
|
||||
/* Would have liked to use use ld/std here but the 630 processors are
|
||||
slow for load/store doubles that are not at least word aligned.
|
||||
Unaligned Load/Store word execute with only a 1 cycle penaltity. */
|
||||
Unaligned Load/Store word execute with only a 1 cycle penalty. */
|
||||
lwz 6,0(4)
|
||||
lwz 7,4(4)
|
||||
stw 6,0(3)
|
||||
|
@ -52,7 +52,7 @@ EALIGN (BP_SYM(strncmp), 4, 0)
|
||||
cmpldi cr1, rN, 0
|
||||
lis rFEFE, -0x101
|
||||
bne L(unaligned)
|
||||
/* We are doubleword alligned so set up for two loops. first a double word
|
||||
/* We are doubleword aligned so set up for two loops. first a double word
|
||||
loop, then fall into the byte loop if any residual. */
|
||||
srdi. rTMP, rN, 3
|
||||
clrldi rN, rN, 61
|
||||
|
@ -28,16 +28,16 @@
|
||||
with the appropriate combination of byte and halfword load/stores.
|
||||
There is minimal effort to optimize the alignment of short moves.
|
||||
The 64-bit implementations of POWER3 and POWER4 do a reasonable job
|
||||
of handling unligned load/stores that do not cross 32-byte boundries.
|
||||
of handling unaligned load/stores that do not cross 32-byte boundaries.
|
||||
|
||||
Longer moves (>= 32-bytes) justify the effort to get at least the
|
||||
destination doubleword (8-byte) aligned. Further optimization is
|
||||
posible when both source and destination are doubleword aligned.
|
||||
possible when both source and destination are doubleword aligned.
|
||||
Each case has a optimized unrolled loop.
|
||||
|
||||
For POWER6 unaligned loads will take a 20+ cycle hicup for any
|
||||
For POWER6 unaligned loads will take a 20+ cycle hiccup for any
|
||||
L1 cache miss that crosses a 32- or 128-byte boundary. Store
|
||||
is more forgiving and does not take a hicup until page or
|
||||
is more forgiving and does not take a hiccup until page or
|
||||
segment boundaries. So we require doubleword alignment for
|
||||
the source but may take a risk and only require word alignment
|
||||
for the destination. */
|
||||
@ -50,9 +50,9 @@ EALIGN (BP_SYM (memcpy), 7, 0)
|
||||
neg 0,3
|
||||
std 3,-16(1)
|
||||
std 31,-8(1)
|
||||
andi. 11,3,7 /* check alignement of dst. */
|
||||
andi. 11,3,7 /* check alignment of dst. */
|
||||
clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */
|
||||
clrldi 10,4,61 /* check alignement of src. */
|
||||
clrldi 10,4,61 /* check alignment of src. */
|
||||
cmpldi cr6,5,8
|
||||
ble- cr1,.L2 /* If move < 32 bytes use short move code. */
|
||||
mtcrf 0x01,0
|
||||
@ -61,8 +61,8 @@ EALIGN (BP_SYM (memcpy), 7, 0)
|
||||
beq .L0
|
||||
|
||||
subf 5,0,5
|
||||
/* Move 0-7 bytes as needed to get the destination doubleword alligned.
|
||||
Duplicate some code to maximize fall-throught and minimize agen delays. */
|
||||
/* Move 0-7 bytes as needed to get the destination doubleword aligned.
|
||||
Duplicate some code to maximize fall-through and minimize agen delays. */
|
||||
1: bf 31,2f
|
||||
lbz 6,0(4)
|
||||
stb 6,0(3)
|
||||
@ -95,10 +95,10 @@ EALIGN (BP_SYM (memcpy), 7, 0)
|
||||
add 4,4,0
|
||||
add 3,3,0
|
||||
|
||||
clrldi 10,4,61 /* check alignement of src again. */
|
||||
clrldi 10,4,61 /* check alignment of src again. */
|
||||
srdi 9,5,3 /* Number of full double words remaining. */
|
||||
|
||||
/* Copy doublewords from source to destination, assumpting the
|
||||
/* Copy doublewords from source to destination, assuming the
|
||||
destination is aligned on a doubleword boundary.
|
||||
|
||||
At this point we know there are at least 25 bytes left (32-7) to copy.
|
||||
@ -130,7 +130,7 @@ EALIGN (BP_SYM (memcpy), 7, 0)
|
||||
load, load, store, store every 2 cycles.
|
||||
|
||||
The following code is sensitive to cache line alignment. Do not
|
||||
make any change with out first making sure thay don't result in
|
||||
make any change with out first making sure they don't result in
|
||||
splitting ld/std pairs across a cache line. */
|
||||
|
||||
mtcrf 0x02,5
|
||||
@ -329,7 +329,7 @@ L(das_tail):
|
||||
|
||||
L(das_tail2):
|
||||
/* At this point we have a tail of 0-7 bytes and we know that the
|
||||
destiniation is double word aligned. */
|
||||
destination is double word aligned. */
|
||||
4: bf 29,2f
|
||||
lwz 6,0(4)
|
||||
stw 6,0(3)
|
||||
@ -537,7 +537,7 @@ L(dus_tailX):
|
||||
.LE8:
|
||||
mr 12,4
|
||||
bne cr6,L(dus_4)
|
||||
/* Exactly 8 bytes. We may cross a 32-/128-byte boundry and take a ~20
|
||||
/* Exactly 8 bytes. We may cross a 32-/128-byte boundary and take a ~20
|
||||
cycle delay. This case should be rare and any attempt to avoid this
|
||||
would take most of 20 cycles any way. */
|
||||
ld 6,0(4)
|
||||
@ -1146,7 +1146,7 @@ L(du_done):
|
||||
add 3,3,0
|
||||
add 12,12,0
|
||||
/* At this point we have a tail of 0-7 bytes and we know that the
|
||||
destiniation is double word aligned. */
|
||||
destination is double word aligned. */
|
||||
4: bf 29,2f
|
||||
lwz 6,0(12)
|
||||
addi 12,12,4
|
||||
|
@ -55,7 +55,7 @@ EALIGN (BP_SYM(memcmp),4,0)
|
||||
beq- cr6,L(zeroLength)
|
||||
dcbt 0,rSTR1
|
||||
dcbt 0,rSTR2
|
||||
/* If less than 8 bytes or not aligned, use the unalligned
|
||||
/* If less than 8 bytes or not aligned, use the unaligned
|
||||
byte loop. */
|
||||
blt cr1,L(bytealigned)
|
||||
std rWORD8,-8(r1)
|
||||
@ -64,7 +64,7 @@ EALIGN (BP_SYM(memcmp),4,0)
|
||||
cfi_offset(rWORD7,-16)
|
||||
bne L(unaligned)
|
||||
/* At this point we know both strings have the same alignment and the
|
||||
compare length is at least 8 bytes. rBITDIF containes the low order
|
||||
compare length is at least 8 bytes. rBITDIF contains the low order
|
||||
3 bits of rSTR1 and cr5 contains the result of the logical compare
|
||||
of rBITDIF to 0. If rBITDIF == 0 then we are already double word
|
||||
aligned and can perform the DWaligned loop.
|
||||
@ -72,7 +72,7 @@ EALIGN (BP_SYM(memcmp),4,0)
|
||||
Otherwise we know the two strings have the same alignment (but not
|
||||
yet DW). So we can force the string addresses to the next lower DW
|
||||
boundary and special case this first DW word using shift left to
|
||||
ellimiate bits preceeding the first byte. Since we want to join the
|
||||
eliminate bits preceding the first byte. Since we want to join the
|
||||
normal (DWaligned) compare loop, starting at the second double word,
|
||||
we need to adjust the length (rN) and special case the loop
|
||||
versioning for the first DW. This insures that the loop count is
|
||||
@ -154,8 +154,8 @@ L(DWaligned):
|
||||
L(dP1):
|
||||
mtctr rTMP
|
||||
/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
|
||||
(8-15 byte compare), we want to use only volitile registers. This
|
||||
means we can avoid restoring non-volitile registers since we did not
|
||||
(8-15 byte compare), we want to use only volatile registers. This
|
||||
means we can avoid restoring non-volatile registers since we did not
|
||||
change any on the early exit path. The key here is the non-early
|
||||
exit path only cares about the condition code (cr5), not about which
|
||||
register pair was used. */
|
||||
@ -217,7 +217,7 @@ L(dP2e):
|
||||
bne cr5,L(dLcr5)
|
||||
b L(dLoop2)
|
||||
/* Again we are on a early exit path (16-23 byte compare), we want to
|
||||
only use volitile registers and avoid restoring non-volitile
|
||||
only use volatile registers and avoid restoring non-volatile
|
||||
registers. */
|
||||
.align 4
|
||||
L(dP2x):
|
||||
@ -258,7 +258,7 @@ L(dP3e):
|
||||
bne cr6,L(dLcr6)
|
||||
b L(dLoop1)
|
||||
/* Again we are on a early exit path (24-31 byte compare), we want to
|
||||
only use volitile registers and avoid restoring non-volitile
|
||||
only use volatile registers and avoid restoring non-volatile
|
||||
registers. */
|
||||
.align 4
|
||||
L(dP3x):
|
||||
@ -342,7 +342,7 @@ L(d04):
|
||||
beq L(zeroLength)
|
||||
/* At this point we have a remainder of 1 to 7 bytes to compare. Since
|
||||
we are aligned it is safe to load the whole double word, and use
|
||||
shift right double to elliminate bits beyond the compare length. */
|
||||
shift right double to eliminate bits beyond the compare length. */
|
||||
L(d00):
|
||||
ld rWORD1,8(rSTR1)
|
||||
ld rWORD2,8(rSTR2)
|
||||
@ -498,15 +498,15 @@ L(zeroLength):
|
||||
|
||||
.align 4
|
||||
/* At this point we know the strings have different alignment and the
|
||||
compare length is at least 8 bytes. rBITDIF containes the low order
|
||||
compare length is at least 8 bytes. rBITDIF contains the low order
|
||||
3 bits of rSTR1 and cr5 contains the result of the logical compare
|
||||
of rBITDIF to 0. If rBITDIF == 0 then rStr1 is double word
|
||||
aligned and can perform the DWunaligned loop.
|
||||
|
||||
Otherwise we know that rSTR1 is not aready DW aligned yet.
|
||||
Otherwise we know that rSTR1 is not already DW aligned yet.
|
||||
So we can force the string addresses to the next lower DW
|
||||
boundary and special case this first DW word using shift left to
|
||||
ellimiate bits preceeding the first byte. Since we want to join the
|
||||
eliminate bits preceding the first byte. Since we want to join the
|
||||
normal (DWaligned) compare loop, starting at the second double word,
|
||||
we need to adjust the length (rN) and special case the loop
|
||||
versioning for the first DW. This insures that the loop count is
|
||||
@ -539,7 +539,7 @@ L(unaligned):
|
||||
clrrdi rSTR2,rSTR2,3
|
||||
std r26,-48(r1)
|
||||
cfi_offset(r26,-48)
|
||||
/* Compute the leaft/right shift counts for the unalign rSTR2,
|
||||
/* Compute the left/right shift counts for the unaligned rSTR2,
|
||||
compensating for the logical (DW aligned) start of rSTR1. */
|
||||
clrldi rSHL,r27,61
|
||||
clrrdi rSTR1,rSTR1,3
|
||||
@ -878,7 +878,7 @@ L(du14):
|
||||
sldi. rN,rN,3
|
||||
bne cr5,L(duLcr5)
|
||||
/* At this point we have a remainder of 1 to 7 bytes to compare. We use
|
||||
shift right double to elliminate bits beyond the compare length.
|
||||
shift right double to eliminate bits beyond the compare length.
|
||||
This allows the use of double word subtract to compute the final
|
||||
result.
|
||||
|
||||
|
@ -52,7 +52,7 @@ L(proceed):
|
||||
cmpb r10,r12,r4 /* Check for BYTE in DWORD1. */
|
||||
sld r10,r10,r0
|
||||
srd r10,r10,r0
|
||||
cmpldi cr7,r10,0 /* If r10 == 0, no BYTE's have been found. */
|
||||
cmpldi cr7,r10,0 /* If r10 == 0, no BYTEs have been found. */
|
||||
bne cr7,L(done)
|
||||
|
||||
/* Are we done already? */
|
||||
|
@ -40,8 +40,8 @@ ENTRY (BP_SYM (__STRCMP))
|
||||
#define rSTR1 r5 /* 1st string */
|
||||
#define rSTR2 r4 /* 2nd string */
|
||||
#define rLOCARG r5 /* 3rd argument: locale_t */
|
||||
#define rCHAR1 r6 /* Byte readed from 1st string */
|
||||
#define rCHAR2 r7 /* Byte readed from 2nd string */
|
||||
#define rCHAR1 r6 /* Byte read from 1st string */
|
||||
#define rCHAR2 r7 /* Byte read from 2nd string */
|
||||
#define rADDR1 r8 /* Address of tolower(rCHAR1) */
|
||||
#define rADDR2 r12 /* Address of tolower(rCHAR2) */
|
||||
#define rLWR1 r8 /* Word tolower(rCHAR1) */
|
||||
|
@ -56,7 +56,7 @@ EALIGN (BP_SYM(strncmp),5,0)
|
||||
cmpldi cr1,rN,0
|
||||
lis rFEFE,-0x101
|
||||
bne L(unaligned)
|
||||
/* We are doubleword alligned so set up for two loops. first a double word
|
||||
/* We are doubleword aligned so set up for two loops. first a double word
|
||||
loop, then fall into the byte loop if any residual. */
|
||||
srdi. rTMP,rN,3
|
||||
clrldi rN,rN,61
|
||||
|
@ -50,7 +50,7 @@ EALIGN (BP_SYM(strncmp), 4, 0)
|
||||
cmpldi cr1, rN, 0
|
||||
lis rFEFE, -0x101
|
||||
bne L(unaligned)
|
||||
/* We are doubleword alligned so set up for two loops. first a double word
|
||||
/* We are doubleword aligned so set up for two loops. first a double word
|
||||
loop, then fall into the byte loop if any residual. */
|
||||
srdi. rTMP, rN, 3
|
||||
clrldi rN, rN, 61
|
||||
|
Loading…
Reference in New Issue
Block a user