Remove ia64-linux-gnu

Linux 6.7 removed ia64 from the official tree [1], following the general
principle that a glibc port needs upstream support for the architecture
in all the components it depends on (binutils, GCC, and the Linux
kernel).

Apart from the removal of sysdeps/ia64 and sysdeps/unix/sysv/linux/ia64,
there are updates to various comments referencing ia64 for which removal
of those references seemed appropriate. The configuration is removed
from README and build-many-glibcs.py.

The CONTRIBUTED-BY, elf/elf.h, manual/contrib.texi (the porting
mention), *.po files, config.guess, and longlong.h are not changed.

For Linux it allows cleanup some clone2 support on multiple files.

The following bug can be closed as WONTFIX: BZ 22634 [2], BZ 14250 [3],
BZ 21634 [4], BZ 10163 [5], BZ 16401 [6], and BZ 11585 [7].

[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=43ff221426d33db909f7159fdf620c3b052e2d1c
[2] https://sourceware.org/bugzilla/show_bug.cgi?id=22634
[3] https://sourceware.org/bugzilla/show_bug.cgi?id=14250
[4] https://sourceware.org/bugzilla/show_bug.cgi?id=21634
[5] https://sourceware.org/bugzilla/show_bug.cgi?id=10163
[6] https://sourceware.org/bugzilla/show_bug.cgi?id=16401
[7] https://sourceware.org/bugzilla/show_bug.cgi?id=11585
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
This commit is contained in:
Adhemerval Zanella 2024-01-08 10:21:17 -03:00
parent e171ad7d59
commit 460860f457
567 changed files with 39 additions and 155609 deletions

View File

@ -609,9 +609,7 @@ Specific advice for GNU/Linux systems
If you are installing the GNU C Library on GNU/Linux systems, you need
to have the header files from a 3.2 or newer kernel around for
reference. (For the ia64 architecture, you need version 3.2.18 or newer
because this is the first version with support for the accept4 system
call.) These headers must be installed using make headers_install;
reference. These headers must be installed using make headers_install;
the headers present in the kernel source directory are not suitable for
direct use by the GNU C Library. You do not need to use that kernel,
just have its headers installed where the GNU C Library can access them,

2
NEWS
View File

@ -80,6 +80,8 @@ Deprecated and removed features, and other changes affecting compatibility:
of GNU libc are advised to check whether their build processes can be
simplified.
* The ia64*-*-linux-gnu configurations are no longer supported.
Changes to build and runtime requirements:
* Building on LoongArch requires at a minimum binutils 2.41 for vector

1
README
View File

@ -30,7 +30,6 @@ The GNU C Library supports these configurations for using Linux kernels:
hppa-*-linux-gnu
i[4567]86-*-linux-gnu
x86_64-*-linux-gnu Can build either x86_64 or x32
ia64-*-linux-gnu
loongarch64-*-linux-gnu Hardware floating point, LE only.
m68k-*-linux-gnu
microblaze*-*-linux-gnu

View File

@ -24,8 +24,7 @@ type mcontext_t
type ucontext_t
element ucontext_t {ucontext_t*} uc_link
// Bug 21634: uc_sigmask has wrong type.
xfail[ia64-linux]-element ucontext_t sigset_t uc_sigmask
element ucontext_t sigset_t uc_sigmask
element ucontext_t stack_t uc_stack
// Bug 21635: uc_mcontext has wrong type.
xfail[powerpc32-linux]-element ucontext_t mcontext_t uc_mcontext
@ -138,8 +137,7 @@ constant SIGSTKSZ
type ucontext_t
element ucontext_t {ucontext_t*} uc_link
// Bug 21634: uc_sigmask has wrong type.
xfail[ia64-linux]-element ucontext_t sigset_t uc_sigmask
element ucontext_t sigset_t uc_sigmask
element ucontext_t stack_t uc_stack
// Bug 21635: uc_mcontext has wrong type.
xfail[powerpc32-linux]-element ucontext_t mcontext_t uc_mcontext

View File

@ -4,8 +4,7 @@ type mcontext_t
type ucontext_t
element ucontext_t {ucontext_t*} uc_link
// Bug 21634: uc_sigmask has wrong type.
xfail[ia64-linux]-element ucontext_t sigset_t uc_sigmask
element ucontext_t sigset_t uc_sigmask
element ucontext_t stack_t uc_stack
// Bug 21635: uc_mcontext has wrong type.
xfail[powerpc32-linux]-element ucontext_t mcontext_t uc_mcontext

View File

@ -179,9 +179,6 @@ print_entry (const char *lib, int flag, uint64_t hwcap,
case FLAG_SPARC_LIB64:
fputs (",64bit", stdout);
break;
case FLAG_IA64_LIB64:
fputs (",IA-64", stdout);
break;
case FLAG_X8664_LIB64:
fputs (",x86-64", stdout);
break;

View File

@ -34,7 +34,6 @@ size_t taddr[] =
0x00010000 /* Linux elf32/sparc */
#if __WORDSIZE > 32
,
0x4000000000000000, /* Linux elf64/ia64 */
0x0000000120000000, /* Linux elf64/alpha */
0x4000000000001000, /* elf64/hppa */
0x0000000100000000 /* Linux elf64/sparc */

View File

@ -669,8 +669,6 @@ patches, although we try to avoid this.
If you are installing @theglibc{} on @gnulinuxsystems{}, you need to have
the header files from a 3.2 or newer kernel around for reference.
(For the ia64 architecture, you need version 3.2.18 or newer because this
is the first version with support for the @code{accept4} system call.)
These headers must be installed using @samp{make headers_install}; the
headers present in the kernel source directory are not suitable for
direct use by @theglibc{}. You do not need to use that kernel, just have

View File

@ -69,7 +69,7 @@ Support for @code{_Float@var{N}} or @code{_Float@var{N}x} types is
provided for @code{_Float32}, @code{_Float64} and @code{_Float32x} on
all platforms.
It is also provided for @code{_Float128} and @code{_Float64x} on
powerpc64le (PowerPC 64-bits little-endian), x86_64, x86, ia64,
powerpc64le (PowerPC 64-bits little-endian), x86_64, x86,
aarch64, alpha, loongarch, mips64, riscv, s390 and sparc.
@menu

View File

@ -128,7 +128,7 @@ extern const char doc[];
/* On some architectures, glibc can be built with compilers that do
not have suitable built-in functions for setting the payload of a
_Float128 NaN. */
#if ((defined __x86_64__ || defined __i386__ || defined __ia64__) \
#if ((defined __x86_64__ || defined __i386__) \
&& !__GNUC_PREREQ (7, 0))
# define XFAIL_FLOAT128_PAYLOAD (TEST_COND_binary128 ? XFAIL_TEST : 0)
#else

View File

@ -150,9 +150,7 @@ __attribute ((always_inline))
guard_position (void *mem, size_t size, size_t guardsize, struct pthread *pd,
size_t pagesize_m1)
{
#ifdef NEED_SEPARATE_REGISTER_STACK
return mem + (((size - guardsize) / 2) & ~pagesize_m1);
#elif _STACK_GROWS_DOWN
#if _STACK_GROWS_DOWN
return mem;
#elif _STACK_GROWS_UP
return (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
@ -166,7 +164,7 @@ setup_stack_prot (char *mem, size_t size, char *guard, size_t guardsize,
const int prot)
{
char *guardend = guard + guardsize;
#if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
#if _STACK_GROWS_DOWN
/* As defined at guard_position, for architectures with downward stack
the guard page is always at start of the allocated area. */
if (__mprotect (guardend, size - guardsize, prot) != 0)
@ -189,7 +187,7 @@ advise_stack_range (void *mem, size_t size, uintptr_t pd, size_t guardsize)
{
uintptr_t sp = (uintptr_t) CURRENT_STACK_FRAME;
size_t pagesize_m1 = __getpagesize () - 1;
#if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
#if _STACK_GROWS_DOWN
size_t freesize = (sp - (uintptr_t) mem) & ~pagesize_m1;
assert (freesize < size);
if (freesize > PTHREAD_STACK_MIN)
@ -510,19 +508,7 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
{
/* The old guard area is too large. */
#ifdef NEED_SEPARATE_REGISTER_STACK
char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
char *oldguard = mem + (((size - pd->guardsize) / 2) & ~pagesize_m1);
if (oldguard < guard
&& __mprotect (oldguard, guard - oldguard, prot) != 0)
goto mprot_error;
if (__mprotect (guard + guardsize,
oldguard + pd->guardsize - guard - guardsize,
prot) != 0)
goto mprot_error;
#elif _STACK_GROWS_DOWN
#if _STACK_GROWS_DOWN
if (__mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
prot) != 0)
goto mprot_error;
@ -599,7 +585,7 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
static void
name_stack_maps (struct pthread *pd, bool set)
{
#if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
#if _STACK_GROWS_DOWN
void *stack = pd->stackblock + pd->guardsize;
#else
void *stack = pd->stackblock;

View File

@ -708,8 +708,6 @@ clock_getcpuclockid (pid_t pid, clockid_t *clock_id)
({ unsigned int _hi, _lo; \
asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \
(Var) = ((unsigned long long int) _hi << 32) | _lo; })
#elif defined __ia64__
#define HP_TIMING_NOW(Var) __asm__ __volatile__ ("mov %0=ar.itc" : "=r" (Var) : : "memory")
#else
#error "HP_TIMING_NOW missing"
#endif

View File

@ -234,10 +234,6 @@ class Context(object):
os_name='linux-gnu')
self.add_config(arch='i686',
os_name='gnu')
self.add_config(arch='ia64',
os_name='linux-gnu',
first_gcc_cfg=['--with-system-libunwind'],
binutils_cfg=['--enable-obsolete'])
self.add_config(arch='loongarch64',
os_name='linux-gnu',
variant='lp64d',
@ -1300,7 +1296,6 @@ def install_linux_headers(policy, cmdlist):
'i586': 'x86',
'i686': 'x86',
'i786': 'x86',
'ia64': 'ia64',
'loongarch64': 'loongarch',
'm68k': 'm68k',
'microblaze': 'microblaze',

View File

@ -27,18 +27,11 @@ xclone (int (*fn) (void *arg), void *arg, void *stack, size_t stack_size,
{
pid_t r = -1;
# ifdef __ia64__
extern int __clone2 (int (*fn) (void *arg), void *stack, size_t stack_size,
int flags, void *arg, ...);
r = __clone2 (fn, stack, stack_size, flags, arg, /* ptid */ NULL,
/* tls */ NULL, /* ctid */ NULL);
# else
# if _STACK_GROWS_DOWN
r = clone (fn, stack + stack_size, flags, arg, /* ptid */ NULL,
/* tls */ NULL, /* ctid */ NULL);
# elif _STACK_GROWS_UP
r = clone (fn, stack, flags, arg, /* ptid */ NULL, /* tls */ NULL, NULL);
# endif
# endif
if (r < 0)

View File

@ -30,7 +30,6 @@
#define FLAG_ELF_LIBC6 0x0003
#define FLAG_REQUIRED_MASK 0xff00
#define FLAG_SPARC_LIB64 0x0100
#define FLAG_IA64_LIB64 0x0200
#define FLAG_X8664_LIB64 0x0300
#define FLAG_S390_LIB64 0x0400
#define FLAG_POWERPC_LIB64 0x0500

View File

@ -224,7 +224,6 @@ _Unwind_FindEnclosingFunction (void *pc)
return NULL;
}
#ifndef __ia64__
_Unwind_Ptr
_Unwind_GetDataRelBase (struct _Unwind_Context *context)
{
@ -236,7 +235,6 @@ _Unwind_GetTextRelBase (struct _Unwind_Context *context)
{
return (_Unwind_Ptr) context->bases.tbase;
}
#endif
/* Extract any interesting information from the CIE for the translation
unit F belongs to. Return a pointer to the byte after the augmentation,

View File

@ -33,11 +33,7 @@ extern "C" {
inefficient for 32-bit and smaller machines. */
typedef unsigned _Unwind_Word __attribute__((__mode__(__unwind_word__)));
typedef signed _Unwind_Sword __attribute__((__mode__(__unwind_word__)));
#if defined(__ia64__) && defined(__hpux__)
typedef unsigned _Unwind_Ptr __attribute__((__mode__(__word__)));
#else
typedef unsigned _Unwind_Ptr __attribute__((__mode__(__pointer__)));
#endif
typedef unsigned _Unwind_Internal_Ptr __attribute__((__mode__(__pointer__)));
/* @@@ The IA-64 ABI uses a 64-bit word to identify the producer and
@ -190,29 +186,8 @@ extern void _Unwind_SjLj_Resume (struct _Unwind_Exception *);
and data-relative addressing in the LDSA. In order to stay link
compatible with the standard ABI for IA-64, we inline these. */
#ifdef __ia64__
#include <stdlib.h>
static inline _Unwind_Ptr
_Unwind_GetDataRelBase (struct _Unwind_Context *_C)
{
/* The GP is stored in R1. */
return _Unwind_GetGR (_C, 1);
}
static inline _Unwind_Ptr
_Unwind_GetTextRelBase (struct _Unwind_Context *_C)
{
abort ();
return 0;
}
/* @@@ Retrieve the Backing Store Pointer of the given context. */
extern _Unwind_Word _Unwind_GetBSP (struct _Unwind_Context *);
#else
extern _Unwind_Ptr _Unwind_GetDataRelBase (struct _Unwind_Context *);
extern _Unwind_Ptr _Unwind_GetTextRelBase (struct _Unwind_Context *);
#endif
/* @@@ Given an address, return the entry point of the function that
contains it. */

View File

@ -1,6 +0,0 @@
wordsize-64
# ia64 uses IEEE 754 floating point.
ieee754/float128
ieee754/ldbl-96
ieee754/dbl-64
ieee754/flt-32

View File

@ -1,4 +0,0 @@
# ia64 does not provide crtbeginT.o, so use crtbegin.o.
+prectorT = $(+prector)
float64x-alias-fcts = yes

View File

@ -1,25 +0,0 @@
# The ia64 `long double' is a distinct type we support.
long-double-fcts = yes
ifeq ($(subdir),math)
# sqrtf128 requires soft-fp.
CPPFLAGS += -I../soft-fp
endif
ifeq ($(subdir),gmon)
sysdep_routines += _mcount
endif
ifeq ($(subdir), csu)
CPPFLAGS-start.S = -D__ASSEMBLY__
ifeq (yes,$(build-shared))
# Compatibility
sysdep_routines += ia64libgcc
shared-only-routines += ia64libgcc
endif
endif
ifeq ($(subdir),elf)
sysdep-dl-routines += dl-symaddr dl-fptr
endif

View File

@ -1,21 +0,0 @@
ld {
GLIBC_PRIVATE {
# ia64 specific functions in the dynamic linker, but used by libc.so.
_dl_symbol_address; _dl_lookup_address;
_dl_function_address;
}
}
libc {
GLIBC_2.2 {
# Functions from libgcc.
__divtf3; __divdf3; __divsf3; __divdi3; __moddi3; __udivdi3; __umoddi3;
__multi3;
}
}
libm {
GLIBC_2.1 {
# A generic bug got this omitted from other configurations' version
# sets, but we always had it.
exp2l;
}
}

View File

@ -1,90 +0,0 @@
/* Machine-specific calling sequence for `mcount' profiling function. ia64
Copyright (C) 2000-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
/* Assembly stub to invoke _mcount(). Compiler generated code calls
this stub before executing a function's prologue and without saving
any registers. It is therefore necessary to preserve the input
registers as they may contain function arguments. To work
correctly with frame-less functions, it is also necessary to
preserve the return pointer (b0 aka rp).
State upon entering _mcount:
r8 address of return value structure (used only when called
function returns a large structure)
r15 static link (used only for nested functions)
in0 ar.pfs to restore before returning to the function that
called _mcount
in1 gp value to restore before returning to the function that
called _mcount
in2 return address in the function that invoked the caller
of _mcount (frompc)
in3 address of the global-offset table entry that holds the
profile count dword allocated by the compiler; to get
the address of this dword, use "ld8 in2=[in2]; this
dword can be used in any way by _mcount (including
not at all, as is the case with the current implementation)
b0 address to return to after _mcount is done
*/
#include <sysdep.h>
#undef ret
LEAF(_mcount)
.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(4)
alloc loc1 = ar.pfs, 4, 4, 3, 0
mov loc0 = rp
.body
mov loc2 = r8 // gcc uses r8 to pass pointer to return structure
;;
mov loc3 = r15 // gcc uses r15 to pass the static link to nested functions
mov out0 = in2
mov out1 = rp
br.call.sptk.few rp = __mcount
;;
.here:
{
.mii
mov gp = in1
mov r2 = ip
mov ar.pfs = loc1
}
;;
adds r2 = _mcount_ret_helper - .here, r2
mov b7 = loc0
mov rp = in2
;;
mov r3 = in0
mov r8 = loc2
mov r15 = loc3
mov b6 = r2
br.ret.sptk.few b6
END(_mcount)
LOCAL_LEAF(_mcount_ret_helper)
.prologue
.altrp b7
.save ar.pfs, r3
.body
alloc r2 = ar.pfs, 0, 0, 8, 0
mov ar.pfs = r3
br b7
END(_mcount_ret_helper)
weak_alias (_mcount, mcount)

View File

@ -1,3 +0,0 @@
/* An instruction which should crash any program is `break 0' which triggers
SIGILL. */
#define ABORT_INSTRUCTION asm ("break 0")

View File

@ -1,96 +0,0 @@
/* Copyright (C) 2003-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <ia64intrin.h>
#define __HAVE_64B_ATOMICS 1
#define USE_ATOMIC_COMPILER_BUILTINS 0
/* XXX Is this actually correct? */
#define ATOMIC_EXCHANGE_USES_CAS 0
#define __arch_compare_and_exchange_bool_8_acq(mem, newval, oldval) \
(abort (), 0)
#define __arch_compare_and_exchange_bool_16_acq(mem, newval, oldval) \
(abort (), 0)
#define __arch_compare_and_exchange_bool_32_acq(mem, newval, oldval) \
(!__sync_bool_compare_and_swap ((mem), (int) (long) (oldval), \
(int) (long) (newval)))
#define __arch_compare_and_exchange_bool_64_acq(mem, newval, oldval) \
(!__sync_bool_compare_and_swap ((mem), (long) (oldval), \
(long) (newval)))
#define __arch_compare_and_exchange_val_8_acq(mem, newval, oldval) \
(abort (), (__typeof (*mem)) 0)
#define __arch_compare_and_exchange_val_16_acq(mem, newval, oldval) \
(abort (), (__typeof (*mem)) 0)
#define __arch_compare_and_exchange_val_32_acq(mem, newval, oldval) \
__sync_val_compare_and_swap ((mem), (int) (long) (oldval), \
(int) (long) (newval))
#define __arch_compare_and_exchange_val_64_acq(mem, newval, oldval) \
__sync_val_compare_and_swap ((mem), (long) (oldval), (long) (newval))
/* Atomically store newval and return the old value. */
#define atomic_exchange_acq(mem, value) \
__sync_lock_test_and_set (mem, value)
#define atomic_exchange_rel(mem, value) \
(__sync_synchronize (), __sync_lock_test_and_set (mem, value))
#define atomic_exchange_and_add(mem, value) \
__sync_fetch_and_add ((mem), (value))
#define atomic_decrement_if_positive(mem) \
({ __typeof (*mem) __oldval, __val; \
__typeof (mem) __memp = (mem); \
\
__val = (*__memp); \
do \
{ \
__oldval = __val; \
if (__builtin_expect (__val <= 0, 0)) \
break; \
__val = atomic_compare_and_exchange_val_acq (__memp, __oldval - 1, \
__oldval); \
} \
while (__builtin_expect (__val != __oldval, 0)); \
__oldval; })
#define atomic_bit_test_set(mem, bit) \
({ __typeof (*mem) __oldval, __val; \
__typeof (mem) __memp = (mem); \
__typeof (*mem) __mask = ((__typeof (*mem)) 1 << (bit)); \
\
__val = (*__memp); \
do \
{ \
__oldval = __val; \
__val = atomic_compare_and_exchange_val_acq (__memp, \
__oldval | __mask, \
__oldval); \
} \
while (__builtin_expect (__val != __oldval, 0)); \
__oldval & __mask; })
#define atomic_full_barrier() __sync_synchronize ()

View File

@ -1,11 +0,0 @@
#ifndef _BITS_ENDIANNESS_H
#define _BITS_ENDIANNESS_H 1
#ifndef _BITS_ENDIAN_H
# error "Never use <bits/endianness.h> directly; include <endian.h> instead."
#endif
/* IA64 is little-endian. */
#define __BYTE_ORDER __LITTLE_ENDIAN
#endif /* bits/endianness.h */

View File

@ -1,104 +0,0 @@
/* Copyright (C) 1999-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#ifndef _FENV_H
# error "Never use <bits/fenv.h> directly; include <fenv.h> instead."
#endif
/* Define bits representing the exception. We use the bit positions of
the appropriate bits in the FPSR... (Tahoe EAS 2.4 5-4)*/
enum
{
FE_INEXACT =
#define FE_INEXACT (1 << 5)
FE_INEXACT,
FE_UNDERFLOW =
#define FE_UNDERFLOW (1 << 4)
FE_UNDERFLOW,
FE_OVERFLOW =
#define FE_OVERFLOW (1 << 3)
FE_OVERFLOW,
FE_DIVBYZERO =
#define FE_DIVBYZERO (1 << 2)
FE_DIVBYZERO,
FE_UNNORMAL =
#define FE_UNNORMAL (1 << 1)
FE_UNNORMAL,
FE_INVALID =
#define FE_INVALID (1 << 0)
FE_INVALID,
FE_ALL_EXCEPT =
#define FE_ALL_EXCEPT (FE_INEXACT | FE_UNDERFLOW | FE_OVERFLOW | FE_DIVBYZERO | FE_UNNORMAL | FE_INVALID)
FE_ALL_EXCEPT
};
enum
{
FE_TOWARDZERO =
#define FE_TOWARDZERO 3
FE_TOWARDZERO,
FE_UPWARD =
#define FE_UPWARD 2
FE_UPWARD,
FE_DOWNWARD =
#define FE_DOWNWARD 1
FE_DOWNWARD,
FE_TONEAREST =
#define FE_TONEAREST 0
FE_TONEAREST,
};
/* Type representing exception flags. */
typedef unsigned long int fexcept_t;
/* Type representing floating-point environment. */
typedef unsigned long int fenv_t;
/* If the default argument is used we use this value. */
#define FE_DFL_ENV ((const fenv_t *) 0xc009804c0270033fUL)
#ifdef __USE_GNU
/* Floating-point environment where only FE_UNNORMAL is masked since this
exception is not generally supported by glibc. */
# define FE_NOMASK_ENV ((const fenv_t *) 0xc009804c02700302UL)
/* Floating-point environment with (processor-dependent) non-IEEE
floating point. In this case, turning on flush-to-zero mode for
s0, s2, and s3. */
# define FE_NONIEEE_ENV ((const fenv_t *) 0xc009a04d0270037fUL)
#endif
#if __GLIBC_USE (IEC_60559_BFP_EXT_C2X)
/* Type representing floating-point control modes. */
typedef unsigned long int femode_t;
/* Default floating-point control modes. */
# define FE_DFL_MODE ((const femode_t *) 0xc009804c0270033fUL)
#endif

View File

@ -1,119 +0,0 @@
/* Macros to control TS 18661-3 glibc features on ia64.
Copyright (C) 2017-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#ifndef _BITS_FLOATN_H
#define _BITS_FLOATN_H
#include <features.h>
/* Defined to 1 if the current compiler invocation provides a
floating-point type with the IEEE 754 binary128 format, and this
glibc includes corresponding *f128 interfaces for it. The required
libgcc support was added some time after the basic compiler
support. */
#if __GNUC_PREREQ (4, 4)
# define __HAVE_FLOAT128 1
#else
# define __HAVE_FLOAT128 0
#endif
/* Defined to 1 if __HAVE_FLOAT128 is 1 and the type is ABI-distinct
from the default float, double and long double types in this glibc. */
#if __HAVE_FLOAT128
# define __HAVE_DISTINCT_FLOAT128 1
#else
# define __HAVE_DISTINCT_FLOAT128 0
#endif
/* Defined to 1 if the current compiler invocation provides a
floating-point type with the right format for _Float64x, and this
glibc includes corresponding *f64x interfaces for it. */
#define __HAVE_FLOAT64X 1
/* Defined to 1 if __HAVE_FLOAT64X is 1 and _Float64x has the format
of long double. Otherwise, if __HAVE_FLOAT64X is 1, _Float64x has
the format of _Float128, which must be different from that of long
double. */
#define __HAVE_FLOAT64X_LONG_DOUBLE 1
#ifndef __ASSEMBLER__
/* Defined to concatenate the literal suffix to be used with _Float128
types, if __HAVE_FLOAT128 is 1. */
# if __HAVE_FLOAT128
# if !__GNUC_PREREQ (7, 0) || (defined __cplusplus && !__GNUC_PREREQ (13, 0))
/* The literal suffix f128 exists only since GCC 7.0. */
# define __f128(x) x##q
# else
# define __f128(x) x##f128
# endif
# endif
/* Defined to a complex binary128 type if __HAVE_FLOAT128 is 1. */
# if __HAVE_FLOAT128
# if !__GNUC_PREREQ (7, 0) || (defined __cplusplus && !__GNUC_PREREQ (13, 0))
/* Add a typedef for older GCC compilers which don't natively support
_Complex _Float128. */
typedef _Complex float __cfloat128 __attribute__ ((__mode__ (__TC__)));
# define __CFLOAT128 __cfloat128
# else
# define __CFLOAT128 _Complex _Float128
# endif
# endif
/* The remaining of this file provides support for older compilers. */
# if __HAVE_FLOAT128
/* The type _Float128 exists only since GCC 7.0. */
# if !__GNUC_PREREQ (7, 0) || (defined __cplusplus && !__GNUC_PREREQ (13, 0))
typedef __float128 _Float128;
# endif
/* __builtin_huge_valf128 doesn't exist before GCC 7.0. */
# if !__GNUC_PREREQ (7, 0)
# define __builtin_huge_valf128() ((_Float128) __builtin_huge_val ())
# endif
/* Older GCC has only a subset of built-in functions for _Float128 on
ia64, and __builtin_infq is not usable in static initializers.
Converting a narrower sNaN to _Float128 produces a quiet NaN, so
attempts to use _Float128 sNaNs will not work properly with older
compilers. */
# if !__GNUC_PREREQ (7, 0)
# define __builtin_copysignf128 __builtin_copysignq
# define __builtin_fabsf128 __builtin_fabsq
# define __builtin_inff128() ((_Float128) __builtin_inf ())
# define __builtin_nanf128(x) ((_Float128) __builtin_nan (x))
# define __builtin_nansf128(x) ((_Float128) __builtin_nans (x))
# endif
/* In math/math.h, __MATH_TG will expand signbit to __builtin_signbit*,
e.g.: __builtin_signbitf128, before GCC 6. However, there has never
been a __builtin_signbitf128 in GCC and the type-generic builtin is
only available since GCC 6. */
# if !__GNUC_PREREQ (6, 0)
# define __builtin_signbitf128 __signbitf128
# endif
# endif
#endif /* !__ASSEMBLER__. */
#include <bits/floatn-common.h>
#endif /* _BITS_FLOATN_H */

View File

@ -1,24 +0,0 @@
/* Define __FP_LOGB0_IS_MIN and __FP_LOGBNAN_IS_MIN. IA64 version.
Copyright (C) 2016-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#ifndef _MATH_H
# error "Never use <bits/fp-logb.h> directly; include <math.h> instead."
#endif
#define __FP_LOGB0_IS_MIN 1
#define __FP_LOGBNAN_IS_MIN 0

View File

@ -1,62 +0,0 @@
/* Copyright (C) 2005-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#ifndef _LINK_H
# error "Never include <bits/link.h> directly; use <link.h> instead."
#endif
/* Registers for entry into PLT on ia64. */
typedef struct La_ia64_regs
{
uint64_t lr_r8;
uint64_t lr_r9;
uint64_t lr_r10;
uint64_t lr_r11;
uint64_t lr_gr [8];
long double lr_fr [8];
uint64_t lr_unat;
uint64_t lr_sp;
} La_ia64_regs;
/* Return values for calls from PLT on ia64. */
typedef struct La_ia64_retval
{
uint64_t lrv_r8;
uint64_t lrv_r9;
uint64_t lrv_r10;
uint64_t lrv_r11;
long double lr_fr [8];
} La_ia64_retval;
__BEGIN_DECLS
extern Elf64_Addr la_ia64_gnu_pltenter (Elf64_Sym *__sym, unsigned int __ndx,
uintptr_t *__refcook,
uintptr_t *__defcook,
La_ia64_regs *__regs,
unsigned int *__flags,
const char *__symname,
long int *__framesizep);
extern unsigned int la_ia64_gnu_pltexit (Elf64_Sym *__sym, unsigned int __ndx,
uintptr_t *__refcook,
uintptr_t *__defcook,
const La_ia64_regs *__inregs,
La_ia64_retval *__outregs,
const char *__symname);
__END_DECLS

View File

@ -1,3 +0,0 @@
/* ia64 does not export __bzero symbol. */
#define __bzero bzero
#include <string/bzero.c>

View File

@ -1,9 +0,0 @@
# This file is generated from configure.ac by Autoconf. DO NOT EDIT!
# Local configure fragment for sysdeps/ia64.
# PIE builds fail on binutils 2.37 and earlier, see:
# https://sourceware.org/bugzilla/show_bug.cgi?id=28672
printf "%s\n" "#define PIE_UNSUPPORTED 1" >>confdefs.h
# work around problem with autoconf and empty lines at the end of files

View File

@ -1,7 +0,0 @@
GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
# Local configure fragment for sysdeps/ia64.
# PIE builds fail on binutils 2.37 and earlier, see:
# https://sourceware.org/bugzilla/show_bug.cgi?id=28672
AC_DEFINE(PIE_UNSUPPORTED)
# work around problem with autoconf and empty lines at the end of files

View File

@ -1,162 +0,0 @@
/* Special .init and .fini section support for IA64.
Copyright (C) 2000-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
In addition to the permissions in the GNU Lesser General Public
License, the Free Software Foundation gives you unlimited
permission to link the compiled version of this file with other
programs, and to distribute those programs without any restriction
coming from the use of this file. (The GNU Lesser General Public
License restrictions do apply in other respects; for example, they
cover modification of the file, and distribution when not linked
into another program.)
Note that people who make modified versions of this file are not
obligated to grant this special exception for their modified
versions; it is their choice whether to do so. The GNU Lesser
General Public License gives permission to release a modified
version without this exception; this exception also makes it
possible to release a modified version which carries forward this
exception.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library. If not, see
<https://www.gnu.org/licenses/>. */
/* crti.S puts a function prologue at the beginning of the .init and
.fini sections and defines global symbols for those addresses, so
they can be called as functions. The symbols _init and _fini are
magic and cause the linker to emit DT_INIT and DT_FINI. */
#include <libc-symbols.h>
#include <sysdep.h>
#undef ret
#ifndef PREINIT_FUNCTION
# define PREINIT_FUNCTION __gmon_start__
#endif
#ifndef PREINIT_FUNCTION_WEAK
# define PREINIT_FUNCTION_WEAK 1
#endif
#if PREINIT_FUNCTION_WEAK
weak_extern (PREINIT_FUNCTION)
#else
.hidden PREINIT_FUNCTION
#endif
/* If we have working .init_array support, we want to keep the .init
section empty (apart from the mandatory prologue/epilogue. This
ensures that the default unwind conventions (return-pointer in b0,
frame state in ar.pfs, etc.) will do the Right Thing. To ensure
an empty .init section, we register gmon_initializer() via the
.init_array.
--davidm 02/10/29 */
#if PREINIT_FUNCTION_WEAK
/* This blob of assembly code is one simple C function:
static void
__attribute__ ((used))
gmon_initializer (void)
{
extern void weak_function __gmon_start__ (void);
if (__gmon_start__)
(*__gmon_start__)();
}
*/
.text
.align 64
.proc gmon_initializer#
gmon_initializer:
.prologue 12, 32
.mmi
.save ar.pfs, r33
alloc r33 = ar.pfs, 0, 3, 0, 0
addl r14 = @ltoff(@fptr(PREINIT_FUNCTION#)), gp
.save rp, r32
mov r32 = b0
.mmi
mov r34 = r1
.body
;;
ld8 r14 = [r14]
nop 0
;;
.mib
cmp.eq p6, p7 = 0, r14
nop 0
(p6) br.cond.spnt .L1
;;
.mib
nop 0
nop 0
br.call.sptk.many b0 = PREINIT_FUNCTION#
;;
.mmi
mov r1 = r34
nop 0
nop 0
.L1:
.mii
nop 0
mov ar.pfs = r33
nop 0
;;
.mib
nop 0
mov b0 = r32
br.ret.sptk.many b0
.endp gmon_initializer#
# undef PREINIT_FUNCTION
# define PREINIT_FUNCTION gmon_initializer
#endif
.section .init_array, "aw"
data8 @fptr(PREINIT_FUNCTION)
.section .init,"ax",@progbits
.global _init#
.hidden _init#
.proc _init#
_init:
.prologue
.save ar.pfs, r34
alloc r34 = ar.pfs, 0, 3, 0, 0
.vframe r32
mov r32 = r12
.save rp, r33
mov r33 = b0
.body
adds r12 = -16, r12
;; /* see gmon_initializer() above */
.endp _init#
.section .fini,"ax",@progbits
.global _fini#
.hidden _fini#
.proc _fini#
_fini:
.prologue
.save ar.pfs, r34
alloc r34 = ar.pfs, 0, 3, 0, 0
.vframe r32
mov r32 = r12
.save rp, r33
mov r33 = b0
.body
adds r12 = -16, r12
;;
.endp _fini#

View File

@ -1,69 +0,0 @@
/* Special .init and .fini section support for ARM.
Copyright (C) 2000-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
In addition to the permissions in the GNU Lesser General Public
License, the Free Software Foundation gives you unlimited
permission to link the compiled version of this file with other
programs, and to distribute those programs without any restriction
coming from the use of this file. (The GNU Lesser General Public
License restrictions do apply in other respects; for example, they
cover modification of the file, and distribution when not linked
into another program.)
Note that people who make modified versions of this file are not
obligated to grant this special exception for their modified
versions; it is their choice whether to do so. The GNU Lesser
General Public License gives permission to release a modified
version without this exception; this exception also makes it
possible to release a modified version which carries forward this
exception.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library. If not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#undef ret
/* crtn.S puts function epilogues in the .init and .fini sections
corresponding to the prologues in crti.S. */
.section .init,"ax",@progbits
.proc _init#
_init:
.prologue
.save ar.pfs, r34
.vframe r32
.save rp, r33
.body
.regstk 0,2,0,0
mov r12 = r32
mov ar.pfs = r34
mov b0 = r33
br.ret.sptk.many b0
.endp _init#
.section .fini,"ax",@progbits
.proc _fini#
_fini:
.prologue
.save ar.pfs, r34
.vframe r32
.save rp, r33
.body
mov r12 = r32
mov ar.pfs = r34
mov b0 = r33
br.ret.sptk.many b0
.endp _fini#

View File

@ -1,21 +0,0 @@
/* Configuration of lookup functions. IA-64 version.
Copyright (C) 2000-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
/* Number of extra dynamic section entries for this architecture. By
default there are none. */
#define DT_THISPROCNUM DT_IA_64_NUM

View File

@ -1,45 +0,0 @@
/* Function descriptors. IA64 version.
Copyright (C) 2003-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#ifndef dl_ia64_fptr_h
#define dl_ia64_fptr_h 1
#include <ia64intrin.h>
#include <sysdeps/generic/dl-fptr.h>
#define COMPARE_AND_SWAP(ptr, old, new) \
__sync_bool_compare_and_swap (ptr, old, new)
/* There are currently 123 dynamic symbols in ld.so.
ELF_MACHINE_BOOT_FPTR_TABLE_LEN needs to be at least that big. */
#define ELF_MACHINE_BOOT_FPTR_TABLE_LEN 200
#define ELF_MACHINE_LOAD_ADDRESS(var, symbol) \
asm ("movl %0 = @gprel (" #symbol ");; add %0 = %0, gp" : "=&r" (var));
/* We don't have a gcc helper to extract the plabel info. */
#define ELF_PTR_TO_FDESC(ptr) \
({ union { \
void *_ptr; \
struct fdesc *_fdesc; \
} _u; \
_u._ptr = ptr; \
_u._fdesc; \
})
#endif /* !dl_ia64_fptr_h */

View File

@ -1,79 +0,0 @@
/* Configuration of lookup functions.
Copyright (C) 2000-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define ELF_FUNCTION_PTR_IS_SPECIAL
#define DL_UNMAP_IS_SPECIAL
#include <dl-fptr.h>
/* We do not support copy relocations for IA-64. */
#define DL_NO_COPY_RELOCS
/* Forward declaration. */
struct link_map;
extern void *_dl_symbol_address (struct link_map *map, const Elf64_Sym *ref);
rtld_hidden_proto (_dl_symbol_address)
#define DL_SYMBOL_ADDRESS(map, ref) _dl_symbol_address(map, ref)
extern Elf64_Addr _dl_lookup_address (const void *address);
#define DL_LOOKUP_ADDRESS(addr) _dl_lookup_address (addr)
extern void attribute_hidden _dl_unmap (struct link_map *map);
#define DL_UNMAP(map) _dl_unmap (map)
#define DL_DT_FUNCTION_ADDRESS(map, start, attr, addr) \
attr volatile unsigned long int fptr[2]; \
fptr[0] = (unsigned long int) (start); \
fptr[1] = (map)->l_info[DT_PLTGOT]->d_un.d_ptr; \
addr = (ElfW(Addr)) fptr; \
#define DL_CALL_DT_INIT(map, start, argc, argv, env) \
{ \
ElfW(Addr) addr; \
DL_DT_FUNCTION_ADDRESS(map, start, , addr) \
dl_init_t init = (dl_init_t) addr; \
init (argc, argv, env); \
}
#define DL_CALL_DT_FINI(map, start) \
{ \
ElfW(Addr) addr; \
DL_DT_FUNCTION_ADDRESS(map, start, , addr) \
fini_t fini = (fini_t) addr; \
fini (); \
}
/* The type of the return value of fixup/profile_fixup. */
#define DL_FIXUP_VALUE_TYPE struct fdesc
/* Construct a value of type DL_FIXUP_VALUE_TYPE from a code address
and a link map. */
#define DL_FIXUP_MAKE_VALUE(map, addr) \
((struct fdesc) { (addr), (map)->l_info[DT_PLTGOT]->d_un.d_ptr })
/* Extract the code address from a value of type DL_FIXUP_MAKE_VALUE.
*/
#define DL_FIXUP_VALUE_CODE_ADDR(value) (value).ip
#define DL_FIXUP_VALUE_ADDR(value) ((uintptr_t) &(value))
#define DL_FIXUP_ADDR_VALUE(addr) (*(struct fdesc *) (addr))
#define DL_FIXUP_BINDNOW_ADDR_VALUE(addr) (addr)
#define DL_FIXUP_BINDNOW_RELOC(l, reloc, value, new_value, st_value, lazy) \
(*value) = *(struct fdesc *) (st_value)

View File

@ -1,460 +0,0 @@
/* Machine-dependent ELF dynamic relocation inline functions. IA-64 version.
Copyright (C) 1995-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#ifndef dl_machine_h
#define dl_machine_h 1
#define ELF_MACHINE_NAME "ia64"
#include <assert.h>
#include <string.h>
#include <link.h>
#include <errno.h>
#include <dl-fptr.h>
#include <tls.h>
#include <dl-static-tls.h>
#include <dl-machine-rel.h>
/* Translate a processor specific dynamic tag to the index
in l_info array. */
#define DT_IA_64(x) (DT_IA_64_##x - DT_LOPROC + DT_NUM)
static inline void __attribute__ ((always_inline))
__ia64_init_bootstrap_fdesc_table (struct link_map *map)
{
Elf64_Addr *boot_table;
/* careful: this will be called before got has been relocated... */
asm (";; addl %0 = @gprel (_dl_boot_fptr_table), gp" : "=r"(boot_table));
map->l_mach.fptr_table_len = ELF_MACHINE_BOOT_FPTR_TABLE_LEN;
map->l_mach.fptr_table = boot_table;
}
#define ELF_MACHINE_BEFORE_RTLD_RELOC(map, dynamic_info) \
__ia64_init_bootstrap_fdesc_table (map);
/* Return nonzero iff ELF header is compatible with the running host. */
static inline int __attribute__ ((unused))
elf_machine_matches_host (const Elf64_Ehdr *ehdr)
{
return ehdr->e_machine == EM_IA_64;
}
/* Return the link-time address of _DYNAMIC. */
static inline Elf64_Addr __attribute__ ((unused, const))
elf_machine_dynamic (void)
{
Elf64_Addr *p;
__asm__ (
".section .sdata\n"
" .type __dynamic_ltv#, @object\n"
" .size __dynamic_ltv#, 8\n"
"__dynamic_ltv:\n"
" data8 @ltv(_DYNAMIC#)\n"
".previous\n"
" addl %0 = @gprel(__dynamic_ltv#), gp ;;"
: "=r" (p));
return *p;
}
/* Return the run-time load address of the shared object. */
static inline Elf64_Addr __attribute__ ((unused))
elf_machine_load_address (void)
{
Elf64_Addr ip;
int *p;
__asm__ (
"1: mov %0 = ip\n"
".section .sdata\n"
"2: data4 @ltv(1b)\n"
" .align 8\n"
".previous\n"
" addl %1 = @gprel(2b), gp ;;"
: "=r" (ip), "=r" (p));
return ip - (Elf64_Addr) *p;
}
/* Set up the loaded object described by L so its unrelocated PLT
entries will jump to the on-demand fixup code in dl-runtime.c. */
static inline int __attribute__ ((unused, always_inline))
elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
int lazy, int profile)
{
extern void _dl_runtime_resolve (void);
extern void _dl_runtime_profile (void);
if (lazy)
{
register Elf64_Addr gp __asm__ ("gp");
Elf64_Addr *reserve, doit;
/*
* Careful with the typecast here or it will try to add l-l_addr
* pointer elements
*/
reserve = ((Elf64_Addr *)
(l->l_info[DT_IA_64 (PLT_RESERVE)]->d_un.d_ptr + l->l_addr));
/* Identify this shared object. */
reserve[0] = (Elf64_Addr) l;
/* This function will be called to perform the relocation. */
#ifdef SHARED
if (__glibc_unlikely (profile))
{
if (GLRO(dl_profile) != NULL
&& _dl_name_match_p (GLRO(dl_profile), l))
{
/* This is the object we are looking for. Say that we really
want profiling and the timers are started. */
GL(dl_profile_map) = l;
}
doit = (Elf64_Addr) ELF_PTR_TO_FDESC (&_dl_runtime_profile)->ip;
}
else
#endif
{
doit = (Elf64_Addr) ELF_PTR_TO_FDESC (&_dl_runtime_resolve)->ip;
}
reserve[1] = doit;
reserve[2] = gp;
}
return lazy;
}
/* Names of the architecture-specific auditing callback functions. */
#define ARCH_LA_PLTENTER ia64_gnu_pltenter
#define ARCH_LA_PLTEXIT ia64_gnu_pltexit
/* Undo the adds out0 = 16, sp below to get at the value we want in
__libc_stack_end. */
#define DL_STACK_END(cookie) \
((void *) (((long) (cookie)) - 16))
/* Initial entry point code for the dynamic linker.
The C function `_dl_start' is the real entry point;
its return value is the user program's entry point. */
#define RTLD_START asm ( \
".text\n" \
" .global _start#\n" \
" .proc _start#\n" \
"_start:\n" \
"0: { .mii\n" \
" .prologue\n" \
" .save rp, r0\n" \
" .body\n" \
" .prologue\n" \
" .save ar.pfs, r32\n" \
" alloc loc0 = ar.pfs, 0, 3, 4, 0\n" \
" .body\n" \
" mov r2 = ip\n" \
" addl r3 = @gprel(0b), r0\n" \
" ;;\n" \
" }\n" \
" { .mlx\n" \
" /* Calculate the GP, and save a copy in loc1. */\n" \
" sub gp = r2, r3\n" \
" movl r8 = 0x9804c0270033f\n" \
" ;;\n" \
" }\n" \
" { .mii\n" \
" mov ar.fpsr = r8\n" \
" sub loc1 = r2, r3\n" \
" /* _dl_start wants a pointer to the pointer to the arg block and\n" \
" the arg block starts with an integer, thus the magic 16. */\n" \
" adds out0 = 16, sp\n" \
" }\n" \
" { .bbb\n" \
" br.call.sptk.many b0 = _dl_start#\n" \
" ;;\n" \
" }\n" \
" .endp _start#\n" \
" /* FALLTHRU */\n" \
" .global _dl_start_user#\n" \
" .proc _dl_start_user#\n" \
"_dl_start_user:\n" \
" .prologue\n" \
" .save rp, r0\n" \
" .body\n" \
" .prologue\n" \
" .save ar.pfs, r32\n" \
" .body\n" \
" { .mii\n" \
" /* Save the pointer to the user entry point fptr in loc2. */\n" \
" mov loc2 = ret0\n" \
" addl r2 = @ltoff(_dl_argc), gp\n" \
" ;;\n" \
" }\n" \
" { .mii\n" \
" ld8 out1 = [r2] /* Get the _dl_argc address. */\n" \
" addl r3 = @ltoff(_dl_argv), gp\n" \
" ;;\n" \
" }\n" \
" { .mmi\n" \
" ld8 out2 = [r3] /* Get the _dl_argv address. */\n" \
" ld8 out1 = [out1] /* Get the adjusted _dl_argc. */\n" \
" addl r2 = @gprel(_rtld_local), gp\n" \
" ;;\n" \
" }\n" \
" { .mmi\n" \
" sxt4 out3 = out1 /* envp = argv + argc + 1 */\n" \
" ;;\n" \
" }\n" \
" { .mmi\n" \
" adds out3 = 1, out3\n" \
" ;;\n" \
" }\n" \
" { .mmi\n" \
" ld8 out2 = [out2] /* Get the adjusted _dl_argv. */\n" \
" shladd out3 = out3, 3, r0\n" \
" ;;\n" \
" }\n" \
" { .mmb\n" \
" add out3 = out3, out2\n" \
" ld8 out0 = [r2] /* Get the linkmap. */\n" \
" br.call.sptk.many b0 = _dl_init#\n" \
" }\n" \
" /* Pass our finalizer function to the user,\n" \
" and jump to the user's entry point. */\n" \
" { .mmi\n" \
" ld8 r3 = [loc2], 8\n" \
" mov b0 = r0\n" \
" }\n" \
" { .mmi\n" \
" addl ret0 = @ltoff(@fptr(_dl_fini#)), gp\n" \
" ;;\n" \
" mov b6 = r3\n" \
" }\n" \
" { .mmi\n" \
" ld8 ret0 = [ret0]\n" \
" ld8 gp = [loc2]\n" \
" mov ar.pfs = loc0\n" \
" ;;\n" \
" }\n" \
" { .mfb\n" \
" br.sptk.many b6\n" \
" ;;\n" \
" }\n" \
" .endp _dl_start_user#\n" \
".previous\n");
#ifndef RTLD_START_SPECIAL_INIT
#define RTLD_START_SPECIAL_INIT /* nothing */
#endif
/* ELF_RTYPE_CLASS_PLT iff TYPE describes relocation of a PLT entry or TLS
variable, so undefined references should not be allowed to define the
value.
ELF_RTYPE_CLASS_COPY iff TYPE should not be allowed to resolve to one
of the main executable's symbols, as for a COPY reloc, which we don't
use. */
/* ??? Ignore *MSB for now. */
#define elf_machine_type_class(type) \
(((type) == R_IA64_IPLTLSB || (type) == R_IA64_DTPMOD64LSB \
|| (type) == R_IA64_DTPREL64LSB || (type) == R_IA64_TPREL64LSB) \
* ELF_RTYPE_CLASS_PLT)
/* A reloc type used for ld.so cmdline arg lookups to reject PLT entries. */
#define ELF_MACHINE_JMP_SLOT R_IA64_IPLTLSB
/* Return the address of the entry point. */
#define ELF_MACHINE_START_ADDRESS(map, start) \
({ \
ElfW(Addr) addr; \
DL_DT_FUNCTION_ADDRESS(map, start, static, addr) \
addr; \
})
/* Fixup a PLT entry to bounce directly to the function at VALUE. */
static inline struct fdesc __attribute__ ((always_inline))
elf_machine_fixup_plt (struct link_map *l, lookup_t t,
const ElfW(Sym) *refsym, const ElfW(Sym) *sym,
const Elf64_Rela *reloc,
Elf64_Addr *reloc_addr, struct fdesc value)
{
/* l is the link_map for the caller, t is the link_map for the object
* being called */
/* got has already been relocated in elf_get_dynamic_info() */
reloc_addr[1] = value.gp;
/* we need a "release" here to ensure that the gp is visible before
the code entry point is updated: */
((volatile Elf64_Addr *) reloc_addr)[0] = value.ip;
return value;
}
/* Return the final value of a plt relocation. */
static inline struct fdesc
elf_machine_plt_value (struct link_map *map, const Elf64_Rela *reloc,
struct fdesc value)
{
/* No need to handle rel vs rela since IA64 is rela only */
return (struct fdesc) { value.ip + reloc->r_addend, value.gp };
}
#endif /* !dl_machine_h */
#ifdef RESOLVE_MAP
#define R_IA64_TYPE(R) ((R) & -8)
#define R_IA64_FORMAT(R) ((R) & 7)
#define R_IA64_FORMAT_32MSB 4
#define R_IA64_FORMAT_32LSB 5
#define R_IA64_FORMAT_64MSB 6
#define R_IA64_FORMAT_64LSB 7
/* Perform the relocation specified by RELOC and SYM (which is fully
resolved). MAP is the object containing the reloc. */
static inline void
__attribute ((always_inline))
elf_machine_rela (struct link_map *map, struct r_scope_elem *scope[],
const Elf64_Rela *reloc,
const Elf64_Sym *sym,
const struct r_found_version *version,
void *const reloc_addr_arg,
int skip_ifunc)
{
Elf64_Addr *const reloc_addr = reloc_addr_arg;
const unsigned long int r_type = ELF64_R_TYPE (reloc->r_info);
Elf64_Addr value;
/* We cannot use a switch here because we cannot locate the switch
jump table until we've self-relocated. */
#if !defined RTLD_BOOTSTRAP
if (__builtin_expect (R_IA64_TYPE (r_type) == R_IA64_TYPE (R_IA64_REL64LSB),
0))
{
assert (ELF64_R_TYPE (reloc->r_info) == R_IA64_REL64LSB);
value = *reloc_addr + map->l_addr;
}
else
#endif
if (__builtin_expect (r_type == R_IA64_NONE, 0))
return;
else
{
struct link_map *sym_map = RESOLVE_MAP (map, scope, &sym, version,
r_type);
/* RESOLVE_MAP() will return NULL if it fail to locate the symbol. */
if (sym_map != NULL)
{
value = SYMBOL_ADDRESS (sym_map, sym, true) + reloc->r_addend;
if (R_IA64_TYPE (r_type) == R_IA64_TYPE (R_IA64_DIR64LSB))
;/* No adjustment. */
else if (r_type == R_IA64_IPLTLSB)
{
elf_machine_fixup_plt (NULL, NULL, NULL, NULL, reloc, reloc_addr,
DL_FIXUP_MAKE_VALUE (sym_map, value));
return;
}
else if (R_IA64_TYPE (r_type) == R_IA64_TYPE (R_IA64_FPTR64LSB))
value = _dl_make_fptr (sym_map, sym, value);
else if (R_IA64_TYPE (r_type) == R_IA64_TYPE (R_IA64_PCREL64LSB))
value -= (Elf64_Addr) reloc_addr & -16;
else if (R_IA64_TYPE (r_type) == R_IA64_TYPE (R_IA64_DTPMOD64LSB))
#ifdef RTLD_BOOTSTRAP
/* During startup the dynamic linker is always index 1. */
value = 1;
#else
/* Get the information from the link map returned by the
resolv function. */
value = sym_map->l_tls_modid;
else if (R_IA64_TYPE (r_type) == R_IA64_TYPE (R_IA64_DTPREL64LSB))
value -= sym_map->l_addr;
#endif
else if (R_IA64_TYPE (r_type) == R_IA64_TYPE (R_IA64_TPREL64LSB))
{
#ifndef RTLD_BOOTSTRAP
CHECK_STATIC_TLS (map, sym_map);
#endif
value += sym_map->l_tls_offset - sym_map->l_addr;
}
else
_dl_reloc_bad_type (map, r_type, 0);
}
else
value = 0;
}
/* ??? Ignore MSB and Instruction format for now. */
if (R_IA64_FORMAT (r_type) == R_IA64_FORMAT_64LSB)
*reloc_addr = value;
else if (R_IA64_FORMAT (r_type) == R_IA64_FORMAT_32LSB)
*(int *) reloc_addr = value;
else if (r_type == R_IA64_IPLTLSB)
{
reloc_addr[0] = 0;
reloc_addr[1] = 0;
}
else
_dl_reloc_bad_type (map, r_type, 0);
}
/* Let do-rel.h know that on IA-64 if l_addr is 0, all RELATIVE relocs
can be skipped. */
#define ELF_MACHINE_REL_RELATIVE 1
static inline void
__attribute ((always_inline))
elf_machine_rela_relative (Elf64_Addr l_addr, const Elf64_Rela *reloc,
void *const reloc_addr_arg)
{
Elf64_Addr *const reloc_addr = reloc_addr_arg;
/* ??? Ignore MSB and Instruction format for now. */
assert (ELF64_R_TYPE (reloc->r_info) == R_IA64_REL64LSB);
*reloc_addr += l_addr;
}
/* Perform a RELATIVE reloc on the .got entry that transfers to the .plt. */
static inline void
__attribute ((always_inline))
elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
Elf64_Addr l_addr, const Elf64_Rela *reloc,
int skip_ifunc)
{
Elf64_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
const unsigned long int r_type = ELF64_R_TYPE (reloc->r_info);
if (r_type == R_IA64_IPLTLSB)
{
reloc_addr[0] += l_addr;
reloc_addr[1] += l_addr;
}
else if (r_type == R_IA64_NONE)
return;
else
_dl_reloc_bad_type (map, r_type, 1);
}
#endif /* RESOLVE_MAP */

View File

@ -1,30 +0,0 @@
/* Thread-local storage handling in the ELF dynamic linker. IA-64 version.
Copyright (C) 2002-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
/* On IA-64 the __tls_get_addr function take the module ID and the
offset as parameters. */
#define GET_ADDR_ARGS size_t tls_ia64_m, size_t tls_ia64_offset
#define GET_ADDR_PARAM tls_ia64_m, tls_ia64_offset
#define GET_ADDR_MODULE tls_ia64_m
#define GET_ADDR_OFFSET tls_ia64_offset
/* We have no tls_index type. */
#define DONT_USE_TLS_INDEX 1
extern void *__tls_get_addr (size_t m, size_t offset);

View File

@ -1,538 +0,0 @@
/* PLT trampolines. ia64 version.
Copyright (C) 2005-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#undef ret
/*
This code is used in dl-runtime.c to call the `_dl_fixup' function
and then redirect to the address it returns. `_dl_fixup()' takes two
arguments, however _dl_profile_fixup() takes five.
The ABI specifies that we will never see more than 8 input
registers to a function call, thus it is safe to simply allocate
those, and simpler than playing stack games. */
/* Used to save and restore 8 incoming fp registers */
#define RESOLVE_FRAME_SIZE (16*8)
ENTRY(_dl_runtime_resolve)
{ .mmi
.prologue
.save ar.pfs, r40
alloc loc0 = ar.pfs, 8, 6, 2, 0
/* Use the 16 byte scratch area. r2 will start at f8 and
r3 will start at f9. */
adds r2 = -(RESOLVE_FRAME_SIZE - 16), r12
adds r3 = -(RESOLVE_FRAME_SIZE - 32), r12
}
{ .mii
.fframe RESOLVE_FRAME_SIZE
adds r12 = -RESOLVE_FRAME_SIZE, r12
.save rp, loc1
mov loc1 = b0
.body
mov loc2 = r8 /* preserve struct value register */
;;
}
{ .mii
mov loc3 = r9 /* preserve language specific register */
mov loc4 = r10 /* preserve language specific register */
mov loc5 = r11 /* preserve language specific register */
}
{ .mmi
stf.spill [r2] = f8, 32
stf.spill [r3] = f9, 32
mov out0 = r16
;;
}
{ .mmi
stf.spill [r2] = f10, 32
stf.spill [r3] = f11, 32
shl out1 = r15, 4
;;
}
{ .mmi
stf.spill [r2] = f12, 32
stf.spill [r3] = f13, 32
/* Relocation record is 24 byte. */
shladd out1 = r15, 3, out1
;;
}
{ .mmb
stf.spill [r2] = f14
stf.spill [r3] = f15
br.call.sptk.many b0 = _dl_fixup
}
{ .mii
/* Skip the 16byte scratch area. */
adds r2 = 16, r12
adds r3 = 32, r12
mov b6 = ret0
;;
}
{ .mmi
ldf.fill f8 = [r2], 32
ldf.fill f9 = [r3], 32
mov b0 = loc1
;;
}
{ .mmi
ldf.fill f10 = [r2], 32
ldf.fill f11 = [r3], 32
mov gp = ret1
;;
}
{ .mmi
ldf.fill f12 = [r2], 32
ldf.fill f13 = [r3], 32
mov ar.pfs = loc0
;;
}
{ .mmi
ldf.fill f14 = [r2], 32
ldf.fill f15 = [r3], 32
.restore sp /* pop the unwind frame state */
adds r12 = RESOLVE_FRAME_SIZE, r12
;;
}
{ .mii
mov r9 = loc3 /* restore language specific register */
mov r10 = loc4 /* restore language specific register */
mov r11 = loc5 /* restore language specific register */
}
{ .mii
mov r8 = loc2 /* restore struct value register */
;;
}
/* An alloc is needed for the break system call to work.
We don't care about the old value of the pfs register. */
{ .mmb
.prologue
.body
alloc r2 = ar.pfs, 0, 0, 8, 0
br.sptk.many b6
;;
}
END(_dl_runtime_resolve)
/* The fourth argument to _dl_profile_fixup and the third one to
_dl_audit_pltexit are a pointer to La_ia64_regs:
8byte r8
8byte r9
8byte r10
8byte r11
8byte in0
8byte in1
8byte in2
8byte in3
8byte in4
8byte in5
8byte in6
8byte in7
16byte f8
16byte f9
16byte f10
16byte f11
16byte f12
16byte f13
16byte f14
16byte f15
8byte ar.unat
8byte sp
The fifth argument to _dl_profile_fixup is a pointer to long int.
The fourth argument to _dl_audit_pltexit is a pointer to
La_ia64_retval:
8byte r8
8byte r9
8byte r10
8byte r11
16byte f8
16byte f9
16byte f10
16byte f11
16byte f12
16byte f13
16byte f14
16byte f15
Since stack has to be 16 byte aligned, the stack allocation is in
16byte increment. Before calling _dl_profile_fixup, the stack will
look like
psp new frame_size
+16 La_ia64_regs
sp scratch
*/
#define PLTENTER_FRAME_SIZE (4*8 + 8*8 + 8*16 + 2*8 + 16)
#define PLTEXIT_FRAME_SIZE (PLTENTER_FRAME_SIZE + 4*8 + 8*16)
#if !defined PROF && defined SHARED
ENTRY(_dl_runtime_profile)
{ .mii
.prologue
.save ar.pfs, r40
alloc loc0 = ar.pfs, 8, 12, 8, 0
.vframe loc10
mov loc10 = r12
.save rp, loc1
mov loc1 = b0
}
{ .mii
.save ar.unat, r17
mov r17 = ar.unat
.save ar.lc, loc6
mov loc6 = ar.lc
mov loc11 = gp
}
{ .mii
.body
/* There is a 16 byte scratch area. r2 will start at r8 and
r3 will start at r9 for La_ia64_regs. */
adds r2 = -(PLTENTER_FRAME_SIZE - 16), r12
adds r3 = -(PLTENTER_FRAME_SIZE - 24), r12
adds r12 = -PLTENTER_FRAME_SIZE, r12
;;
}
{ .mmi
st8 [r2] = r8, 16;
st8 [r3] = r9, 16;
mov out2 = b0 /* needed by _dl_fixup_profile */
;;
}
{ .mmi
st8 [r2] = r10, 16;
st8 [r3] = r11, 16;
adds out3 = 16, r12 /* pointer to La_ia64_regs */
;;
}
{ .mmi
.mem.offset 0, 0
st8.spill [r2] = in0, 16
.mem.offset 8, 0
st8.spill [r3] = in1, 16
mov out4 = loc10 /* pointer to new frame size */
;;
}
{ .mmi
.mem.offset 0, 0
st8.spill [r2] = in2, 16
.mem.offset 8, 0
st8.spill [r3] = in3, 16
mov loc2 = r8 /* preserve struct value register */
;;
}
{ .mmi
.mem.offset 0, 0
st8.spill [r2] = in4, 16
.mem.offset 8, 0
st8.spill [r3] = in5, 16
mov loc3 = r9 /* preserve language specific register */
;;
}
{ .mmi
.mem.offset 0, 0
st8 [r2] = in6, 16
.mem.offset 8, 0
st8 [r3] = in7, 24 /* adjust for f9 */
mov loc4 = r10 /* preserve language specific register */
;;
}
{ .mii
mov r18 = ar.unat /* save it in La_ia64_regs */
mov loc7 = out3 /* save it for _dl_audit_pltexit */
mov loc5 = r11 /* preserve language specific register */
}
{ .mmi
stf.spill [r2] = f8, 32
stf.spill [r3] = f9, 32
mov out0 = r16 /* needed by _dl_fixup_profile */
;;
}
{ .mii
mov ar.unat = r17 /* restore it for function call */
mov loc8 = r16 /* save it for _dl_audit_pltexit */
nop.i 0x0
}
{ .mmi
stf.spill [r2] = f10, 32
stf.spill [r3] = f11, 32
shl out1 = r15, 4
;;
}
{ .mmi
stf.spill [r2] = f12, 32
stf.spill [r3] = f13, 32
/* Relocation record is 24 byte. */
shladd out1 = r15, 3, out1
;;
}
{ .mmi
stf.spill [r2] = f14, 32
stf.spill [r3] = f15, 24
mov loc9 = out1 /* save it for _dl_audit_pltexit */
;;
}
{ .mmb
st8 [r2] = r18 /* store ar.unat */
st8 [r3] = loc10 /* store sp */
br.call.sptk.many b0 = _dl_profile_fixup
}
{ .mii
/* Skip the 16byte scratch area, 4 language specific GRs and
8 incoming GRs to restore incoming fp registers. */
adds r2 = (4*8 + 8*8 + 16), r12
adds r3 = (4*8 + 8*8 + 32), r12
mov b6 = ret0
;;
}
{ .mmi
ldf.fill f8 = [r2], 32
ldf.fill f9 = [r3], 32
mov gp = ret1
;;
}
{ .mmi
ldf.fill f10 = [r2], 32
ldf.fill f11 = [r3], 32
mov r8 = loc2 /* restore struct value register */
;;
}
{ .mmi
ldf.fill f12 = [r2], 32
ldf.fill f13 = [r3], 32
mov r9 = loc3 /* restore language specific register */
;;
}
{ .mmi
ldf.fill f14 = [r2], 32
ldf.fill f15 = [r3], 32
mov r10 = loc4 /* restore language specific register */
;;
}
{ .mii
ld8 r15 = [loc10] /* load the new frame size */
mov r11 = loc5 /* restore language specific register */
;;
cmp.eq p6, p7 = -1, r15
;;
}
{ .mii
(p7) cmp.eq p8, p9 = 0, r15
(p6) mov b0 = loc1
(p6) mov ar.lc = loc6
}
{ .mib
nop.m 0x0
(p6) mov ar.pfs = loc0
(p6) br.cond.dptk.many .Lresolved
;;
}
/* At this point, the stack looks like
+psp free
+16 La_ia64_regs
sp scratch
We need to keep the current stack and call the resolved
function by copying the r15 byte from sp + PLTENTER_FRAME_SIZE
+ 16 (scratch area) to sp + 16 (scratch area). Since stack
has to be 16byte aligned, we around r15 up to 16byte. */
{ .mbb
(p9) adds r15 = 15, r15
(p8) br.cond.dptk.many .Lno_new_frame
nop.b 0x0
;;
}
{ .mmi
and r15 = -16, r15
;;
/* We don't copy the 16byte scratch area. Prepare r16/r17 as
destination. */
sub r16 = r12, r15
sub r17 = r12, r15
;;
}
{ .mii
adds r16 = 16, r16
adds r17 = 24, r17
sub r12 = r12, r15 /* Adjust stack */
;;
}
{ .mii
nop.m 0x0
shr r15 = r15, 4
;;
adds r15 = -1, r15
;;
}
{ .mii
/* Skip the 16byte scratch area. Prepare r2/r3 as source. */
adds r2 = 16, loc10
adds r3 = 24, loc10
mov ar.lc = r15
;;
}
.Lcopy:
{ .mmi
ld8 r18 = [r2], 16
ld8 r19 = [r3], 16
nop.i 0x0
;;
}
{ .mmb
st8 [r16] = r18, 16
st8 [r17] = r19, 16
br.cloop.sptk.few .Lcopy
}
.Lno_new_frame:
{ .mii
mov out0 = in0
mov out1 = in1
mov out2 = in2
}
{ .mii
mov out3 = in3
mov out4 = in4
mov out5 = in5
}
{ .mib
mov out6 = in6
mov out7 = in7
/* Call the resolved function */
br.call.sptk.many b0 = b6
}
{ .mii
/* Prepare stack for _dl_audit_pltexit. Loc10 has the original
stack pointer. */
adds r12 = -PLTEXIT_FRAME_SIZE, loc10
adds r2 = -(PLTEXIT_FRAME_SIZE - 16), loc10
adds r3 = -(PLTEXIT_FRAME_SIZE - 24), loc10
;;
}
{ .mmi
/* Load all possible return values into buffer. */
st8 [r2] = r8, 16
st8 [r3] = r9, 16
mov out0 = loc8
;;
}
{ .mmi
st8 [r2] = r10, 16
st8 [r3] = r11, 24
mov out1 = loc9
;;
}
{ .mmi
stf.spill [r2] = f8, 32
stf.spill [r3] = f9, 32
mov out2 = loc7 /* Pointer to La_ia64_regs */
;;
}
{ .mmi
stf.spill [r2] = f10, 32
stf.spill [r3] = f11, 32
adds out3 = 16, r12 /* Pointer to La_ia64_retval */
;;
}
{ .mmi
stf.spill [r2] = f12, 32
stf.spill [r3] = f13, 32
/* We need to restore gp for _dl_audit_pltexit. */
mov gp = loc11
;;
}
{ .mmb
stf.spill [r2] = f14
stf.spill [r3] = f15
br.call.sptk.many b0 = _dl_audit_pltexit
}
{ .mmi
/* Load all the non-floating and floating return values. Skip
the 16byte scratch area. */
adds r2 = 16, r12
adds r3 = 24, r12
nop.i 0x0
;;
}
{ .mmi
ld8 r8 = [r2], 16
ld8 r9 = [r3], 16
nop.i 0x0
;;
}
{ .mmi
ld8 r10 = [r2], 16
ld8 r11 = [r3], 24
nop.i 0x0
;;
}
{ .mmi
ldf.fill f8 = [r2], 32
ldf.fill f9 = [r3], 32
mov ar.lc = loc6
;;
}
{ .mmi
ldf.fill f10 = [r2], 32
ldf.fill f11 = [r3], 32
mov ar.pfs = loc0
;;
}
{ .mmi
ldf.fill f12 = [r2], 32
ldf.fill f13 = [r3], 32
mov b0 = loc1
;;
}
{ .mmi
ldf.fill f14 = [r2]
ldf.fill f15 = [r3]
/* We know that the previous stack pointer, loc10, isn't 0.
We use it to reload p7. */
cmp.ne p7, p0 = 0, loc10
;;
}
.Lresolved:
{ .mmb
.restore sp
mov r12 = loc10
(p7) br.ret.sptk.many b0
;;
}
/* An alloc is needed for the break system call to work. We
don't care about the old value of the pfs register. After
this alloc, we can't use any rotating registers. Otherwise
assembler won't be happy. This has to be at the end. */
{ .mmb
.prologue
.body
alloc r2 = ar.pfs, 0, 0, 8, 0
br.sptk.many b6
;;
}
END(_dl_runtime_profile)
#endif

View File

@ -1,20 +0,0 @@
/* Determine DT_INIT/DT_FINI support in the dynamic loader. IA64 version.
Copyright (C) 2020-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
/* Enable DT_INIT/DT_FINI support. */
#define ELF_INITFINI 1

View File

@ -1,8 +0,0 @@
#include <link.h>
#include <dl-fptr.h>
extern void _start (void);
/* The function's entry point is stored in the first word of the
function descriptor (plabel) of _start(). */
#define ENTRY_POINT ELF_PTR_TO_FDESC (_start)->ip

View File

@ -1,3 +0,0 @@
/* ABI version for _Float128 ABI introduction. */
#define FLOAT128_VERSION GLIBC_2.26
#define FLOAT128_VERSION_M GLIBC_2_26

View File

@ -1,34 +0,0 @@
ifeq ($(subdir),math)
#
# Some files which need to go both into libc and libm have external
# dependencies which need to be resolved differently for libc
# vs. libm. For example, inside libc, __libm_error_support needs to
# resolve to HIDDEN_JUMPTARGET(__libm_error_support) whereas within
# libm it always resolves to __libm_error_support. Such files need to
# be compiled twice. Fortunately, math/Makefile already has logic to
# support this: if a file starts with "s_", make will automatically
# generate a matching file whose name starts with "m_" which simply
# includes the corresponding "s_" file.
#
duplicated-routines = s_libm_ldexp s_libm_ldexpf s_libm_ldexpl \
s_libm_scalbn s_libm_scalbnf s_libm_scalbnl
libm-sysdep_routines += s_erfc s_erfcf s_erfcl \
s_matherrf s_matherrl libm_reduce \
libm_error \
libm_frexp libm_frexpf libm_frexpl \
libm_sincos libm_sincosf libm_sincosl \
libm_sincos_large \
libm_lgamma libm_lgammaf libm_lgammal \
libm_scalblnf \
$(duplicated-routines:s_%=m_%)
sysdep_routines += libc_libm_error libm_frexp libm_frexpf libm_frexpl \
$(duplicated-routines)
sysdep-CPPFLAGS += -include libm-symbols.h \
-D__POSIX__ -Dopensource \
-D_LIB_VERSIONIMF=_LIB_VERSION \
-DSIZE_INT_32 -DSIZE_LONG_INT_64 -DSIZE_LONG_LONG_INT_64 \
-DSIZE_LONG_64 -DIA64
endif

View File

@ -1,50 +0,0 @@
----------------------------------------------------------
Notes on how to update libm based on Intel's libm releases
----------------------------------------------------------
This source code in this directory is currently based on Intel libm
v2.1 as available from:
http://www.intel.com/software/products/opensource/libraries/num.htm
To ease importing, fix some bugs, and simplify integration into libc,
it is also necessary to apply the patch at:
ftp://ftp.hpl.hp.com/pub/linux-ia64/intel-libm-041228.diff.gz
The expectation is that Intel will integrate most if not all of these
changes into future releases of libm, so this patching step can
hopefully be omitted in the future.
Once the patched libm sources are extracted in a directory $LIBM, they
can be imported into the libc source tree at $LIBC with the following
step:
$ cd $LIBC/src/sysdep/ia64/fpu
$ ./import_intel_libm $LIBM
This should produce a number of "Importing..." messages, without
showing any errors.
At this point, you should be able to build glibc in the usual fashion.
We assume you do this in directory $OBJ. Once the build has
completed, run "make check" to verify that all (math) checks succeed.
If these checks succeed, you should also run the following commands to
verify that the new libm doesn't pollute the name-space and has proper
size-info for the data objects:
$ cd $LIBC/src/sysdep/ia64/fpu
$ import_check $OBJ/math/
There should be no (unexpected) errors reported by this script.
As an optional step, you may also want to confirm that the new libm
exports the exact same global symbols as the old one.
If you want to see the changes introduced by the "import_intel_libm"
script, you can run the commands:
$ cd $LIBC/src/sysdep/ia64/fpu
$ import_diffs
That's it.

View File

@ -1,10 +0,0 @@
libc {
GLIBC_PRIVATE {
__libm_frexp_4; __libm_frexp_4f; __libm_frexp_4l; __libm_error_support;
}
}
libm {
GLIBC_2.2.3 {
matherrf; matherrl;
}
}

View File

@ -1 +0,0 @@
/* Not needed. */

View File

@ -1 +0,0 @@
/* Not needed. */

View File

@ -1 +0,0 @@
/* Not needed. */

View File

@ -1,878 +0,0 @@
.file "acos.s"
// Copyright (c) 2000 - 2003 Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
// History
//==============================================================
// 02/02/00 Initial version
// 08/17/00 New and much faster algorithm.
// 08/30/00 Avoided bank conflicts on loads, shortened |x|=1 and x=0 paths,
// fixed mfb split issue stalls.
// 05/20/02 Cleaned up namespace and sf0 syntax
// 08/02/02 New and much faster algorithm II
// 02/06/03 Reordered header: .section, .global, .proc, .align
// Description
//=========================================
// The acos function computes the principal value of the arc cosine of x.
// acos(0) returns Pi/2, acos(1) returns 0, acos(-1) returns Pi.
// A domain error occurs for arguments not in the range [-1,+1].
//
// The acos function returns the arc cosine in the range [0, Pi] radians.
//
// There are 8 paths:
// 1. x = +/-0.0
// Return acos(x) = Pi/2 + x
//
// 2. 0.0 < |x| < 0.625
// Return acos(x) = Pi/2 - x - x^3 *PolA(x^2)
// where PolA(x^2) = A3 + A5*x^2 + A7*x^4 +...+ A35*x^32
//
// 3. 0.625 <=|x| < 1.0
// Return acos(x) = Pi/2 - asin(x) =
// = Pi/2 - sign(x) * ( Pi/2 - sqrt(R) * PolB(R))
// Where R = 1 - |x|,
// PolB(R) = B0 + B1*R + B2*R^2 +...+B12*R^12
//
// sqrt(R) is approximated using the following sequence:
// y0 = (1 + eps)/sqrt(R) - initial approximation by frsqrta,
// |eps| < 2^(-8)
// Then 3 iterations are used to refine the result:
// H0 = 0.5*y0
// S0 = R*y0
//
// d0 = 0.5 - H0*S0
// H1 = H0 + d0*H0
// S1 = S0 + d0*S0
//
// d1 = 0.5 - H1*S1
// H2 = H1 + d0*H1
// S2 = S1 + d0*S1
//
// d2 = 0.5 - H2*S2
// S3 = S3 + d2*S3
//
// S3 approximates sqrt(R) with enough accuracy for this algorithm
//
// So, the result should be reconstracted as follows:
// acos(x) = Pi/2 - sign(x) * (Pi/2 - S3*PolB(R))
//
// But for optimization purposes the reconstruction step is slightly
// changed:
// acos(x) = Cpi + sign(x)*PolB(R)*S2 - sign(x)*d2*S2*PolB(R)
// where Cpi = 0 if x > 0 and Cpi = Pi if x < 0
//
// 4. |x| = 1.0
// Return acos(1.0) = 0.0, acos(-1.0) = Pi
//
// 5. 1.0 < |x| <= +INF
// A domain error occurs for arguments not in the range [-1,+1]
//
// 6. x = [S,Q]NaN
// Return acos(x) = QNaN
//
// 7. x is denormal
// Return acos(x) = Pi/2 - x,
//
// 8. x is unnormal
// Normalize input in f8 and return to the very beginning of the function
//
// Registers used
//==============================================================
// Floating Point registers used:
// f8, input, output
// f6, f7, f9 -> f15, f32 -> f64
// General registers used:
// r3, r21 -> r31, r32 -> r38
// Predicate registers used:
// p0, p6 -> p14
//
// Assembly macros
//=========================================
// integer registers used
// scratch
rTblAddr = r3
rPiBy2Ptr = r21
rTmpPtr3 = r22
rDenoBound = r23
rOne = r24
rAbsXBits = r25
rHalf = r26
r0625 = r27
rSign = r28
rXBits = r29
rTmpPtr2 = r30
rTmpPtr1 = r31
// stacked
GR_SAVE_PFS = r32
GR_SAVE_B0 = r33
GR_SAVE_GP = r34
GR_Parameter_X = r35
GR_Parameter_Y = r36
GR_Parameter_RESULT = r37
GR_Parameter_TAG = r38
// floating point registers used
FR_X = f10
FR_Y = f1
FR_RESULT = f8
// scratch
fXSqr = f6
fXCube = f7
fXQuadr = f9
f1pX = f10
f1mX = f11
f1pXRcp = f12
f1mXRcp = f13
fH = f14
fS = f15
// stacked
fA3 = f32
fB1 = f32
fA5 = f33
fB2 = f33
fA7 = f34
fPiBy2 = f34
fA9 = f35
fA11 = f36
fB10 = f35
fB11 = f36
fA13 = f37
fA15 = f38
fB4 = f37
fB5 = f38
fA17 = f39
fA19 = f40
fB6 = f39
fB7 = f40
fA21 = f41
fA23 = f42
fB3 = f41
fB8 = f42
fA25 = f43
fA27 = f44
fB9 = f43
fB12 = f44
fA29 = f45
fA31 = f46
fA33 = f47
fA35 = f48
fBaseP = f49
fB0 = f50
fSignedS = f51
fD = f52
fHalf = f53
fR = f54
fCloseTo1Pol = f55
fSignX = f56
fDenoBound = f57
fNormX = f58
fX8 = f59
fRSqr = f60
fRQuadr = f61
fR8 = f62
fX16 = f63
fCpi = f64
// Data tables
//==============================================================
RODATA
.align 16
LOCAL_OBJECT_START(acos_base_range_table)
// Ai: Polynomial coefficients for the acos(x), |x| < .625000
// Bi: Polynomial coefficients for the acos(x), |x| > .625000
data8 0xBFDAAB56C01AE468 //A29
data8 0x3FE1C470B76A5B2B //A31
data8 0xBFDC5FF82A0C4205 //A33
data8 0x3FC71FD88BFE93F0 //A35
data8 0xB504F333F9DE6487, 0x00003FFF //B0
data8 0xAAAAAAAAAAAAFC18, 0x00003FFC //A3
data8 0x3F9F1C71BC4A7823 //A9
data8 0x3F96E8BBAAB216B2 //A11
data8 0x3F91C4CA1F9F8A98 //A13
data8 0x3F8C9DDCEDEBE7A6 //A15
data8 0x3F877784442B1516 //A17
data8 0x3F859C0491802BA2 //A19
data8 0x9999999998C88B8F, 0x00003FFB //A5
data8 0x3F6BD7A9A660BF5E //A21
data8 0x3F9FC1659340419D //A23
data8 0xB6DB6DB798149BDF, 0x00003FFA //A7
data8 0xBFB3EF18964D3ED3 //A25
data8 0x3FCD285315542CF2 //A27
data8 0xF15BEEEFF7D2966A, 0x00003FFB //B1
data8 0x3EF0DDA376D10FB3 //B10
data8 0xBEB83CAFE05EBAC9 //B11
data8 0x3F65FFB67B513644 //B4
data8 0x3F5032FBB86A4501 //B5
data8 0x3F392162276C7CBA //B6
data8 0x3F2435949FD98BDF //B7
data8 0xD93923D7FA08341C, 0x00003FF9 //B2
data8 0x3F802995B6D90BDB //B3
data8 0x3F10DF86B341A63F //B8
data8 0xC90FDAA22168C235, 0x00003FFF // Pi/2
data8 0x3EFA3EBD6B0ECB9D //B9
data8 0x3EDE18BA080E9098 //B12
LOCAL_OBJECT_END(acos_base_range_table)
.section .text
GLOBAL_LIBM_ENTRY(acos)
acos_unnormal_back:
{ .mfi
getf.d rXBits = f8 // grab bits of input value
// set p12 = 1 if x is a NaN, denormal, or zero
fclass.m p12, p0 = f8, 0xcf
adds rSign = 1, r0
}
{ .mfi
addl rTblAddr = @ltoff(acos_base_range_table),gp
// 1 - x = 1 - |x| for positive x
fms.s1 f1mX = f1, f1, f8
addl rHalf = 0xFFFE, r0 // exponent of 1/2
}
;;
{ .mfi
addl r0625 = 0x3FE4, r0 // high 16 bits of 0.625
// set p8 = 1 if x < 0
fcmp.lt.s1 p8, p9 = f8, f0
shl rSign = rSign, 63 // sign bit
}
{ .mfi
// point to the beginning of the table
ld8 rTblAddr = [rTblAddr]
// 1 + x = 1 - |x| for negative x
fma.s1 f1pX = f1, f1, f8
adds rOne = 0x3FF, r0
}
;;
{ .mfi
andcm rAbsXBits = rXBits, rSign // bits of |x|
fmerge.s fSignX = f8, f1 // signum(x)
shl r0625 = r0625, 48 // bits of DP representation of 0.625
}
{ .mfb
setf.exp fHalf = rHalf // load A2 to FP reg
fma.s1 fXSqr = f8, f8, f0 // x^2
// branch on special path if x is a NaN, denormal, or zero
(p12) br.cond.spnt acos_special
}
;;
{ .mfi
adds rPiBy2Ptr = 272, rTblAddr
nop.f 0
shl rOne = rOne, 52 // bits of 1.0
}
{ .mfi
adds rTmpPtr1 = 16, rTblAddr
nop.f 0
// set p6 = 1 if |x| < 0.625
cmp.lt p6, p7 = rAbsXBits, r0625
}
;;
{ .mfi
ldfpd fA29, fA31 = [rTblAddr] // A29, fA31
// 1 - x = 1 - |x| for positive x
(p9) fms.s1 fR = f1, f1, f8
// point to coefficient of "near 1" polynomial
(p7) adds rTmpPtr2 = 176, rTblAddr
}
{ .mfi
ldfpd fA33, fA35 = [rTmpPtr1], 16 // A33, fA35
// 1 + x = 1 - |x| for negative x
(p8) fma.s1 fR = f1, f1, f8
(p6) adds rTmpPtr2 = 48, rTblAddr
}
;;
{ .mfi
ldfe fB0 = [rTmpPtr1], 16 // B0
nop.f 0
nop.i 0
}
{ .mib
adds rTmpPtr3 = 16, rTmpPtr2
// set p10 = 1 if |x| = 1.0
cmp.eq p10, p0 = rAbsXBits, rOne
// branch on special path for |x| = 1.0
(p10) br.cond.spnt acos_abs_1
}
;;
{ .mfi
ldfe fA3 = [rTmpPtr2], 48 // A3 or B1
nop.f 0
adds rTmpPtr1 = 64, rTmpPtr3
}
{ .mib
ldfpd fA9, fA11 = [rTmpPtr3], 16 // A9, A11 or B10, B11
// set p11 = 1 if |x| > 1.0
cmp.gt p11, p0 = rAbsXBits, rOne
// branch on special path for |x| > 1.0
(p11) br.cond.spnt acos_abs_gt_1
}
;;
{ .mfi
ldfpd fA17, fA19 = [rTmpPtr2], 16 // A17, A19 or B6, B7
// initial approximation of 1 / sqrt(1 - x)
frsqrta.s1 f1mXRcp, p0 = f1mX
nop.i 0
}
{ .mfi
ldfpd fA13, fA15 = [rTmpPtr3] // A13, A15 or B4, B5
fma.s1 fXCube = fXSqr, f8, f0 // x^3
nop.i 0
}
;;
{ .mfi
ldfe fA5 = [rTmpPtr2], 48 // A5 or B2
// initial approximation of 1 / sqrt(1 + x)
frsqrta.s1 f1pXRcp, p0 = f1pX
nop.i 0
}
{ .mfi
ldfpd fA21, fA23 = [rTmpPtr1], 16 // A21, A23 or B3, B8
fma.s1 fXQuadr = fXSqr, fXSqr, f0 // x^4
nop.i 0
}
;;
{ .mfi
ldfe fA7 = [rTmpPtr1] // A7 or Pi/2
fma.s1 fRSqr = fR, fR, f0 // R^2
nop.i 0
}
{ .mfb
ldfpd fA25, fA27 = [rTmpPtr2] // A25, A27 or B9, B12
nop.f 0
(p6) br.cond.spnt acos_base_range;
}
;;
{ .mfi
nop.m 0
(p9) fma.s1 fH = fHalf, f1mXRcp, f0 // H0 for x > 0
nop.i 0
}
{ .mfi
nop.m 0
(p9) fma.s1 fS = f1mX, f1mXRcp, f0 // S0 for x > 0
nop.i 0
}
;;
{ .mfi
nop.m 0
(p8) fma.s1 fH = fHalf, f1pXRcp, f0 // H0 for x < 0
nop.i 0
}
{ .mfi
nop.m 0
(p8) fma.s1 fS = f1pX, f1pXRcp, f0 // S0 for x > 0
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fRQuadr = fRSqr, fRSqr, f0 // R^4
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fB11 = fB11, fR, fB10
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fB1 = fB1, fR, fB0
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fB5 = fB5, fR, fB4
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fB7 = fB7, fR, fB6
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fB3 = fB3, fR, fB2
nop.i 0
}
;;
{ .mfi
nop.m 0
fnma.s1 fD = fH, fS, fHalf // d0 = 1/2 - H0*S0
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fR8 = fRQuadr, fRQuadr, f0 // R^4
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fB9 = fB9, fR, fB8
nop.i 0
}
;;
{.mfi
nop.m 0
fma.s1 fB12 = fB12, fRSqr, fB11
nop.i 0
}
{.mfi
nop.m 0
fma.s1 fB7 = fB7, fRSqr, fB5
nop.i 0
}
;;
{.mfi
nop.m 0
fma.s1 fB3 = fB3, fRSqr, fB1
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fH = fH, fD, fH // H1 = H0 + H0*d0
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fS = fS, fD, fS // S1 = S0 + S0*d0
nop.i 0
}
;;
{.mfi
nop.m 0
(p9) fma.s1 fCpi = f1, f0, f0 // Cpi = 0 if x > 0
nop.i 0
}
{ .mfi
nop.m 0
(p8) fma.s1 fCpi = fPiBy2, f1, fPiBy2 // Cpi = Pi if x < 0
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fB12 = fB12, fRSqr, fB9
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fB7 = fB7, fRQuadr, fB3
nop.i 0
}
;;
{.mfi
nop.m 0
fnma.s1 fD = fH, fS, fHalf // d1 = 1/2 - H1*S1
nop.i 0
}
{ .mfi
nop.m 0
fnma.s1 fSignedS = fSignX, fS, f0 // -signum(x)*S1
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fCloseTo1Pol = fB12, fR8, fB7
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fH = fH, fD, fH // H2 = H1 + H1*d1
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fS = fS, fD, fS // S2 = S1 + S1*d1
nop.i 0
}
;;
{ .mfi
nop.m 0
// -signum(x)* S2 = -signum(x)*(S1 + S1*d1)
fma.s1 fSignedS = fSignedS, fD, fSignedS
nop.i 0
}
;;
{.mfi
nop.m 0
fnma.s1 fD = fH, fS, fHalf // d2 = 1/2 - H2*S2
nop.i 0
}
;;
{ .mfi
nop.m 0
// Cpi + signum(x)*PolB*S2
fnma.s1 fCpi = fSignedS, fCloseTo1Pol, fCpi
nop.i 0
}
{ .mfi
nop.m 0
// signum(x)*PolB * S2
fnma.s1 fCloseTo1Pol = fSignedS, fCloseTo1Pol, f0
nop.i 0
}
;;
{ .mfb
nop.m 0
// final result for 0.625 <= |x| < 1
fma.d.s0 f8 = fCloseTo1Pol, fD, fCpi
// exit here for 0.625 <= |x| < 1
br.ret.sptk b0
}
;;
// here if |x| < 0.625
.align 32
acos_base_range:
{ .mfi
ldfe fCpi = [rPiBy2Ptr] // Pi/2
fma.s1 fA33 = fA33, fXSqr, fA31
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA15 = fA15, fXSqr, fA13
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA29 = fA29, fXSqr, fA27
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA25 = fA25, fXSqr, fA23
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA21 = fA21, fXSqr, fA19
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA9 = fA9, fXSqr, fA7
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA5 = fA5, fXSqr, fA3
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA35 = fA35, fXQuadr, fA33
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA17 = fA17, fXQuadr, fA15
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fX8 = fXQuadr, fXQuadr, f0 // x^8
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA25 = fA25, fXQuadr, fA21
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA9 = fA9, fXQuadr, fA5
nop.i 0
}
;;
{ .mfi
nop.m 0
fms.s1 fCpi = fCpi, f1, f8 // Pi/2 - x
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA35 = fA35, fXQuadr, fA29
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA17 = fA17, fXSqr, fA11
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fX16 = fX8, fX8, f0 // x^16
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA35 = fA35, fX8, fA25
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA17 = fA17, fX8, fA9
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fBaseP = fA35, fX16, fA17
nop.i 0
}
;;
{ .mfb
nop.m 0
// final result for |x| < 0.625
fnma.d.s0 f8 = fBaseP, fXCube, fCpi
// exit here for |x| < 0.625 path
br.ret.sptk b0
}
;;
// here if |x| = 1
// acos(1) = 0
// acos(-1) = Pi
.align 32
acos_abs_1:
{ .mfi
ldfe fPiBy2 = [rPiBy2Ptr] // Pi/2
nop.f 0
nop.i 0
}
;;
.pred.rel "mutex", p8, p9
{ .mfi
nop.m 0
// result for x = 1.0
(p9) fma.d.s0 f8 = f1, f0, f0 // 0.0
nop.i 0
}
{.mfb
nop.m 0
// result for x = -1.0
(p8) fma.d.s0 f8 = fPiBy2, f1, fPiBy2 // Pi
// exit here for |x| = 1.0
br.ret.sptk b0
}
;;
// here if x is a NaN, denormal, or zero
.align 32
acos_special:
{ .mfi
// point to Pi/2
adds rPiBy2Ptr = 272, rTblAddr
// set p12 = 1 if x is a NaN
fclass.m p12, p0 = f8, 0xc3
nop.i 0
}
{ .mlx
nop.m 0
// smallest positive DP normalized number
movl rDenoBound = 0x0010000000000000
}
;;
{ .mfi
ldfe fPiBy2 = [rPiBy2Ptr] // Pi/2
// set p13 = 1 if x = 0.0
fclass.m p13, p0 = f8, 0x07
nop.i 0
}
{ .mfi
nop.m 0
fnorm.s1 fNormX = f8
nop.i 0
}
;;
{ .mfb
// load smallest normal to FP reg
setf.d fDenoBound = rDenoBound
// answer if x is a NaN
(p12) fma.d.s0 f8 = f8,f1,f0
// exit here if x is a NaN
(p12) br.ret.spnt b0
}
;;
{ .mfi
nop.m 0
// absolute value of normalized x
fmerge.s fNormX = f1, fNormX
nop.i 0
}
;;
{ .mfb
nop.m 0
// final result for x = 0
(p13) fma.d.s0 f8 = fPiBy2, f1, f8
// exit here if x = 0.0
(p13) br.ret.spnt b0
}
;;
// if we still here then x is denormal or unnormal
{ .mfi
nop.m 0
// set p14 = 1 if normalized x is greater than or
// equal to the smallest denormalized value
// So, if p14 is set to 1 it means that we deal with
// unnormal rather than with "true" denormal
fcmp.ge.s1 p14, p0 = fNormX, fDenoBound
nop.i 0
}
;;
{ .mfi
nop.m 0
(p14) fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag if x unnormal
nop.i 0
}
{ .mfb
nop.m 0
// normalize unnormal input
(p14) fnorm.s1 f8 = f8
// return to the main path
(p14) br.cond.sptk acos_unnormal_back
}
;;
// if we still here it means that input is "true" denormal
{ .mfb
nop.m 0
// final result if x is denormal
fms.d.s0 f8 = fPiBy2, f1, f8 // Pi/2 - x
// exit here if x is denormal
br.ret.sptk b0
}
;;
// here if |x| > 1.0
// error handler should be called
.align 32
acos_abs_gt_1:
{ .mfi
alloc r32 = ar.pfs, 0, 3, 4, 0 // get some registers
fmerge.s FR_X = f8,f8
nop.i 0
}
{ .mfb
mov GR_Parameter_TAG = 58 // error code
frcpa.s0 FR_RESULT, p0 = f0,f0
// call error handler routine
br.cond.sptk __libm_error_region
}
;;
GLOBAL_LIBM_END(acos)
libm_alias_double_other (acos, acos)
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
};;
{ .mmi
stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
nop.b 0
}
{ .mib
stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
add GR_Parameter_RESULT = 48,sp
nop.m 0
nop.i 0
};;
{ .mmi
ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
};;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#

View File

@ -1,694 +0,0 @@
.file "acosf.s"
// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
// History
//==============================================================
// 02/02/00 Initial version
// 06/28/00 Improved speed
// 06/31/00 Changed register allocation because of some duplicate macros
// moved nan exit bundle up to gain a cycle.
// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
// 08/17/00 Changed predicate register macro-usage to direct predicate
// names due to an assembler bug.
// 10/17/00 Improved speed of x=0 and x=1 paths, set D flag if x denormal.
// 03/13/01 Corrected sign of imm1 value in dep instruction.
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/06/03 Reordered header: .section, .global, .proc, .align
// 04/17/03 Moved mutex after label
// Description
//=========================================
// The acosf function computes the principle value of the arc sine of x.
// A domain error occurs for arguments not in the range [-1,+1].
// The acosf function returns the arc cosine in the range [0, +pi] radians.
// acos(1) returns +0
// acos(x) returns a Nan and raises the invalid exception for |x| >1
// |x| <= sqrt(2)/2. get Ax and Bx
// poly_p1 = x p1
// poly_p3 = x2 p4 + p3
// poly_p1 = x2 (poly_p1) + x = x2(x p1) + x
// poly_p2 = x2( poly_p3) + p2 = x2(x2 p4 + p3) + p2
// poly_Ax = x5(x2( poly_p3) + p2) + x2(x p1) + x
// = x5(x2(x2 p4 + p3) + p2) + x2(x p1) + x
// poly_p7 = x2 p8 + p7
// poly_p5 = x2 p6 + p5
// poly_p7 = x4 p9 + (x2 p8 + p7)
// poly_Bx = x4 (x4 p9 + (x2 p8 + p7)) + x2 p6 + p5
// sinf1 = x11(x4 (x4 p9 + (x2 p8 + p7)) + x2 p6 + p5) + x5(x2(x2 p4 + p3) + p2) + x2(x p1) + x
// = x19 p9 + x17 p8 + x15 p7 x13 p6 + x11 p5 + x9 p4 + x7 p3 + x5 p2 + x3 p1 + x
// answer1 = pi/2 - sinf1
// |x| > sqrt(2)/2
// Get z = sqrt(1-x2)
// Get polynomial in t = 1-x2
// t2 = t t
// t4 = t2 t2
// poly_p4 = t p5 + p4
// poly_p1 = t p1 + 1
// poly_p6 = t p7 + p6
// poly_p2 = t p3 + p2
// poly_p8 = t p9 + p8
// poly_p4 = t2 poly_p6 + poly_p4
// = t2 (t p7 + p6) + (t p5 + p4)
// poly_p2 = t2 poly_p2 + poly_p1
// = t2 (t p3 + p2) + (t p1 + 1)
// poly_p4 = t4 poly_p8 + poly_p4
// = t4 (t p9 + p8) + (t2 (t p7 + p6) + (t p5 + p4))
// P(t) = poly_p2 + t4 poly_p8
// = t2 (t p3 + p2) + (t p1 + 1) + t4 (t4 (t p9 + p8) + (t2 (t p7 + p6) + (t p5 + p4)))
// = t3 p3 + t2 p2 + t p1 + 1 + t9 p9 + t8 p8 + t7 p7 + t6 p6 + t5 p5 + t4 p4
// answer2 = sign(x) z P(t) if x>0
// = sign(x) z P(t) + pi if x<0
//
// Assembly macros
//=========================================
// predicate registers
//acosf_pred_LEsqrt2by2 = p7
//acosf_pred_GTsqrt2by2 = p8
// integer registers
ACOSF_Addr1 = r33
ACOSF_Addr2 = r34
ACOSF_GR_1by2 = r35
ACOSF_GR_3by2 = r36
ACOSF_GR_5by2 = r37
GR_SAVE_B0 = r38
GR_SAVE_PFS = r39
GR_SAVE_GP = r40
GR_Parameter_X = r41
GR_Parameter_Y = r42
GR_Parameter_RESULT = r43
GR_Parameter_TAG = r44
// floating point registers
acosf_y = f32
acosf_abs_x = f33
acosf_x2 = f34
acosf_sgn_x = f35
acosf_1by2 = f36
acosf_3by2 = f37
acosf_5by2 = f38
acosf_coeff_P3 = f39
acosf_coeff_P8 = f40
acosf_coeff_P1 = f41
acosf_coeff_P4 = f42
acosf_coeff_P5 = f43
acosf_coeff_P2 = f44
acosf_coeff_P7 = f45
acosf_coeff_P6 = f46
acosf_coeff_P9 = f47
acosf_x2 = f48
acosf_x3 = f49
acosf_x4 = f50
acosf_x8 = f51
acosf_x5 = f52
acosf_const_piby2 = f53
acosf_const_sqrt2by2 = f54
acosf_x11 = f55
acosf_poly_p1 = f56
acosf_poly_p3 = f57
acosf_sinf1 = f58
acosf_poly_p2 = f59
acosf_poly_Ax = f60
acosf_poly_p7 = f61
acosf_poly_p5 = f62
acosf_sgnx_t4 = f63
acosf_poly_Bx = f64
acosf_t = f65
acosf_yby2 = f66
acosf_B = f67
acosf_B2 = f68
acosf_Az = f69
acosf_dz = f70
acosf_Sz = f71
acosf_d2z = f72
acosf_Fz = f73
acosf_z = f74
acosf_sgnx_z = f75
acosf_t2 = f76
acosf_2poly_p4 = f77
acosf_2poly_p6 = f78
acosf_2poly_p1 = f79
acosf_2poly_p2 = f80
acosf_2poly_p8 = f81
acosf_t4 = f82
acosf_Pt = f83
acosf_sgnx_2poly_p2 = f84
acosf_sgn_x_piby2 = f85
acosf_poly_p7a = f86
acosf_2poly_p4a = f87
acosf_2poly_p4b = f88
acosf_2poly_p2a = f89
acosf_poly_p1a = f90
// Data tables
//==============================================================
RODATA
.align 16
LOCAL_OBJECT_START(acosf_coeff_1_table)
data8 0x3FC5555607DCF816 // P1
data8 0x3F9CF81AD9BAB2C6 // P4
data8 0x3FC59E0975074DF3 // P7
data8 0xBFA6F4CC2780AA1D // P6
data8 0x3FC2DD45292E93CB // P9
data8 0x3fe6a09e667f3bcd // sqrt(2)/2
LOCAL_OBJECT_END(acosf_coeff_1_table)
LOCAL_OBJECT_START(acosf_coeff_2_table)
data8 0x3FA6F108E31EFBA6 // P3
data8 0xBFCA31BF175D82A0 // P8
data8 0x3FA30C0337F6418B // P5
data8 0x3FB332C9266CB1F9 // P2
data8 0x3ff921fb54442d18 // pi_by_2
LOCAL_OBJECT_END(acosf_coeff_2_table)
.section .text
GLOBAL_LIBM_ENTRY(acosf)
// Load the addresses of the two tables.
// Then, load the coefficients and other constants.
{ .mfi
alloc r32 = ar.pfs,1,8,4,0
fnma.s1 acosf_t = f8,f8,f1
dep.z ACOSF_GR_1by2 = 0x3f,24,8 // 0x3f000000
}
{ .mfi
addl ACOSF_Addr1 = @ltoff(acosf_coeff_1_table),gp
fma.s1 acosf_x2 = f8,f8,f0
addl ACOSF_Addr2 = @ltoff(acosf_coeff_2_table),gp ;;
}
{ .mfi
ld8 ACOSF_Addr1 = [ACOSF_Addr1]
fmerge.s acosf_abs_x = f1,f8
dep ACOSF_GR_3by2 = -1,r0,22,8 // 0x3fc00000
}
{ .mlx
nop.m 999
movl ACOSF_GR_5by2 = 0x40200000;;
}
{ .mfi
setf.s acosf_1by2 = ACOSF_GR_1by2
fmerge.s acosf_sgn_x = f8,f1
nop.i 999
}
{ .mfi
ld8 ACOSF_Addr2 = [ACOSF_Addr2]
nop.f 0
nop.i 999;;
}
{ .mfi
setf.s acosf_5by2 = ACOSF_GR_5by2
fcmp.lt.s1 p11,p12 = f8,f0
nop.i 999;;
}
{ .mmf
ldfpd acosf_coeff_P1,acosf_coeff_P4 = [ACOSF_Addr1],16
setf.s acosf_3by2 = ACOSF_GR_3by2
fclass.m.unc p8,p0 = f8, 0xc3 ;; //@qnan | @snan
}
{ .mfi
ldfpd acosf_coeff_P7,acosf_coeff_P6 = [ACOSF_Addr1],16
fma.s1 acosf_t2 = acosf_t,acosf_t,f0
nop.i 999
}
{ .mfi
ldfpd acosf_coeff_P3,acosf_coeff_P8 = [ACOSF_Addr2],16
fma.s1 acosf_x4 = acosf_x2,acosf_x2,f0
nop.i 999;;
}
{ .mfi
ldfpd acosf_coeff_P9,acosf_const_sqrt2by2 = [ACOSF_Addr1]
fclass.m.unc p10,p0 = f8, 0x07 //@zero
nop.i 999
}
{ .mfi
ldfpd acosf_coeff_P5,acosf_coeff_P2 = [ACOSF_Addr2],16
fma.s1 acosf_x3 = f8,acosf_x2,f0
nop.i 999;;
}
{ .mfi
ldfd acosf_const_piby2 = [ACOSF_Addr2]
frsqrta.s1 acosf_B,p0 = acosf_t
nop.i 999
}
{ .mfb
nop.m 999
(p8) fma.s.s0 f8 = f8,f1,f0
(p8) br.ret.spnt b0 ;; // Exit if x=nan
}
{ .mfb
nop.m 999
fcmp.eq.s1 p6,p0 = acosf_abs_x,f1
(p10) br.cond.spnt ACOSF_ZERO ;; // Branch if x=0
}
{ .mfi
nop.m 999
fcmp.gt.s1 p9,p0 = acosf_abs_x,f1
nop.i 999;;
}
{ .mfi
nop.m 999
fma.s1 acosf_x8 = acosf_x4,acosf_x4,f0
nop.i 999
}
{ .mfb
nop.m 999
fma.s1 acosf_t4 = acosf_t2,acosf_t2,f0
(p6) br.cond.spnt ACOSF_ABS_ONE ;; // Branch if |x|=1
}
{ .mfi
nop.m 999
fma.s1 acosf_x5 = acosf_x2,acosf_x3,f0
nop.i 999
}
{ .mfb
(p9) mov GR_Parameter_TAG = 59
fma.s1 acosf_yby2 = acosf_t,acosf_1by2,f0
(p9) br.cond.spnt __libm_error_region ;; // Branch if |x|>1
}
{ .mfi
nop.m 999
fma.s1 acosf_Az = acosf_t,acosf_B,f0
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 acosf_B2 = acosf_B,acosf_B,f0
nop.i 999;;
}
{ .mfi
nop.m 999
fma.s1 acosf_poly_p1 = f8,acosf_coeff_P1,f0
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 acosf_2poly_p1 = acosf_coeff_P1,acosf_t,f1
nop.i 999;;
}
{ .mfi
nop.m 999
fma.s1 acosf_poly_p3 = acosf_coeff_P4,acosf_x2,acosf_coeff_P3
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 acosf_2poly_p6 = acosf_coeff_P7,acosf_t,acosf_coeff_P6
nop.i 999;;
}
{ .mfi
nop.m 999
fma.s1 acosf_poly_p7 = acosf_x2,acosf_coeff_P8,acosf_coeff_P7
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 acosf_2poly_p2 = acosf_coeff_P3,acosf_t,acosf_coeff_P2
nop.i 999;;
}
{ .mfi
nop.m 999
fma.s1 acosf_poly_p5 = acosf_x2,acosf_coeff_P6,acosf_coeff_P5
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 acosf_2poly_p4 = acosf_coeff_P5,acosf_t,acosf_coeff_P4
nop.i 999;;
}
{ .mfi
nop.m 999
fma.s1 acosf_x11 = acosf_x8,acosf_x3,f0
nop.i 999
}
{ .mfi
nop.m 999
fnma.s1 acosf_dz = acosf_B2,acosf_yby2,acosf_1by2
nop.i 999;;
}
{ .mfi
nop.m 999
fma.s1 acosf_poly_p1a = acosf_x2,acosf_poly_p1,f8
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 acosf_2poly_p8 = acosf_coeff_P9,acosf_t,acosf_coeff_P8
nop.i 999;;
}
// Get the absolute value of x and determine the region in which x lies
{ .mfi
nop.m 999
fcmp.le.s1 p7,p8 = acosf_abs_x,acosf_const_sqrt2by2
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 acosf_poly_p2 = acosf_x2,acosf_poly_p3,acosf_coeff_P2
nop.i 999;;
}
{ .mfi
nop.m 999
fma.s1 acosf_poly_p7a = acosf_x4,acosf_coeff_P9,acosf_poly_p7
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 acosf_2poly_p2a = acosf_2poly_p2,acosf_t2,acosf_2poly_p1
nop.i 999;;
}
{ .mfi
nop.m 999
(p8) fma.s1 acosf_sgnx_t4 = acosf_sgn_x,acosf_t4,f0
nop.i 999
}
{ .mfi
nop.m 999
(p8) fma.s1 acosf_2poly_p4a = acosf_2poly_p6,acosf_t2,acosf_2poly_p4
nop.i 999;;
}
{ .mfi
nop.m 999
(p8) fma.s1 acosf_Sz = acosf_5by2,acosf_dz,acosf_3by2
nop.i 999
}
{ .mfi
nop.m 999
(p8) fma.s1 acosf_d2z = acosf_dz,acosf_dz,f0
nop.i 999;;
}
{ .mfi
nop.m 999
(p8) fnma.d.s1 acosf_sgn_x_piby2 = acosf_sgn_x,acosf_const_piby2,acosf_const_piby2
nop.i 999
}
{ .mfi
nop.m 999
(p7) fma.s1 acosf_poly_Ax = acosf_x5,acosf_poly_p2,acosf_poly_p1a
nop.i 999;;
}
{ .mfi
nop.m 999
(p7) fma.s1 acosf_poly_Bx = acosf_x4,acosf_poly_p7a,acosf_poly_p5
nop.i 999
}
{ .mfi
nop.m 999
(p8) fma.s1 acosf_sgnx_2poly_p2 = acosf_sgn_x,acosf_2poly_p2a,f0
nop.i 999;;
}
{ .mfi
nop.m 999
fcmp.eq.s0 p6,p0 = f8,f0 // Only purpose is to set D if x denormal
nop.i 999
}
{ .mfi
nop.m 999
(p8) fma.s1 acosf_2poly_p4b = acosf_2poly_p8,acosf_t4,acosf_2poly_p4a
nop.i 999;;
}
{ .mfi
nop.m 999
(p8) fma.s1 acosf_Fz = acosf_d2z,acosf_Sz,acosf_dz
nop.i 999;;
}
{ .mfi
nop.m 999
(p8) fma.d.s1 acosf_Pt = acosf_2poly_p4b,acosf_sgnx_t4,acosf_sgnx_2poly_p2
nop.i 999;;
}
{ .mfi
nop.m 999
(p8) fma.d.s1 acosf_z = acosf_Az,acosf_Fz,acosf_Az
nop.i 999 ;;
}
{ .mfi
nop.m 999
(p7) fma.d.s1 acosf_sinf1 = acosf_x11,acosf_poly_Bx,acosf_poly_Ax
nop.i 999;;
}
.pred.rel "mutex",p8,p7 //acosf_pred_GTsqrt2by2,acosf_pred_LEsqrt2by2
{ .mfi
nop.m 999
(p8) fma.s.s0 f8 = acosf_z,acosf_Pt,acosf_sgn_x_piby2
nop.i 999
}
{ .mfb
nop.m 999
(p7) fms.s.s0 f8 = acosf_const_piby2,f1,acosf_sinf1
br.ret.sptk b0 ;;
}
ACOSF_ZERO:
// Here if x=0
{ .mfb
nop.m 999
fma.s.s0 f8 = acosf_const_piby2,f1,f0 // acosf(0)=pi/2
br.ret.sptk b0 ;;
}
ACOSF_ABS_ONE:
.pred.rel "mutex",p11,p12
// Here if |x|=1
{ .mfi
nop.m 999
(p11) fma.s.s0 f8 = acosf_const_piby2,f1,acosf_const_piby2 // acosf(-1)=pi
nop.i 999
}
{ .mfb
nop.m 999
(p12) fma.s.s0 f8 = f1,f0,f0 // acosf(1)=0
br.ret.sptk b0 ;;
}
GLOBAL_LIBM_END(acosf)
libm_alias_float_other (acos, acos)
// Stack operations when calling error support.
// (1) (2)
// sp -> + psp -> +
// | |
// | | <- GR_Y
// | |
// | <-GR_Y Y2->|
// | |
// | | <- GR_X
// | |
// sp-64 -> + sp -> +
// save ar.pfs save b0
// save gp
// Stack operations when calling error support.
// (3) (call) (4)
// psp -> + sp -> +
// | |
// R3 ->| <- GR_RESULT | -> f8
// | |
// Y2 ->| <- GR_Y |
// | |
// X1 ->| |
// | |
// sp -> + +
// restore gp
// restore ar.pfs
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 999
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
};;
{ .mmi
stfs [GR_Parameter_Y] = f1,16 // Store Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mfi
nop.m 0
frcpa.s0 f9,p0 = f0,f0
nop.i 0
};;
{ .mib
stfs [GR_Parameter_X] = f8 // Store Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y
nop.b 0 // Parameter 3 address
}
{ .mib
stfs [GR_Parameter_Y] = f9 // Store Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
nop.m 0
nop.m 0
add GR_Parameter_RESULT = 48,sp
};;
{ .mmi
ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
};;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,854 +0,0 @@
.file "asin.s"
// Copyright (c) 2000 - 2003 Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
// History
//==============================================================
// 02/02/00 Initial version
// 08/17/00 New and much faster algorithm.
// 08/31/00 Avoided bank conflicts on loads, shortened |x|=1 path,
// fixed mfb split issue stalls.
// 12/19/00 Fixed small arg cases to force inexact, or inexact and underflow.
// 08/02/02 New and much faster algorithm II
// 02/06/03 Reordered header: .section, .global, .proc, .align
// Description
//=========================================
// The asin function computes the principal value of the arc sine of x.
// asin(0) returns 0, asin(1) returns pi/2, asin(-1) returns -pi/2.
// A domain error occurs for arguments not in the range [-1,+1].
//
// The asin function returns the arc sine in the range [-pi/2, +pi/2] radians.
//
// There are 8 paths:
// 1. x = +/-0.0
// Return asin(x) = +/-0.0
//
// 2. 0.0 < |x| < 0.625
// Return asin(x) = x + x^3 *PolA(x^2)
// where PolA(x^2) = A3 + A5*x^2 + A7*x^4 +...+ A35*x^32
//
// 3. 0.625 <=|x| < 1.0
// Return asin(x) = sign(x) * ( Pi/2 - sqrt(R) * PolB(R))
// Where R = 1 - |x|,
// PolB(R) = B0 + B1*R + B2*R^2 +...+B12*R^12
//
// sqrt(R) is approximated using the following sequence:
// y0 = (1 + eps)/sqrt(R) - initial approximation by frsqrta,
// |eps| < 2^(-8)
// Then 3 iterations are used to refine the result:
// H0 = 0.5*y0
// S0 = R*y0
//
// d0 = 0.5 - H0*S0
// H1 = H0 + d0*H0
// S1 = S0 + d0*S0
//
// d1 = 0.5 - H1*S1
// H2 = H1 + d0*H1
// S2 = S1 + d0*S1
//
// d2 = 0.5 - H2*S2
// S3 = S3 + d2*S3
//
// S3 approximates sqrt(R) with enough accuracy for this algorithm
//
// So, the result should be reconstracted as follows:
// asin(x) = sign(x) * (Pi/2 - S3*PolB(R))
//
// But for optimization perposes the reconstruction step is slightly
// changed:
// asin(x) = sign(x)*(Pi/2 - PolB(R)*S2) + sign(x)*d2*S2*PolB(R)
//
// 4. |x| = 1.0
// Return asin(x) = sign(x)*Pi/2
//
// 5. 1.0 < |x| <= +INF
// A domain error occurs for arguments not in the range [-1,+1]
//
// 6. x = [S,Q]NaN
// Return asin(x) = QNaN
//
// 7. x is denormal
// Return asin(x) = x + x^3,
//
// 8. x is unnormal
// Normalize input in f8 and return to the very beginning of the function
//
// Registers used
//==============================================================
// Floating Point registers used:
// f8, input, output
// f6, f7, f9 -> f15, f32 -> f63
// General registers used:
// r3, r21 -> r31, r32 -> r38
// Predicate registers used:
// p0, p6 -> p14
//
// Assembly macros
//=========================================
// integer registers used
// scratch
rTblAddr = r3
rPiBy2Ptr = r21
rTmpPtr3 = r22
rDenoBound = r23
rOne = r24
rAbsXBits = r25
rHalf = r26
r0625 = r27
rSign = r28
rXBits = r29
rTmpPtr2 = r30
rTmpPtr1 = r31
// stacked
GR_SAVE_PFS = r32
GR_SAVE_B0 = r33
GR_SAVE_GP = r34
GR_Parameter_X = r35
GR_Parameter_Y = r36
GR_Parameter_RESULT = r37
GR_Parameter_TAG = r38
// floating point registers used
FR_X = f10
FR_Y = f1
FR_RESULT = f8
// scratch
fXSqr = f6
fXCube = f7
fXQuadr = f9
f1pX = f10
f1mX = f11
f1pXRcp = f12
f1mXRcp = f13
fH = f14
fS = f15
// stacked
fA3 = f32
fB1 = f32
fA5 = f33
fB2 = f33
fA7 = f34
fPiBy2 = f34
fA9 = f35
fA11 = f36
fB10 = f35
fB11 = f36
fA13 = f37
fA15 = f38
fB4 = f37
fB5 = f38
fA17 = f39
fA19 = f40
fB6 = f39
fB7 = f40
fA21 = f41
fA23 = f42
fB3 = f41
fB8 = f42
fA25 = f43
fA27 = f44
fB9 = f43
fB12 = f44
fA29 = f45
fA31 = f46
fA33 = f47
fA35 = f48
fBaseP = f49
fB0 = f50
fSignedS = f51
fD = f52
fHalf = f53
fR = f54
fCloseTo1Pol = f55
fSignX = f56
fDenoBound = f57
fNormX = f58
fX8 = f59
fRSqr = f60
fRQuadr = f61
fR8 = f62
fX16 = f63
// Data tables
//==============================================================
RODATA
.align 16
LOCAL_OBJECT_START(asin_base_range_table)
// Ai: Polynomial coefficients for the asin(x), |x| < .625000
// Bi: Polynomial coefficients for the asin(x), |x| > .625000
data8 0xBFDAAB56C01AE468 //A29
data8 0x3FE1C470B76A5B2B //A31
data8 0xBFDC5FF82A0C4205 //A33
data8 0x3FC71FD88BFE93F0 //A35
data8 0xB504F333F9DE6487, 0x00003FFF //B0
data8 0xAAAAAAAAAAAAFC18, 0x00003FFC //A3
data8 0x3F9F1C71BC4A7823 //A9
data8 0x3F96E8BBAAB216B2 //A11
data8 0x3F91C4CA1F9F8A98 //A13
data8 0x3F8C9DDCEDEBE7A6 //A15
data8 0x3F877784442B1516 //A17
data8 0x3F859C0491802BA2 //A19
data8 0x9999999998C88B8F, 0x00003FFB //A5
data8 0x3F6BD7A9A660BF5E //A21
data8 0x3F9FC1659340419D //A23
data8 0xB6DB6DB798149BDF, 0x00003FFA //A7
data8 0xBFB3EF18964D3ED3 //A25
data8 0x3FCD285315542CF2 //A27
data8 0xF15BEEEFF7D2966A, 0x00003FFB //B1
data8 0x3EF0DDA376D10FB3 //B10
data8 0xBEB83CAFE05EBAC9 //B11
data8 0x3F65FFB67B513644 //B4
data8 0x3F5032FBB86A4501 //B5
data8 0x3F392162276C7CBA //B6
data8 0x3F2435949FD98BDF //B7
data8 0xD93923D7FA08341C, 0x00003FF9 //B2
data8 0x3F802995B6D90BDB //B3
data8 0x3F10DF86B341A63F //B8
data8 0xC90FDAA22168C235, 0x00003FFF // Pi/2
data8 0x3EFA3EBD6B0ECB9D //B9
data8 0x3EDE18BA080E9098 //B12
LOCAL_OBJECT_END(asin_base_range_table)
.section .text
GLOBAL_LIBM_ENTRY(asin)
asin_unnormal_back:
{ .mfi
getf.d rXBits = f8 // grab bits of input value
// set p12 = 1 if x is a NaN, denormal, or zero
fclass.m p12, p0 = f8, 0xcf
adds rSign = 1, r0
}
{ .mfi
addl rTblAddr = @ltoff(asin_base_range_table),gp
// 1 - x = 1 - |x| for positive x
fms.s1 f1mX = f1, f1, f8
addl rHalf = 0xFFFE, r0 // exponent of 1/2
}
;;
{ .mfi
addl r0625 = 0x3FE4, r0 // high 16 bits of 0.625
// set p8 = 1 if x < 0
fcmp.lt.s1 p8, p9 = f8, f0
shl rSign = rSign, 63 // sign bit
}
{ .mfi
// point to the beginning of the table
ld8 rTblAddr = [rTblAddr]
// 1 + x = 1 - |x| for negative x
fma.s1 f1pX = f1, f1, f8
adds rOne = 0x3FF, r0
}
;;
{ .mfi
andcm rAbsXBits = rXBits, rSign // bits of |x|
fmerge.s fSignX = f8, f1 // signum(x)
shl r0625 = r0625, 48 // bits of DP representation of 0.625
}
{ .mfb
setf.exp fHalf = rHalf // load A2 to FP reg
fma.s1 fXSqr = f8, f8, f0 // x^2
// branch on special path if x is a NaN, denormal, or zero
(p12) br.cond.spnt asin_special
}
;;
{ .mfi
adds rPiBy2Ptr = 272, rTblAddr
nop.f 0
shl rOne = rOne, 52 // bits of 1.0
}
{ .mfi
adds rTmpPtr1 = 16, rTblAddr
nop.f 0
// set p6 = 1 if |x| < 0.625
cmp.lt p6, p7 = rAbsXBits, r0625
}
;;
{ .mfi
ldfpd fA29, fA31 = [rTblAddr] // A29, fA31
// 1 - x = 1 - |x| for positive x
(p9) fms.s1 fR = f1, f1, f8
// point to coefficient of "near 1" polynomial
(p7) adds rTmpPtr2 = 176, rTblAddr
}
{ .mfi
ldfpd fA33, fA35 = [rTmpPtr1], 16 // A33, fA35
// 1 + x = 1 - |x| for negative x
(p8) fma.s1 fR = f1, f1, f8
(p6) adds rTmpPtr2 = 48, rTblAddr
}
;;
{ .mfi
ldfe fB0 = [rTmpPtr1], 16 // B0
nop.f 0
nop.i 0
}
{ .mib
adds rTmpPtr3 = 16, rTmpPtr2
// set p10 = 1 if |x| = 1.0
cmp.eq p10, p0 = rAbsXBits, rOne
// branch on special path for |x| = 1.0
(p10) br.cond.spnt asin_abs_1
}
;;
{ .mfi
ldfe fA3 = [rTmpPtr2], 48 // A3 or B1
nop.f 0
adds rTmpPtr1 = 64, rTmpPtr3
}
{ .mib
ldfpd fA9, fA11 = [rTmpPtr3], 16 // A9, A11 or B10, B11
// set p11 = 1 if |x| > 1.0
cmp.gt p11, p0 = rAbsXBits, rOne
// branch on special path for |x| > 1.0
(p11) br.cond.spnt asin_abs_gt_1
}
;;
{ .mfi
ldfpd fA17, fA19 = [rTmpPtr2], 16 // A17, A19 or B6, B7
// initial approximation of 1 / sqrt(1 - x)
frsqrta.s1 f1mXRcp, p0 = f1mX
nop.i 0
}
{ .mfi
ldfpd fA13, fA15 = [rTmpPtr3] // A13, A15 or B4, B5
fma.s1 fXCube = fXSqr, f8, f0 // x^3
nop.i 0
}
;;
{ .mfi
ldfe fA5 = [rTmpPtr2], 48 // A5 or B2
// initial approximation of 1 / sqrt(1 + x)
frsqrta.s1 f1pXRcp, p0 = f1pX
nop.i 0
}
{ .mfi
ldfpd fA21, fA23 = [rTmpPtr1], 16 // A21, A23 or B3, B8
fma.s1 fXQuadr = fXSqr, fXSqr, f0 // x^4
nop.i 0
}
;;
{ .mfi
ldfe fA7 = [rTmpPtr1] // A7 or Pi/2
fma.s1 fRSqr = fR, fR, f0 // R^2
nop.i 0
}
{ .mfb
ldfpd fA25, fA27 = [rTmpPtr2] // A25, A27 or B9, B12
nop.f 0
(p6) br.cond.spnt asin_base_range;
}
;;
{ .mfi
nop.m 0
(p9) fma.s1 fH = fHalf, f1mXRcp, f0 // H0 for x > 0
nop.i 0
}
{ .mfi
nop.m 0
(p9) fma.s1 fS = f1mX, f1mXRcp, f0 // S0 for x > 0
nop.i 0
}
;;
{ .mfi
nop.m 0
(p8) fma.s1 fH = fHalf, f1pXRcp, f0 // H0 for x < 0
nop.i 0
}
{ .mfi
nop.m 0
(p8) fma.s1 fS = f1pX, f1pXRcp, f0 // S0 for x > 0
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fRQuadr = fRSqr, fRSqr, f0 // R^4
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fB11 = fB11, fR, fB10
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fB1 = fB1, fR, fB0
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fB5 = fB5, fR, fB4
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fB7 = fB7, fR, fB6
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fB3 = fB3, fR, fB2
nop.i 0
}
;;
{ .mfi
nop.m 0
fnma.s1 fD = fH, fS, fHalf // d0 = 1/2 - H0*S0
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fR8 = fRQuadr, fRQuadr, f0 // R^4
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fB9 = fB9, fR, fB8
nop.i 0
}
;;
{.mfi
nop.m 0
fma.s1 fB12 = fB12, fRSqr, fB11
nop.i 0
}
{.mfi
nop.m 0
fma.s1 fB7 = fB7, fRSqr, fB5
nop.i 0
}
;;
{.mfi
nop.m 0
fma.s1 fB3 = fB3, fRSqr, fB1
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fH = fH, fD, fH // H1 = H0 + H0*d0
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fS = fS, fD, fS // S1 = S0 + S0*d0
nop.i 0
}
;;
{.mfi
nop.m 0
fma.s1 fPiBy2 = fPiBy2, fSignX, f0 // signum(x)*Pi/2
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fB12 = fB12, fRSqr, fB9
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fB7 = fB7, fRQuadr, fB3
nop.i 0
}
;;
{.mfi
nop.m 0
fnma.s1 fD = fH, fS, fHalf // d1 = 1/2 - H1*S1
nop.i 0
}
{ .mfi
nop.m 0
fnma.s1 fSignedS = fSignX, fS, f0 // -signum(x)*S1
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fCloseTo1Pol = fB12, fR8, fB7
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fH = fH, fD, fH // H2 = H1 + H1*d1
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fS = fS, fD, fS // S2 = S1 + S1*d1
nop.i 0
}
;;
{ .mfi
nop.m 0
// -signum(x)* S2 = -signum(x)*(S1 + S1*d1)
fma.s1 fSignedS = fSignedS, fD, fSignedS
nop.i 0
}
;;
{.mfi
nop.m 0
fnma.s1 fD = fH, fS, fHalf // d2 = 1/2 - H2*S2
nop.i 0
}
;;
{ .mfi
nop.m 0
// signum(x)*(Pi/2 - PolB*S2)
fma.s1 fPiBy2 = fSignedS, fCloseTo1Pol, fPiBy2
nop.i 0
}
{ .mfi
nop.m 0
// -signum(x)*PolB * S2
fma.s1 fCloseTo1Pol = fSignedS, fCloseTo1Pol, f0
nop.i 0
}
;;
{ .mfb
nop.m 0
// final result for 0.625 <= |x| < 1
fma.d.s0 f8 = fCloseTo1Pol, fD, fPiBy2
// exit here for 0.625 <= |x| < 1
br.ret.sptk b0
}
;;
// here if |x| < 0.625
.align 32
asin_base_range:
{ .mfi
nop.m 0
fma.s1 fA33 = fA33, fXSqr, fA31
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA15 = fA15, fXSqr, fA13
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA29 = fA29, fXSqr, fA27
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA25 = fA25, fXSqr, fA23
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA21 = fA21, fXSqr, fA19
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA9 = fA9, fXSqr, fA7
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA5 = fA5, fXSqr, fA3
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA35 = fA35, fXQuadr, fA33
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA17 = fA17, fXQuadr, fA15
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fX8 = fXQuadr, fXQuadr, f0 // x^8
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA25 = fA25, fXQuadr, fA21
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA9 = fA9, fXQuadr, fA5
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA35 = fA35, fXQuadr, fA29
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA17 = fA17, fXSqr, fA11
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fX16 = fX8, fX8, f0 // x^16
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA35 = fA35, fX8, fA25
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA17 = fA17, fX8, fA9
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fBaseP = fA35, fX16, fA17
nop.i 0
}
;;
{ .mfb
nop.m 0
// final result for |x| < 0.625
fma.d.s0 f8 = fBaseP, fXCube, f8
// exit here for |x| < 0.625 path
br.ret.sptk b0
}
;;
// here if |x| = 1
// asin(x) = sign(x) * Pi/2
.align 32
asin_abs_1:
{ .mfi
ldfe fPiBy2 = [rPiBy2Ptr] // Pi/2
nop.f 0
nop.i 0
}
;;
{.mfb
nop.m 0
// result for |x| = 1.0
fma.d.s0 f8 = fPiBy2, fSignX, f0
// exit here for |x| = 1.0
br.ret.sptk b0
}
;;
// here if x is a NaN, denormal, or zero
.align 32
asin_special:
{ .mfi
nop.m 0
// set p12 = 1 if x is a NaN
fclass.m p12, p0 = f8, 0xc3
nop.i 0
}
{ .mlx
nop.m 0
// smallest positive DP normalized number
movl rDenoBound = 0x0010000000000000
}
;;
{ .mfi
nop.m 0
// set p13 = 1 if x = 0.0
fclass.m p13, p0 = f8, 0x07
nop.i 0
}
{ .mfi
nop.m 0
fnorm.s1 fNormX = f8
nop.i 0
}
;;
{ .mfb
// load smallest normal to FP reg
setf.d fDenoBound = rDenoBound
// answer if x is a NaN
(p12) fma.d.s0 f8 = f8,f1,f0
// exit here if x is a NaN
(p12) br.ret.spnt b0
}
;;
{ .mfb
nop.m 0
nop.f 0
// exit here if x = 0.0
(p13) br.ret.spnt b0
}
;;
// if we still here then x is denormal or unnormal
{ .mfi
nop.m 0
// absolute value of normalized x
fmerge.s fNormX = f1, fNormX
nop.i 0
}
;;
{ .mfi
nop.m 0
// set p14 = 1 if normalized x is greater than or
// equal to the smallest denormalized value
// So, if p14 is set to 1 it means that we deal with
// unnormal rather than with "true" denormal
fcmp.ge.s1 p14, p0 = fNormX, fDenoBound
nop.i 0
}
;;
{ .mfi
nop.m 0
(p14) fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag if x unnormal
nop.i 0
}
{ .mfb
nop.m 0
// normalize unnormal input
(p14) fnorm.s1 f8 = f8
// return to the main path
(p14) br.cond.sptk asin_unnormal_back
}
;;
// if we still here it means that input is "true" denormal
{ .mfb
nop.m 0
// final result if x is denormal
fma.d.s0 f8 = f8, fXSqr, f8
// exit here if x is denormal
br.ret.sptk b0
}
;;
// here if |x| > 1.0
// error handler should be called
.align 32
asin_abs_gt_1:
{ .mfi
alloc r32 = ar.pfs, 0, 3, 4, 0 // get some registers
fmerge.s FR_X = f8,f8
nop.i 0
}
{ .mfb
mov GR_Parameter_TAG = 61 // error code
frcpa.s0 FR_RESULT, p0 = f0,f0
// call error handler routine
br.cond.sptk __libm_error_region
}
;;
GLOBAL_LIBM_END(asin)
libm_alias_double_other (asin, asin)
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
};;
{ .mmi
stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
nop.b 0
}
{ .mib
stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
add GR_Parameter_RESULT = 48,sp
nop.m 0
nop.i 0
};;
{ .mmi
ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
};;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#

View File

@ -1,675 +0,0 @@
.file "asinf.s"
// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
// History
//==============================================================
// 02/02/00 Initial version
// 06/28/00 Improved speed
// 06/31/00 Changed register allocation because of some duplicate macros
// moved nan exit bundle up to gain a cycle.
// 08/08/00 Improved speed by avoiding SIR flush.
// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
// 08/17/00 Changed predicate register macro-usage to direct predicate
// names due to an assembler bug.
// 10/17/00 Improved speed of x=0 and x=1 paths, set D flag if x denormal.
// 03/13/01 Corrected sign of imm1 value in dep instruction.
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/06/03 Reordered header: .section, .global, .proc, .align
// Description
//=========================================
// The asinf function computes the arc sine of x in the range [-pi,+pi].
// A domain error occurs for arguments not in the range [-1,+1].
// asinf(+-0) returns +-0
// asinf(x) returns a Nan and raises the invalid exception for |x| >1
// The acosf function returns the arc cosine in the range [0, +pi] radians.
// A domain error occurs for arguments not in the range [-1,+1].
// acosf(1) returns +0
// acosf(x) returns a Nan and raises the invalid exception for |x| >1
// |x| <= sqrt(2)/2. get Ax and Bx
// poly_p1 = x p1
// poly_p3 = x2 p4 + p3
// poly_p1 = x2 (poly_p1) + x = x2(x p1) + x
// poly_p2 = x2( poly_p3) + p2 = x2(x2 p4 + p3) + p2
// poly_Ax = x5(x2( poly_p3) + p2) + x2(x p1) + x
// = x5(x2(x2 p4 + p3) + p2) + x2(x p1) + x
// poly_p7 = x2 p8 + p7
// poly_p5 = x2 p6 + p5
// poly_p7 = x4 p9 + (poly_p7)
// poly_p7 = x4 p9 + (x2 p8 + p7)
// poly_Bx = x4 (x4 p9 + (x2 p8 + p7)) + x2 p6 + p5
// answer1 = x11(x4 (x4 p9 + (x2 p8 + p7)) + x2 p6 + p5) + x5(x2(x2 p4 + p3) + p2) + x2(x p1) + x
// = x19 p9 + x17 p8 + x15 p7 x13 p6 + x11 p5 + x9 p4 + x7 p3 + x5 p2 + x3 p1 + x
// |x| > sqrt(2)/2
// Get z = sqrt(1-x2)
// Get polynomial in t = 1-x2
// t2 = t t
// t4 = t2 t2
// poly_p4 = t p5 + p4
// poly_p1 = t p1 + 1
// poly_p6 = t p7 + p6
// poly_p2 = t p3 + p2
// poly_p8 = t p9 + p8
// poly_p4 = t2 poly_p6 + poly_p4
// = t2 (t p7 + p6) + (t p5 + p4)
// poly_p2 = t2 poly_p2 + poly_p1
// = t2 (t p3 + p2) + (t p1 + 1)
// poly_p4 = t4 poly_p8 + poly_p4
// = t4 (t p9 + p8) + (t2 (t p7 + p6) + (t p5 + p4))
// P(t) = poly_p2 + t4 poly_p8
// = t2 (t p3 + p2) + (t p1 + 1) + t4 (t4 (t p9 + p8) + (t2 (t p7 + p6) + (t p5 + p4)))
// = t3 p3 + t2 p2 + t p1 + 1 + t9 p9 + t8 p8 + t7 p7 + t6 p6 + t5 p5 + t4 p4
// answer2 = - sign(x) z P(t) + (sign(x) pi/2)
//
// Assembly macros
//=========================================
// predicate registers
//asinf_pred_LEsqrt2by2 = p7
//asinf_pred_GTsqrt2by2 = p8
// integer registers
ASINF_Addr1 = r33
ASINF_Addr2 = r34
ASINF_GR_1by2 = r35
ASINF_GR_3by2 = r36
ASINF_GR_5by2 = r37
GR_SAVE_B0 = r38
GR_SAVE_PFS = r39
GR_SAVE_GP = r40
GR_Parameter_X = r41
GR_Parameter_Y = r42
GR_Parameter_RESULT = r43
GR_Parameter_TAG = r44
// floating point registers
asinf_y = f32
asinf_abs_x = f33
asinf_x2 = f34
asinf_sgn_x = f35
asinf_1by2 = f36
asinf_3by2 = f37
asinf_5by2 = f38
asinf_coeff_P3 = f39
asinf_coeff_P8 = f40
asinf_coeff_P1 = f41
asinf_coeff_P4 = f42
asinf_coeff_P5 = f43
asinf_coeff_P2 = f44
asinf_coeff_P7 = f45
asinf_coeff_P6 = f46
asinf_coeff_P9 = f47
asinf_x2 = f48
asinf_x3 = f49
asinf_x4 = f50
asinf_x8 = f51
asinf_x5 = f52
asinf_const_piby2 = f53
asinf_const_sqrt2by2 = f54
asinf_x11 = f55
asinf_poly_p1 = f56
asinf_poly_p3 = f57
asinf_sinf1 = f58
asinf_poly_p2 = f59
asinf_poly_Ax = f60
asinf_poly_p7 = f61
asinf_poly_p5 = f62
asinf_sgnx_t4 = f63
asinf_poly_Bx = f64
asinf_t = f65
asinf_yby2 = f66
asinf_B = f67
asinf_B2 = f68
asinf_Az = f69
asinf_dz = f70
asinf_Sz = f71
asinf_d2z = f72
asinf_Fz = f73
asinf_z = f74
asinf_sgnx_z = f75
asinf_t2 = f76
asinf_2poly_p4 = f77
asinf_2poly_p6 = f78
asinf_2poly_p1 = f79
asinf_2poly_p2 = f80
asinf_2poly_p8 = f81
asinf_t4 = f82
asinf_Pt = f83
asinf_sgnx_2poly_p2 = f84
asinf_sgn_x_piby2 = f85
asinf_poly_p7a = f86
asinf_2poly_p4a = f87
asinf_2poly_p4b = f88
asinf_2poly_p2a = f89
asinf_poly_p1a = f90
// Data tables
//==============================================================
RODATA
.align 16
LOCAL_OBJECT_START(asinf_coeff_1_table)
data8 0x3FC5555607DCF816 // P1
data8 0x3F9CF81AD9BAB2C6 // P4
data8 0x3FC59E0975074DF3 // P7
data8 0xBFA6F4CC2780AA1D // P6
data8 0x3FC2DD45292E93CB // P9
data8 0x3fe6a09e667f3bcd // sqrt(2)/2
LOCAL_OBJECT_END(asinf_coeff_1_table)
LOCAL_OBJECT_START(asinf_coeff_2_table)
data8 0x3FA6F108E31EFBA6 // P3
data8 0xBFCA31BF175D82A0 // P8
data8 0x3FA30C0337F6418B // P5
data8 0x3FB332C9266CB1F9 // P2
data8 0x3ff921fb54442d18 // pi_by_2
LOCAL_OBJECT_END(asinf_coeff_2_table)
.section .text
GLOBAL_LIBM_ENTRY(asinf)
// Load the addresses of the two tables.
// Then, load the coefficients and other constants.
{ .mfi
alloc r32 = ar.pfs,1,8,4,0
fnma.s1 asinf_t = f8,f8,f1
dep.z ASINF_GR_1by2 = 0x3f,24,8 // 0x3f000000
}
{ .mfi
addl ASINF_Addr1 = @ltoff(asinf_coeff_1_table),gp
fma.s1 asinf_x2 = f8,f8,f0
addl ASINF_Addr2 = @ltoff(asinf_coeff_2_table),gp ;;
}
{ .mfi
ld8 ASINF_Addr1 = [ASINF_Addr1]
fmerge.s asinf_abs_x = f1,f8
dep ASINF_GR_3by2 = -1,r0,22,8 // 0x3fc00000
}
{ .mlx
nop.m 999
movl ASINF_GR_5by2 = 0x40200000;;
}
{ .mfi
setf.s asinf_1by2 = ASINF_GR_1by2
fmerge.s asinf_sgn_x = f8,f1
nop.i 999
}
{ .mfi
ld8 ASINF_Addr2 = [ASINF_Addr2]
nop.f 0
nop.i 999;;
}
{ .mfi
setf.s asinf_5by2 = ASINF_GR_5by2
fcmp.lt.s1 p11,p12 = f8,f0
nop.i 999;;
}
{ .mmf
ldfpd asinf_coeff_P1,asinf_coeff_P4 = [ASINF_Addr1],16
setf.s asinf_3by2 = ASINF_GR_3by2
fclass.m.unc p8,p0 = f8, 0xc3 ;; //@qnan | @snan
}
{ .mfi
ldfpd asinf_coeff_P7,asinf_coeff_P6 = [ASINF_Addr1],16
fma.s1 asinf_t2 = asinf_t,asinf_t,f0
nop.i 999
}
{ .mfi
ldfpd asinf_coeff_P3,asinf_coeff_P8 = [ASINF_Addr2],16
fma.s1 asinf_x4 = asinf_x2,asinf_x2,f0
nop.i 999;;
}
{ .mfi
ldfpd asinf_coeff_P9,asinf_const_sqrt2by2 = [ASINF_Addr1]
fclass.m.unc p10,p0 = f8, 0x07 //@zero
nop.i 999
}
{ .mfi
ldfpd asinf_coeff_P5,asinf_coeff_P2 = [ASINF_Addr2],16
fma.s1 asinf_x3 = f8,asinf_x2,f0
nop.i 999;;
}
{ .mfi
ldfd asinf_const_piby2 = [ASINF_Addr2]
frsqrta.s1 asinf_B,p0 = asinf_t
nop.i 999
}
{ .mfb
nop.m 999
(p8) fma.s.s0 f8 = f8,f1,f0
(p8) br.ret.spnt b0 ;; // Exit if x=nan
}
{ .mfb
nop.m 999
fcmp.eq.s1 p6,p0 = asinf_abs_x,f1
(p10) br.ret.spnt b0 ;; // Exit if x=0
}
{ .mfi
nop.m 999
fcmp.gt.s1 p9,p0 = asinf_abs_x,f1
nop.i 999;;
}
{ .mfi
nop.m 999
fma.s1 asinf_x8 = asinf_x4,asinf_x4,f0
nop.i 999
}
{ .mfb
nop.m 999
fma.s1 asinf_t4 = asinf_t2,asinf_t2,f0
(p6) br.cond.spnt ASINF_ABS_ONE ;; // Branch if |x|=1
}
{ .mfi
nop.m 999
fma.s1 asinf_x5 = asinf_x2,asinf_x3,f0
nop.i 999
}
{ .mfb
(p9) mov GR_Parameter_TAG = 62
fma.s1 asinf_yby2 = asinf_t,asinf_1by2,f0
(p9) br.cond.spnt __libm_error_region ;; // Branch if |x|>1
}
{ .mfi
nop.m 999
fma.s1 asinf_Az = asinf_t,asinf_B,f0
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 asinf_B2 = asinf_B,asinf_B,f0
nop.i 999;;
}
{ .mfi
nop.m 999
fma.s1 asinf_poly_p1 = f8,asinf_coeff_P1,f0
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 asinf_2poly_p1 = asinf_coeff_P1,asinf_t,f1
nop.i 999;;
}
{ .mfi
nop.m 999
fma.s1 asinf_poly_p3 = asinf_coeff_P4,asinf_x2,asinf_coeff_P3
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 asinf_2poly_p6 = asinf_coeff_P7,asinf_t,asinf_coeff_P6
nop.i 999;;
}
{ .mfi
nop.m 999
fma.s1 asinf_poly_p7 = asinf_x2,asinf_coeff_P8,asinf_coeff_P7
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 asinf_2poly_p2 = asinf_coeff_P3,asinf_t,asinf_coeff_P2
nop.i 999;;
}
{ .mfi
nop.m 999
fma.s1 asinf_poly_p5 = asinf_x2,asinf_coeff_P6,asinf_coeff_P5
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 asinf_2poly_p4 = asinf_coeff_P5,asinf_t,asinf_coeff_P4
nop.i 999;;
}
{ .mfi
nop.m 999
fma.d.s1 asinf_x11 = asinf_x8,asinf_x3,f0
nop.i 999
}
{ .mfi
nop.m 999
fnma.s1 asinf_dz = asinf_B2,asinf_yby2,asinf_1by2
nop.i 999;;
}
{ .mfi
nop.m 999
fma.s1 asinf_poly_p1a = asinf_x2,asinf_poly_p1,f8
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 asinf_2poly_p8 = asinf_coeff_P9,asinf_t,asinf_coeff_P8
nop.i 999;;
}
// Get the absolute value of x and determine the region in which x lies
{ .mfi
nop.m 999
fcmp.le.s1 p7,p8 = asinf_abs_x,asinf_const_sqrt2by2
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 asinf_poly_p2 = asinf_x2,asinf_poly_p3,asinf_coeff_P2
nop.i 999;;
}
{ .mfi
nop.m 999
fma.s1 asinf_poly_p7a = asinf_x4,asinf_coeff_P9,asinf_poly_p7
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 asinf_2poly_p2a = asinf_2poly_p2,asinf_t2,asinf_2poly_p1
nop.i 999;;
}
{ .mfi
nop.m 999
(p8) fma.s1 asinf_sgnx_t4 = asinf_sgn_x,asinf_t4,f0
nop.i 999
}
{ .mfi
nop.m 999
(p8) fma.s1 asinf_2poly_p4a = asinf_2poly_p6,asinf_t2,asinf_2poly_p4
nop.i 999;;
}
{ .mfi
nop.m 999
(p8) fma.s1 asinf_Sz = asinf_5by2,asinf_dz,asinf_3by2
nop.i 999
}
{ .mfi
nop.m 999
(p8) fma.s1 asinf_d2z = asinf_dz,asinf_dz,f0
nop.i 999;;
}
{ .mfi
nop.m 999
(p8) fma.s1 asinf_sgn_x_piby2 = asinf_sgn_x,asinf_const_piby2,f0
nop.i 999
}
{ .mfi
nop.m 999
(p7) fma.d.s1 asinf_poly_Ax = asinf_x5,asinf_poly_p2,asinf_poly_p1a
nop.i 999;;
}
{ .mfi
nop.m 999
(p7) fma.d.s1 asinf_poly_Bx = asinf_x4,asinf_poly_p7a,asinf_poly_p5
nop.i 999
}
{ .mfi
nop.m 999
(p8) fma.s1 asinf_sgnx_2poly_p2 = asinf_sgn_x,asinf_2poly_p2a,f0
nop.i 999;;
}
{ .mfi
nop.m 999
fcmp.eq.s0 p6,p0 = f8,f0 // Only purpose is to set D if x denormal
nop.i 999
}
{ .mfi
nop.m 999
(p8) fma.s1 asinf_2poly_p4b = asinf_2poly_p8,asinf_t4,asinf_2poly_p4a
nop.i 999;;
}
{ .mfi
nop.m 999
(p8) fma.s1 asinf_Fz = asinf_d2z,asinf_Sz,asinf_dz
nop.i 999;;
}
{ .mfi
nop.m 999
(p8) fma.d.s1 asinf_Pt = asinf_2poly_p4b,asinf_sgnx_t4,asinf_sgnx_2poly_p2
nop.i 999;;
}
{ .mfi
nop.m 999
(p8) fma.d.s1 asinf_z = asinf_Az,asinf_Fz,asinf_Az
nop.i 999;;
}
.pred.rel "mutex",p8,p7 //asinf_pred_GTsqrt2by2,asinf_pred_LEsqrt2by2
{ .mfi
nop.m 999
(p8) fnma.s.s0 f8 = asinf_z,asinf_Pt,asinf_sgn_x_piby2
nop.i 999
}
{ .mfb
nop.m 999
(p7) fma.s.s0 f8 = asinf_x11,asinf_poly_Bx,asinf_poly_Ax
br.ret.sptk b0 ;;
}
ASINF_ABS_ONE:
// Here for short exit if |x|=1
{ .mfb
nop.m 999
fma.s.s0 f8 = asinf_sgn_x,asinf_const_piby2,f0
br.ret.sptk b0
}
;;
GLOBAL_LIBM_END(asinf)
libm_alias_float_other (asin, asin)
// Stack operations when calling error support.
// (1) (2)
// sp -> + psp -> +
// | |
// | | <- GR_Y
// | |
// | <-GR_Y Y2->|
// | |
// | | <- GR_X
// | |
// sp-64 -> + sp -> +
// save ar.pfs save b0
// save gp
// Stack operations when calling error support.
// (3) (call) (4)
// psp -> + sp -> +
// | |
// R3 ->| <- GR_RESULT | -> f8
// | |
// Y2 ->| <- GR_Y |
// | |
// X1 ->| |
// | |
// sp -> + +
// restore gp
// restore ar.pfs
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 999
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
};;
{ .mmi
stfs [GR_Parameter_Y] = f1,16 // Store Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mfi
nop.m 0
frcpa.s0 f9,p0 = f0,f0
nop.i 0
};;
{ .mib
stfs [GR_Parameter_X] = f8 // Store Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y
nop.b 0 // Parameter 3 address
}
{ .mib
stfs [GR_Parameter_Y] = f9 // Store Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
nop.m 0
nop.m 0
add GR_Parameter_RESULT = 48,sp
};;
{ .mmi
ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
};;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,900 +0,0 @@
.file "atan2f.s"
// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
// History
//==============================================================
// 06/01/00 Initial version
// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
// 08/17/00 Changed predicate register macro-usage to direct predicate
// names due to an assembler bug.
// 01/05/01 Fixed flag settings for denormal input.
// 01/19/01 Added documentation
// 01/30/01 Improved speed
// 02/06/02 Corrected .section statement
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/06/03 Reordered header: .section, .global, .proc, .align
// Description
//=========================================
// The atan2 function computes the principle value of the arc tangent of y/x using
// the signs of both arguments to determine the quadrant of the return value.
// A domain error may occur if both arguments are zero.
// The atan2 function returns the arc tangent of y/x in the range [-pi,+pi] radians.
//..
//..Let (v,u) = (y,x) if |y| <= |x|, and (v,u) = (x,y) otherwise. Note that
//..v and u can be negative. We state the relationship between atan2(y,x) and
//..atan(v/u).
//..
//..Let swap = false if v = y, and swap = true if v = x.
//..Define C according to the matrix
//..
//.. TABLE FOR C
//.. x +ve x -ve
//.. no swap (swap = false) sgn(y)*0 sgn(y)*pi
//.. swap (swap = true ) sgn(y)*pi/2 sgn(y)*pi/2
//..
//.. atan2(y,x) = C + atan(v/u) if no swap
//.. atan2(y,x) = C - atan(v/u) if swap
//..
//..These relationship is more efficient to compute as we accommodate signs in v and u
//..saving the need to obtain the absolute value before computation can proceed.
//..
//..Suppose (v,u) = (y,x), we calculate atan(v/u) as follows:
//..A = y * frcpa(x) (so A = (y/x)(1 - beta))
//..atan(y/x) = atan(A) + atan( ((y/x)-A))/(1 + (y/x)A) ), the second term is
//..a correction.
//..atan(A) is approximated by a polynomial
//..A + p1 A^3 + p2 A^5 + ... + p10 A^21,
//..atan(G) is approximated as follows:
//..Let G = (y - Ax)/(x + Ay), atan(G) can be approximated by G + g * p1
//..where g is a limited precision approximation to G via g = (y - Ax)*frcpa(x + Ay).
//..
//..Suppose (v,u) = (x,y), we calculate atan(v/u) as follows:
//..Z = x * frcpa(y) (so Z = (x/y)(1 - beta))
//..atan(x/y) = atan(Z) + atan( ((x/y)-Z))/(1 + (x/y)Z) ), the second term is
//..a correction.
//..atan(Z) is approximated by a polynomial
//..Z + p1 Z^3 + p2 Z^5 + ... + p10 Z^21,
//..atan(T) is approximated as follows:
//..Let T = (x - Ay)/(y + Ax), atan(T) can be approximated by T + t * p1
//..where t is a limited precision approximation to T via t = (x - Ay)*frcpa(y + Ax).
//..
//..
//..A = y * frcpa(x)
//..atan(A) ~=~ A + p1 A^3 + ... + P10 A^21
//..
//..This polynomial is computed as follows:
//..Asq = A*A; Acub = A*Asq, A4 = Asq*Asq
//..A5 = Asq*Acub, A6 = Asq*A4; A11 = A5 * A6
//..
//..poly_A1 = p9 + Asq*p10, poly_A2 = p7 + Asq*p8, poly_A3 = p5 + Asq*p6
//..poly_A1 = poly_A2 + A4 * poly_A1
//..poly_A1 = poly_A3 + A4 * poly_A1
//..
//..poly_A4 = p1 * A
//,,poly_A5 = p3 + Asq * p4, poly_A4 = A + Asq*poly_A4
//..poly_A5 = p2 + Asq * poly_A5
//..poly_A4 = poly_A4 + A5 * poly_A5
//..
//..atan_A = poly_A4 + A11 * poly_A1
//..
//..atan(G) is approximated as follows:
//..G_numer = y - A*x, G_denom = x + A*y
//..H1 = frcpa(G_denom)
//..H_beta = 1 - H1 * G_denom
//..H2 = H1 + H1 * H_beta
//..H_beta2 = H_beta*H_beta
//..H3 = H2 + H2*H_beta2
//..g = H1 * G_numer; gsq = g*g; atan_G = g*p1, atan_G = atan_G*gsq
//..atan_G = G_numer*H3 + atan_G
//..
//..
//..A = y * frcpa(x)
//..atan(A) ~=~ A + p1 A^3 + ... + P10 A^21
//..
//..This polynomial is computed as follows:
//..Asq = A*A; Acub = A*Asq, A4 = Asq*Asq
//..A5 = Asq*Acub, A6 = Asq*A4; A11 = A5 * A6
//..
//..poly_A1 = p9 + Asq*p10, poly_A2 = p7 + Asq*p8, poly_A3 = p5 + Asq*p6
//..poly_A1 = poly_A2 + A4 * poly_A1
//..poly_A1 = poly_A3 + A4 * poly_A1
//..
//..poly_A4 = p1 * A
//,,poly_A5 = p3 + Asq * p4, poly_A4 = A + Asq*poly_A4
//..poly_A5 = p2 + Asq * poly_A5
//..poly_A4 = poly_A4 + A5 * poly_A5
//..
//..atan_A = poly_A4 + A11 * poly_A1
//..
//..
//..====================================================================
//.. COEFFICIENTS USED IN THE COMPUTATION
//..====================================================================
//coef_pj, j = 1,2,...,10; atan(A) ~=~ A + p1 A^3 + p2 A^5 + ... + p10 A^21
//
// coef_p1 = -.3333332707155439167401311806315789E+00
// coef_p1 in dbl = BFD5 5555 1219 1621
//
// coef_p2 = .1999967670926658391827857030875748E+00
// coef_p2 in dbl = 3FC9 997E 7AFB FF4E
//
// coef_p3 = -.1427989384500152360161563301087296E+00
// coef_p3 in dbl = BFC2 473C 5145 EE38
//
// coef_p4 = .1105852823460720770079031213661163E+00
// coef_p4 in dbl = 3FBC 4F51 2B18 65F5
//
// coef_p5 = -.8811839915595312348625710228448363E-01
// coef_p5 in dbl = BFB6 8EED 6A8C FA32
//
// coef_p6 = .6742329836955067042153645159059714E-01
// coef_p6 in dbl = 3FB1 42A7 3D7C 54E3
//
// coef_p7 = -.4468571068774672908561591262231909E-01
// coef_p7 in dbl = BFA6 E10B A401 393F
//
// coef_p8 = .2252333246746511135532726960586493E-01
// coef_p8 in dbl = 3F97 105B 4160 F86B
//
// coef_p9 = -.7303884867007574742501716845542314E-02
// coef_p9 in dbl = BF7D EAAD AA33 6451
//
// coef_p10 = .1109686868355312093949039454619058E-02
// coef_p10 in dbl = 3F52 2E5D 33BC 9BAA
//
// Special values
//==============================================================
// Y x Result
// +number +inf +0
// -number +inf -0
// +number -inf +pi
// -number -inf -pi
//
// +inf +number +pi/2
// -inf +number -pi/2
// +inf -number +pi/2
// -inf -number -pi/2
//
// +inf +inf +pi/4
// -inf +inf -pi/4
// +inf -inf +3pi/4
// -inf -inf -3pi/4
//
// +1 +1 +pi/4
// -1 +1 -pi/4
// +1 -1 +3pi/4
// -1 -1 -3pi/4
//
// +number +0 +pi/2 // does not raise DBZ
// -number +0 -pi/2 // does not raise DBZ
// +number -0 +pi/2 // does not raise DBZ
// -number -0 -pi/2 // does not raise DBZ
//
// +0 +number +0
// -0 +number -0
// +0 -number +pi
// -0 -number -pi
//
// +0 +0 +0 // does not raise invalid
// -0 +0 -0 // does not raise invalid
// +0 -0 +pi // does not raise invalid
// -0 -0 -pi // does not raise invalid
//
// Nan anything quiet Y
// anything NaN quiet X
// atan2(+-0/+-0) sets double error tag to 37
// atan2f(+-0/+-0) sets single error tag to 38
// These are domain errors.
//
// Assembly macros
//=========================================
// integer registers
atan2f_GR_Addr_1 = r33
atan2f_GR_Addr_2 = r34
GR_SAVE_B0 = r35
GR_SAVE_PFS = r36
GR_SAVE_GP = r37
GR_Parameter_X = r38
GR_Parameter_Y = r39
GR_Parameter_RESULT = r40
GR_Parameter_TAG = r41
// floating point registers
atan2f_coef_p1 = f32
atan2f_coef_p10 = f33
atan2f_coef_p7 = f34
atan2f_coef_p6 = f35
atan2f_coef_p3 = f36
atan2f_coef_p2 = f37
atan2f_coef_p9 = f38
atan2f_coef_p8 = f39
atan2f_coef_p5 = f40
atan2f_coef_p4 = f41
atan2f_const_piby2 = f42
atan2f_const_pi = f43
atan2f_const_piby4 = f44
atan2f_const_3piby4 = f45
atan2f_xsq = f46
atan2f_ysq = f47
atan2f_xy = f48
atan2f_const_1 = f49
atan2f_sgn_Y = f50
atan2f_Z0 = f51
atan2f_A0 = f52
atan2f_Z = f53
atan2f_A = f54
atan2f_C = f55
atan2f_U = f56
atan2f_Usq = f57
atan2f_U4 = f58
atan2f_U6 = f59
atan2f_U8 = f60
atan2f_poly_u109 = f61
atan2f_poly_u87 = f62
atan2f_poly_u65 = f63
atan2f_poly_u43 = f64
atan2f_poly_u21 = f65
atan2f_poly_u10to7 = f66
atan2f_poly_u6to3 = f67
atan2f_poly_u10to3 = f68
atan2f_poly_u10to0 = f69
atan2f_poly_u210 = f70
atan2f_T_numer = f71
atan2f_T_denom = f72
atan2f_G_numer = f73
atan2f_G_denom = f74
atan2f_p1rnum = f75
atan2f_R_denom = f76
atan2f_R_numer = f77
atan2f_pR = f78
atan2f_pRC = f79
atan2f_pQRC = f80
atan2f_Q1 = f81
atan2f_Q_beta = f82
atan2f_Q2 = f83
atan2f_Q_beta2 = f84
atan2f_Q3 = f85
atan2f_r = f86
atan2f_rsq = f87
atan2f_poly_atan_U = f88
// predicate registers
//atan2f_Pred_Swap = p6 // |y| > |x|
//atan2f_Pred_noSwap = p7 // |y| <= |x|
//atan2f_Pred_Xpos = p8 // x >= 0
//atan2f_Pred_Xneg = p9 // x < 0
RODATA
.align 16
LOCAL_OBJECT_START(atan2f_coef_table1)
data8 0xBFD5555512191621 // p1
data8 0x3F522E5D33BC9BAA // p10
data8 0xBFA6E10BA401393F // p7
data8 0x3FB142A73D7C54E3 // p6
data8 0xBFC2473C5145EE38 // p3
data8 0x3FC9997E7AFBFF4E // p2
LOCAL_OBJECT_END(atan2f_coef_table1)
LOCAL_OBJECT_START(atan2f_coef_table2)
data8 0xBF7DEAADAA336451 // p9
data8 0x3F97105B4160F86B // p8
data8 0xBFB68EED6A8CFA32 // p5
data8 0x3FBC4F512B1865F5 // p4
data8 0x3ff921fb54442d18 // pi/2
data8 0x400921fb54442d18 // pi
data8 0x3fe921fb54442d18 // pi/4
data8 0x4002d97c7f3321d2 // 3pi/4
LOCAL_OBJECT_END(atan2f_coef_table2)
.section .text
GLOBAL_IEEE754_ENTRY(atan2f)
{ .mfi
alloc r32 = ar.pfs,1,5,4,0
frcpa.s1 atan2f_Z0,p0 = f1,f8 // Approx to 1/y
nop.i 999
}
{ .mfi
addl atan2f_GR_Addr_1 = @ltoff(atan2f_coef_table1),gp
fma.s1 atan2f_xsq = f9,f9,f0
nop.i 999 ;;
}
{ .mfi
ld8 atan2f_GR_Addr_1 = [atan2f_GR_Addr_1]
frcpa.s1 atan2f_A0,p0 = f1,f9 // Approx to 1/x
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 atan2f_ysq = f8,f8,f0
nop.i 999 ;;
}
{ .mfi
nop.m 999
fcmp.ge.s1 p8,p9 = f9,f0 // Set p8 if x>=0, p9 if x<0
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 atan2f_xy = f9,f8,f0
nop.i 999 ;;
}
{ .mfi
add atan2f_GR_Addr_2 = 0x30, atan2f_GR_Addr_1
fmerge.s atan2f_sgn_Y = f8,f1
nop.i 999 ;;
}
{ .mmf
ldfpd atan2f_coef_p1,atan2f_coef_p10 = [atan2f_GR_Addr_1],16
ldfpd atan2f_coef_p9,atan2f_coef_p8 = [atan2f_GR_Addr_2],16
fclass.m p10,p0 = f9,0xe7 // Test x @inf|@snan|@qnan|@zero
}
;;
{ .mfi
ldfpd atan2f_coef_p7,atan2f_coef_p6 = [atan2f_GR_Addr_1],16
fma.s1 atan2f_T_denom = atan2f_Z0,atan2f_xsq,f8
nop.i 999
}
{ .mfi
ldfpd atan2f_coef_p5,atan2f_coef_p4 = [atan2f_GR_Addr_2],16
fma.s1 atan2f_Z = atan2f_Z0,f9,f0
nop.i 999 ;;
}
{ .mfi
ldfpd atan2f_coef_p3,atan2f_coef_p2 = [atan2f_GR_Addr_1],16
fma.s1 atan2f_G_denom = atan2f_A0,atan2f_ysq,f9
nop.i 999
}
{ .mfi
ldfpd atan2f_const_piby2,atan2f_const_pi = [atan2f_GR_Addr_2],16
fma.s1 atan2f_A = atan2f_A0,f8,f0
nop.i 999 ;;
}
{ .mfi
ldfpd atan2f_const_piby4,atan2f_const_3piby4 = [atan2f_GR_Addr_2]
fclass.m p11,p0 = f8,0xe7 // Test y @inf|@snan|@qnan|@zero
nop.i 999
}
{ .mfb
nop.m 999
fnma.s1 atan2f_T_numer = atan2f_Z0,atan2f_xy,f9
(p10) br.cond.spnt ATAN2F_XY_INF_NAN_ZERO ;; // Branch on x nan,inf,zero
}
// p6 if |y|>|x|, p7 if |x|>=|y| , use xsq and ysq for test
{ .mfi
nop.m 999
fcmp.gt.s1 p6,p7 = atan2f_ysq,atan2f_xsq
nop.i 999
}
{ .mfb
nop.m 999
fnma.s1 atan2f_G_numer = atan2f_A0,atan2f_xy,f8
(p11) br.cond.spnt ATAN2F_XY_INF_NAN_ZERO ;; // Branch on y nan,inf,zero
}
{ .mfi
nop.m 999
(p8) fma.s1 atan2f_const_1 = atan2f_sgn_Y,f0,f0
nop.i 999
}
{ .mfi
nop.m 999
(p9) fma.s1 atan2f_const_1 = atan2f_sgn_Y,f1,f0
nop.i 999 ;;
}
{ .mfi
nop.m 999
(p6) fnma.s1 atan2f_U = atan2f_Z,f1,f0
nop.i 999
}
{ .mfi
nop.m 999
(p6) fma.s1 atan2f_Usq = atan2f_Z,atan2f_Z,f0
nop.i 999 ;;
}
{ .mfi
nop.m 999
(p7) fma.s1 atan2f_U = atan2f_A,f1,f0
nop.i 999
}
{ .mfi
nop.m 999
(p7) fma.s1 atan2f_Usq = atan2f_A,atan2f_A,f0
nop.i 999 ;;
}
{ .mfi
nop.m 999
(p6) frcpa.s1 atan2f_Q1,p0 = f1,atan2f_T_denom
nop.i 999
}
{ .mfi
nop.m 999
(p6) fma.s1 atan2f_R_denom = atan2f_T_denom,f1,f0
nop.i 999 ;;
}
{ .mfi
nop.m 999
(p7) frcpa.s1 atan2f_Q1,p0 = f1,atan2f_G_denom
nop.i 999
}
{ .mfi
nop.m 999
(p7) fma.s1 atan2f_R_denom = atan2f_G_denom,f1,f0
nop.i 999 ;;
}
{ .mfi
nop.m 999
(p6) fnma.s1 atan2f_R_numer = atan2f_T_numer,f1,f0
nop.i 999
}
{ .mfi
nop.m 999
(p7) fma.s1 atan2f_R_numer = atan2f_G_numer,f1,f0
nop.i 999 ;;
}
{ .mfi
nop.m 999
(p6) fnma.s1 atan2f_p1rnum = atan2f_T_numer,atan2f_coef_p1,f0
nop.i 999 ;;
}
{ .mfi
nop.m 999
(p7) fma.s1 atan2f_p1rnum = atan2f_G_numer,atan2f_coef_p1,f0
nop.i 999 ;;
}
{ .mfi
nop.m 999
fma.s1 atan2f_U4 = atan2f_Usq,atan2f_Usq,f0
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 atan2f_poly_u109 = atan2f_Usq,atan2f_coef_p10,atan2f_coef_p9
nop.i 999 ;;
}
{ .mfi
nop.m 999
fma.s1 atan2f_poly_u87 = atan2f_Usq,atan2f_coef_p8,atan2f_coef_p7
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 atan2f_poly_u65 = atan2f_Usq,atan2f_coef_p6,atan2f_coef_p5
nop.i 999 ;;
}
{ .mfi
nop.m 999
fma.s1 atan2f_poly_u43 = atan2f_Usq,atan2f_coef_p4,atan2f_coef_p3
nop.i 999
}
{ .mfi
nop.m 999
fnma.s1 atan2f_Q_beta = atan2f_Q1,atan2f_R_denom,f1
nop.i 999 ;;
}
{ .mfi
nop.m 999
fma.s1 atan2f_poly_u21 = atan2f_Usq,atan2f_coef_p2,atan2f_coef_p1
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 atan2f_r = atan2f_Q1,atan2f_R_numer,f0
nop.i 999 ;;
}
{ .mfi
nop.m 999
(p6) fma.s1 atan2f_C = atan2f_sgn_Y,atan2f_const_piby2,f0
nop.i 999
}
{ .mfi
nop.m 999
(p7) fma.s1 atan2f_C = atan2f_const_1,atan2f_const_pi,f0
nop.i 999 ;;
}
{ .mfi
nop.m 999
fma.s1 atan2f_U6 = atan2f_U4,atan2f_Usq,f0
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 atan2f_U8 = atan2f_U4,atan2f_U4,f0
nop.i 999 ;;
}
{ .mfi
nop.m 999
fma.s1 atan2f_poly_u10to7 = atan2f_U4,atan2f_poly_u109,atan2f_poly_u87
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 atan2f_pR = atan2f_p1rnum,atan2f_Q1,f0
nop.i 999 ;;
}
{ .mfi
nop.m 999
fma.s1 atan2f_poly_u6to3 = atan2f_U4,atan2f_poly_u65,atan2f_poly_u43
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 atan2f_Q2 = atan2f_Q1,atan2f_Q_beta,atan2f_Q1
nop.i 999 ;;
}
{ .mfi
nop.m 999
fma.s1 atan2f_Q_beta2 = atan2f_Q_beta,atan2f_Q_beta,f0
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 atan2f_rsq = atan2f_r,atan2f_r,f0
nop.i 999 ;;
}
{ .mfi
nop.m 999
fma.s1 atan2f_poly_u210 = atan2f_Usq,atan2f_poly_u21,f1
nop.i 999 ;;
}
{ .mfi
nop.m 999
fcmp.eq.s0 p8,p0 = f8,f9 // Dummy op to set flag on denormal inputs
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 atan2f_poly_u10to3 = atan2f_U8,atan2f_poly_u10to7,atan2f_poly_u6to3
nop.i 999 ;;
}
{ .mfi
nop.m 999
fma.s1 atan2f_Q3 = atan2f_Q2,atan2f_Q_beta2,atan2f_Q2
nop.i 999
}
{ .mfi
nop.m 999
fma.s1 atan2f_pRC = atan2f_rsq,atan2f_pR,atan2f_C
nop.i 999 ;;
}
{ .mfi
nop.m 999
fma.s1 atan2f_poly_u10to0 = atan2f_U6,atan2f_poly_u10to3,atan2f_poly_u210
nop.i 999 ;;
}
{ .mfi
nop.m 999
fma.s1 atan2f_pQRC = atan2f_R_numer,atan2f_Q3,atan2f_pRC
nop.i 999 ;;
}
{ .mfb
nop.m 999
fma.s.s0 f8 = atan2f_U,atan2f_poly_u10to0,atan2f_pQRC
br.ret.sptk b0 ;;
}
ATAN2F_XY_INF_NAN_ZERO:
{ .mfi
nop.m 999
fclass.m p10,p0 = f8,0xc3 // Is y nan
nop.i 999
}
;;
{ .mfi
nop.m 999
fclass.m p12,p0 = f9,0xc3 // Is x nan
nop.i 999
}
;;
{ .mfi
nop.m 999
fclass.m p6,p0 = f9,0x21 // Is x +inf
nop.i 999
}
{ .mfb
nop.m 999
(p10) fma.s.s0 f8 = f9,f8,f0 // Result quietized y if y is nan
(p10) br.ret.spnt b0 // Exit if y is nan
}
;;
{ .mfi
nop.m 999
(p6) fclass.m.unc p7,p8 = f8,0x23 // x +inf, is y inf
nop.i 999
}
{ .mfb
nop.m 999
(p12) fnorm.s.s0 f8 = f9 // Result quietized x if x is nan, y not nan
(p12) br.ret.spnt b0 // Exit if x is nan, y not nan
}
;;
// Here if x or y inf, or x or y zero
{ .mfi
nop.m 999
fcmp.eq.s0 p15,p0 = f8,f9 // Dummy op to set flag on denormal inputs
nop.i 999
}
;;
{ .mfi
nop.m 999
fclass.m p11,p12 = f9,0x22 // Is x -inf
nop.i 999
}
{ .mfb
nop.m 999
(p7) fma.s.s0 f8 = atan2f_sgn_Y, atan2f_const_piby4,f0 // Result +-pi/4
(p7) br.ret.spnt b0 // Exit if x +inf and y inf
}
;;
{ .mfb
nop.m 999
(p8) fmerge.s f8 = f8,f0 // If x +inf and y not inf, result +-0
(p8) br.ret.spnt b0 // Exit if x +inf and y not inf
}
;;
{ .mfi
nop.m 999
(p12) fclass.m.unc p13,p0 = f8,0x23 // x not -inf, is y inf
nop.i 999
}
;;
{ .mfi
nop.m 999
(p11) fclass.m.unc p14,p15 = f8,0x23 // x -inf, is y inf
nop.i 999
}
;;
{ .mfi
nop.m 999
fclass.m p6,p7 = f9,0x7 // Is x zero
nop.i 999
}
{ .mfb
nop.m 999
(p13) fma.s.s0 f8 = atan2f_sgn_Y, atan2f_const_piby2,f0 // Result +-pi/2
(p13) br.ret.spnt b0 // Exit if x not -inf and y inf
}
;;
{ .mfi
nop.m 999
(p14) fma.s.s0 f8 = atan2f_sgn_Y, atan2f_const_3piby4,f0 // Result +-3pi/4
nop.i 999
}
{ .mfb
nop.m 999
(p15) fma.s.s0 f8 = atan2f_sgn_Y, atan2f_const_pi,f0 // Result +-pi
(p11) br.ret.spnt b0 // Exit if x -inf
}
;;
// Here if x or y zero
{ .mfi
nop.m 999
(p7) fclass.m.unc p8,p9 = f9,0x19 // x not zero, y zero, is x > zero
nop.i 999
}
;;
{ .mfi
nop.m 999
(p6) fclass.m.unc p10,p11 = f8,0x7 // x zero, is y zero
nop.i 999
}
;;
{ .mfi
nop.m 999
(p8) fmerge.s f8 = f8, f0 // x > zero and y zero, result is +-zero
nop.i 999
}
{ .mfb
nop.m 999
(p9) fma.s.s0 f8 = atan2f_sgn_Y, atan2f_const_pi,f0 // x < 0, y 0, result +-pi
(p10) br.cond.spnt __libm_error_region // Branch if x zero and y zero
}
;;
{ .mfb
nop.m 999
(p11) fma.s.s0 f8 = atan2f_sgn_Y, atan2f_const_piby2,f0 // x zero, y not zero
br.ret.sptk b0 // Final special case exit
}
;;
GLOBAL_IEEE754_END(atan2f)
libm_alias_float_other (__atan2, atan2)
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
mov GR_Parameter_TAG = 38
fclass.m p10,p11 = f9,0x5 // @zero | @pos
;;
(p10) fmerge.s f10 = f8, f0
(p11) fma.s.s0 f10 = atan2f_sgn_Y, atan2f_const_pi,f0
;;
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 999
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
}
;;
{ .mmi
stfs [GR_Parameter_Y] = f9,16 // Store Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
}
;;
.body
{ .mib
stfs [GR_Parameter_X] = f8 // Store Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y
nop.b 0 // Parameter 3 address
}
{ .mib
stfs [GR_Parameter_Y] = f10 // Store Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
}
;;
{ .mmi
nop.m 0
nop.m 0
add GR_Parameter_RESULT = 48,sp
};;
{ .mmi
ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
}
;;
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
}
;;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#

View File

@ -1 +0,0 @@
/* Not needed. */

File diff suppressed because it is too large Load Diff

View File

@ -1,845 +0,0 @@
.file "atanhf.s"
// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
// 05/22/01 Initial version
// 05/20/02 Cleaned up namespace and sf0 syntax
// 08/06/02 Improved Itanium 2 performance
// 02/06/03 Reordered header: .section, .global, .proc, .align
// 05/26/03 Improved performance, fixed to handle unorms
//
// API
//==============================================================
// float atanhf(float)
//
// Overview of operation
//==============================================================
// Background
//
//
// There are 7 paths:
// 1. x = +/-0.0
// Return atanhf(x) = +/-0.0
//
// 2. 0.0 < |x| <= MAX_DENORMAL_ABS
// Return atanhf(x) = x + sign(x)*x^2
//
// 3. MAX_DENORMAL_ABS < |x| < 2^(-20)
// Return atanhf(x) = Pol3(x), where Pol3(x) = x + x^3
//
// 4. 2^(-20) <= |x| < 1
// Return atanhf(x) = 0.5 * (log(1 + x) - log(1 - x))
// Algorithm description for log function see below.
//
// 5. |x| = 1
// Return atanhf(x) = sign(x) * +INF
//
// 6. 1 < |x| <= +INF
// Return atanhf(x) = QNaN
//
// 7. x = [S,Q]NaN
// Return atanhf(x) = QNaN
//
//==============================================================
// Algorithm Description for log(x) function
//
// Consider x = 2^N * 1.f1 f2 f3 f4...f63
// log(x) = log(x * frcpa(x) / frcpa(x))
// = log(x * frcpa(x)) + log(1/frcpa(x))
// = log(x * frcpa(x)) - log(frcpa(x))
//
// frcpa(x) = 2^(-N) * frcpa(1.f1 f2 ... f63)
//
// -log(frcpa(x)) = -log(C)
// = -log(2^(-N)) - log(frcpa(1.f1 f2 ... f63))
//
// -log(frcpa(x)) = -log(C)
// = N*log2 - log(frcpa(1.f1 f2 ... f63))
//
//
// log(x) = log(1/frcpa(x)) + log(frcpa(x) x)
//
// log(x) = N*log2 + log(1./frcpa(1.f1 f2 ... f63)) + log(x * frcpa(x))
// log(x) = N*log2 + T + log(frcpa(x) x)
//
// Log(x) = N*log2 + T + log(C * x)
//
// C * x = 1 + r
//
// log(x) = N*log2 + T + log(1 + r)
// log(x) = N*log2 + T + Series(r)
//
// 1.f1 f2 ... f8 has 256 entries.
// They are 1 + k/2^8, k = 0 ... 255
// These 256 values are the table entries.
//
// Implementation
//==============================================================
// C = frcpa(x)
// r = C * x - 1
//
// Form rseries = r + P1*r^2 + P2*r^3 + P3*r^4
//
// x = f * 2*N where f is 1.f_1f_2f_3...f_63
// Nfloat = float(n) where n is the true unbiased exponent
// pre-index = f_1f_2....f_8
// index = pre_index * 16
// get the dxt table entry at index + offset = T
//
// result = (T + Nfloat * log(2)) + rseries
//
// The T table is calculated as follows
// Form x_k = 1 + k/2^8 where k goes from 0... 255
// y_k = frcpa(x_k)
// log(1/y_k) in quad and round to double-extended
// Registers used
//==============================================================
// Floating Point registers used:
// f8, input
// f32 -> f59
// General registers used:
// r14 -> r29, r32 -> r39
// Predicate registers used:
// p6 -> p9
// p6 to filter out case when |x| >= 1
// p7 to filter out case when x = [Q,S]NaN or +/-0
// p8 to filter out case when |x| < 2^(-20)
// p9 to filter out case when x = denormal
// Assembly macros
//==============================================================
DataPtr = r14
RcpTablePtrM = r15
RcpTablePtrP = r16
rExpbMask = r17
rBias = r18
rNearZeroBound = r19
rArgSExpb = r20
rArgExpb = r21
rExpbm = r22
rExpbp = r23
rSigm = r24
rSigp = r25
rNm = r26
rNp = r27
rIndm = r28
rIndp = r29
GR_SAVE_B0 = r33
GR_SAVE_GP = r34
GR_SAVE_PFS = r35
GR_Parameter_X = r36
GR_Parameter_Y = r37
GR_Parameter_RESULT = r38
atanh_GR_tag = r39
//==============================================================
fOneMx = f33
fOnePx = f34
fRm2 = f35
fRm3 = f36
fRp2 = f37
fRp3 = f38
fRcpM = f39
fRcpP = f40
fRp = f41
fRm = f42
fN4CvtM = f43
fN4CvtP = f44
fNm = f45
fNp = f46
fLogTm = f47
fLogTp = f48
fLog2 = f49
fArgAbs = f50
fNormX = f50
fP32m = f51
fP32p = f52
fP10m = f53
fP10p = f54
fX2 = f55
fP3 = f56
fP2 = f57
fP1 = f58
fHalf = f59
// Data tables
//==============================================================
RODATA
.align 16
LOCAL_OBJECT_START(atanhf_data)
data8 0xbfc0001008f39d59 // P3*0.5
data8 0x3fc5556073e0c45a // P2*0.5
data8 0xbfcffffffffaea15 // P1*0.5
data8 0x3fe0000000000000 // 0.5
data8 0x3fd62e42fefa39ef // 0.5*ln(2)
data8 0x0000000000000000 // pad
LOCAL_OBJECT_END(atanhf_data)
LOCAL_OBJECT_START(atanhf_data2)
data8 0x3f50040155d5889e //log(1/frcpa(1+0/256))/2
data8 0x3f68121214586b54 //log(1/frcpa(1+1/256))/2
data8 0x3f741929f96832f0 //log(1/frcpa(1+2/256))/2
data8 0x3f7c317384c75f06 //log(1/frcpa(1+3/256))/2
data8 0x3f81a6b91ac73386 //log(1/frcpa(1+4/256))/2
data8 0x3f85ba9a5d9ac039 //log(1/frcpa(1+5/256))/2
data8 0x3f89d2a8074325f4 //log(1/frcpa(1+6/256))/2
data8 0x3f8d6b2725979802 //log(1/frcpa(1+7/256))/2
data8 0x3f90c58fa19dfaaa //log(1/frcpa(1+8/256))/2
data8 0x3f92954c78cbce1b //log(1/frcpa(1+9/256))/2
data8 0x3f94a94d2da96c56 //log(1/frcpa(1+10/256))/2
data8 0x3f967c94f2d4bb58 //log(1/frcpa(1+11/256))/2
data8 0x3f985188b630f068 //log(1/frcpa(1+12/256))/2
data8 0x3f9a6b8abe73af4c //log(1/frcpa(1+13/256))/2
data8 0x3f9c441e06f72a9e //log(1/frcpa(1+14/256))/2
data8 0x3f9e1e6713606d07 //log(1/frcpa(1+15/256))/2
data8 0x3f9ffa6911ab9301 //log(1/frcpa(1+16/256))/2
data8 0x3fa0ec139c5da601 //log(1/frcpa(1+17/256))/2
data8 0x3fa1dbd2643d190b //log(1/frcpa(1+18/256))/2
data8 0x3fa2cc7284fe5f1c //log(1/frcpa(1+19/256))/2
data8 0x3fa3bdf5a7d1ee64 //log(1/frcpa(1+20/256))/2
data8 0x3fa4b05d7aa012e0 //log(1/frcpa(1+21/256))/2
data8 0x3fa580db7ceb5702 //log(1/frcpa(1+22/256))/2
data8 0x3fa674f089365a7a //log(1/frcpa(1+23/256))/2
data8 0x3fa769ef2c6b568d //log(1/frcpa(1+24/256))/2
data8 0x3fa85fd927506a48 //log(1/frcpa(1+25/256))/2
data8 0x3fa9335e5d594989 //log(1/frcpa(1+26/256))/2
data8 0x3faa2b0220c8e5f5 //log(1/frcpa(1+27/256))/2
data8 0x3fab0004ac1a86ac //log(1/frcpa(1+28/256))/2
data8 0x3fabf968769fca11 //log(1/frcpa(1+29/256))/2
data8 0x3faccfedbfee13a8 //log(1/frcpa(1+30/256))/2
data8 0x3fada727638446a2 //log(1/frcpa(1+31/256))/2
data8 0x3faea3257fe10f7a //log(1/frcpa(1+32/256))/2
data8 0x3faf7be9fedbfde6 //log(1/frcpa(1+33/256))/2
data8 0x3fb02ab352ff25f4 //log(1/frcpa(1+34/256))/2
data8 0x3fb097ce579d204d //log(1/frcpa(1+35/256))/2
data8 0x3fb1178e8227e47c //log(1/frcpa(1+36/256))/2
data8 0x3fb185747dbecf34 //log(1/frcpa(1+37/256))/2
data8 0x3fb1f3b925f25d41 //log(1/frcpa(1+38/256))/2
data8 0x3fb2625d1e6ddf57 //log(1/frcpa(1+39/256))/2
data8 0x3fb2d1610c86813a //log(1/frcpa(1+40/256))/2
data8 0x3fb340c59741142e //log(1/frcpa(1+41/256))/2
data8 0x3fb3b08b6757f2a9 //log(1/frcpa(1+42/256))/2
data8 0x3fb40dfb08378003 //log(1/frcpa(1+43/256))/2
data8 0x3fb47e74e8ca5f7c //log(1/frcpa(1+44/256))/2
data8 0x3fb4ef51f6466de4 //log(1/frcpa(1+45/256))/2
data8 0x3fb56092e02ba516 //log(1/frcpa(1+46/256))/2
data8 0x3fb5d23857cd74d5 //log(1/frcpa(1+47/256))/2
data8 0x3fb6313a37335d76 //log(1/frcpa(1+48/256))/2
data8 0x3fb6a399dabbd383 //log(1/frcpa(1+49/256))/2
data8 0x3fb70337dd3ce41b //log(1/frcpa(1+50/256))/2
data8 0x3fb77654128f6127 //log(1/frcpa(1+51/256))/2
data8 0x3fb7e9d82a0b022d //log(1/frcpa(1+52/256))/2
data8 0x3fb84a6b759f512f //log(1/frcpa(1+53/256))/2
data8 0x3fb8ab47d5f5a310 //log(1/frcpa(1+54/256))/2
data8 0x3fb91fe49096581b //log(1/frcpa(1+55/256))/2
data8 0x3fb981634011aa75 //log(1/frcpa(1+56/256))/2
data8 0x3fb9f6c407089664 //log(1/frcpa(1+57/256))/2
data8 0x3fba58e729348f43 //log(1/frcpa(1+58/256))/2
data8 0x3fbabb55c31693ad //log(1/frcpa(1+59/256))/2
data8 0x3fbb1e104919efd0 //log(1/frcpa(1+60/256))/2
data8 0x3fbb94ee93e367cb //log(1/frcpa(1+61/256))/2
data8 0x3fbbf851c067555f //log(1/frcpa(1+62/256))/2
data8 0x3fbc5c0254bf23a6 //log(1/frcpa(1+63/256))/2
data8 0x3fbcc000c9db3c52 //log(1/frcpa(1+64/256))/2
data8 0x3fbd244d99c85674 //log(1/frcpa(1+65/256))/2
data8 0x3fbd88e93fb2f450 //log(1/frcpa(1+66/256))/2
data8 0x3fbdedd437eaef01 //log(1/frcpa(1+67/256))/2
data8 0x3fbe530effe71012 //log(1/frcpa(1+68/256))/2
data8 0x3fbeb89a1648b971 //log(1/frcpa(1+69/256))/2
data8 0x3fbf1e75fadf9bde //log(1/frcpa(1+70/256))/2
data8 0x3fbf84a32ead7c35 //log(1/frcpa(1+71/256))/2
data8 0x3fbfeb2233ea07cd //log(1/frcpa(1+72/256))/2
data8 0x3fc028f9c7035c1c //log(1/frcpa(1+73/256))/2
data8 0x3fc05c8be0d9635a //log(1/frcpa(1+74/256))/2
data8 0x3fc085eb8f8ae797 //log(1/frcpa(1+75/256))/2
data8 0x3fc0b9c8e32d1911 //log(1/frcpa(1+76/256))/2
data8 0x3fc0edd060b78081 //log(1/frcpa(1+77/256))/2
data8 0x3fc122024cf0063f //log(1/frcpa(1+78/256))/2
data8 0x3fc14be2927aecd4 //log(1/frcpa(1+79/256))/2
data8 0x3fc180618ef18adf //log(1/frcpa(1+80/256))/2
data8 0x3fc1b50bbe2fc63b //log(1/frcpa(1+81/256))/2
data8 0x3fc1df4cc7cf242d //log(1/frcpa(1+82/256))/2
data8 0x3fc214456d0eb8d4 //log(1/frcpa(1+83/256))/2
data8 0x3fc23ec5991eba49 //log(1/frcpa(1+84/256))/2
data8 0x3fc2740d9f870afb //log(1/frcpa(1+85/256))/2
data8 0x3fc29ecdabcdfa04 //log(1/frcpa(1+86/256))/2
data8 0x3fc2d46602adccee //log(1/frcpa(1+87/256))/2
data8 0x3fc2ff66b04ea9d4 //log(1/frcpa(1+88/256))/2
data8 0x3fc335504b355a37 //log(1/frcpa(1+89/256))/2
data8 0x3fc360925ec44f5d //log(1/frcpa(1+90/256))/2
data8 0x3fc38bf1c3337e75 //log(1/frcpa(1+91/256))/2
data8 0x3fc3c25277333184 //log(1/frcpa(1+92/256))/2
data8 0x3fc3edf463c1683e //log(1/frcpa(1+93/256))/2
data8 0x3fc419b423d5e8c7 //log(1/frcpa(1+94/256))/2
data8 0x3fc44591e0539f49 //log(1/frcpa(1+95/256))/2
data8 0x3fc47c9175b6f0ad //log(1/frcpa(1+96/256))/2
data8 0x3fc4a8b341552b09 //log(1/frcpa(1+97/256))/2
data8 0x3fc4d4f3908901a0 //log(1/frcpa(1+98/256))/2
data8 0x3fc501528da1f968 //log(1/frcpa(1+99/256))/2
data8 0x3fc52dd06347d4f6 //log(1/frcpa(1+100/256))/2
data8 0x3fc55a6d3c7b8a8a //log(1/frcpa(1+101/256))/2
data8 0x3fc5925d2b112a59 //log(1/frcpa(1+102/256))/2
data8 0x3fc5bf406b543db2 //log(1/frcpa(1+103/256))/2
data8 0x3fc5ec433d5c35ae //log(1/frcpa(1+104/256))/2
data8 0x3fc61965cdb02c1f //log(1/frcpa(1+105/256))/2
data8 0x3fc646a84935b2a2 //log(1/frcpa(1+106/256))/2
data8 0x3fc6740add31de94 //log(1/frcpa(1+107/256))/2
data8 0x3fc6a18db74a58c5 //log(1/frcpa(1+108/256))/2
data8 0x3fc6cf31058670ec //log(1/frcpa(1+109/256))/2
data8 0x3fc6f180e852f0ba //log(1/frcpa(1+110/256))/2
data8 0x3fc71f5d71b894f0 //log(1/frcpa(1+111/256))/2
data8 0x3fc74d5aefd66d5c //log(1/frcpa(1+112/256))/2
data8 0x3fc77b79922bd37e //log(1/frcpa(1+113/256))/2
data8 0x3fc7a9b9889f19e2 //log(1/frcpa(1+114/256))/2
data8 0x3fc7d81b037eb6a6 //log(1/frcpa(1+115/256))/2
data8 0x3fc8069e33827231 //log(1/frcpa(1+116/256))/2
data8 0x3fc82996d3ef8bcb //log(1/frcpa(1+117/256))/2
data8 0x3fc85855776dcbfb //log(1/frcpa(1+118/256))/2
data8 0x3fc8873658327ccf //log(1/frcpa(1+119/256))/2
data8 0x3fc8aa75973ab8cf //log(1/frcpa(1+120/256))/2
data8 0x3fc8d992dc8824e5 //log(1/frcpa(1+121/256))/2
data8 0x3fc908d2ea7d9512 //log(1/frcpa(1+122/256))/2
data8 0x3fc92c59e79c0e56 //log(1/frcpa(1+123/256))/2
data8 0x3fc95bd750ee3ed3 //log(1/frcpa(1+124/256))/2
data8 0x3fc98b7811a3ee5b //log(1/frcpa(1+125/256))/2
data8 0x3fc9af47f33d406c //log(1/frcpa(1+126/256))/2
data8 0x3fc9df270c1914a8 //log(1/frcpa(1+127/256))/2
data8 0x3fca0325ed14fda4 //log(1/frcpa(1+128/256))/2
data8 0x3fca33440224fa79 //log(1/frcpa(1+129/256))/2
data8 0x3fca57725e80c383 //log(1/frcpa(1+130/256))/2
data8 0x3fca87d0165dd199 //log(1/frcpa(1+131/256))/2
data8 0x3fcaac2e6c03f896 //log(1/frcpa(1+132/256))/2
data8 0x3fcadccc6fdf6a81 //log(1/frcpa(1+133/256))/2
data8 0x3fcb015b3eb1e790 //log(1/frcpa(1+134/256))/2
data8 0x3fcb323a3a635948 //log(1/frcpa(1+135/256))/2
data8 0x3fcb56fa04462909 //log(1/frcpa(1+136/256))/2
data8 0x3fcb881aa659bc93 //log(1/frcpa(1+137/256))/2
data8 0x3fcbad0bef3db165 //log(1/frcpa(1+138/256))/2
data8 0x3fcbd21297781c2f //log(1/frcpa(1+139/256))/2
data8 0x3fcc039236f08819 //log(1/frcpa(1+140/256))/2
data8 0x3fcc28cb1e4d32fd //log(1/frcpa(1+141/256))/2
data8 0x3fcc4e19b84723c2 //log(1/frcpa(1+142/256))/2
data8 0x3fcc7ff9c74554c9 //log(1/frcpa(1+143/256))/2
data8 0x3fcca57b64e9db05 //log(1/frcpa(1+144/256))/2
data8 0x3fcccb130a5cebb0 //log(1/frcpa(1+145/256))/2
data8 0x3fccf0c0d18f326f //log(1/frcpa(1+146/256))/2
data8 0x3fcd232075b5a201 //log(1/frcpa(1+147/256))/2
data8 0x3fcd490246defa6b //log(1/frcpa(1+148/256))/2
data8 0x3fcd6efa918d25cd //log(1/frcpa(1+149/256))/2
data8 0x3fcd9509707ae52f //log(1/frcpa(1+150/256))/2
data8 0x3fcdbb2efe92c554 //log(1/frcpa(1+151/256))/2
data8 0x3fcdee2f3445e4af //log(1/frcpa(1+152/256))/2
data8 0x3fce148a1a2726ce //log(1/frcpa(1+153/256))/2
data8 0x3fce3afc0a49ff40 //log(1/frcpa(1+154/256))/2
data8 0x3fce6185206d516e //log(1/frcpa(1+155/256))/2
data8 0x3fce882578823d52 //log(1/frcpa(1+156/256))/2
data8 0x3fceaedd2eac990c //log(1/frcpa(1+157/256))/2
data8 0x3fced5ac5f436be3 //log(1/frcpa(1+158/256))/2
data8 0x3fcefc9326d16ab9 //log(1/frcpa(1+159/256))/2
data8 0x3fcf2391a2157600 //log(1/frcpa(1+160/256))/2
data8 0x3fcf4aa7ee03192d //log(1/frcpa(1+161/256))/2
data8 0x3fcf71d627c30bb0 //log(1/frcpa(1+162/256))/2
data8 0x3fcf991c6cb3b379 //log(1/frcpa(1+163/256))/2
data8 0x3fcfc07ada69a910 //log(1/frcpa(1+164/256))/2
data8 0x3fcfe7f18eb03d3e //log(1/frcpa(1+165/256))/2
data8 0x3fd007c053c5002e //log(1/frcpa(1+166/256))/2
data8 0x3fd01b942198a5a1 //log(1/frcpa(1+167/256))/2
data8 0x3fd02f74400c64eb //log(1/frcpa(1+168/256))/2
data8 0x3fd04360be7603ad //log(1/frcpa(1+169/256))/2
data8 0x3fd05759ac47fe34 //log(1/frcpa(1+170/256))/2
data8 0x3fd06b5f1911cf52 //log(1/frcpa(1+171/256))/2
data8 0x3fd078bf0533c568 //log(1/frcpa(1+172/256))/2
data8 0x3fd08cd9687e7b0e //log(1/frcpa(1+173/256))/2
data8 0x3fd0a10074cf9019 //log(1/frcpa(1+174/256))/2
data8 0x3fd0b5343a234477 //log(1/frcpa(1+175/256))/2
data8 0x3fd0c974c89431ce //log(1/frcpa(1+176/256))/2
data8 0x3fd0ddc2305b9886 //log(1/frcpa(1+177/256))/2
data8 0x3fd0eb524bafc918 //log(1/frcpa(1+178/256))/2
data8 0x3fd0ffb54213a476 //log(1/frcpa(1+179/256))/2
data8 0x3fd114253da97d9f //log(1/frcpa(1+180/256))/2
data8 0x3fd128a24f1d9aff //log(1/frcpa(1+181/256))/2
data8 0x3fd1365252bf0865 //log(1/frcpa(1+182/256))/2
data8 0x3fd14ae558b4a92d //log(1/frcpa(1+183/256))/2
data8 0x3fd15f85a19c765b //log(1/frcpa(1+184/256))/2
data8 0x3fd16d4d38c119fa //log(1/frcpa(1+185/256))/2
data8 0x3fd18203c20dd133 //log(1/frcpa(1+186/256))/2
data8 0x3fd196c7bc4b1f3b //log(1/frcpa(1+187/256))/2
data8 0x3fd1a4a738b7a33c //log(1/frcpa(1+188/256))/2
data8 0x3fd1b981c0c9653d //log(1/frcpa(1+189/256))/2
data8 0x3fd1ce69e8bb106b //log(1/frcpa(1+190/256))/2
data8 0x3fd1dc619de06944 //log(1/frcpa(1+191/256))/2
data8 0x3fd1f160a2ad0da4 //log(1/frcpa(1+192/256))/2
data8 0x3fd2066d7740737e //log(1/frcpa(1+193/256))/2
data8 0x3fd2147dba47a394 //log(1/frcpa(1+194/256))/2
data8 0x3fd229a1bc5ebac3 //log(1/frcpa(1+195/256))/2
data8 0x3fd237c1841a502e //log(1/frcpa(1+196/256))/2
data8 0x3fd24cfce6f80d9a //log(1/frcpa(1+197/256))/2
data8 0x3fd25b2c55cd5762 //log(1/frcpa(1+198/256))/2
data8 0x3fd2707f4d5f7c41 //log(1/frcpa(1+199/256))/2
data8 0x3fd285e0842ca384 //log(1/frcpa(1+200/256))/2
data8 0x3fd294294708b773 //log(1/frcpa(1+201/256))/2
data8 0x3fd2a9a2670aff0c //log(1/frcpa(1+202/256))/2
data8 0x3fd2b7fb2c8d1cc1 //log(1/frcpa(1+203/256))/2
data8 0x3fd2c65a6395f5f5 //log(1/frcpa(1+204/256))/2
data8 0x3fd2dbf557b0df43 //log(1/frcpa(1+205/256))/2
data8 0x3fd2ea64c3f97655 //log(1/frcpa(1+206/256))/2
data8 0x3fd3001823684d73 //log(1/frcpa(1+207/256))/2
data8 0x3fd30e97e9a8b5cd //log(1/frcpa(1+208/256))/2
data8 0x3fd32463ebdd34ea //log(1/frcpa(1+209/256))/2
data8 0x3fd332f4314ad796 //log(1/frcpa(1+210/256))/2
data8 0x3fd348d90e7464d0 //log(1/frcpa(1+211/256))/2
data8 0x3fd35779f8c43d6e //log(1/frcpa(1+212/256))/2
data8 0x3fd36621961a6a99 //log(1/frcpa(1+213/256))/2
data8 0x3fd37c299f3c366a //log(1/frcpa(1+214/256))/2
data8 0x3fd38ae2171976e7 //log(1/frcpa(1+215/256))/2
data8 0x3fd399a157a603e7 //log(1/frcpa(1+216/256))/2
data8 0x3fd3afccfe77b9d1 //log(1/frcpa(1+217/256))/2
data8 0x3fd3be9d503533b5 //log(1/frcpa(1+218/256))/2
data8 0x3fd3cd7480b4a8a3 //log(1/frcpa(1+219/256))/2
data8 0x3fd3e3c43918f76c //log(1/frcpa(1+220/256))/2
data8 0x3fd3f2acb27ed6c7 //log(1/frcpa(1+221/256))/2
data8 0x3fd4019c2125ca93 //log(1/frcpa(1+222/256))/2
data8 0x3fd4181061389722 //log(1/frcpa(1+223/256))/2
data8 0x3fd42711518df545 //log(1/frcpa(1+224/256))/2
data8 0x3fd436194e12b6bf //log(1/frcpa(1+225/256))/2
data8 0x3fd445285d68ea69 //log(1/frcpa(1+226/256))/2
data8 0x3fd45bcc464c893a //log(1/frcpa(1+227/256))/2
data8 0x3fd46aed21f117fc //log(1/frcpa(1+228/256))/2
data8 0x3fd47a1527e8a2d3 //log(1/frcpa(1+229/256))/2
data8 0x3fd489445efffccc //log(1/frcpa(1+230/256))/2
data8 0x3fd4a018bcb69835 //log(1/frcpa(1+231/256))/2
data8 0x3fd4af5a0c9d65d7 //log(1/frcpa(1+232/256))/2
data8 0x3fd4bea2a5bdbe87 //log(1/frcpa(1+233/256))/2
data8 0x3fd4cdf28f10ac46 //log(1/frcpa(1+234/256))/2
data8 0x3fd4dd49cf994058 //log(1/frcpa(1+235/256))/2
data8 0x3fd4eca86e64a684 //log(1/frcpa(1+236/256))/2
data8 0x3fd503c43cd8eb68 //log(1/frcpa(1+237/256))/2
data8 0x3fd513356667fc57 //log(1/frcpa(1+238/256))/2
data8 0x3fd522ae0738a3d8 //log(1/frcpa(1+239/256))/2
data8 0x3fd5322e26867857 //log(1/frcpa(1+240/256))/2
data8 0x3fd541b5cb979809 //log(1/frcpa(1+241/256))/2
data8 0x3fd55144fdbcbd62 //log(1/frcpa(1+242/256))/2
data8 0x3fd560dbc45153c7 //log(1/frcpa(1+243/256))/2
data8 0x3fd5707a26bb8c66 //log(1/frcpa(1+244/256))/2
data8 0x3fd587f60ed5b900 //log(1/frcpa(1+245/256))/2
data8 0x3fd597a7977c8f31 //log(1/frcpa(1+246/256))/2
data8 0x3fd5a760d634bb8b //log(1/frcpa(1+247/256))/2
data8 0x3fd5b721d295f10f //log(1/frcpa(1+248/256))/2
data8 0x3fd5c6ea94431ef9 //log(1/frcpa(1+249/256))/2
data8 0x3fd5d6bb22ea86f6 //log(1/frcpa(1+250/256))/2
data8 0x3fd5e6938645d390 //log(1/frcpa(1+251/256))/2
data8 0x3fd5f673c61a2ed2 //log(1/frcpa(1+252/256))/2
data8 0x3fd6065bea385926 //log(1/frcpa(1+253/256))/2
data8 0x3fd6164bfa7cc06b //log(1/frcpa(1+254/256))/2
data8 0x3fd62643fecf9743 //log(1/frcpa(1+255/256))/2
LOCAL_OBJECT_END(atanhf_data2)
.section .text
GLOBAL_LIBM_ENTRY(atanhf)
{ .mfi
getf.exp rArgSExpb = f8
fclass.m p9,p0 = f8, 0x0b // is arg denormal ?
mov rExpbMask = 0x1ffff
}
{ .mfi
addl DataPtr = @ltoff(atanhf_data), gp
fnma.s1 fOneMx = f8, f1, f1 // 1 - x
mov rBias = 0xffff
}
;;
{ .mfi
nop.m 0
fclass.m p7,p0 = f8, 0xc7 // is arg NaN or +/-0 ?
mov rNearZeroBound = 0xffeb // 2^(-20)
}
{ .mfi
ld8 DataPtr = [DataPtr]
fma.s1 fOnePx = f8, f1, f1 // 1 + x
nop.i 0
}
;;
{ .mfb
nop.m 0
fnorm.s1 fNormX = f8 // Normalize x
(p9) br.cond.spnt ATANH_UNORM // Branch if x=unorm
}
;;
ATANH_COMMON:
// Return here if x=unorm and not denorm
{ .mfi
ldfpd fP3, fP2 = [DataPtr], 16
fma.s1 fX2 = f8, f8, f0 // x^2
nop.i 0
}
{ .mfb
nop.m 0
(p7) fma.s.s0 f8 = f8,f1,f8 // NaN or +/-0
(p7) br.ret.spnt b0
}
;;
{ .mfi
ldfpd fP1, fHalf = [DataPtr], 16
frcpa.s1 fRcpM, p9 = f1, fOneMx // rcpm = frcpa(1 - x)
nop.i 0
}
;;
{ .mfi
getf.exp rExpbm = fOneMx
frcpa.s1 fRcpP, p0 = f1, fOnePx // rcpp = frcpa(1 + x)
// biased exponent
and rArgExpb = rArgSExpb, rExpbMask
}
;;
{ .mmi
getf.exp rExpbp = fOnePx
// is |x| < 2^(-20) ?
cmp.gt p8,p0 = rNearZeroBound, rArgExpb
cmp.ge p6,p0 = rArgExpb, rBias // is |x| >= 1 ?
}
;;
{ .mmb
getf.sig rSigm = fOneMx
nop.m 0
(p6) br.cond.spnt atanhf_ge_one
}
;;
{ .mfb
getf.sig rSigp = fOnePx
(p8) fma.s.s0 f8 = fX2, f8, f8 // x + x^3
(p8) br.ret.spnt b0 // Exit for MAX_DENORM_ABS < |x| < 2^-20
}
;;
{ .mfi
ldfd fLog2 = [DataPtr], 16
fms.s1 fRm = fRcpM, fOneMx, f1 // rm = rcpm * (1 - x) - 1
nop.i 0
}
;;
{ .mmf
// (1 - x) is always positive here and we need not mask sign bit
sub rNm = rExpbm, rBias
// (1 + x) is always positive here and we need not mask sign bit
sub rNp = rExpbp, rBias
fms.s1 fRp = fRcpP, fOnePx, f1 // rp = rcpp * (1 + x) - 1
}
;;
{ .mmi
setf.sig fN4CvtM = rNm
setf.sig fN4CvtP = rNp
extr.u rIndm = rSigm,55,8 // Extract 8 bits
}
;;
{ .mmi
shladd RcpTablePtrM = rIndm, 3, DataPtr
nop.m 0
extr.u rIndp = rSigp,55,8 // Extract 8 bits
}
;;
{ .mmi
ldfd fLogTm = [RcpTablePtrM]
shladd RcpTablePtrP = rIndp, 3, DataPtr
nop.i 0
}
;;
{ .mfi
ldfd fLogTp = [RcpTablePtrP]
fma.s1 fRm2 = fRm, fRm, f0 // rm^2
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fP32m = fP3, fRm, fP2 // P3*rm + P2
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fRp2 = fRp, fRp, f0 // rp^2
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fP10m = fP1, fRm, fHalf // P1*rm + 1
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fP32p = fP3, fRp, fP2 // P3*rp + P2
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fP10p = fP1, fRp, fHalf // P1*rp + 1
nop.i 0
}
;;
{ .mfi
nop.m 0
fcvt.xf fNm = fN4CvtM
nop.i 0
}
{ .mfi
nop.m 0
fcvt.xf fNp = fN4CvtP
nop.i 0
}
;;
{ .mfi
nop.m 0
// (P3*rm + P2)*rm^2 + (P1*rm + 1)
fma.s1 fP32m = fP32m, fRm2, fP10m
nop.i 0
}
{ .mfi
nop.m 0
// (P3*rp + P2)*rp^2 + (P1*rp + 1)
fma.s1 fP32p = fP32p, fRp2, fP10p
nop.i 0
}
;;
{ .mfi
nop.m 0
// Nm*ln(2)/2 + Tm/2
fma.s1 fLogTm = fNm, fLog2, fLogTm
nop.i 0
}
{ .mfi
nop.m 0
// Np*ln(2)/2 + Tp/2
fma.s1 fLogTp = fNp, fLog2, fLogTp
nop.i 0
}
;;
{ .mfi
nop.m 0
// ((P3*rm + P2)*rm^2 + (P3*rm + 1))*0.5*rm + (Nm*ln(2)/2 + Tm/2)
fma.d.s1 fP32m = fP32m, fRm, fLogTm
nop.i 0
}
{ .mfi
nop.m 0
// ((P3*rp + P2)*rp^2 + (P3*rp + 1))*0.5*rp + (Np*ln(2)/2 + Tp/2)
fma.d.s1 fP32p = fP32p, fRp, fLogTp
nop.i 0
}
;;
{ .mfb
nop.m 0
// atanhf(x) = 0.5 * (log(1 + x) - log(1 - x))
fnma.s.s0 f8 = fP32m, f1, fP32p
br.ret.sptk b0 // Exit for 2^(-20) <= |x| < 1.0
}
;;
ATANH_UNORM:
// Here if x=unorm
{ .mfi
getf.exp rArgSExpb = fNormX // Recompute if x unorm
fclass.m p0,p9 = fNormX, 0x0b // Test x denorm
nop.i 0
}
;;
{ .mfb
nop.m 0
fcmp.lt.s0 p10,p11 = f8, f0 // Set denormal flag
(p9) br.cond.sptk ATANH_COMMON // Continue if x unorm and not denorm
}
;;
.pred.rel "mutex",p6,p7
{ .mfi
nop.m 0
(p6) fnma.s.s0 f8 = f8,f8,f8 // Result x-x^2 if x=-denorm
nop.i 0
}
{ .mfb
nop.m 0
(p7) fma.s.s0 f8 = f8,f8,f8 // Result x+x^2 if x=+denorm
br.ret.spnt b0 // Exit if denorm
}
;;
// Here if |x| >= 1.0
atanhf_ge_one:
{ .mfi
alloc r32 = ar.pfs,1,3,4,0
fmerge.s fArgAbs = f0, f8 // Form |x|
nop.i 0
}
;;
{ .mfi
nop.m 0
fmerge.s f10 = f8, f8 // Save input for error call
nop.i 0
}
;;
{ .mfi
nop.m 0
fcmp.eq.s1 p6,p7 = fArgAbs, f1 // Test for |x| = 1.0
nop.i 0
}
;;
// Set error tag and result, and raise invalid flag if |x| > 1.0
{ .mfi
(p7) mov atanh_GR_tag = 133
(p7) frcpa.s0 f8, p0 = f0, f0 // Get QNaN, and raise invalid
nop.i 0
}
;;
// Set error tag and result, and raise Z flag if |x| = 1.0
{ .mfi
nop.m 0
(p6) frcpa.s0 fRm, p0 = f1, f0 // Get inf, and raise Z flag
nop.i 0
}
;;
{ .mfb
(p6) mov atanh_GR_tag = 134
(p6) fmerge.s f8 = f8, fRm // result is +-inf
br.cond.sptk __libm_error_region // Exit if |x| >= 1.0
}
;;
GLOBAL_LIBM_END(atanhf)
libm_alias_float_other (atanh, atanh)
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
};;
{ .mmi
stfs [GR_Parameter_Y] = f1,16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
stfs [GR_Parameter_X] = f10 // STORE Parameter 1 on stack
// Parameter 3 address
add GR_Parameter_RESULT = 0,GR_Parameter_Y
nop.b 0
}
{ .mib
stfs [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
add GR_Parameter_RESULT = 48,sp
nop.m 0
nop.i 0
};;
{ .mmi
ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
};;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#

File diff suppressed because it is too large Load Diff

View File

@ -1,866 +0,0 @@
.file "cosh.s"
// Copyright (c) 2000 - 2005, Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
// 02/02/00 Initial version
// 04/04/00 Unwind support added
// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
// 05/07/01 Reworked to improve speed of all paths
// 05/20/02 Cleaned up namespace and sf0 syntax
// 11/15/02 Improved speed with new algorithm
// 03/31/05 Reformatted delimiters between data tables
// API
//==============================================================
// double cosh(double)
// Overview of operation
//==============================================================
// Case 1: 0 < |x| < 0.25
// Evaluate cosh(x) by a 12th order polynomial
// Care is take for the order of multiplication; and A2 is not exactly 1/4!,
// A3 is not exactly 1/6!, etc.
// cosh(x) = 1 + (A1*x^2 + A2*x^4 + A3*x^6 + A4*x^8 + A5*x^10 + A6*x^12)
//
// Case 2: 0.25 < |x| < 710.47586
// Algorithm is based on the identity cosh(x) = ( exp(x) + exp(-x) ) / 2.
// The algorithm for exp is described as below. There are a number of
// economies from evaluating both exp(x) and exp(-x). Although we
// are evaluating both quantities, only where the quantities diverge do we
// duplicate the computations. The basic algorithm for exp(x) is described
// below.
//
// Take the input x. w is "how many log2/128 in x?"
// w = x * 128/log2
// n = int(w)
// x = n log2/128 + r + delta
// n = 128M + index_1 + 2^4 index_2
// x = M log2 + (log2/128) index_1 + (log2/8) index_2 + r + delta
// exp(x) = 2^M 2^(index_1/128) 2^(index_2/8) exp(r) exp(delta)
// Construct 2^M
// Get 2^(index_1/128) from table_1;
// Get 2^(index_2/8) from table_2;
// Calculate exp(r) by 5th order polynomial
// r = x - n (log2/128)_high
// delta = - n (log2/128)_low
// Calculate exp(delta) as 1 + delta
// Special values
//==============================================================
// cosh(+0) = 1.0
// cosh(-0) = 1.0
// cosh(+qnan) = +qnan
// cosh(-qnan) = -qnan
// cosh(+snan) = +qnan
// cosh(-snan) = -qnan
// cosh(-inf) = +inf
// cosh(+inf) = +inf
// Overflow and Underflow
//=======================
// cosh(x) = largest double normal when
// x = 710.47586 = 0x408633ce8fb9f87d
//
// There is no underflow.
// Registers used
//==============================================================
// Floating Point registers used:
// f8, input, output
// f6 -> f15, f32 -> f61
// General registers used:
// r14 -> r40
// Predicate registers used:
// p6 -> p15
// Assembly macros
//==============================================================
rRshf = r14
rN_neg = r14
rAD_TB1 = r15
rAD_TB2 = r16
rAD_P = r17
rN = r18
rIndex_1 = r19
rIndex_2_16 = r20
rM = r21
rBiased_M = r21
rSig_inv_ln2 = r22
rIndex_1_neg = r22
rExp_bias = r23
rExp_bias_minus_1 = r23
rExp_mask = r24
rTmp = r24
rGt_ln = r24
rIndex_2_16_neg = r24
rM_neg = r25
rBiased_M_neg = r25
rRshf_2to56 = r26
rAD_T1_neg = r26
rExp_2tom56 = r28
rAD_T2_neg = r28
rAD_T1 = r29
rAD_T2 = r30
rSignexp_x = r31
rExp_x = r31
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
GR_SAVE_GP = r35
GR_SAVE_SP = r36
GR_Parameter_X = r37
GR_Parameter_Y = r38
GR_Parameter_RESULT = r39
GR_Parameter_TAG = r40
FR_X = f10
FR_Y = f1
FR_RESULT = f8
fRSHF_2TO56 = f6
fINV_LN2_2TO63 = f7
fW_2TO56_RSH = f9
f2TOM56 = f11
fP5 = f12
fP4 = f13
fP3 = f14
fP2 = f15
fLn2_by_128_hi = f33
fLn2_by_128_lo = f34
fRSHF = f35
fNfloat = f36
fNormX = f37
fR = f38
fF = f39
fRsq = f40
f2M = f41
fS1 = f42
fT1 = f42
fS2 = f43
fT2 = f43
fS = f43
fWre_urm_f8 = f44
fAbsX = f44
fMIN_DBL_OFLOW_ARG = f45
fMAX_DBL_NORM_ARG = f46
fXsq = f47
fX4 = f48
fGt_pln = f49
fTmp = f49
fP54 = f50
fP5432 = f50
fP32 = f51
fP = f52
fP54_neg = f53
fP5432_neg = f53
fP32_neg = f54
fP_neg = f55
fF_neg = f56
f2M_neg = f57
fS1_neg = f58
fT1_neg = f58
fS2_neg = f59
fT2_neg = f59
fS_neg = f59
fExp = f60
fExp_neg = f61
fA6 = f50
fA65 = f50
fA6543 = f50
fA654321 = f50
fA5 = f51
fA4 = f52
fA43 = f52
fA3 = f53
fA2 = f54
fA21 = f54
fA1 = f55
// Data tables
//==============================================================
RODATA
.align 16
// ************* DO NOT CHANGE ORDER OF THESE TABLES ********************
// double-extended 1/ln(2)
// 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88
// 3fff b8aa 3b29 5c17 f0bc
// For speed the significand will be loaded directly with a movl and setf.sig
// and the exponent will be bias+63 instead of bias+0. Thus subsequent
// computations need to scale appropriately.
// The constant 128/ln(2) is needed for the computation of w. This is also
// obtained by scaling the computations.
//
// Two shifting constants are loaded directly with movl and setf.d.
// 1. fRSHF_2TO56 = 1.1000..00 * 2^(63-7)
// This constant is added to x*1/ln2 to shift the integer part of
// x*128/ln2 into the rightmost bits of the significand.
// The result of this fma is fW_2TO56_RSH.
// 2. fRSHF = 1.1000..00 * 2^(63)
// This constant is subtracted from fW_2TO56_RSH * 2^(-56) to give
// the integer part of w, n, as a floating-point number.
// The result of this fms is fNfloat.
LOCAL_OBJECT_START(exp_table_1)
data8 0x408633ce8fb9f87e // smallest dbl overflow arg
data8 0x408633ce8fb9f87d // largest dbl arg to give normal dbl result
data8 0xb17217f7d1cf79ab , 0x00003ff7 // ln2/128 hi
data8 0xc9e3b39803f2f6af , 0x00003fb7 // ln2/128 lo
//
// Table 1 is 2^(index_1/128) where
// index_1 goes from 0 to 15
//
data8 0x8000000000000000 , 0x00003FFF
data8 0x80B1ED4FD999AB6C , 0x00003FFF
data8 0x8164D1F3BC030773 , 0x00003FFF
data8 0x8218AF4373FC25EC , 0x00003FFF
data8 0x82CD8698AC2BA1D7 , 0x00003FFF
data8 0x8383594EEFB6EE37 , 0x00003FFF
data8 0x843A28C3ACDE4046 , 0x00003FFF
data8 0x84F1F656379C1A29 , 0x00003FFF
data8 0x85AAC367CC487B15 , 0x00003FFF
data8 0x8664915B923FBA04 , 0x00003FFF
data8 0x871F61969E8D1010 , 0x00003FFF
data8 0x87DB357FF698D792 , 0x00003FFF
data8 0x88980E8092DA8527 , 0x00003FFF
data8 0x8955EE03618E5FDD , 0x00003FFF
data8 0x8A14D575496EFD9A , 0x00003FFF
data8 0x8AD4C6452C728924 , 0x00003FFF
LOCAL_OBJECT_END(exp_table_1)
// Table 2 is 2^(index_1/8) where
// index_2 goes from 0 to 7
LOCAL_OBJECT_START(exp_table_2)
data8 0x8000000000000000 , 0x00003FFF
data8 0x8B95C1E3EA8BD6E7 , 0x00003FFF
data8 0x9837F0518DB8A96F , 0x00003FFF
data8 0xA5FED6A9B15138EA , 0x00003FFF
data8 0xB504F333F9DE6484 , 0x00003FFF
data8 0xC5672A115506DADD , 0x00003FFF
data8 0xD744FCCAD69D6AF4 , 0x00003FFF
data8 0xEAC0C6E7DD24392F , 0x00003FFF
LOCAL_OBJECT_END(exp_table_2)
LOCAL_OBJECT_START(exp_p_table)
data8 0x3f8111116da21757 //P5
data8 0x3fa55555d787761c //P4
data8 0x3fc5555555555414 //P3
data8 0x3fdffffffffffd6a //P2
LOCAL_OBJECT_END(exp_p_table)
LOCAL_OBJECT_START(cosh_p_table)
data8 0x8FA02AC65BCBD5BC, 0x00003FE2 // A6
data8 0xD00D00D1021D7370, 0x00003FEF // A4
data8 0xAAAAAAAAAAAAAB80, 0x00003FFA // A2
data8 0x93F27740C0C2F1CC, 0x00003FE9 // A5
data8 0xB60B60B60B4FE884, 0x00003FF5 // A3
data8 0x8000000000000000, 0x00003FFE // A1
LOCAL_OBJECT_END(cosh_p_table)
.section .text
GLOBAL_IEEE754_ENTRY(cosh)
{ .mlx
getf.exp rSignexp_x = f8 // Must recompute if x unorm
movl rSig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2
}
{ .mlx
addl rAD_TB1 = @ltoff(exp_table_1), gp
movl rRshf_2to56 = 0x4768000000000000 // 1.10000 2^(63+56)
}
;;
{ .mfi
ld8 rAD_TB1 = [rAD_TB1]
fclass.m p6,p0 = f8,0x0b // Test for x=unorm
mov rExp_mask = 0x1ffff
}
{ .mfi
mov rExp_bias = 0xffff
fnorm.s1 fNormX = f8
mov rExp_2tom56 = 0xffff-56
}
;;
// Form two constants we need
// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128
// 1.1000..000 * 2^(63+63-7) to right shift int(w) into the significand
{ .mfi
setf.sig fINV_LN2_2TO63 = rSig_inv_ln2 // form 1/ln2 * 2^63
fclass.m p8,p0 = f8,0x07 // Test for x=0
nop.i 999
}
{ .mlx
setf.d fRSHF_2TO56 = rRshf_2to56 // Form const 1.100 * 2^(63+56)
movl rRshf = 0x43e8000000000000 // 1.10000 2^63 for right shift
}
;;
{ .mfi
ldfpd fMIN_DBL_OFLOW_ARG, fMAX_DBL_NORM_ARG = [rAD_TB1],16
fclass.m p10,p0 = f8,0x1e3 // Test for x=inf, nan, NaT
nop.i 0
}
{ .mfb
setf.exp f2TOM56 = rExp_2tom56 // form 2^-56 for scaling Nfloat
nop.f 0
(p6) br.cond.spnt COSH_UNORM // Branch if x=unorm
}
;;
COSH_COMMON:
{ .mfi
ldfe fLn2_by_128_hi = [rAD_TB1],16
nop.f 0
nop.i 0
}
{ .mfb
setf.d fRSHF = rRshf // Form right shift const 1.100 * 2^63
(p8) fma.d.s0 f8 = f1,f1,f0 // quick exit for x=0
(p8) br.ret.spnt b0
}
;;
{ .mfi
ldfe fLn2_by_128_lo = [rAD_TB1],16
nop.f 0
nop.i 0
}
{ .mfb
and rExp_x = rExp_mask, rSignexp_x // Biased exponent of x
(p10) fma.d.s0 f8 = f8,f8,f0 // Result if x=inf, nan, NaT
(p10) br.ret.spnt b0 // quick exit for x=inf, nan, NaT
}
;;
// After that last load rAD_TB1 points to the beginning of table 1
{ .mfi
nop.m 0
fcmp.eq.s0 p6,p0 = f8, f0 // Dummy to set D
sub rExp_x = rExp_x, rExp_bias // True exponent of x
}
;;
{ .mfi
nop.m 0
fmerge.s fAbsX = f0, fNormX // Form |x|
nop.i 0
}
{ .mfb
cmp.gt p7, p0 = -2, rExp_x // Test |x| < 2^(-2)
fma.s1 fXsq = fNormX, fNormX, f0 // x*x for small path
(p7) br.cond.spnt COSH_SMALL // Branch if 0 < |x| < 2^-2
}
;;
// W = X * Inv_log2_by_128
// By adding 1.10...0*2^63 we shift and get round_int(W) in significand.
// We actually add 1.10...0*2^56 to X * Inv_log2 to do the same thing.
{ .mfi
add rAD_P = 0x180, rAD_TB1
fma.s1 fW_2TO56_RSH = fNormX, fINV_LN2_2TO63, fRSHF_2TO56
add rAD_TB2 = 0x100, rAD_TB1
}
;;
// Divide arguments into the following categories:
// Certain Safe - 0.25 <= |x| <= MAX_DBL_NORM_ARG
// Possible Overflow p14 - MAX_DBL_NORM_ARG < |x| < MIN_DBL_OFLOW_ARG
// Certain Overflow p15 - MIN_DBL_OFLOW_ARG <= |x| < +inf
//
// If the input is really a double arg, then there will never be
// "Possible Overflow" arguments.
//
{ .mfi
ldfpd fP5, fP4 = [rAD_P] ,16
fcmp.ge.s1 p15,p14 = fAbsX,fMIN_DBL_OFLOW_ARG
nop.i 0
}
;;
// Nfloat = round_int(W)
// The signficand of fW_2TO56_RSH contains the rounded integer part of W,
// as a twos complement number in the lower bits (that is, it may be negative).
// That twos complement number (called N) is put into rN.
// Since fW_2TO56_RSH is scaled by 2^56, it must be multiplied by 2^-56
// before the shift constant 1.10000 * 2^63 is subtracted to yield fNfloat.
// Thus, fNfloat contains the floating point version of N
{ .mfi
ldfpd fP3, fP2 = [rAD_P]
(p14) fcmp.gt.unc.s1 p14,p0 = fAbsX,fMAX_DBL_NORM_ARG
nop.i 0
}
{ .mfb
nop.m 0
fms.s1 fNfloat = fW_2TO56_RSH, f2TOM56, fRSHF
(p15) br.cond.spnt COSH_CERTAIN_OVERFLOW
}
;;
{ .mfi
getf.sig rN = fW_2TO56_RSH
nop.f 0
mov rExp_bias_minus_1 = 0xfffe
}
;;
// rIndex_1 has index_1
// rIndex_2_16 has index_2 * 16
// rBiased_M has M
// rM has true M
// r = x - Nfloat * ln2_by_128_hi
// f = 1 - Nfloat * ln2_by_128_lo
{ .mfi
and rIndex_1 = 0x0f, rN
fnma.s1 fR = fNfloat, fLn2_by_128_hi, fNormX
shr rM = rN, 0x7
}
{ .mfi
and rIndex_2_16 = 0x70, rN
fnma.s1 fF = fNfloat, fLn2_by_128_lo, f1
sub rN_neg = r0, rN
}
;;
{ .mmi
and rIndex_1_neg = 0x0f, rN_neg
add rBiased_M = rExp_bias_minus_1, rM
shr rM_neg = rN_neg, 0x7
}
{ .mmi
and rIndex_2_16_neg = 0x70, rN_neg
add rAD_T2 = rAD_TB2, rIndex_2_16
shladd rAD_T1 = rIndex_1, 4, rAD_TB1
}
;;
// rAD_T1 has address of T1
// rAD_T2 has address if T2
{ .mmi
setf.exp f2M = rBiased_M
ldfe fT2 = [rAD_T2]
nop.i 0
}
{ .mmi
add rBiased_M_neg = rExp_bias_minus_1, rM_neg
add rAD_T2_neg = rAD_TB2, rIndex_2_16_neg
shladd rAD_T1_neg = rIndex_1_neg, 4, rAD_TB1
}
;;
// Create Scale = 2^M
// Load T1 and T2
{ .mmi
ldfe fT1 = [rAD_T1]
nop.m 0
nop.i 0
}
{ .mmf
setf.exp f2M_neg = rBiased_M_neg
ldfe fT2_neg = [rAD_T2_neg]
fma.s1 fF_neg = fNfloat, fLn2_by_128_lo, f1
}
;;
{ .mfi
nop.m 0
fma.s1 fRsq = fR, fR, f0
nop.i 0
}
{ .mfi
ldfe fT1_neg = [rAD_T1_neg]
fma.s1 fP54 = fR, fP5, fP4
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fP32 = fR, fP3, fP2
nop.i 0
}
{ .mfi
nop.m 0
fnma.s1 fP54_neg = fR, fP5, fP4
nop.i 0
}
;;
{ .mfi
nop.m 0
fnma.s1 fP32_neg = fR, fP3, fP2
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fP5432 = fRsq, fP54, fP32
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fS2 = fF,fT2,f0
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fS1 = f2M,fT1,f0
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fP5432_neg = fRsq, fP54_neg, fP32_neg
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fS1_neg = f2M_neg,fT1_neg,f0
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fS2_neg = fF_neg,fT2_neg,f0
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fP = fRsq, fP5432, fR
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fS = fS1,fS2,f0
nop.i 0
}
;;
{ .mfi
nop.m 0
fms.s1 fP_neg = fRsq, fP5432_neg, fR
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fS_neg = fS1_neg,fS2_neg,f0
nop.i 0
}
;;
{ .mfb
nop.m 0
fmpy.s0 fTmp = fLn2_by_128_lo, fLn2_by_128_lo // Force inexact
(p14) br.cond.spnt COSH_POSSIBLE_OVERFLOW
}
;;
{ .mfi
nop.m 0
fma.s1 fExp = fS, fP, fS
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fExp_neg = fS_neg, fP_neg, fS_neg
nop.i 0
}
;;
{ .mfb
nop.m 0
fma.d.s0 f8 = fExp, f1, fExp_neg
br.ret.sptk b0 // Normal path exit
}
;;
// Here if 0 < |x| < 0.25
COSH_SMALL:
{ .mmf
add rAD_T1 = 0x1a0, rAD_TB1
add rAD_T2 = 0x1d0, rAD_TB1
}
;;
{ .mmf
ldfe fA6 = [rAD_T1],16
ldfe fA5 = [rAD_T2],16
nop.f 0
}
;;
{ .mmi
ldfe fA4 = [rAD_T1],16
ldfe fA3 = [rAD_T2],16
nop.i 0
}
;;
{ .mmi
ldfe fA2 = [rAD_T1],16
ldfe fA1 = [rAD_T2],16
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fX4 = fXsq, fXsq, f0
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA65 = fXsq, fA6, fA5
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA43 = fXsq, fA4, fA3
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA21 = fXsq, fA2, fA1
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA6543 = fX4, fA65, fA43
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA654321 = fX4, fA6543, fA21
nop.i 0
}
;;
// Dummy multiply to generate inexact
{ .mfi
nop.m 0
fmpy.s0 fTmp = fA6, fA6
nop.i 0
}
{ .mfb
nop.m 0
fma.d.s0 f8 = fA654321, fXsq, f1
br.ret.sptk b0 // Exit if 0 < |x| < 0.25
}
;;
COSH_POSSIBLE_OVERFLOW:
// Here if fMAX_DBL_NORM_ARG < |x| < fMIN_DBL_OFLOW_ARG
// This cannot happen if input is a double, only if input higher precision.
// Overflow is a possibility, not a certainty.
// Recompute result using status field 2 with user's rounding mode,
// and wre set. If result is larger than largest double, then we have
// overflow
{ .mfi
mov rGt_ln = 0x103ff // Exponent for largest dbl + 1 ulp
fsetc.s2 0x7F,0x42 // Get user's round mode, set wre
nop.i 0
}
;;
{ .mfi
setf.exp fGt_pln = rGt_ln // Create largest double + 1 ulp
fma.d.s2 fWre_urm_f8 = fS, fP, fS // Result with wre set
nop.i 0
}
;;
{ .mfi
nop.m 0
fsetc.s2 0x7F,0x40 // Turn off wre in sf2
nop.i 0
}
;;
{ .mfi
nop.m 0
fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow
nop.i 0
}
;;
{ .mfb
nop.m 0
nop.f 0
(p6) br.cond.spnt COSH_CERTAIN_OVERFLOW // Branch if overflow
}
;;
{ .mfb
nop.m 0
fma.d.s0 f8 = fS, fP, fS
br.ret.sptk b0 // Exit if really no overflow
}
;;
COSH_CERTAIN_OVERFLOW:
{ .mmi
sub rTmp = rExp_mask, r0, 1
;;
setf.exp fTmp = rTmp
nop.i 0
}
;;
{ .mfi
alloc r32=ar.pfs,1,4,4,0
fmerge.s FR_X = f8,f8
nop.i 0
}
{ .mfb
mov GR_Parameter_TAG = 64
fma.d.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and +INF result
br.cond.sptk __libm_error_region
}
;;
// Here if x unorm
COSH_UNORM:
{ .mfb
getf.exp rSignexp_x = fNormX // Must recompute if x unorm
fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag
br.cond.sptk COSH_COMMON
}
;;
GLOBAL_IEEE754_END(cosh)
libm_alias_double_other (__cosh, cosh)
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
};;
{ .mmi
stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
nop.b 0
}
{ .mib
stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
add GR_Parameter_RESULT = 48,sp
nop.m 0
nop.i 0
};;
{ .mmi
ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
};;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#

View File

@ -1,711 +0,0 @@
.file "coshf.s"
// Copyright (c) 2000 - 2005, Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
// History
//*********************************************************************
// 02/02/00 Initial version
// 02/16/00 The error tag for coshf overflow changed to 65 (from 64).
// 04/04/00 Unwind support added
// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
// 05/07/01 Reworked to improve speed of all paths
// 05/20/02 Cleaned up namespace and sf0 syntax
// 11/15/02 Improved algorithm based on expf
// 03/31/05 Reformatted delimiters between data tables
//
// API
//*********************************************************************
// float coshf(float)
//
// Overview of operation
//*********************************************************************
// Case 1: 0 < |x| < 0.25
// Evaluate cosh(x) by a 8th order polynomial
// Care is take for the order of multiplication; and A2 is not exactly 1/4!,
// A3 is not exactly 1/6!, etc.
// cosh(x) = 1 + (A1*x^2 + A2*x^4 + A3*x^6 + A4*x^8)
//
// Case 2: 0.25 < |x| < 89.41598
// Algorithm is based on the identity cosh(x) = ( exp(x) + exp(-x) ) / 2.
// The algorithm for exp is described as below. There are a number of
// economies from evaluating both exp(x) and exp(-x). Although we
// are evaluating both quantities, only where the quantities diverge do we
// duplicate the computations. The basic algorithm for exp(x) is described
// below.
//
// Take the input x. w is "how many log2/128 in x?"
// w = x * 64/log2
// NJ = int(w)
// x = NJ*log2/64 + R
// NJ = 64*n + j
// x = n*log2 + (log2/64)*j + R
//
// So, exp(x) = 2^n * 2^(j/64)* exp(R)
//
// T = 2^n * 2^(j/64)
// Construct 2^n
// Get 2^(j/64) table
// actually all the entries of 2^(j/64) table are stored in DP and
// with exponent bits set to 0 -> multiplication on 2^n can be
// performed by doing logical "or" operation with bits presenting 2^n
// exp(R) = 1 + (exp(R) - 1)
// P = exp(R) - 1 approximated by Taylor series of 3rd degree
// P = A3*R^3 + A2*R^2 + R, A3 = 1/6, A2 = 1/2
//
// The final result is reconstructed as follows
// exp(x) = T + T*P
// Special values
//*********************************************************************
// coshf(+0) = 1.0
// coshf(-0) = 1.0
// coshf(+qnan) = +qnan
// coshf(-qnan) = -qnan
// coshf(+snan) = +qnan
// coshf(-snan) = -qnan
// coshf(-inf) = +inf
// coshf(+inf) = +inf
// Overflow and Underflow
//*********************************************************************
// coshf(x) = largest single normal when
// x = 89.41598 = 0x42b2d4fc
//
// There is no underflow.
// Registers used
//*********************************************************************
// Floating Point registers used:
// f8 input, output
// f6,f7, f9 -> f15, f32 -> f45
// General registers used:
// r2, r3, r16 -> r38
// Predicate registers used:
// p6 -> p15
// Assembly macros
//*********************************************************************
// integer registers used
// scratch
rNJ = r2
rNJ_neg = r3
rJ_neg = r16
rN_neg = r17
rSignexp_x = r18
rExp_x = r18
rExp_mask = r19
rExp_bias = r20
rAd1 = r21
rAd2 = r22
rJ = r23
rN = r24
rTblAddr = r25
rA3 = r26
rExpHalf = r27
rLn2Div64 = r28
rGt_ln = r29
r17ones_m1 = r29
rRightShifter = r30
rJ_mask = r30
r64DivLn2 = r31
rN_mask = r31
// stacked
GR_SAVE_PFS = r32
GR_SAVE_B0 = r33
GR_SAVE_GP = r34
GR_Parameter_X = r35
GR_Parameter_Y = r36
GR_Parameter_RESULT = r37
GR_Parameter_TAG = r38
// floating point registers used
FR_X = f10
FR_Y = f1
FR_RESULT = f8
// scratch
fRightShifter = f6
f64DivLn2 = f7
fNormX = f9
fNint = f10
fN = f11
fR = f12
fLn2Div64 = f13
fA2 = f14
fA3 = f15
// stacked
fP = f32
fT = f33
fMIN_SGL_OFLOW_ARG = f34
fMAX_SGL_NORM_ARG = f35
fRSqr = f36
fA1 = f37
fA21 = f37
fA4 = f38
fA43 = f38
fA4321 = f38
fX4 = f39
fTmp = f39
fGt_pln = f39
fWre_urm_f8 = f40
fXsq = f40
fP_neg = f41
fT_neg = f42
fExp = f43
fExp_neg = f44
fAbsX = f45
RODATA
.align 16
LOCAL_OBJECT_START(_coshf_table)
data4 0x42b2d4fd // Smallest single arg to overflow single result
data4 0x42b2d4fc // Largest single arg to give normal single result
data4 0x00000000 // pad
data4 0x00000000 // pad
//
// 2^(j/64) table, j goes from 0 to 63
data8 0x0000000000000000 // 2^(0/64)
data8 0x00002C9A3E778061 // 2^(1/64)
data8 0x000059B0D3158574 // 2^(2/64)
data8 0x0000874518759BC8 // 2^(3/64)
data8 0x0000B5586CF9890F // 2^(4/64)
data8 0x0000E3EC32D3D1A2 // 2^(5/64)
data8 0x00011301D0125B51 // 2^(6/64)
data8 0x0001429AAEA92DE0 // 2^(7/64)
data8 0x000172B83C7D517B // 2^(8/64)
data8 0x0001A35BEB6FCB75 // 2^(9/64)
data8 0x0001D4873168B9AA // 2^(10/64)
data8 0x0002063B88628CD6 // 2^(11/64)
data8 0x0002387A6E756238 // 2^(12/64)
data8 0x00026B4565E27CDD // 2^(13/64)
data8 0x00029E9DF51FDEE1 // 2^(14/64)
data8 0x0002D285A6E4030B // 2^(15/64)
data8 0x000306FE0A31B715 // 2^(16/64)
data8 0x00033C08B26416FF // 2^(17/64)
data8 0x000371A7373AA9CB // 2^(18/64)
data8 0x0003A7DB34E59FF7 // 2^(19/64)
data8 0x0003DEA64C123422 // 2^(20/64)
data8 0x0004160A21F72E2A // 2^(21/64)
data8 0x00044E086061892D // 2^(22/64)
data8 0x000486A2B5C13CD0 // 2^(23/64)
data8 0x0004BFDAD5362A27 // 2^(24/64)
data8 0x0004F9B2769D2CA7 // 2^(25/64)
data8 0x0005342B569D4F82 // 2^(26/64)
data8 0x00056F4736B527DA // 2^(27/64)
data8 0x0005AB07DD485429 // 2^(28/64)
data8 0x0005E76F15AD2148 // 2^(29/64)
data8 0x0006247EB03A5585 // 2^(30/64)
data8 0x0006623882552225 // 2^(31/64)
data8 0x0006A09E667F3BCD // 2^(32/64)
data8 0x0006DFB23C651A2F // 2^(33/64)
data8 0x00071F75E8EC5F74 // 2^(34/64)
data8 0x00075FEB564267C9 // 2^(35/64)
data8 0x0007A11473EB0187 // 2^(36/64)
data8 0x0007E2F336CF4E62 // 2^(37/64)
data8 0x00082589994CCE13 // 2^(38/64)
data8 0x000868D99B4492ED // 2^(39/64)
data8 0x0008ACE5422AA0DB // 2^(40/64)
data8 0x0008F1AE99157736 // 2^(41/64)
data8 0x00093737B0CDC5E5 // 2^(42/64)
data8 0x00097D829FDE4E50 // 2^(43/64)
data8 0x0009C49182A3F090 // 2^(44/64)
data8 0x000A0C667B5DE565 // 2^(45/64)
data8 0x000A5503B23E255D // 2^(46/64)
data8 0x000A9E6B5579FDBF // 2^(47/64)
data8 0x000AE89F995AD3AD // 2^(48/64)
data8 0x000B33A2B84F15FB // 2^(49/64)
data8 0x000B7F76F2FB5E47 // 2^(50/64)
data8 0x000BCC1E904BC1D2 // 2^(51/64)
data8 0x000C199BDD85529C // 2^(52/64)
data8 0x000C67F12E57D14B // 2^(53/64)
data8 0x000CB720DCEF9069 // 2^(54/64)
data8 0x000D072D4A07897C // 2^(55/64)
data8 0x000D5818DCFBA487 // 2^(56/64)
data8 0x000DA9E603DB3285 // 2^(57/64)
data8 0x000DFC97337B9B5F // 2^(58/64)
data8 0x000E502EE78B3FF6 // 2^(59/64)
data8 0x000EA4AFA2A490DA // 2^(60/64)
data8 0x000EFA1BEE615A27 // 2^(61/64)
data8 0x000F50765B6E4540 // 2^(62/64)
data8 0x000FA7C1819E90D8 // 2^(63/64)
LOCAL_OBJECT_END(_coshf_table)
LOCAL_OBJECT_START(cosh_p_table)
data8 0x3efa3001dcf5905b // A4
data8 0x3f56c1437543543e // A3
data8 0x3fa5555572601504 // A2
data8 0x3fdfffffffe2f097 // A1
LOCAL_OBJECT_END(cosh_p_table)
.section .text
GLOBAL_IEEE754_ENTRY(coshf)
{ .mlx
getf.exp rSignexp_x = f8 // Must recompute if x unorm
movl r64DivLn2 = 0x40571547652B82FE // 64/ln(2)
}
{ .mlx
addl rTblAddr = @ltoff(_coshf_table),gp
movl rRightShifter = 0x43E8000000000000 // DP Right Shifter
}
;;
{ .mfi
// point to the beginning of the table
ld8 rTblAddr = [rTblAddr]
fclass.m p6, p0 = f8, 0x0b // Test for x=unorm
addl rA3 = 0x3E2AA, r0 // high bits of 1.0/6.0 rounded to SP
}
{ .mfi
nop.m 0
fnorm.s1 fNormX = f8 // normalized x
addl rExpHalf = 0xFFFE, r0 // exponent of 1/2
}
;;
{ .mfi
setf.d f64DivLn2 = r64DivLn2 // load 64/ln(2) to FP reg
fclass.m p15, p0 = f8, 0x1e3 // test for NaT,NaN,Inf
nop.i 0
}
{ .mlx
// load Right Shifter to FP reg
setf.d fRightShifter = rRightShifter
movl rLn2Div64 = 0x3F862E42FEFA39EF // DP ln(2)/64 in GR
}
;;
{ .mfi
mov rExp_mask = 0x1ffff
fcmp.eq.s1 p13, p0 = f0, f8 // test for x = 0.0
shl rA3 = rA3, 12 // 0x3E2AA000, approx to 1.0/6.0 in SP
}
{ .mfb
nop.m 0
nop.f 0
(p6) br.cond.spnt COSH_UNORM // Branch if x=unorm
}
;;
COSH_COMMON:
{ .mfi
setf.exp fA2 = rExpHalf // load A2 to FP reg
nop.f 0
mov rExp_bias = 0xffff
}
{ .mfb
setf.d fLn2Div64 = rLn2Div64 // load ln(2)/64 to FP reg
(p15) fma.s.s0 f8 = f8, f8, f0 // result if x = NaT,NaN,Inf
(p15) br.ret.spnt b0 // exit here if x = NaT,NaN,Inf
}
;;
{ .mfi
// min overflow and max normal threshold
ldfps fMIN_SGL_OFLOW_ARG, fMAX_SGL_NORM_ARG = [rTblAddr], 8
nop.f 0
and rExp_x = rExp_mask, rSignexp_x // Biased exponent of x
}
{ .mfb
setf.s fA3 = rA3 // load A3 to FP reg
(p13) fma.s.s0 f8 = f1, f1, f0 // result if x = 0.0
(p13) br.ret.spnt b0 // exit here if x =0.0
}
;;
{ .mfi
sub rExp_x = rExp_x, rExp_bias // True exponent of x
fmerge.s fAbsX = f0, fNormX // Form |x|
nop.i 0
}
;;
{ .mfi
nop.m 0
// x*(64/ln(2)) + Right Shifter
fma.s1 fNint = fNormX, f64DivLn2, fRightShifter
add rTblAddr = 8, rTblAddr
}
{ .mfb
cmp.gt p7, p0 = -2, rExp_x // Test |x| < 2^(-2)
fma.s1 fXsq = fNormX, fNormX, f0 // x*x for small path
(p7) br.cond.spnt COSH_SMALL // Branch if 0 < |x| < 2^-2
}
;;
{ .mfi
nop.m 0
// check for overflow
fcmp.ge.s1 p12, p13 = fAbsX, fMIN_SGL_OFLOW_ARG
mov rJ_mask = 0x3f // 6-bit mask for J
}
;;
{ .mfb
nop.m 0
fms.s1 fN = fNint, f1, fRightShifter // n in FP register
// branch out if overflow
(p12) br.cond.spnt COSH_CERTAIN_OVERFLOW
}
;;
{ .mfi
getf.sig rNJ = fNint // bits of n, j
// check for possible overflow
fcmp.gt.s1 p13, p0 = fAbsX, fMAX_SGL_NORM_ARG
nop.i 0
}
;;
{ .mfi
addl rN = 0xFFBF - 63, rNJ // biased and shifted n-1,j
fnma.s1 fR = fLn2Div64, fN, fNormX // R = x - N*ln(2)/64
and rJ = rJ_mask, rNJ // bits of j
}
{ .mfi
sub rNJ_neg = r0, rNJ // bits of n, j for -x
nop.f 0
andcm rN_mask = -1, rJ_mask // 0xff...fc0 to mask N
}
;;
{ .mfi
shladd rJ = rJ, 3, rTblAddr // address in the 2^(j/64) table
nop.f 0
and rN = rN_mask, rN // biased, shifted n-1
}
{ .mfi
addl rN_neg = 0xFFBF - 63, rNJ_neg // -x biased, shifted n-1,j
nop.f 0
and rJ_neg = rJ_mask, rNJ_neg // bits of j for -x
}
;;
{ .mfi
ld8 rJ = [rJ] // Table value
nop.f 0
shl rN = rN, 46 // 2^(n-1) bits in DP format
}
{ .mfi
shladd rJ_neg = rJ_neg, 3, rTblAddr // addr in 2^(j/64) table -x
nop.f 0
and rN_neg = rN_mask, rN_neg // biased, shifted n-1 for -x
}
;;
{ .mfi
ld8 rJ_neg = [rJ_neg] // Table value for -x
nop.f 0
shl rN_neg = rN_neg, 46 // 2^(n-1) bits in DP format for -x
}
;;
{ .mfi
or rN = rN, rJ // bits of 2^n * 2^(j/64) in DP format
nop.f 0
nop.i 0
}
;;
{ .mmf
setf.d fT = rN // 2^(n-1) * 2^(j/64)
or rN_neg = rN_neg, rJ_neg // -x bits of 2^n * 2^(j/64) in DP
fma.s1 fRSqr = fR, fR, f0 // R^2
}
;;
{ .mfi
setf.d fT_neg = rN_neg // 2^(n-1) * 2^(j/64) for -x
fma.s1 fP = fA3, fR, fA2 // A3*R + A2
nop.i 0
}
{ .mfi
nop.m 0
fnma.s1 fP_neg = fA3, fR, fA2 // A3*R + A2 for -x
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fP = fP, fRSqr, fR // P = (A3*R + A2)*R^2 + R
nop.i 0
}
{ .mfi
nop.m 0
fms.s1 fP_neg = fP_neg, fRSqr, fR // P = (A3*R + A2)*R^2 + R, -x
nop.i 0
}
;;
{ .mfi
nop.m 0
fmpy.s0 fTmp = fLn2Div64, fLn2Div64 // Force inexact
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fExp = fP, fT, fT // exp(x)/2
nop.i 0
}
{ .mfb
nop.m 0
fma.s1 fExp_neg = fP_neg, fT_neg, fT_neg // exp(-x)/2
// branch out if possible overflow result
(p13) br.cond.spnt COSH_POSSIBLE_OVERFLOW
}
;;
{ .mfb
nop.m 0
// final result in the absence of overflow
fma.s.s0 f8 = fExp, f1, fExp_neg // result = (exp(x)+exp(-x))/2
// exit here in the absence of overflow
br.ret.sptk b0 // Exit main path, 0.25 <= |x| < 89.41598
}
;;
// Here if 0 < |x| < 0.25. Evaluate 8th order polynomial.
COSH_SMALL:
{ .mmi
add rAd1 = 0x200, rTblAddr
add rAd2 = 0x210, rTblAddr
nop.i 0
}
;;
{ .mmi
ldfpd fA4, fA3 = [rAd1]
ldfpd fA2, fA1 = [rAd2]
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fX4 = fXsq, fXsq, f0
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA43 = fXsq, fA4, fA3
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA21 = fXsq, fA2, fA1
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA4321 = fX4, fA43, fA21
nop.i 0
}
;;
// Dummy multiply to generate inexact
{ .mfi
nop.m 0
fmpy.s0 fTmp = fA4, fA4
nop.i 0
}
{ .mfb
nop.m 0
fma.s.s0 f8 = fA4321, fXsq, f1
br.ret.sptk b0 // Exit if 0 < |x| < 0.25
}
;;
COSH_POSSIBLE_OVERFLOW:
// Here if fMAX_SGL_NORM_ARG < x < fMIN_SGL_OFLOW_ARG
// This cannot happen if input is a single, only if input higher precision.
// Overflow is a possibility, not a certainty.
// Recompute result using status field 2 with user's rounding mode,
// and wre set. If result is larger than largest single, then we have
// overflow
{ .mfi
mov rGt_ln = 0x1007f // Exponent for largest single + 1 ulp
fsetc.s2 0x7F,0x42 // Get user's round mode, set wre
nop.i 0
}
;;
{ .mfi
setf.exp fGt_pln = rGt_ln // Create largest single + 1 ulp
fma.s.s2 fWre_urm_f8 = fP, fT, fT // Result with wre set
nop.i 0
}
;;
{ .mfi
nop.m 0
fsetc.s2 0x7F,0x40 // Turn off wre in sf2
nop.i 0
}
;;
{ .mfi
nop.m 0
fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow
nop.i 0
}
;;
{ .mfb
nop.m 0
nop.f 0
(p6) br.cond.spnt COSH_CERTAIN_OVERFLOW // Branch if overflow
}
;;
{ .mfb
nop.m 0
fma.s.s0 f8 = fP, fT, fT
br.ret.sptk b0 // Exit if really no overflow
}
;;
// here if overflow
COSH_CERTAIN_OVERFLOW:
{ .mmi
addl r17ones_m1 = 0x1FFFE, r0
;;
setf.exp fTmp = r17ones_m1
nop.i 0
}
;;
{ .mfi
alloc r32 = ar.pfs, 0, 3, 4, 0 // get some registers
fmerge.s FR_X = f8,f8
nop.i 0
}
{ .mfb
mov GR_Parameter_TAG = 65
fma.s.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and +INF result
br.cond.sptk __libm_error_region
}
;;
// Here if x unorm
COSH_UNORM:
{ .mfb
getf.exp rSignexp_x = fNormX // Must recompute if x unorm
fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag
br.cond.sptk COSH_COMMON // Return to main path
}
;;
GLOBAL_IEEE754_END(coshf)
libm_alias_float_other (__cosh, cosh)
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
};;
{ .mmi
stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mfi
stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
nop.f 0
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
}
{ .mib
stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
add GR_Parameter_RESULT = 48,sp
nop.m 0
nop.i 0
};;
{ .mmi
ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
};;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#

File diff suppressed because it is too large Load Diff

View File

@ -1,799 +0,0 @@
.file "exp.s"
// Copyright (c) 2000 - 2005, Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
// 2/02/00 Initial version
// 3/07/00 exp(inf) = inf but now does NOT call error support
// exp(-inf) = 0 but now does NOT call error support
// 4/04/00 Unwind support added
// 8/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
// 11/30/00 Reworked to shorten main path, widen main path to include all
// args in normal range, and add quick exit for 0, nan, inf.
// 12/05/00 Loaded constants earlier with setf to save 2 cycles.
// 02/05/02 Corrected uninitialize predicate in POSSIBLE_UNDERFLOW path
// 05/20/02 Cleaned up namespace and sf0 syntax
// 09/07/02 Force inexact flag
// 11/15/02 Split underflow path into zero/nonzero; eliminated fma in main path
// 05/30/03 Set inexact flag on unmasked overflow/underflow
// 03/31/05 Reformatted delimiters between data tables
// API
//==============================================================
// double exp(double)
// Overview of operation
//==============================================================
// Take the input x. w is "how many log2/128 in x?"
// w = x * 128/log2
// n = int(w)
// x = n log2/128 + r + delta
// n = 128M + index_1 + 2^4 index_2
// x = M log2 + (log2/128) index_1 + (log2/8) index_2 + r + delta
// exp(x) = 2^M 2^(index_1/128) 2^(index_2/8) exp(r) exp(delta)
// Construct 2^M
// Get 2^(index_1/128) from table_1;
// Get 2^(index_2/8) from table_2;
// Calculate exp(r) by 5th order polynomial
// r = x - n (log2/128)_high
// delta = - n (log2/128)_low
// Calculate exp(delta) as 1 + delta
// Special values
//==============================================================
// exp(+0) = 1.0
// exp(-0) = 1.0
// exp(+qnan) = +qnan
// exp(-qnan) = -qnan
// exp(+snan) = +qnan
// exp(-snan) = -qnan
// exp(-inf) = +0
// exp(+inf) = +inf
// Overflow and Underflow
//=======================
// exp(x) = largest double normal when
// x = 709.7827 = 0x40862e42fefa39ef
// exp(x) = smallest double normal when
// x = -708.396 = 0xc086232bdd7abcd2
// exp(x) = largest round-to-nearest single zero when
// x = -745.1332 = 0xc0874910d52d3052
// Registers used
//==============================================================
// Floating Point registers used:
// f8, input, output
// f6 -> f15, f32 -> f49
// General registers used:
// r14 -> r40
// Predicate registers used:
// p6 -> p15
// Assembly macros
//==============================================================
rRshf = r14
rAD_TB1 = r15
rAD_T1 = r15
rAD_TB2 = r16
rAD_T2 = r16
rAD_P = r17
rN = r18
rIndex_1 = r19
rIndex_2_16 = r20
rM = r21
rBiased_M = r21
rIndex_1_16 = r21
rSig_inv_ln2 = r22
rExp_bias = r23
rExp_mask = r24
rTmp = r25
rRshf_2to56 = r26
rGt_ln = r27
rExp_2tom56 = r28
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
GR_SAVE_GP = r35
GR_SAVE_SP = r36
GR_Parameter_X = r37
GR_Parameter_Y = r38
GR_Parameter_RESULT = r39
GR_Parameter_TAG = r40
FR_X = f10
FR_Y = f1
FR_RESULT = f8
fRSHF_2TO56 = f6
fINV_LN2_2TO63 = f7
fW_2TO56_RSH = f9
f2TOM56 = f11
fP5 = f12
fP54 = f12
fP5432 = f12
fP4 = f13
fP3 = f14
fP32 = f14
fP2 = f15
fP = f15
fLn2_by_128_hi = f33
fLn2_by_128_lo = f34
fRSHF = f35
fNfloat = f36
fNormX = f37
fR = f38
fF = f39
fRsq = f40
f2M = f41
fS1 = f42
fT1 = f42
fS2 = f43
fT2 = f43
fS = f43
fWre_urm_f8 = f44
fFtz_urm_f8 = f44
fMIN_DBL_OFLOW_ARG = f45
fMAX_DBL_ZERO_ARG = f46
fMAX_DBL_NORM_ARG = f47
fMIN_DBL_NORM_ARG = f48
fGt_pln = f49
fTmp = f49
// Data tables
//==============================================================
RODATA
.align 16
// ************* DO NOT CHANGE ORDER OF THESE TABLES ********************
// double-extended 1/ln(2)
// 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88
// 3fff b8aa 3b29 5c17 f0bc
// For speed the significand will be loaded directly with a movl and setf.sig
// and the exponent will be bias+63 instead of bias+0. Thus subsequent
// computations need to scale appropriately.
// The constant 128/ln(2) is needed for the computation of w. This is also
// obtained by scaling the computations.
//
// Two shifting constants are loaded directly with movl and setf.d.
// 1. fRSHF_2TO56 = 1.1000..00 * 2^(63-7)
// This constant is added to x*1/ln2 to shift the integer part of
// x*128/ln2 into the rightmost bits of the significand.
// The result of this fma is fW_2TO56_RSH.
// 2. fRSHF = 1.1000..00 * 2^(63)
// This constant is subtracted from fW_2TO56_RSH * 2^(-56) to give
// the integer part of w, n, as a floating-point number.
// The result of this fms is fNfloat.
LOCAL_OBJECT_START(exp_table_1)
data8 0x40862e42fefa39f0 // smallest dbl overflow arg, +709.7827
data8 0xc0874910d52d3052 // largest arg for rnd-to-nearest 0 result, -745.133
data8 0x40862e42fefa39ef // largest dbl arg to give normal dbl result, +709.7827
data8 0xc086232bdd7abcd2 // smallest dbl arg to give normal dbl result, -708.396
data8 0xb17217f7d1cf79ab , 0x00003ff7 // ln2/128 hi
data8 0xc9e3b39803f2f6af , 0x00003fb7 // ln2/128 lo
//
// Table 1 is 2^(index_1/128) where
// index_1 goes from 0 to 15
//
data8 0x8000000000000000 , 0x00003FFF
data8 0x80B1ED4FD999AB6C , 0x00003FFF
data8 0x8164D1F3BC030773 , 0x00003FFF
data8 0x8218AF4373FC25EC , 0x00003FFF
data8 0x82CD8698AC2BA1D7 , 0x00003FFF
data8 0x8383594EEFB6EE37 , 0x00003FFF
data8 0x843A28C3ACDE4046 , 0x00003FFF
data8 0x84F1F656379C1A29 , 0x00003FFF
data8 0x85AAC367CC487B15 , 0x00003FFF
data8 0x8664915B923FBA04 , 0x00003FFF
data8 0x871F61969E8D1010 , 0x00003FFF
data8 0x87DB357FF698D792 , 0x00003FFF
data8 0x88980E8092DA8527 , 0x00003FFF
data8 0x8955EE03618E5FDD , 0x00003FFF
data8 0x8A14D575496EFD9A , 0x00003FFF
data8 0x8AD4C6452C728924 , 0x00003FFF
LOCAL_OBJECT_END(exp_table_1)
// Table 2 is 2^(index_1/8) where
// index_2 goes from 0 to 7
LOCAL_OBJECT_START(exp_table_2)
data8 0x8000000000000000 , 0x00003FFF
data8 0x8B95C1E3EA8BD6E7 , 0x00003FFF
data8 0x9837F0518DB8A96F , 0x00003FFF
data8 0xA5FED6A9B15138EA , 0x00003FFF
data8 0xB504F333F9DE6484 , 0x00003FFF
data8 0xC5672A115506DADD , 0x00003FFF
data8 0xD744FCCAD69D6AF4 , 0x00003FFF
data8 0xEAC0C6E7DD24392F , 0x00003FFF
LOCAL_OBJECT_END(exp_table_2)
LOCAL_OBJECT_START(exp_p_table)
data8 0x3f8111116da21757 //P5
data8 0x3fa55555d787761c //P4
data8 0x3fc5555555555414 //P3
data8 0x3fdffffffffffd6a //P2
LOCAL_OBJECT_END(exp_p_table)
.section .text
GLOBAL_IEEE754_ENTRY(exp)
{ .mlx
nop.m 0
movl rSig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2
}
{ .mlx
addl rAD_TB1 = @ltoff(exp_table_1), gp
movl rRshf_2to56 = 0x4768000000000000 // 1.10000 2^(63+56)
}
;;
{ .mfi
ld8 rAD_TB1 = [rAD_TB1]
fclass.m p8,p0 = f8,0x07 // Test for x=0
mov rExp_mask = 0x1ffff
}
{ .mfi
mov rExp_bias = 0xffff
fnorm.s1 fNormX = f8
mov rExp_2tom56 = 0xffff-56
}
;;
// Form two constants we need
// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128
// 1.1000..000 * 2^(63+63-7) to right shift int(w) into the significand
{ .mfi
setf.sig fINV_LN2_2TO63 = rSig_inv_ln2 // form 1/ln2 * 2^63
fclass.m p9,p0 = f8,0x22 // Test for x=-inf
nop.i 0
}
{ .mlx
setf.d fRSHF_2TO56 = rRshf_2to56 // Form const 1.100 * 2^(63+56)
movl rRshf = 0x43e8000000000000 // 1.10000 2^63 for right shift
}
;;
{ .mfi
ldfpd fMIN_DBL_OFLOW_ARG, fMAX_DBL_ZERO_ARG = [rAD_TB1],16
fclass.m p10,p0 = f8,0x1e1 // Test for x=+inf, nan, NaT
nop.i 0
}
{ .mfb
setf.exp f2TOM56 = rExp_2tom56 // form 2^-56 for scaling Nfloat
(p9) fma.d.s0 f8 = f0,f0,f0 // quick exit for x=-inf
(p9) br.ret.spnt b0
}
;;
{ .mfi
ldfpd fMAX_DBL_NORM_ARG, fMIN_DBL_NORM_ARG = [rAD_TB1],16
nop.f 0
nop.i 0
}
{ .mfb
setf.d fRSHF = rRshf // Form right shift const 1.100 * 2^63
(p8) fma.d.s0 f8 = f1,f1,f0 // quick exit for x=0
(p8) br.ret.spnt b0
}
;;
{ .mfb
ldfe fLn2_by_128_hi = [rAD_TB1],16
(p10) fma.d.s0 f8 = f8,f8,f0 // Result if x=+inf, nan, NaT
(p10) br.ret.spnt b0 // quick exit for x=+inf, nan, NaT
}
;;
{ .mfi
ldfe fLn2_by_128_lo = [rAD_TB1],16
fcmp.eq.s0 p6,p0 = f8, f0 // Dummy to set D
nop.i 0
}
;;
// After that last load, rAD_TB1 points to the beginning of table 1
// W = X * Inv_log2_by_128
// By adding 1.10...0*2^63 we shift and get round_int(W) in significand.
// We actually add 1.10...0*2^56 to X * Inv_log2 to do the same thing.
{ .mfi
nop.m 0
fma.s1 fW_2TO56_RSH = fNormX, fINV_LN2_2TO63, fRSHF_2TO56
nop.i 0
}
;;
// Divide arguments into the following categories:
// Certain Underflow p11 - -inf < x <= MAX_DBL_ZERO_ARG
// Possible Underflow p13 - MAX_DBL_ZERO_ARG < x < MIN_DBL_NORM_ARG
// Certain Safe - MIN_DBL_NORM_ARG <= x <= MAX_DBL_NORM_ARG
// Possible Overflow p14 - MAX_DBL_NORM_ARG < x < MIN_DBL_OFLOW_ARG
// Certain Overflow p15 - MIN_DBL_OFLOW_ARG <= x < +inf
//
// If the input is really a double arg, then there will never be
// "Possible Overflow" arguments.
//
{ .mfi
add rAD_TB2 = 0x100, rAD_TB1
fcmp.ge.s1 p15,p0 = fNormX,fMIN_DBL_OFLOW_ARG
nop.i 0
}
;;
{ .mfi
add rAD_P = 0x80, rAD_TB2
fcmp.le.s1 p11,p0 = fNormX,fMAX_DBL_ZERO_ARG
nop.i 0
}
;;
{ .mfb
ldfpd fP5, fP4 = [rAD_P] ,16
fcmp.gt.s1 p14,p0 = fNormX,fMAX_DBL_NORM_ARG
(p15) br.cond.spnt EXP_CERTAIN_OVERFLOW
}
;;
// Nfloat = round_int(W)
// The signficand of fW_2TO56_RSH contains the rounded integer part of W,
// as a twos complement number in the lower bits (that is, it may be negative).
// That twos complement number (called N) is put into rN.
// Since fW_2TO56_RSH is scaled by 2^56, it must be multiplied by 2^-56
// before the shift constant 1.10000 * 2^63 is subtracted to yield fNfloat.
// Thus, fNfloat contains the floating point version of N
{ .mfb
ldfpd fP3, fP2 = [rAD_P]
fms.s1 fNfloat = fW_2TO56_RSH, f2TOM56, fRSHF
(p11) br.cond.spnt EXP_CERTAIN_UNDERFLOW
}
;;
{ .mfi
getf.sig rN = fW_2TO56_RSH
nop.f 0
nop.i 0
}
;;
// rIndex_1 has index_1
// rIndex_2_16 has index_2 * 16
// rBiased_M has M
// rIndex_1_16 has index_1 * 16
// rM has true M
// r = x - Nfloat * ln2_by_128_hi
// f = 1 - Nfloat * ln2_by_128_lo
{ .mfi
and rIndex_1 = 0x0f, rN
fnma.s1 fR = fNfloat, fLn2_by_128_hi, fNormX
shr rM = rN, 0x7
}
{ .mfi
and rIndex_2_16 = 0x70, rN
fnma.s1 fF = fNfloat, fLn2_by_128_lo, f1
nop.i 0
}
;;
// rAD_T1 has address of T1
// rAD_T2 has address if T2
{ .mmi
add rBiased_M = rExp_bias, rM
add rAD_T2 = rAD_TB2, rIndex_2_16
shladd rAD_T1 = rIndex_1, 4, rAD_TB1
}
;;
// Create Scale = 2^M
{ .mmi
setf.exp f2M = rBiased_M
ldfe fT2 = [rAD_T2]
nop.i 0
}
;;
// Load T1 and T2
{ .mfi
ldfe fT1 = [rAD_T1]
fmpy.s0 fTmp = fLn2_by_128_lo, fLn2_by_128_lo // Force inexact
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fRsq = fR, fR, f0
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fP54 = fR, fP5, fP4
nop.i 0
}
;;
{ .mfi
nop.m 0
fcmp.lt.s1 p13,p0 = fNormX,fMIN_DBL_NORM_ARG
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fP32 = fR, fP3, fP2
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fP5432 = fRsq, fP54, fP32
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fS1 = f2M,fT1,f0
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fS2 = fF,fT2,f0
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fP = fRsq, fP5432, fR
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fS = fS1,fS2,f0
nop.i 0
}
;;
{ .mbb
nop.m 0
(p13) br.cond.spnt EXP_POSSIBLE_UNDERFLOW
(p14) br.cond.spnt EXP_POSSIBLE_OVERFLOW
}
;;
{ .mfb
nop.m 0
fma.d.s0 f8 = fS, fP, fS
br.ret.sptk b0 // Normal path exit
}
;;
EXP_POSSIBLE_OVERFLOW:
// Here if fMAX_DBL_NORM_ARG < x < fMIN_DBL_OFLOW_ARG
// This cannot happen if input is a double, only if input higher precision.
// Overflow is a possibility, not a certainty.
// Recompute result using status field 2 with user's rounding mode,
// and wre set. If result is larger than largest double, then we have
// overflow
{ .mfi
mov rGt_ln = 0x103ff // Exponent for largest dbl + 1 ulp
fsetc.s2 0x7F,0x42 // Get user's round mode, set wre
nop.i 0
}
;;
{ .mfi
setf.exp fGt_pln = rGt_ln // Create largest double + 1 ulp
fma.d.s2 fWre_urm_f8 = fS, fP, fS // Result with wre set
nop.i 0
}
;;
{ .mfi
nop.m 0
fsetc.s2 0x7F,0x40 // Turn off wre in sf2
nop.i 0
}
;;
{ .mfi
nop.m 0
fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow
nop.i 0
}
;;
{ .mfb
nop.m 0
nop.f 0
(p6) br.cond.spnt EXP_CERTAIN_OVERFLOW // Branch if overflow
}
;;
{ .mfb
nop.m 0
fma.d.s0 f8 = fS, fP, fS
br.ret.sptk b0 // Exit if really no overflow
}
;;
EXP_CERTAIN_OVERFLOW:
{ .mmi
sub rTmp = rExp_mask, r0, 1
;;
setf.exp fTmp = rTmp
nop.i 0
}
;;
{ .mfi
alloc r32=ar.pfs,1,4,4,0
fmerge.s FR_X = f8,f8
nop.i 0
}
{ .mfb
mov GR_Parameter_TAG = 14
fma.d.s0 FR_RESULT = fTmp, fTmp, fTmp // Set I,O and +INF result
br.cond.sptk __libm_error_region
}
;;
EXP_POSSIBLE_UNDERFLOW:
// Here if fMAX_DBL_ZERO_ARG < x < fMIN_DBL_NORM_ARG
// Underflow is a possibility, not a certainty
// We define an underflow when the answer with
// ftz set
// is zero (tiny numbers become zero)
// Notice (from below) that if we have an unlimited exponent range,
// then there is an extra machine number E between the largest denormal and
// the smallest normal.
// So if with unbounded exponent we round to E or below, then we are
// tiny and underflow has occurred.
// But notice that you can be in a situation where we are tiny, namely
// rounded to E, but when the exponent is bounded we round to smallest
// normal. So the answer can be the smallest normal with underflow.
// E
// -----+--------------------+--------------------+-----
// | | |
// 1.1...10 2^-3fff 1.1...11 2^-3fff 1.0...00 2^-3ffe
// 0.1...11 2^-3ffe (biased, 1)
// largest dn smallest normal
{ .mfi
nop.m 0
fsetc.s2 0x7F,0x41 // Get user's round mode, set ftz
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.d.s2 fFtz_urm_f8 = fS, fP, fS // Result with ftz set
nop.i 0
}
;;
{ .mfi
nop.m 0
fsetc.s2 0x7F,0x40 // Turn off ftz in sf2
nop.i 0
}
;;
{ .mfi
nop.m 0
fcmp.eq.s1 p6, p7 = fFtz_urm_f8, f0 // Test for underflow
nop.i 0
}
{ .mfi
nop.m 0
fma.d.s0 f8 = fS, fP, fS // Compute result, set I, maybe U
nop.i 0
}
;;
{ .mbb
nop.m 0
(p6) br.cond.spnt EXP_UNDERFLOW_COMMON // Branch if really underflow
(p7) br.ret.sptk b0 // Exit if really no underflow
}
;;
EXP_CERTAIN_UNDERFLOW:
// Here if x < fMAX_DBL_ZERO_ARG
// Result will be zero (or smallest denorm if round to +inf) with I, U set
{ .mmi
mov rTmp = 1
;;
setf.exp fTmp = rTmp // Form small normal
nop.i 0
}
;;
{ .mfi
nop.m 0
fmerge.se fTmp = fTmp, fLn2_by_128_lo // Small with signif lsb 1
nop.i 0
}
;;
{ .mfb
nop.m 0
fma.d.s0 f8 = fTmp, fTmp, f0 // Set I,U, tiny (+0.0) result
br.cond.sptk EXP_UNDERFLOW_COMMON
}
;;
EXP_UNDERFLOW_COMMON:
// Determine if underflow result is zero or nonzero
{ .mfi
alloc r32=ar.pfs,1,4,4,0
fcmp.eq.s1 p6, p0 = f8, f0
nop.i 0
}
;;
{ .mfb
nop.m 0
fmerge.s FR_X = fNormX,fNormX
(p6) br.cond.spnt EXP_UNDERFLOW_ZERO
}
;;
EXP_UNDERFLOW_NONZERO:
// Here if x < fMIN_DBL_NORM_ARG and result nonzero;
// I, U are set
{ .mfb
mov GR_Parameter_TAG = 15
nop.f 0 // FR_RESULT already set
br.cond.sptk __libm_error_region
}
;;
EXP_UNDERFLOW_ZERO:
// Here if x < fMIN_DBL_NORM_ARG and result zero;
// I, U are set
{ .mfb
mov GR_Parameter_TAG = 15
nop.f 0 // FR_RESULT already set
br.cond.sptk __libm_error_region
}
;;
GLOBAL_IEEE754_END(exp)
libm_alias_double_other (__exp, exp)
#ifdef SHARED
.symver exp,exp@@GLIBC_2.29
.weak __exp_compat
.set __exp_compat,__exp
.symver __exp_compat,exp@GLIBC_2.2
#endif
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
};;
{ .mmi
stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
nop.b 0
}
{ .mib
stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
add GR_Parameter_RESULT = 48,sp
nop.m 0
nop.i 0
};;
{ .mmi
ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
};;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#

View File

@ -1,609 +0,0 @@
.file "exp10.s"
// Copyright (c) 2000 - 2005, Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
// 08/25/00 Initial version
// 05/20/02 Cleaned up namespace and sf0 syntax
// 09/06/02 Improved performance; no inexact flags on exact cases
// 01/29/03 Added missing } to bundle templates
// 12/16/04 Call error handling on underflow.
// 03/31/05 Reformatted delimiters between data tables
//
// API
//==============================================================
// double exp10(double)
//
// Overview of operation
//==============================================================
// Background
//
// Implementation
//
// Let x= (K + fh + fl + r)/log2(10), where
// K is an integer, fh= 0.b1 b2 b3 b4 b5,
// fl= 2^{-5}* 0.b6 b7 b8 b8 b10 (fh, fl >= 0),
// and |r|<2^{-11}
// Th is a table that stores 2^fh (32 entries) rounded to
// double extended precision (only mantissa is stored)
// Tl is a table that stores 2^fl (32 entries) rounded to
// double extended precision (only mantissa is stored)
//
// 10^x is approximated as
// 2^K * Th [ f ] * Tl [ f ] * (1+c1*e+c1*r+c2*r^2+c3*r^3+c4*r^4),
// where e= (x*log2(10)_hi-RN(x*log2(10)_hi))+log2(10)_lo*x
// Note there are only 22 non-zero values that produce an exact result:
// 1.0, 2.0, ... 22.0.
// We test for these cases and use s1 to avoid setting the inexact flag.
// Special values
//==============================================================
// exp10(0)= 1
// exp10(+inf)= inf
// exp10(-inf)= 0
//
// Registers used
//==============================================================
// r2-r3, r14-r40
// f6-f15, f32-f52
// p6-p12
//
#include <shlib-compat.h>
GR_TBL_START = r2
GR_LOG_TBL = r3
GR_OF_LIMIT = r14
GR_UF_LIMIT = r15
GR_EXP_CORR = r16
GR_F_low = r17
GR_F_high = r18
GR_K = r19
GR_Flow_ADDR = r20
GR_BIAS = r21
GR_Fh = r22
GR_Fh_ADDR = r23
GR_EXPMAX = r24
GR_BIAS53 = r25
GR_ROUNDVAL = r26
GR_SNORM_LIMIT = r26
GR_MASK = r27
GR_KF0 = r28
GR_MASK_low = r29
GR_COEFF_START = r30
GR_exact_limit = r31
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
GR_SAVE_GP = r35
GR_SAVE_SP = r36
GR_Parameter_X = r37
GR_Parameter_Y = r38
GR_Parameter_RESULT = r39
GR_Parameter_TAG = r40
FR_X = f10
FR_Y = f1
FR_RESULT = f8
FR_COEFF1 = f6
FR_COEFF2 = f7
FR_R = f9
FR_LOG2_10 = f10
FR_2P53 = f11
FR_KF0 = f12
FR_COEFF3 = f13
FR_COEFF4 = f14
FR_UF_LIMIT = f15
FR_OF_LIMIT = f32
FR_DX_L210 = f33
FR_ROUNDVAL = f34
FR_KF = f35
FR_2_TO_K = f36
FR_T_low = f37
FR_T_high = f38
FR_P34 = f39
FR_R2 = f40
FR_P12 = f41
FR_T_low_K = f42
FR_P14 = f43
FR_T = f44
FR_P = f45
FR_L2_10_low = f46
FR_L2_10_high = f47
FR_E0 = f48
FR_E = f49
FR_exact_limit = f50
FR_int_x = f51
FR_SNORM_LIMIT = f52
// Data tables
//==============================================================
RODATA
.align 16
LOCAL_OBJECT_START(poly_coeffs)
data8 0xd49a784bcd1b8afe, 0x00003fcb // log2(10)*2^(10-63)
data8 0x9257edfe9b5fb698, 0x3fbf // log2(10)_low (bits 64...127)
data8 0x3fac6b08d704a0c0, 0x3f83b2ab6fba4e77 // C_3 and C_4
data8 0xb17217f7d1cf79ab, 0x00003ffe // C_1
data8 0xf5fdeffc162c7541, 0x00003ffc // C_2
LOCAL_OBJECT_END(poly_coeffs)
LOCAL_OBJECT_START(T_table)
// 2^{0.00000 b6 b7 b8 b9 b10}
data8 0x8000000000000000, 0x8016302f17467628
data8 0x802c6436d0e04f50, 0x80429c17d77c18ed
data8 0x8058d7d2d5e5f6b0, 0x806f17687707a7af
data8 0x80855ad965e88b83, 0x809ba2264dada76a
data8 0x80b1ed4fd999ab6c, 0x80c83c56b50cf77f
data8 0x80de8f3b8b85a0af, 0x80f4e5ff089f763e
data8 0x810b40a1d81406d4, 0x81219f24a5baa59d
data8 0x813801881d886f7b, 0x814e67cceb90502c
data8 0x8164d1f3bc030773, 0x817b3ffd3b2f2e47
data8 0x8191b1ea15813bfd, 0x81a827baf7838b78
data8 0x81bea1708dde6055, 0x81d51f0b8557ec1c
data8 0x81eba08c8ad4536f, 0x820225f44b55b33b
data8 0x8218af4373fc25eb, 0x822f3c7ab205c89a
data8 0x8245cd9ab2cec048, 0x825c62a423d13f0c
data8 0x8272fb97b2a5894c, 0x828998760d01faf3
data8 0x82a0393fe0bb0ca8, 0x82b6ddf5dbc35906
//
// 2^{0.b1 b2 b3 b4 b5}
data8 0x8000000000000000, 0x82cd8698ac2ba1d7
data8 0x85aac367cc487b14, 0x88980e8092da8527
data8 0x8b95c1e3ea8bd6e6, 0x8ea4398b45cd53c0
data8 0x91c3d373ab11c336, 0x94f4efa8fef70961
data8 0x9837f0518db8a96f, 0x9b8d39b9d54e5538
data8 0x9ef5326091a111ad, 0xa27043030c496818
data8 0xa5fed6a9b15138ea, 0xa9a15ab4ea7c0ef8
data8 0xad583eea42a14ac6, 0xb123f581d2ac258f
data8 0xb504f333f9de6484, 0xb8fbaf4762fb9ee9
data8 0xbd08a39f580c36be, 0xc12c4cca66709456
data8 0xc5672a115506dadd, 0xc9b9bd866e2f27a2
data8 0xce248c151f8480e3, 0xd2a81d91f12ae45a
data8 0xd744fccad69d6af4, 0xdbfbb797daf23755
data8 0xe0ccdeec2a94e111, 0xe5b906e77c8348a8
data8 0xeac0c6e7dd24392e, 0xefe4b99bdcdaf5cb
data8 0xf5257d152486cc2c, 0xfa83b2db722a033a
LOCAL_OBJECT_END(T_table)
.section .text
GLOBAL_IEEE754_ENTRY(exp10)
{.mfi
alloc r32= ar.pfs, 1, 4, 4, 0
// will continue only for non-zero normal/denormal numbers
fclass.nm.unc p12, p7= f8, 0x1b
mov GR_BIAS53= 0xffff+63-10
}
{.mlx
// GR_TBL_START= pointer to log2(10), C_1...C_4 followed by T_table
addl GR_TBL_START= @ltoff(poly_coeffs), gp
movl GR_ROUNDVAL= 0x3fc00000 // 1.5 (SP)
}
;;
{.mfi
ld8 GR_COEFF_START= [ GR_TBL_START ] // Load pointer to coeff table
fcmp.lt.s1 p6, p8= f8, f0 // X<0 ?
nop.i 0
}
;;
{.mlx
setf.exp FR_2P53= GR_BIAS53 // 2^{63-10}
movl GR_UF_LIMIT= 0xc07439b746e36b52 // (-2^10-51) / log2(10)
}
{.mlx
setf.s FR_ROUNDVAL= GR_ROUNDVAL
movl GR_OF_LIMIT= 0x40734413509f79fe // Overflow threshold
}
;;
{.mlx
ldfe FR_LOG2_10= [ GR_COEFF_START ], 16 // load log2(10)*2^(10-63)
movl GR_SNORM_LIMIT= 0xc0733a7146f72a41 // Smallest normal threshold
}
{.mib
nop.m 0
nop.i 0
(p12) br.cond.spnt SPECIAL_exp10 // Branch if nan, inf, zero
}
;;
{.mmf
ldfe FR_L2_10_low= [ GR_COEFF_START ], 16 // load log2(10)_low
setf.d FR_OF_LIMIT= GR_OF_LIMIT // Set overflow limit
fma.s0 f8= f8, f1, f0 // normalize x
}
;;
{.mfi
ldfpd FR_COEFF3, FR_COEFF4= [ GR_COEFF_START ], 16 // load C_3, C_4
(p8) fcvt.fx.s1 FR_int_x = f8 // Convert x to integer
nop.i 0
}
{.mfi
setf.d FR_UF_LIMIT= GR_UF_LIMIT // Set underflow limit
fma.s1 FR_KF0= f8, FR_LOG2_10, FR_ROUNDVAL // y= (x*log2(10)*2^10 +
// 1.5*2^63) * 2^(-63)
mov GR_EXP_CORR= 0xffff-126
}
;;
{.mfi
setf.d FR_SNORM_LIMIT= GR_SNORM_LIMIT // Set smallest normal limit
fma.s1 FR_L2_10_high= FR_LOG2_10, FR_2P53, f0 // FR_LOG2_10= log2(10)_hi
nop.i 0
}
;;
{.mfi
ldfe FR_COEFF1= [ GR_COEFF_START ], 16 // load C_1
fms.s1 FR_KF= FR_KF0, f1, FR_ROUNDVAL // (K+f)*2^(10-63)
mov GR_MASK= 1023
}
;;
{.mfi
ldfe FR_COEFF2= [ GR_COEFF_START ], 16 // load C_2
fma.s1 FR_LOG2_10= f8, FR_L2_10_high, f0 // y0= x*log2(10)_hi
mov GR_MASK_low= 31
}
;;
{.mlx
getf.sig GR_KF0= FR_KF0 // (K+f)*2^10= round_to_int(y)
(p8) movl GR_exact_limit= 0x41b00000 // Largest x for exact result,
// +22.0
}
;;
{.mfi
add GR_LOG_TBL= 256, GR_COEFF_START // Pointer to high T_table
fcmp.gt.s1 p12, p7= f8, FR_OF_LIMIT // x>overflow threshold ?
nop.i 0
}
;;
{.mfi
(p8) setf.s FR_exact_limit = GR_exact_limit // Largest x for exact result
(p8) fcvt.xf FR_int_x = FR_int_x // Integral part of x
shr GR_K= GR_KF0, 10 // K
}
{.mfi
and GR_F_high= GR_MASK, GR_KF0 // f_high*32
fnma.s1 FR_R= FR_KF, FR_2P53, FR_LOG2_10 // r= x*log2(10)-2^{63-10}*
// [ (K+f)*2^{10-63} ]
and GR_F_low= GR_KF0, GR_MASK_low // f_low
}
;;
{.mmi
shladd GR_Flow_ADDR= GR_F_low, 3, GR_COEFF_START // address of 2^{f_low}
add GR_BIAS= GR_K, GR_EXP_CORR // K= bias-2*63
shr GR_Fh= GR_F_high, 5 // f_high
}
;;
{.mfi
setf.exp FR_2_TO_K= GR_BIAS // 2^{K-126}
(p7) fcmp.lt.s1 p12, p7= f8, FR_UF_LIMIT // x<underflow threshold ?
shladd GR_Fh_ADDR= GR_Fh, 3, GR_LOG_TBL // address of 2^{f_high}
}
{.mfi
ldf8 FR_T_low= [ GR_Flow_ADDR ] // load T_low= 2^{f_low}
fms.s1 FR_DX_L210= f8, FR_L2_10_high, FR_LOG2_10 // x*log2(10)_hi-
// RN(x*log2(10)_hi)
nop.i 0
}
;;
{.mfi
ldf8 FR_T_high= [ GR_Fh_ADDR ] // load T_high= 2^{f_high}
fma.s1 FR_P34= FR_COEFF4, FR_R, FR_COEFF3 // P34= C_3+C_4*r
nop.i 0
}
{.mfb
nop.m 0
fma.s1 FR_R2= FR_R, FR_R, f0 // r*r
(p12) br.cond.spnt OUT_RANGE_exp10
}
;;
{.mfi
nop.m 0
// e= (x*log2(10)_hi-RN(x*log2(10)_hi))+log2(10)_lo*x
fma.s1 FR_E0= f8, FR_L2_10_low, FR_DX_L210
cmp.eq p7,p9= r0,r0 // Assume inexact result
}
{.mfi
nop.m 0
fma.s1 FR_P12= FR_COEFF2, FR_R, FR_COEFF1 // P12= C_1+C_2*r
nop.i 0
}
;;
{.mfi
nop.m 0
(p8) fcmp.eq.s1 p9,p7= FR_int_x, f8 // Test x positive integer
nop.i 0
}
{.mfi
nop.m 0
fma.s1 FR_T_low_K= FR_T_low, FR_2_TO_K, f0 // T= 2^{K-126}*T_low
nop.i 0
}
;;
{.mfi
nop.m 0
fcmp.ge.s1 p11,p0= f8, FR_SNORM_LIMIT // Test x for normal range
nop.i 0
}
;;
{.mfi
nop.m 0
fma.s1 FR_E= FR_E0, FR_COEFF1, f0 // E= C_1*e
nop.i 0
}
{.mfi
nop.m 0
fma.s1 FR_P14= FR_R2, FR_P34, FR_P12 // P14= P12+r2*P34
nop.i 0
}
;;
// If x a positive integer, will it produce an exact result?
// p7 result will be inexact
// p9 result will be exact
{.mfi
nop.m 0
(p9) fcmp.le.s1 p9,p7= f8, FR_exact_limit // Test x gives exact result
nop.i 0
}
{.mfi
nop.m 0
fma.s1 FR_T= FR_T_low_K, FR_T_high, f0 // T= T*T_high
nop.i 0
}
;;
{.mfi
nop.m 0
fma.s1 FR_P= FR_P14, FR_R, FR_E // P= P14*r+E
nop.i 0
}
;;
.pred.rel "mutex",p7,p9
{.mfi
nop.m 0
(p7) fma.d.s0 f8= FR_P, FR_T, FR_T // result= T+T*P, inexact set
nop.i 0
}
{.mfb
nop.m 0
(p9) fma.d.s1 f8= FR_P, FR_T, FR_T // result= T+T*P, exact use s1
(p11) br.ret.sptk b0 // return, if result normal
}
;;
// Here if result in denormal range (and not zero)
{.mib
nop.m 0
mov GR_Parameter_TAG= 265
br.cond.sptk __libm_error_region // Branch to error handling
}
;;
SPECIAL_exp10:
{.mfi
nop.m 0
fclass.m p6, p0= f8, 0x22 // x= -Infinity ?
nop.i 0
}
;;
{.mfi
nop.m 0
fclass.m p7, p0= f8, 0x21 // x= +Infinity ?
nop.i 0
}
;;
{.mfi
nop.m 0
fclass.m p8, p0= f8, 0x7 // x= +/-Zero ?
nop.i 0
}
{.mfb
nop.m 0
(p6) mov f8= f0 // exp10(-Infinity)= 0
(p6) br.ret.spnt b0
}
;;
{.mfb
nop.m 0
nop.f 0
(p7) br.ret.spnt b0 // exp10(+Infinity)= +Infinity
}
;;
{.mfb
nop.m 0
(p8) mov f8= f1 // exp10(+/-0)= 1
(p8) br.ret.spnt b0
}
;;
{.mfb
nop.m 0
fma.d.s0 f8= f8, f1, f0 // Remaining cases: NaNs
br.ret.sptk b0
}
;;
OUT_RANGE_exp10:
// underflow: p6= 1
// overflow: p8= 1
.pred.rel "mutex",p6,p8
{.mmi
(p8) mov GR_EXPMAX= 0x1fffe
(p6) mov GR_EXPMAX= 1
nop.i 0
}
;;
{.mii
setf.exp FR_R= GR_EXPMAX
(p8) mov GR_Parameter_TAG= 166
(p6) mov GR_Parameter_TAG= 265
}
;;
{.mfb
nop.m 0
fma.d.s0 f8= FR_R, FR_R, f0 // Create overflow/underflow
br.cond.sptk __libm_error_region // Branch to error handling
}
;;
GLOBAL_IEEE754_END(exp10)
libm_alias_double_other (__exp10, exp10)
#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_27)
compat_symbol (libm, exp10, pow10, GLIBC_2_2)
#endif
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{.mfi
add GR_Parameter_Y= -32, sp // Parameter 2 value
nop.f 0
.save ar.pfs, GR_SAVE_PFS
mov GR_SAVE_PFS= ar.pfs // Save ar.pfs
}
{.mfi
.fframe 64
add sp= -64, sp // Create new stack
nop.f 0
mov GR_SAVE_GP= gp // Save gp
}
;;
{.mmi
stfd [ GR_Parameter_Y ]= FR_Y, 16 // STORE Parameter 2 on stack
add GR_Parameter_X= 16, sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0= b0 // Save b0
}
;;
.body
{.mib
stfd [ GR_Parameter_X ]= FR_X // STORE Parameter 1 on stack
add GR_Parameter_RESULT= 0, GR_Parameter_Y // Parameter 3 address
nop.b 0
}
{.mib
stfd [ GR_Parameter_Y ]= FR_RESULT // STORE Parameter 3 on stack
add GR_Parameter_Y= -16, GR_Parameter_Y
br.call.sptk b0= __libm_error_support# // Call error handling function
}
;;
{.mmi
add GR_Parameter_RESULT= 48, sp
nop.m 0
nop.i 0
}
;;
{.mmi
ldfd f8= [ GR_Parameter_RESULT ] // Get return result off stack
.restore sp
add sp= 64, sp // Restore stack pointer
mov b0= GR_SAVE_B0 // Restore return address
}
;;
{.mib
mov gp= GR_SAVE_GP // Restore gp
mov ar.pfs= GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
}
;;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#, @function
.global __libm_error_support#

View File

@ -1,5 +0,0 @@
/* IA64 does not provided the finite symbol alias. */
#include <libm-alias-finite.h>
#undef libm_alias_finite
#define libm_alias_finite(a, b)
#include <sysdeps/ieee754/flt-32/e_exp10f.c>

View File

@ -1,814 +0,0 @@
.file "exp10l.s"
// Copyright (c) 2000 - 2004, Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
// 08/25/00 Initial version
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/06/03 Reordered header: .section, .global, .proc, .align
// 05/08/03 Reformatted assembly source; corrected overflow result for round to
// -inf and round to zero; exact results now don't set inexact flag
// 12/16/04 Call error handling on underflow.
//
// API
//==============================================================
// long double exp10l(long double)
//
// Overview of operation
//==============================================================
// Background
//
// Implementation
//
// Let x= (K + f + r)/log2(10), where
// K is an integer, f= 0.b1 b2... b8 (f>= 0),
// and |r|<2^{-9}
// T is a table that stores 2^f (256 entries) rounded to
// double extended precision (only mantissa is stored)
// D stores (2^f/T [ f ] - 1), rounded to single precision
//
// 10^x is approximated as
// 2^K * T [ f ] * ((1+c1*r+c2*r^2+...+c6*r^6)*(1+c1*e)+D [ f ] ),
// where e= log2(10)_lo*x+(log2(10)_hi*x-RN(log2(10)_hi*x))
//
// Special values
//==============================================================
// exp10(0)= 1
// exp10(+inf)= inf
// exp10(-inf)= 0
//
// Registers used
//==============================================================
// f6-f15, f32-f63
// r14-r30, r32-r40
// p6-p8, p11-p14
//
#include <shlib-compat.h>
FR_X = f10
FR_Y = f1
FR_RESULT = f8
FR_COEFF1 = f6
FR_COEFF2 = f7
FR_KF0 = f9
FR_LOG10 = f10
FR_CONST1 = f11
FR_XL10 = f12
FR_COEFF3 = f13
FR_COEFF4 = f14
FR_UF_TEST = f15
FR_OF_TEST = f32
FR_L10_LOW = f33
FR_COEFF5 = f34
FR_COEFF6 = f35
FR_L10 = f36
FR_C_L10 = f37
FR_XL10_H = f38
FR_XL10_L = f39
FR_KF = f40
FR_E = f41
FR_T = f42
FR_D = f43
FR_EXP_M_63 = f44
FR_R = f45
FR_E1 = f46
FR_COEFF2 = f47
FR_P34 = f48
FR_P56 = f49
FR_R2 = f50
FR_RE = f51
FR_D1 = f52
FR_P36 = f53
FR_R3E = f54
FR_P1 = f55
FR_P = f56
FR_T1 = f57
FR_XINT = f58
FR_XINTF = f59
FR_4 = f60
FR_28 = f61
FR_32 = f62
FR_SNORM_LIMIT = f63
GR_ADDR0 = r14
GR_D_ADDR = r15
GR_ADDR = r16
GR_B63 = r17
GR_KBITS = r18
GR_F = r19
GR_K = r20
GR_D = r21
GR_BM63 = r22
GR_T = r23
GR_CONST1 = r24
GR_EMIN = r25
GR_CONST2 = r26
GR_BM8 = r27
GR_SREG = r28
GR_4_BIAS = r29
GR_32_BIAS = r30
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
GR_SAVE_GP = r35
GR_SAVE_SP = r36
GR_Parameter_X = r37
GR_Parameter_Y = r38
GR_Parameter_RESULT= r39
GR_Parameter_TAG = r40
// Data tables
//==============================================================
RODATA
.align 16
LOCAL_OBJECT_START(poly_coeffs)
data8 0xd49a784bcd1b8afe, 0x00004008 // log2(10)*2^8
data8 0x9a209a84fbcff798, 0x0000400b // overflow threshold
data8 0xb17217f7d1cf79ab, 0x00003ffe // C_1
data8 0xf5fdeffc162c7541, 0x00003ffc // C_2
data8 0x3fac6b08d704a0c0 // C_3
data8 0x3f83b2ab6fba4e77 // C_4
data8 0x3f55d87fe78a6731 // C_5
data8 0x3f2430912f86c787 // C_6
data8 0x9257edfe9b5fb698, 0x00003fbf // log2(10)_low (bits 64...127)
data8 0x9a1bc98027a81918, 0x0000c00b // Smallest normal threshold
LOCAL_OBJECT_END(poly_coeffs)
LOCAL_OBJECT_START(T_table)
// 2^{0.b1 b2 b3 b4 b5 b6 b7 b8}
data8 0x8000000000000000, 0x8058d7d2d5e5f6b1
data8 0x80b1ed4fd999ab6c, 0x810b40a1d81406d4
data8 0x8164d1f3bc030773, 0x81bea1708dde6056
data8 0x8218af4373fc25ec, 0x8272fb97b2a5894c
data8 0x82cd8698ac2ba1d7, 0x83285071e0fc4547
data8 0x8383594eefb6ee37, 0x83dea15b9541b132
data8 0x843a28c3acde4046, 0x8495efb3303efd30
data8 0x84f1f656379c1a29, 0x854e3cd8f9c8c95d
data8 0x85aac367cc487b15, 0x86078a2f23642a9f
data8 0x8664915b923fba04, 0x86c1d919caef5c88
data8 0x871f61969e8d1010, 0x877d2afefd4e256c
data8 0x87db357ff698d792, 0x88398146b919f1d4
data8 0x88980e8092da8527, 0x88f6dd5af155ac6b
data8 0x8955ee03618e5fdd, 0x89b540a7902557a4
data8 0x8a14d575496efd9a, 0x8a74ac9a79896e47
data8 0x8ad4c6452c728924, 0x8b3522a38e1e1032
data8 0x8b95c1e3ea8bd6e7, 0x8bf6a434adde0085
data8 0x8c57c9c4646f4dde, 0x8cb932c1bae97a95
data8 0x8d1adf5b7e5ba9e6, 0x8d7ccfc09c50e2f8
data8 0x8ddf042022e69cd6, 0x8e417ca940e35a01
data8 0x8ea4398b45cd53c0, 0x8f073af5a2013520
data8 0x8f6a8117e6c8e5c4, 0x8fce0c21c6726481
data8 0x9031dc431466b1dc, 0x9095f1abc540ca6b
data8 0x90fa4c8beee4b12b, 0x915eed13c89689d3
data8 0x91c3d373ab11c336, 0x9228ffdc10a051ad
data8 0x928e727d9531f9ac, 0x92f42b88f673aa7c
data8 0x935a2b2f13e6e92c, 0x93c071a0eef94bc1
data8 0x9426ff0fab1c04b6, 0x948dd3ac8ddb7ed3
data8 0x94f4efa8fef70961, 0x955c5336887894d5
data8 0x95c3fe86d6cc7fef, 0x962bf1cbb8d97560
data8 0x96942d3720185a00, 0x96fcb0fb20ac4ba3
data8 0x97657d49f17ab08e, 0x97ce9255ec4357ab
data8 0x9837f0518db8a96f, 0x98a1976f7597e996
data8 0x990b87e266c189aa, 0x9975c1dd47518c77
data8 0x99e0459320b7fa65, 0x9a4b13371fd166ca
data8 0x9ab62afc94ff864a, 0x9b218d16f441d63d
data8 0x9b8d39b9d54e5539, 0x9bf93118f3aa4cc1
data8 0x9c6573682ec32c2d, 0x9cd200db8a0774cb
data8 0x9d3ed9a72cffb751, 0x9dabfdff6367a2aa
data8 0x9e196e189d472420, 0x9e872a276f0b98ff
data8 0x9ef5326091a111ae, 0x9f6386f8e28ba651
data8 0x9fd228256400dd06, 0xa041161b3d0121be
data8 0xa0b0510fb9714fc2, 0xa11fd9384a344cf7
data8 0xa18faeca8544b6e4, 0xa1ffd1fc25cea188
data8 0xa27043030c496819, 0xa2e102153e918f9e
data8 0xa3520f68e802bb93, 0xa3c36b345991b47c
data8 0xa43515ae09e6809e, 0xa4a70f0c95768ec5
data8 0xa5195786be9ef339, 0xa58bef536dbeb6ee
data8 0xa5fed6a9b15138ea, 0xa6720dc0be08a20c
data8 0xa6e594cfeee86b1e, 0xa7596c0ec55ff55b
data8 0xa7cd93b4e965356a, 0xa8420bfa298f70d1
data8 0xa8b6d5167b320e09, 0xa92bef41fa77771b
data8 0xa9a15ab4ea7c0ef8, 0xaa1717a7b5693979
data8 0xaa8d2652ec907629, 0xab0386ef48868de1
data8 0xab7a39b5a93ed337, 0xabf13edf162675e9
data8 0xac6896a4be3fe929, 0xace0413ff83e5d04
data8 0xad583eea42a14ac6, 0xadd08fdd43d01491
data8 0xae493452ca35b80e, 0xaec22c84cc5c9465
data8 0xaf3b78ad690a4375, 0xafb51906e75b8661
data8 0xb02f0dcbb6e04584, 0xb0a957366fb7a3c9
data8 0xb123f581d2ac2590, 0xb19ee8e8c94feb09
data8 0xb21a31a66618fe3b, 0xb295cff5e47db4a4
data8 0xb311c412a9112489, 0xb38e0e38419fae18
data8 0xb40aaea2654b9841, 0xb487a58cf4a9c180
data8 0xb504f333f9de6484, 0xb58297d3a8b9f0d2
data8 0xb60093a85ed5f76c, 0xb67ee6eea3b22b8f
data8 0xb6fd91e328d17791, 0xb77c94c2c9d725e9
data8 0xb7fbefca8ca41e7c, 0xb87ba337a1743834
data8 0xb8fbaf4762fb9ee9, 0xb97c143756844dbf
data8 0xb9fcd2452c0b9deb, 0xba7de9aebe5fea09
data8 0xbaff5ab2133e45fb, 0xbb81258d5b704b6f
data8 0xbc034a7ef2e9fb0d, 0xbc85c9c560e7b269
data8 0xbd08a39f580c36bf, 0xbd8bd84bb67ed483
data8 0xbe0f6809860993e2, 0xbe935317fc378238
data8 0xbf1799b67a731083, 0xbf9c3c248e2486f8
data8 0xc0213aa1f0d08db0, 0xc0a6956e8836ca8d
data8 0xc12c4cca66709456, 0xc1b260f5ca0fbb33
data8 0xc238d2311e3d6673, 0xc2bfa0bcfad907c9
data8 0xc346ccda24976407, 0xc3ce56c98d21b15d
data8 0xc4563ecc5334cb33, 0xc4de8523c2c07baa
data8 0xc5672a115506dadd, 0xc5f02dd6b0bbc3d9
data8 0xc67990b5aa245f79, 0xc70352f04336c51e
data8 0xc78d74c8abb9b15d, 0xc817f681416452b2
data8 0xc8a2d85c8ffe2c45, 0xc92e1a9d517f0ecc
data8 0xc9b9bd866e2f27a3, 0xca45c15afcc72624
data8 0xcad2265e4290774e, 0xcb5eecd3b38597c9
data8 0xcbec14fef2727c5d, 0xcc799f23d11510e5
data8 0xcd078b86503dcdd2, 0xcd95da6a9ff06445
data8 0xce248c151f8480e4, 0xceb3a0ca5dc6a55d
data8 0xcf4318cf191918c1, 0xcfd2f4683f94eeb5
data8 0xd06333daef2b2595, 0xd0f3d76c75c5db8d
data8 0xd184df6251699ac6, 0xd2164c023056bcab
data8 0xd2a81d91f12ae45a, 0xd33a5457a3029054
data8 0xd3ccf099859ac379, 0xd45ff29e0972c561
data8 0xd4f35aabcfedfa1f, 0xd5872909ab75d18a
data8 0xd61b5dfe9f9bce07, 0xd6aff9d1e13ba2fe
data8 0xd744fccad69d6af4, 0xd7da67311797f56a
data8 0xd870394c6db32c84, 0xd9067364d44a929c
data8 0xd99d15c278afd7b6, 0xda3420adba4d8704
data8 0xdacb946f2ac9cc72, 0xdb63714f8e295255
data8 0xdbfbb797daf23755, 0xdc9467913a4f1c92
data8 0xdd2d818508324c20, 0xddc705bcd378f7f0
data8 0xde60f4825e0e9124, 0xdefb4e1f9d1037f2
data8 0xdf9612deb8f04420, 0xe031430a0d99e627
data8 0xe0ccdeec2a94e111, 0xe168e6cfd3295d23
data8 0xe2055afffe83d369, 0xe2a23bc7d7d91226
data8 0xe33f8972be8a5a51, 0xe3dd444c46499619
data8 0xe47b6ca0373da88d, 0xe51a02ba8e26d681
data8 0xe5b906e77c8348a8, 0xe658797368b3a717
data8 0xe6f85aaaee1fce22, 0xe798aadadd5b9cbf
data8 0xe8396a503c4bdc68, 0xe8da9958464b42ab
data8 0xe97c38406c4f8c57, 0xea1e4756550eb27b
data8 0xeac0c6e7dd24392f, 0xeb63b74317369840
data8 0xec0718b64c1cbddc, 0xecaaeb8ffb03ab41
data8 0xed4f301ed9942b84, 0xedf3e6b1d418a491
data8 0xee990f980da3025b, 0xef3eab20e032bc6b
data8 0xefe4b99bdcdaf5cb, 0xf08b3b58cbe8b76a
data8 0xf13230a7ad094509, 0xf1d999d8b7708cc1
data8 0xf281773c59ffb13a, 0xf329c9233b6bae9c
data8 0xf3d28fde3a641a5b, 0xf47bcbbe6db9fddf
data8 0xf5257d152486cc2c, 0xf5cfa433e6537290
data8 0xf67a416c733f846e, 0xf7255510c4288239
data8 0xf7d0df730ad13bb9, 0xf87ce0e5b2094d9c
data8 0xf92959bb5dd4ba74, 0xf9d64a46eb939f35
data8 0xfa83b2db722a033a, 0xfb3193cc4227c3f4
data8 0xfbdfed6ce5f09c49, 0xfc8ec01121e447bb
data8 0xfd3e0c0cf486c175, 0xfdedd1b496a89f35
data8 0xfe9e115c7b8f884c, 0xff4ecb59511ec8a5
LOCAL_OBJECT_END(T_table)
LOCAL_OBJECT_START(D_table)
data4 0x00000000, 0x9f55c08f, 0x1e93ffa3, 0x1dcd43a8
data4 0x1f751f79, 0x9f3cdd88, 0x9f43d155, 0x1eda222c
data4 0x1ef35513, 0x9f597895, 0x9e698881, 0x1ec71073
data4 0x1e50e371, 0x9dc01e19, 0x1de74133, 0x1e2f028c
data4 0x9edefb47, 0x1ebbac48, 0x9e8b0330, 0x9e9e9314
data4 0x1edc1d11, 0x1f098529, 0x9f52827c, 0x1f50050d
data4 0x1f301e8e, 0x1f5b64d1, 0x9f45e3ee, 0x9ef64d6d
data4 0x1d6ec5e8, 0x9e61ad9a, 0x1d44ccbb, 0x9e4a8bbb
data4 0x9cf11576, 0x9dcce7e7, 0x9d02ac90, 0x1f26ccf0
data4 0x9f0877c6, 0x9ddd62ae, 0x9f4b7fc3, 0x1ea8ef6b
data4 0x1ea4378d, 0x1ef6fc38, 0x1db99fd9, 0x1f22bf6f
data4 0x1f53e172, 0x1e85504a, 0x9f37cc75, 0x1f0c5e17
data4 0x1dde8aac, 0x9cb42bb2, 0x1e153cd7, 0x1eb62bba
data4 0x9e9b941b, 0x9ea80e3c, 0x1f508823, 0x1ec3fd36
data4 0x1e9ffaa1, 0x1e21e2eb, 0x9d948b1d, 0x9e8ac93a
data4 0x1ef7ee6f, 0x9e80dda3, 0x1f0814be, 0x1dc5ddfe
data4 0x1eedb9d1, 0x9f2aaa26, 0x9ea5b0fc, 0x1edf702e
data4 0x9e391201, 0x1f1316bb, 0x1ea27fb7, 0x9e05ed18
data4 0x9f199ed2, 0x1ee7fd7c, 0x1f003db6, 0x9eac3793
data4 0x9e5b8c10, 0x9f3af17c, 0x1bc9a8be, 0x1ee3c004
data4 0x9f19b1b2, 0x9f242ce9, 0x9ce67dd1, 0x9e4f6275
data4 0x1e20742c, 0x1eb9328a, 0x9f477153, 0x1d969718
data4 0x9f1e6c43, 0x1f2f67f4, 0x9f39c7e4, 0x9e3c4feb
data4 0x1da3956b, 0x9e7c685d, 0x1f280911, 0x9f0d8afb
data4 0x1e314b40, 0x9eb4f250, 0x9f1a34ad, 0x1ef5d5e7
data4 0x9f145496, 0x1e604827, 0x9f1e5195, 0x1e9c1fc0
data4 0x1efde521, 0x1e69b385, 0x1f316830, 0x9f244eae
data4 0x1f1787ec, 0x9e939971, 0x1f0bb393, 0x9f0511d6
data4 0x1ed919de, 0x1d8b7b28, 0x1e5ca4a9, 0x1e7c357b
data4 0x9e3ff8e8, 0x1eef53b5, 0x9ed22ed7, 0x1f16659b
data4 0x9f2db102, 0x9e2c6a78, 0x1f328d7d, 0x9f2fec3c
data4 0x1eb395bd, 0x9f242b84, 0x9e2683e6, 0x1ed71e68
data4 0x1efd1df5, 0x9e9eeafd, 0x9ed2249c, 0x1eef129a
data4 0x1d1ea44c, 0x9e81f7ff, 0x1eaf77c9, 0x9ee7a285
data4 0x1e1864ed, 0x9ee7edbb, 0x9e15a27d, 0x9ae61655
data4 0x1f1ff1a2, 0x1da29755, 0x9e5f46fb, 0x1e901236
data4 0x9eecfb9b, 0x9f204d2f, 0x1ec64685, 0x9eb809bd
data4 0x9e0026c5, 0x1d9f1da1, 0x1f142b49, 0x9f20f22e
data4 0x1f24b067, 0x1f185a4c, 0x9f09765c, 0x9ece902f
data4 0x1e2ca5db, 0x1e6de464, 0x9f071f67, 0x1f1518c3
data4 0x1ea13ded, 0x1f0b8414, 0x1edb6ad4, 0x9e548740
data4 0x9ea10efb, 0x1ee48a60, 0x1e7954c5, 0x9edad013
data4 0x9f21517d, 0x9e9b6e0c, 0x9ee7f9a6, 0x9ebd4298
data4 0x9d65b24e, 0x1eed751f, 0x9f1573ea, 0x9d430377
data4 0x9e13fc0c, 0x1e47008a, 0x1e3d5c1d, 0x1ef41a91
data4 0x9e4a4ef7, 0x9e952f18, 0x1d620566, 0x1d9b8d33
data4 0x1db06247, 0x1e94b31e, 0x1f0730ad, 0x9d79ffb4
data4 0x1ed64d51, 0x9e91fd11, 0x9e28d35a, 0x9dea0ed9
data4 0x1e891def, 0x9ee28ac0, 0x1e1db99b, 0x9ee1ce38
data4 0x9bdd9bca, 0x1eb72cb9, 0x9e8c53c6, 0x1e0df6ca
data4 0x1e8f2ccd, 0x9e9b0886, 0x1eeb3bc7, 0x1ec7e772
data4 0x9e210776, 0x9daf246c, 0x1ea1f151, 0x1ece4dc6
data4 0x1ce741c8, 0x1ed3c88f, 0x9ec9a4fd, 0x9e0c8d30
data4 0x1d2fbb26, 0x9ef212a7, 0x1ee44f1c, 0x9e445550
data4 0x1e075f77, 0x9d9291a3, 0x1f09c2ee, 0x9e012c88
data4 0x1f057d62, 0x9e7bb0dc, 0x9d8758ee, 0x1ee8d6c1
data4 0x9e509a57, 0x9e4ca7b7, 0x1e2cb341, 0x9ec35106
data4 0x1ecf3baf, 0x1e11781c, 0x1ea0cc78, 0x1eb75ca6
data4 0x1e961e1a, 0x1eb88853, 0x1e7abf50, 0x1ee38704
data4 0x9dc5ab0f, 0x1afe197b, 0x9ec07523, 0x9d9b7f78
data4 0x1f011618, 0x1ed43b0b, 0x9f035945, 0x9e3fd014
data4 0x9bbda5cd, 0x9e83f8ab, 0x1e58a928, 0x1e392d61
data4 0x1efdbb52, 0x1ee310a8, 0x9ec7ecc1, 0x1e8c9ed6
data4 0x9ef82dee, 0x9e70545b, 0x9ea53fc4, 0x1e40f419
LOCAL_OBJECT_END(D_table)
.section .text
GLOBAL_IEEE754_ENTRY(exp10l)
{.mfi
alloc GR_SREG = ar.pfs, 1, 4, 4, 0
// will continue only for normal/denormal numbers
fclass.nm.unc p12, p7 = f8, 0x1b
// GR_ADDR0 = pointer to log2(10), C_1...C_6 followed by T_table
addl GR_ADDR0 = @ltoff(poly_coeffs), gp ;;
}
{.mfi
// load start address for C_1...C_6 followed by T_table
ld8 GR_ADDR0 = [ GR_ADDR0 ]
// X<0 ?
fcmp.lt.s1 p6, p8 = f8, f0
// GR_BM8 = bias-8
mov GR_BM8 = 0xffff-8
}
{.mlx
nop.m 0
// GR_EMIN = (-2^14-62)*2^{8}
movl GR_EMIN = 0xca807c00 ;;
}
{.mmb
// FR_CONST1 = 2^{-8}
setf.exp FR_CONST1 = GR_BM8
// load log2(10)*2^8
ldfe FR_LOG10 = [ GR_ADDR0 ], 16
(p12) br.cond.spnt SPECIAL_EXP10 ;;
}
{.mmf
setf.s FR_UF_TEST = GR_EMIN
// load overflow threshold
ldfe FR_OF_TEST = [ GR_ADDR0 ], 16
// normalize x
fma.s0 f8 = f8, f1, f0 ;;
}
{.mmi
// load C_1
ldfe FR_COEFF1 = [ GR_ADDR0 ], 16 ;;
// load C_2
ldfe FR_COEFF2 = [ GR_ADDR0 ], 16
nop.i 0 ;;
}
{.mmf
// GR_D_ADDR = pointer to D table
add GR_D_ADDR = 2048-64+96+32, GR_ADDR0
// load C_3, C_4
ldfpd FR_COEFF3, FR_COEFF4 = [ GR_ADDR0 ], 16
// y = x*log2(10)*2^8
fma.s1 FR_XL10 = f8, FR_LOG10, f0 ;;
}
{.mfi
// load C_5, C_6
ldfpd FR_COEFF5, FR_COEFF6 = [ GR_ADDR0 ], 16
// get int(x)
fcvt.fx.trunc.s1 FR_XINT = f8
nop.i 0
}
{.mfi
nop.m 0
// FR_LOG10 = log2(10)
fma.s1 FR_L10 = FR_LOG10, FR_CONST1, f0
nop.i 0 ;;
}
{.mfi
// load log2(10)_low
ldfe FR_L10_LOW = [ GR_ADDR0 ], 16
// y0 = x*log2(10) = x*log2(10)_hi
fma.s1 FR_LOG10 = f8, FR_L10, f0
mov GR_EMIN = 0xffff-63
}
{.mfi
mov GR_32_BIAS = 0xffff + 5
// (K+f)*2^8 = round_to_int(y)
fcvt.fx.s1 FR_KF0 = FR_XL10
mov GR_4_BIAS = 0xffff + 2;;
}
{.mfi
// load smallest normal limit
ldfe FR_SNORM_LIMIT = [ GR_ADDR0 ], 16
// x>overflow threshold ?
fcmp.gt.s1 p12, p7 = f8, FR_OF_TEST
nop.i 0 ;;
}
{.mfi
setf.exp FR_32 = GR_32_BIAS
// x<underflow threshold ?
(p7) fcmp.lt.s1 p12, p7 = FR_XL10, FR_UF_TEST
nop.i 0 ;;
}
{.mfi
setf.exp FR_4 = GR_4_BIAS
fcvt.xf FR_XINTF = FR_XINT
nop.i 0
}
{.mfi
nop.m 0
// FR_L10 = log2(10)_h*x-RN(log2(10)_h*x)
fms.s1 FR_L10 = f8, FR_L10, FR_LOG10
nop.i 0 ;;
}
{.mfi
getf.sig GR_BM8 = FR_KF0
fcvt.xf FR_KF0 = FR_KF0
mov GR_CONST2 = 255 ;;
}
{.mfi
// GR_CONST2 = f
and GR_CONST2 = GR_CONST2, GR_BM8
// FR_L10_LOW = e = log2(10)_l*x+(log2(10)_h*x-RN(log2(10)_h*x))
fma.s1 FR_L10_LOW = FR_L10_LOW, f8, FR_L10
// GR_BM8 = K
shr GR_BM8 = GR_BM8, 8 ;;
}
{.mmi
// address of D
shladd GR_D_ADDR = GR_CONST2, 2, GR_D_ADDR
// K+ = bias-63
add GR_BM8 = GR_BM8, GR_EMIN
// address of T
shladd GR_ADDR0 = GR_CONST2, 3, GR_ADDR0 ;;
}
{.mfb
// load D
ldfs FR_OF_TEST = [ GR_D_ADDR ]
// is input an integer ?
fcmp.eq.s1 p13, p14 = f8, FR_XINTF
(p12) br.cond.spnt OUT_RANGE_EXP10 ;;
}
{.mmf
// load T
ldf8 FR_UF_TEST = [ GR_ADDR0 ]
// FR_XL10 = 2^{K-63}
setf.exp FR_XL10 = GR_BM8
// r = x*log2(10)_hi-2^{-10}* [ (K+f)*2^{10} ]
fnma.s1 FR_KF0 = FR_KF0, FR_CONST1, FR_LOG10 ;;
}
{.mfi
nop.m 0
// get 28.0
fms.s1 FR_28 = FR_32, f1, FR_4
nop.i 0
}
{.mfi
nop.m 0
// E = 1+C_1*e
fma.s1 FR_L10 = FR_L10_LOW, FR_COEFF1, f1
nop.i 0 ;;
}
{.mfi
nop.m 0
// P12 = C_1+C_2*r
fma.s1 FR_COEFF2 = FR_COEFF2, FR_KF0, FR_COEFF1
nop.i 0
}
{.mfi
nop.m 0
// P34 = C_3+C_4*r
fma.s1 FR_COEFF4 = FR_COEFF4, FR_KF0, FR_COEFF3
nop.i 0 ;;
}
{.mfi
nop.m 0
// P56 = C_5+C_6*r
fma.s1 FR_COEFF5 = FR_COEFF6, FR_KF0, FR_COEFF5
nop.i 0
}
{.mfi
nop.m 0
// GR_ADDR0 = r*r
fma.s1 FR_COEFF3 = FR_KF0, FR_KF0, f0
nop.i 0 ;;
}
{.mfi
nop.m 0
// if input is integer, is it positive ?
(p13) fcmp.ge.s1 p13, p14 = f8, f0
nop.i 0
}
{.mfi
nop.m 0
// r' = r*E
fma.s1 FR_KF0 = FR_KF0, FR_L10, f0
nop.i 0 ;;
}
{.mfi
nop.m 0
// D' = D+C_1*e
fma.s1 FR_OF_TEST = FR_L10_LOW, FR_COEFF1, FR_OF_TEST
nop.i 0 ;;
}
{.mfi
nop.m 0
// test if x >= smallest normal limit
fcmp.ge.s1 p11, p0 = f8, FR_SNORM_LIMIT
nop.i 0 ;;
}
{.mfi
nop.m 0
// P36 = P34+r2*P56
fma.s1 FR_COEFF4 = FR_COEFF5, FR_COEFF3, FR_COEFF4
nop.i 0
}
{.mfi
nop.m 0
// GR_D_ADDR = r'*r2
fma.s1 FR_COEFF3 = FR_COEFF3, FR_KF0, f0
nop.i 0 ;;
}
{.mfi
nop.m 0
// is input below 28.0 ?
(p13) fcmp.lt.s1 p13, p14 = f8, FR_28
nop.i 0
}
{.mfi
nop.m 0
// P' = P12*r'+D'
fma.s1 FR_COEFF2 = FR_COEFF2, FR_KF0, FR_OF_TEST
nop.i 0 ;;
}
{.mfi
nop.m 0
// P = P'+r3*P36
fma.s1 FR_COEFF3 = FR_COEFF3, FR_COEFF4, FR_COEFF2
nop.i 0
}
{.mfi
nop.m 0
// T = 2^{K-63}*T
fma.s1 FR_UF_TEST = FR_UF_TEST, FR_XL10, f0
nop.i 0 ;;
}
.pred.rel "mutex",p13,p14
{.mfi
nop.m 0
(p13) fma.s1 f8 = FR_COEFF3, FR_UF_TEST, FR_UF_TEST
nop.i 0
}
{.mfb
nop.m 0
// result = T+T*P
(p14) fma.s0 f8 = FR_COEFF3, FR_UF_TEST, FR_UF_TEST
// return
(p11) br.ret.sptk b0 ;; // return, if result normal
}
// Here if result in denormal range (and not zero)
{.mib
nop.m 0
mov GR_Parameter_TAG= 264
br.cond.sptk __libm_error_region // Branch to error handling
}
;;
SPECIAL_EXP10:
{.mfi
nop.m 0
// x = -Infinity ?
fclass.m p6, p0 = f8, 0x22
nop.i 0 ;;
}
{.mfi
nop.m 0
// x = +Infinity ?
fclass.m p7, p0 = f8, 0x21
nop.i 0 ;;
}
{.mfi
nop.m 0
// x = +/-Zero ?
fclass.m p8, p0 = f8, 0x7
nop.i 0
}
{.mfb
nop.m 0
// exp10(-Infinity) = 0
(p6) mov f8 = f0
(p6) br.ret.spnt b0 ;;
}
{.mfb
nop.m 0
// exp10(+Infinity) = +Infinity
nop.f 0
(p7) br.ret.spnt b0 ;;
}
{.mfb
nop.m 0
// exp10(+/-0) = 1
(p8) mov f8 = f1
(p8) br.ret.spnt b0 ;;
}
{.mfb
nop.m 0
// Remaining cases: NaNs
fma.s0 f8 = f8, f1, f0
br.ret.sptk b0 ;;
}
OUT_RANGE_EXP10:
// underflow: p6 = 1
// overflow: p8 = 1
.pred.rel "mutex",p6,p8
{.mmi
(p8) mov GR_CONST1 = 0x1fffe
(p6) mov GR_CONST1 = 1
nop.i 0
}
;;
{.mii
setf.exp FR_KF0 = GR_CONST1
(p8) mov GR_Parameter_TAG = 165
(p6) mov GR_Parameter_TAG = 264
}
;;
{.mfb
nop.m 999
fma.s0 f8 = FR_KF0, FR_KF0, f0 // Create overflow/underflow
br.cond.sptk __libm_error_region // Branch to error handling
}
;;
GLOBAL_IEEE754_END(exp10l)
libm_alias_ldouble_other (__exp10, exp10)
#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_27)
compat_symbol (libm, exp10l, pow10l, GLIBC_2_2)
#endif
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{.mfi
add GR_Parameter_Y = -32, sp // Parameter 2 value
nop.f 0
.save ar.pfs, GR_SAVE_PFS
mov GR_SAVE_PFS = ar.pfs // Save ar.pfs
}
{.mfi
.fframe 64
add sp = -64, sp // Create new stack
nop.f 0
mov GR_SAVE_GP = gp ;; // Save gp
}
{.mmi
stfe [ GR_Parameter_Y ] = FR_Y, 16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16, sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0 = b0 ;; // Save b0
}
.body
{.mib
stfe [ GR_Parameter_X ] = FR_X // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0, GR_Parameter_Y // Parameter 3 address
nop.b 0
}
{.mib
stfe [ GR_Parameter_Y ] = FR_RESULT // STORE Parameter 3 on stack
add GR_Parameter_Y = -16, GR_Parameter_Y
br.call.sptk b0 = __libm_error_support# ;; // Call error handling function
}
{.mmi
add GR_Parameter_RESULT = 48, sp
nop.m 0
nop.i 0 ;;
}
{.mmi
ldfe f8 = [ GR_Parameter_RESULT ] // Get return result off stack
.restore sp
add sp = 64, sp // Restore stack pointer
mov b0 = GR_SAVE_B0 ;; // Restore return address
}
{.mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 ;; // Return
}
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#, @function
.global __libm_error_support#

View File

@ -1,570 +0,0 @@
.file "exp2.s"
// Copyright (c) 2000 - 2005, Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
// 08/25/00 Initial version
// 05/20/02 Cleaned up namespace and sf0 syntax
// 09/05/02 Improved performance
// 01/17/03 Fixed to call error support when x=1024.0
// 03/31/05 Reformatted delimiters between data tables
//
// API
//==============================================================
// double exp2(double)
//
// Overview of operation
//==============================================================
// Background
//
// Implementation
//
// Let x= (K + fh + fl + r), where
// K is an integer, fh= 0.b1 b2 b3 b4 b5,
// fl= 2^{-5}* 0.b6 b7 b8 b8 b10 (fh, fl >= 0),
// and |r|<2^{-11}
// Th is a table that stores 2^fh (32 entries) rounded to
// double extended precision (only mantissa is stored)
// Tl is a table that stores 2^fl (32 entries) rounded to
// double extended precision (only mantissa is stored)
//
// 2^x is approximated as
// 2^K * Th [ f ] * Tl [ f ] * (1+c1*r+c2*r^2+c3*r^3+c4*r^4)
// Note: We use the following trick to speed up conversion from FP to integer:
//
// Let x = K + r, where K is an integer, and |r| <= 0.5
// Let N be the number of significand bits for the FP format used
// ( N=64 for double-extended, N=53 for double)
//
// Then let y = 1.5 * 2^(N-1) + x for RN mode
// K = y - 1.5 * 2^(N-1)
// r = x - K
//
// If we want to obtain the integer part and the first m fractional bits of x,
// we can use the same trick, but with a constant of 1.5 * 2^(N-1-m):
//
// Let x = K + f + r
// f = 0.b_1 b_2 ... b_m
// |r| <= 2^(-m-1)
//
// Then let y = 1.5 * 2^(N-1-m) + x for RN mode
// (K+f) = y - 1.5 * 2^(N-1-m)
// r = x - K
// Special values
//==============================================================
// exp2(0)= 1
// exp2(+inf)= inf
// exp2(-inf)= 0
//
// Registers used
//==============================================================
// r2-r3, r14-r40
// f6-f15, f32-f45
// p6-p8, p12
//
GR_TBL_START = r2
GR_LOG_TBL = r3
GR_OF_LIMIT = r14
GR_UF_LIMIT = r15
GR_EXP_CORR = r16
GR_F_low = r17
GR_F_high = r18
GR_K = r19
GR_Flow_ADDR = r20
GR_BIAS = r21
GR_Fh = r22
GR_Fh_ADDR = r23
GR_EXPMAX = r24
GR_EMIN = r25
GR_ROUNDVAL = r26
GR_MASK = r27
GR_KF0 = r28
GR_MASK_low = r29
GR_COEFF_START = r30
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
GR_SAVE_GP = r35
GR_SAVE_SP = r36
GR_Parameter_X = r37
GR_Parameter_Y = r38
GR_Parameter_RESULT = r39
GR_Parameter_TAG = r40
FR_X = f10
FR_Y = f1
FR_RESULT = f8
FR_COEFF1 = f6
FR_COEFF2 = f7
FR_R = f9
FR_KF0 = f12
FR_COEFF3 = f13
FR_COEFF4 = f14
FR_UF_LIMIT = f15
FR_OF_LIMIT = f32
FR_EXPMIN = f33
FR_ROUNDVAL = f34
FR_KF = f35
FR_2_TO_K = f36
FR_T_low = f37
FR_T_high = f38
FR_P34 = f39
FR_R2 = f40
FR_P12 = f41
FR_T_low_K = f42
FR_P14 = f43
FR_T = f44
FR_P = f45
// Data tables
//==============================================================
RODATA
.align 16
LOCAL_OBJECT_START(poly_coeffs)
data8 0x3fac6b08d704a0c0, 0x3f83b2ab6fba4e77 // C_3 and C_4
data8 0xb17217f7d1cf79ab, 0x00003ffe // C_1
data8 0xf5fdeffc162c7541, 0x00003ffc // C_2
LOCAL_OBJECT_END(poly_coeffs)
LOCAL_OBJECT_START(T_table)
// 2^{0.00000 b6 b7 b8 b9 b10}
data8 0x8000000000000000, 0x8016302f17467628
data8 0x802c6436d0e04f50, 0x80429c17d77c18ed
data8 0x8058d7d2d5e5f6b0, 0x806f17687707a7af
data8 0x80855ad965e88b83, 0x809ba2264dada76a
data8 0x80b1ed4fd999ab6c, 0x80c83c56b50cf77f
data8 0x80de8f3b8b85a0af, 0x80f4e5ff089f763e
data8 0x810b40a1d81406d4, 0x81219f24a5baa59d
data8 0x813801881d886f7b, 0x814e67cceb90502c
data8 0x8164d1f3bc030773, 0x817b3ffd3b2f2e47
data8 0x8191b1ea15813bfd, 0x81a827baf7838b78
data8 0x81bea1708dde6055, 0x81d51f0b8557ec1c
data8 0x81eba08c8ad4536f, 0x820225f44b55b33b
data8 0x8218af4373fc25eb, 0x822f3c7ab205c89a
data8 0x8245cd9ab2cec048, 0x825c62a423d13f0c
data8 0x8272fb97b2a5894c, 0x828998760d01faf3
data8 0x82a0393fe0bb0ca8, 0x82b6ddf5dbc35906
//
// 2^{0.b1 b2 b3 b4 b5}
data8 0x8000000000000000, 0x82cd8698ac2ba1d7
data8 0x85aac367cc487b14, 0x88980e8092da8527
data8 0x8b95c1e3ea8bd6e6, 0x8ea4398b45cd53c0
data8 0x91c3d373ab11c336, 0x94f4efa8fef70961
data8 0x9837f0518db8a96f, 0x9b8d39b9d54e5538
data8 0x9ef5326091a111ad, 0xa27043030c496818
data8 0xa5fed6a9b15138ea, 0xa9a15ab4ea7c0ef8
data8 0xad583eea42a14ac6, 0xb123f581d2ac258f
data8 0xb504f333f9de6484, 0xb8fbaf4762fb9ee9
data8 0xbd08a39f580c36be, 0xc12c4cca66709456
data8 0xc5672a115506dadd, 0xc9b9bd866e2f27a2
data8 0xce248c151f8480e3, 0xd2a81d91f12ae45a
data8 0xd744fccad69d6af4, 0xdbfbb797daf23755
data8 0xe0ccdeec2a94e111, 0xe5b906e77c8348a8
data8 0xeac0c6e7dd24392e, 0xefe4b99bdcdaf5cb
data8 0xf5257d152486cc2c, 0xfa83b2db722a033a
LOCAL_OBJECT_END(T_table)
.section .text
WEAK_LIBM_ENTRY(exp2)
{.mfi
alloc r32= ar.pfs, 1, 4, 4, 0
// will continue only for non-zero normal/denormal numbers
fclass.nm p12, p0= f8, 0x1b
// GR_TBL_START= pointer to C_1...C_4 followed by T_table
addl GR_TBL_START= @ltoff(poly_coeffs), gp
}
{.mlx
mov GR_OF_LIMIT= 0xffff + 10 // Exponent of overflow limit
movl GR_ROUNDVAL= 0x5a400000 // 1.5*2^(63-10) (SP)
}
;;
// Form special constant 1.5*2^(63-10) to give integer part and first 10
// fractional bits of x
{.mfi
setf.s FR_ROUNDVAL= GR_ROUNDVAL // Form special constant
fcmp.lt.s1 p6, p8= f8, f0 // X<0 ?
nop.i 0
}
{.mfb
ld8 GR_COEFF_START= [ GR_TBL_START ] // Load pointer to coeff table
nop.f 0
(p12) br.cond.spnt SPECIAL_exp2 // Branch if nan, inf, zero
}
;;
{.mlx
setf.exp FR_OF_LIMIT= GR_OF_LIMIT // Set overflow limit
movl GR_UF_LIMIT= 0xc4866000 // (-2^10-51) = -1075
}
;;
{.mfi
ldfpd FR_COEFF3, FR_COEFF4= [ GR_COEFF_START ], 16 // load C_3, C_4
fma.s0 f8= f8, f1, f0 // normalize x
nop.i 0
}
;;
{.mmi
setf.s FR_UF_LIMIT= GR_UF_LIMIT // Set underflow limit
ldfe FR_COEFF1= [ GR_COEFF_START ], 16 // load C_1
mov GR_EXP_CORR= 0xffff-126
}
;;
{.mfi
ldfe FR_COEFF2= [ GR_COEFF_START ], 16 // load C_2
fma.s1 FR_KF0= f8, f1, FR_ROUNDVAL // y= x + 1.5*2^(63-10)
nop.i 0
}
;;
{.mfi
mov GR_MASK= 1023
fms.s1 FR_KF= FR_KF0, f1, FR_ROUNDVAL // (K+f)
mov GR_MASK_low= 31
}
;;
{.mfi
getf.sig GR_KF0= FR_KF0 // (K+f)*2^10= round_to_int(y)
fcmp.ge.s1 p12, p7= f8, FR_OF_LIMIT // x >= overflow threshold ?
add GR_LOG_TBL= 256, GR_COEFF_START // Pointer to high T_table
}
;;
{.mmi
and GR_F_low= GR_KF0, GR_MASK_low // f_low
and GR_F_high= GR_MASK, GR_KF0 // f_high*32
shr GR_K= GR_KF0, 10 // K
}
;;
{.mmi
shladd GR_Flow_ADDR= GR_F_low, 3, GR_COEFF_START // address of 2^{f_low}
add GR_BIAS= GR_K, GR_EXP_CORR // K= bias-2*63
shr GR_Fh= GR_F_high, 5 // f_high
}
;;
{.mfi
setf.exp FR_2_TO_K= GR_BIAS // 2^{K-126}
fnma.s1 FR_R= FR_KF, f1, f8 // r= x - (K+f)
shladd GR_Fh_ADDR= GR_Fh, 3, GR_LOG_TBL // address of 2^{f_high}
}
{.mlx
ldf8 FR_T_low= [ GR_Flow_ADDR ] // load T_low= 2^{f_low}
movl GR_EMIN= 0xc47f8000 // EMIN= -1022
}
;;
{.mfi
ldf8 FR_T_high= [ GR_Fh_ADDR ] // load T_high= 2^{f_high}
(p7) fcmp.lt.s1 p12, p7= f8, FR_UF_LIMIT // x<underflow threshold ?
nop.i 0
}
;;
{.mfi
setf.s FR_EXPMIN= GR_EMIN // FR_EXPMIN= EMIN
fma.s1 FR_P34= FR_COEFF4, FR_R, FR_COEFF3 // P34= C_3+C_4*r
nop.i 0
}
{.mfb
nop.m 0
fma.s1 FR_R2= FR_R, FR_R, f0 // r*r
(p12) br.cond.spnt OUT_RANGE_exp2
}
;;
{.mfi
nop.m 0
fma.s1 FR_P12= FR_COEFF2, FR_R, FR_COEFF1 // P12= C_1+C_2*r
nop.i 0
}
;;
{.mfi
nop.m 0
fma.s1 FR_T_low_K= FR_T_low, FR_2_TO_K, f0 // T= 2^{K-126}*T_low
nop.i 0
}
;;
{.mfi
nop.m 0
fma.s1 FR_P14= FR_R2, FR_P34, FR_P12 // P14= P12+r2*P34
nop.i 0
}
;;
{.mfi
nop.m 0
fma.s1 FR_T= FR_T_low_K, FR_T_high, f0 // T= T*T_high
nop.i 0
}
;;
{.mfi
nop.m 0
fcmp.lt.s0 p6, p8= f8, FR_EXPMIN // underflow (x<EMIN) ?
nop.i 0
}
;;
{.mfi
nop.m 0
fma.s1 FR_P= FR_P14, FR_R, f0 // P= P14*r
nop.i 0
}
;;
{.mfb
nop.m 0
fma.d.s0 f8= FR_P, FR_T, FR_T // result= T+T*P
(p8) br.ret.sptk b0 // return
}
;;
{.mfb
(p6) mov GR_Parameter_TAG= 162
nop.f 0
(p6) br.cond.sptk __libm_error_region
}
;;
SPECIAL_exp2:
{.mfi
nop.m 0
fclass.m p6, p0= f8, 0x22 // x= -Infinity ?
nop.i 0
}
;;
{.mfi
nop.m 0
fclass.m p7, p0= f8, 0x21 // x= +Infinity ?
nop.i 0
}
;;
{.mfi
nop.m 0
fclass.m p8, p0= f8, 0x7 // x= +/-Zero ?
nop.i 0
}
{.mfb
nop.m 0
(p6) mov f8= f0 // exp2(-Infinity)= 0
(p6) br.ret.spnt b0
}
;;
{.mfb
nop.m 0
nop.f 0
(p7) br.ret.spnt b0 // exp2(+Infinity)= +Infinity
}
;;
{.mfb
nop.m 0
(p8) mov f8= f1 // exp2(+/-0)= 1
(p8) br.ret.spnt b0
}
;;
{.mfb
nop.m 0
fma.d.s0 f8= f8, f1, f0 // Remaining cases: NaNs
br.ret.sptk b0
}
;;
OUT_RANGE_exp2:
// overflow: p8= 1
{.mii
(p8) mov GR_EXPMAX= 0x1fffe
nop.i 0
nop.i 0
}
;;
{.mmb
(p8) mov GR_Parameter_TAG= 161
(p8) setf.exp FR_R= GR_EXPMAX
nop.b 999
}
;;
{.mfi
nop.m 999
(p8) fma.d.s0 f8= FR_R, FR_R, f0 // Create overflow
nop.i 999
}
// underflow: p6= 1
{.mii
(p6) mov GR_Parameter_TAG= 162
(p6) mov GR_EXPMAX= 1
nop.i 0
}
;;
{.mmb
nop.m 0
(p6) setf.exp FR_R= GR_EXPMAX
nop.b 999
}
;;
{.mfb
nop.m 999
(p6) fma.d.s0 f8= FR_R, FR_R, f0 // Create underflow
nop.b 0
}
;;
WEAK_LIBM_END(exp2)
libm_alias_double_other (__exp2, exp2)
#ifdef SHARED
.symver exp2,exp2@@GLIBC_2.29
.weak __exp2_compat
.set __exp2_compat,__exp2
.symver __exp2_compat,exp2@GLIBC_2.2
#endif
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{.mfi
add GR_Parameter_Y= -32, sp // Parameter 2 value
nop.f 0
.save ar.pfs, GR_SAVE_PFS
mov GR_SAVE_PFS= ar.pfs // Save ar.pfs
}
{.mfi
.fframe 64
add sp= -64, sp // Create new stack
nop.f 0
mov GR_SAVE_GP= gp // Save gp
}
;;
{.mmi
stfd [ GR_Parameter_Y ]= FR_Y, 16 // STORE Parameter 2 on stack
add GR_Parameter_X= 16, sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0= b0 // Save b0
}
;;
.body
{.mib
stfd [ GR_Parameter_X ]= FR_X // STORE Parameter 1 on stack
add GR_Parameter_RESULT= 0, GR_Parameter_Y // Parameter 3 address
nop.b 0
}
{.mib
stfd [ GR_Parameter_Y ]= FR_RESULT // STORE Parameter 3 on stack
add GR_Parameter_Y= -16, GR_Parameter_Y
br.call.sptk b0= __libm_error_support# // Call error handling function
}
;;
{.mmi
add GR_Parameter_RESULT= 48, sp
nop.m 0
nop.i 0
}
;;
{.mmi
ldfd f8= [ GR_Parameter_RESULT ] // Get return result off stack
.restore sp
add sp= 64, sp // Restore stack pointer
mov b0= GR_SAVE_B0 // Restore return address
}
;;
{.mib
mov gp= GR_SAVE_GP // Restore gp
mov ar.pfs= GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
}
;;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#, @function
.global __libm_error_support#

View File

@ -1,545 +0,0 @@
.file "exp2f.s"
// Copyright (c) 2000 - 2005, Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
// 08/25/00 Initial version
// 05/20/02 Cleaned up namespace and sf0 syntax
// 09/05/02 Improved performance and accuracy
// 01/17/03 Fixed to call error support when x=128.0
// 03/31/05 Reformatted delimiters between data tables
//
// API
//==============================================================
// float exp2f(float)
//
// Overview of operation
//==============================================================
// Background
//
// Implementation
//
// Let x= (K + fh + fl + r), where
// K is an integer, fh= 0.b1 b2 b3 b4 b5,
// fl= 2^{-5}* 0.b6 b7 b8 b8 b10 (fh, fl >= 0),
// and |r|<2^{-11}
// Th is a table that stores 2^fh (32 entries) rounded to
// double extended precision (only mantissa is stored)
// Tl is a table that stores 2^fl (32 entries) rounded to
// double extended precision (only mantissa is stored)
//
// 2^x is approximated as
// 2^K * Th [ f ] * Tl [ f ] * (1+c1*r+c2*r^2)
// Note: We use the following trick to speed up conversion from FP to integer:
//
// Let x = K + r, where K is an integer, and |r| <= 0.5
// Let N be the number of significand bits for the FP format used
// ( N=64 for double-extended, N=53 for double)
//
// Then let y = 1.5 * 2^(N-1) + x for RN mode
// K = y - 1.5 * 2^(N-1)
// r = x - K
//
// If we want to obtain the integer part and the first m fractional bits of x,
// we can use the same trick, but with a constant of 1.5 * 2^(N-1-m):
//
// Let x = K + f + r
// f = 0.b_1 b_2 ... b_m
// |r| <= 2^(-m-1)
//
// Then let y = 1.5 * 2^(N-1-m) + x for RN mode
// (K+f) = y - 1.5 * 2^(N-1-m)
// r = x - K
// Special values
//==============================================================
// exp2(0)= 1
// exp2(+inf)= inf
// exp2(-inf)= 0
//
// Registers used
//==============================================================
// r2-r3, r14-r40
// f6-f15, f32-f45
// p6-p8, p12
//
GR_TBL_START = r2
GR_LOG_TBL = r3
GR_OF_LIMIT = r14
GR_UF_LIMIT = r15
GR_EXP_CORR = r16
GR_F_low = r17
GR_F_high = r18
GR_K = r19
GR_Flow_ADDR = r20
GR_BIAS = r21
GR_Fh = r22
GR_Fh_ADDR = r23
GR_EXPMAX = r24
GR_EMIN = r25
GR_ROUNDVAL = r26
GR_MASK = r27
GR_KF0 = r28
GR_MASK_low = r29
GR_COEFF_START = r30
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
GR_SAVE_GP = r35
GR_SAVE_SP = r36
GR_Parameter_X = r37
GR_Parameter_Y = r38
GR_Parameter_RESULT = r39
GR_Parameter_TAG = r40
FR_X = f10
FR_Y = f1
FR_RESULT = f8
FR_COEFF1 = f6
FR_COEFF2 = f7
FR_R = f9
FR_KF0 = f12
FR_UF_LIMIT = f15
FR_OF_LIMIT = f32
FR_EXPMIN = f33
FR_ROUNDVAL = f34
FR_KF = f35
FR_2_TO_K = f36
FR_T_low = f37
FR_T_high = f38
FR_P12 = f41
FR_T_low_K = f42
FR_T = f44
FR_P = f45
// Data tables
//==============================================================
RODATA
.align 16
LOCAL_OBJECT_START(poly_coeffs)
data8 0xb17217f7d1cf79ab, 0x00003ffe // C_1
data8 0xf5fdeffc162c7541, 0x00003ffc // C_2
LOCAL_OBJECT_END(poly_coeffs)
LOCAL_OBJECT_START(T_table)
// 2^{0.00000 b6 b7 b8 b9 b10}
data8 0x8000000000000000, 0x8016302f17467628
data8 0x802c6436d0e04f50, 0x80429c17d77c18ed
data8 0x8058d7d2d5e5f6b0, 0x806f17687707a7af
data8 0x80855ad965e88b83, 0x809ba2264dada76a
data8 0x80b1ed4fd999ab6c, 0x80c83c56b50cf77f
data8 0x80de8f3b8b85a0af, 0x80f4e5ff089f763e
data8 0x810b40a1d81406d4, 0x81219f24a5baa59d
data8 0x813801881d886f7b, 0x814e67cceb90502c
data8 0x8164d1f3bc030773, 0x817b3ffd3b2f2e47
data8 0x8191b1ea15813bfd, 0x81a827baf7838b78
data8 0x81bea1708dde6055, 0x81d51f0b8557ec1c
data8 0x81eba08c8ad4536f, 0x820225f44b55b33b
data8 0x8218af4373fc25eb, 0x822f3c7ab205c89a
data8 0x8245cd9ab2cec048, 0x825c62a423d13f0c
data8 0x8272fb97b2a5894c, 0x828998760d01faf3
data8 0x82a0393fe0bb0ca8, 0x82b6ddf5dbc35906
//
// 2^{0.b1 b2 b3 b4 b5}
data8 0x8000000000000000, 0x82cd8698ac2ba1d7
data8 0x85aac367cc487b14, 0x88980e8092da8527
data8 0x8b95c1e3ea8bd6e6, 0x8ea4398b45cd53c0
data8 0x91c3d373ab11c336, 0x94f4efa8fef70961
data8 0x9837f0518db8a96f, 0x9b8d39b9d54e5538
data8 0x9ef5326091a111ad, 0xa27043030c496818
data8 0xa5fed6a9b15138ea, 0xa9a15ab4ea7c0ef8
data8 0xad583eea42a14ac6, 0xb123f581d2ac258f
data8 0xb504f333f9de6484, 0xb8fbaf4762fb9ee9
data8 0xbd08a39f580c36be, 0xc12c4cca66709456
data8 0xc5672a115506dadd, 0xc9b9bd866e2f27a2
data8 0xce248c151f8480e3, 0xd2a81d91f12ae45a
data8 0xd744fccad69d6af4, 0xdbfbb797daf23755
data8 0xe0ccdeec2a94e111, 0xe5b906e77c8348a8
data8 0xeac0c6e7dd24392e, 0xefe4b99bdcdaf5cb
data8 0xf5257d152486cc2c, 0xfa83b2db722a033a
LOCAL_OBJECT_END(T_table)
.section .text
WEAK_LIBM_ENTRY(exp2f)
{.mfi
alloc r32= ar.pfs, 1, 4, 4, 0
// will continue only for non-zero normal/denormal numbers
fclass.nm p12, p0= f8, 0x1b
// GR_TBL_START= pointer to C_1...C_2 followed by T_table
addl GR_TBL_START= @ltoff(poly_coeffs), gp
}
{.mlx
mov GR_OF_LIMIT= 0xffff + 7 // Exponent of overflow limit
movl GR_ROUNDVAL= 0x5a400000 // 1.5*2^(63-10) (SP)
}
;;
// Form special constant 1.5*2^(63-10) to give integer part and first 10
// fractional bits of x
{.mfi
setf.s FR_ROUNDVAL= GR_ROUNDVAL // Form special constant
fcmp.lt.s1 p6, p8= f8, f0 // X<0 ?
nop.i 0
}
{.mfb
ld8 GR_COEFF_START= [ GR_TBL_START ] // Load pointer to coeff table
nop.f 0
(p12) br.cond.spnt SPECIAL_exp2 // Branch if nan, inf, zero
}
;;
{.mlx
setf.exp FR_OF_LIMIT= GR_OF_LIMIT // Set overflow limit
movl GR_UF_LIMIT= 0xc3160000 // (-2^7-22) = -150
}
;;
{.mfi
ldfe FR_COEFF1= [ GR_COEFF_START ], 16 // load C_1
fma.s0 f8= f8, f1, f0 // normalize x
nop.i 0
}
;;
{.mmi
ldfe FR_COEFF2= [ GR_COEFF_START ], 16 // load C_2
setf.s FR_UF_LIMIT= GR_UF_LIMIT // Set underflow limit
mov GR_EXP_CORR= 0xffff-126
}
;;
{.mfi
nop.m 0
fma.s1 FR_KF0= f8, f1, FR_ROUNDVAL // y= x + 1.5*2^(63-10)
nop.i 0
}
;;
{.mfi
mov GR_MASK= 1023
fms.s1 FR_KF= FR_KF0, f1, FR_ROUNDVAL // (K+f)
mov GR_MASK_low= 31
}
;;
{.mfi
getf.sig GR_KF0= FR_KF0 // (K+f)*2^10= round_to_int(y)
fcmp.ge.s1 p12, p7= f8, FR_OF_LIMIT // x >= overflow threshold ?
add GR_LOG_TBL= 256, GR_COEFF_START // Pointer to high T_table
}
;;
{.mmi
and GR_F_low= GR_KF0, GR_MASK_low // f_low
and GR_F_high= GR_MASK, GR_KF0 // f_high*32
shr GR_K= GR_KF0, 10 // K
}
;;
{.mmi
shladd GR_Flow_ADDR= GR_F_low, 3, GR_COEFF_START // address of 2^{f_low}
add GR_BIAS= GR_K, GR_EXP_CORR // K= bias-2*63
shr GR_Fh= GR_F_high, 5 // f_high
}
;;
{.mfi
setf.exp FR_2_TO_K= GR_BIAS // 2^{K-126}
fnma.s1 FR_R= FR_KF, f1, f8 // r= x - (K+f)
shladd GR_Fh_ADDR= GR_Fh, 3, GR_LOG_TBL // address of 2^{f_high}
}
{.mlx
ldf8 FR_T_low= [ GR_Flow_ADDR ] // load T_low= 2^{f_low}
movl GR_EMIN= 0xc2fc0000 // EMIN= -126
}
;;
{.mfi
ldf8 FR_T_high= [ GR_Fh_ADDR ] // load T_high= 2^{f_high}
(p7) fcmp.lt.s1 p12, p7= f8, FR_UF_LIMIT // x<underflow threshold ?
nop.i 0
}
;;
{.mfb
setf.s FR_EXPMIN= GR_EMIN // FR_EXPMIN= EMIN
fma.s1 FR_P12= FR_COEFF2, FR_R, FR_COEFF1 // P12= C_1+C_2*r
(p12) br.cond.spnt OUT_RANGE_exp2
}
;;
{.mfi
nop.m 0
fma.s1 FR_T_low_K= FR_T_low, FR_2_TO_K, f0 // T= 2^{K-126}*T_low
nop.i 0
}
;;
{.mfi
nop.m 0
fma.s1 FR_P= FR_R, FR_P12, f0 // P= P12+r
nop.i 0
}
;;
{.mfi
nop.m 0
fma.s1 FR_T= FR_T_low_K, FR_T_high, f0 // T= T*T_high
nop.i 0
}
;;
{.mfi
nop.m 0
fcmp.lt.s0 p6, p8= f8, FR_EXPMIN // underflow (x<EMIN) ?
nop.i 0
}
;;
{.mfb
nop.m 0
fma.s.s0 f8= FR_P, FR_T, FR_T // result= T+T*P
(p8) br.ret.sptk b0 // return
}
;;
{.mfb
(p6) mov GR_Parameter_TAG= 164
nop.f 0
(p6) br.cond.sptk __libm_error_region
}
;;
SPECIAL_exp2:
{.mfi
nop.m 0
fclass.m p6, p0= f8, 0x22 // x= -Infinity ?
nop.i 0
}
;;
{.mfi
nop.m 0
fclass.m p7, p0= f8, 0x21 // x= +Infinity ?
nop.i 0
}
;;
{.mfi
nop.m 0
fclass.m p8, p0= f8, 0x7 // x= +/-Zero ?
nop.i 0
}
{.mfb
nop.m 0
(p6) mov f8= f0 // exp2(-Infinity)= 0
(p6) br.ret.spnt b0
}
;;
{.mfb
nop.m 0
nop.f 0
(p7) br.ret.spnt b0 // exp2(+Infinity)= +Infinity
}
;;
{.mfb
nop.m 0
(p8) mov f8= f1 // exp2(+/-0)= 1
(p8) br.ret.spnt b0
}
;;
{.mfb
nop.m 0
fma.s.s0 f8= f8, f1, f0 // Remaining cases: NaNs
br.ret.sptk b0
}
;;
OUT_RANGE_exp2:
// overflow: p8= 1
{.mii
(p8) mov GR_EXPMAX= 0x1fffe
nop.i 0
nop.i 0
}
;;
{.mmb
(p8) mov GR_Parameter_TAG= 163
(p8) setf.exp FR_R= GR_EXPMAX
nop.b 999
}
;;
{.mfi
nop.m 999
(p8) fma.s.s0 f8= FR_R, FR_R, f0 // Create overflow
nop.i 999
}
// underflow: p6= 1
{.mii
(p6) mov GR_Parameter_TAG= 164
(p6) mov GR_EXPMAX= 1
nop.i 0
}
;;
{.mmb
nop.m 0
(p6) setf.exp FR_R= GR_EXPMAX
nop.b 999
}
;;
{.mfb
nop.m 999
(p6) fma.s.s0 f8= FR_R, FR_R, f0 // Create underflow
nop.b 0
}
;;
WEAK_LIBM_END(exp2f)
libm_alias_float_other (__exp2, exp2)
#ifdef SHARED
.symver exp2f,exp2f@@GLIBC_2.27
.weak __exp2f_compat
.set __exp2f_compat,__exp2f
.symver __exp2f_compat,exp2f@GLIBC_2.2
#endif
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{.mfi
add GR_Parameter_Y= -32, sp // Parameter 2 value
nop.f 0
.save ar.pfs, GR_SAVE_PFS
mov GR_SAVE_PFS= ar.pfs // Save ar.pfs
}
{.mfi
.fframe 64
add sp= -64, sp // Create new stack
nop.f 0
mov GR_SAVE_GP= gp // Save gp
}
;;
{.mmi
stfs [ GR_Parameter_Y ]= FR_Y, 16 // STORE Parameter 2 on stack
add GR_Parameter_X= 16, sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0= b0 // Save b0
}
;;
.body
{.mib
stfs [ GR_Parameter_X ]= FR_X // STORE Parameter 1 on stack
add GR_Parameter_RESULT= 0, GR_Parameter_Y // Parameter 3 address
nop.b 0
}
{.mib
stfs [ GR_Parameter_Y ]= FR_RESULT // STORE Parameter 3 on stack
add GR_Parameter_Y= -16, GR_Parameter_Y
br.call.sptk b0= __libm_error_support# // Call error handling function
}
;;
{.mmi
add GR_Parameter_RESULT= 48, sp
nop.m 0
nop.i 0
}
;;
{.mmi
ldfs f8= [ GR_Parameter_RESULT ] // Get return result off stack
.restore sp
add sp= 64, sp // Restore stack pointer
mov b0= GR_SAVE_B0 // Restore return address
}
;;
{.mib
mov gp= GR_SAVE_GP // Restore gp
mov ar.pfs= GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
}
;;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#, @function
.global __libm_error_support#

View File

@ -1,807 +0,0 @@
.file "exp2l.s"
// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
// 07/27/00 Initial version
// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [ the previously overwritten ] GR_Parameter_RESULT.
// 02/02/01 Added libm_error_support calls for underflow
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/06/03 Reordered header: .section, .global, .proc, .align
// 05/07/03 Reformatted assembly source
//
// API
//==============================================================
// long double exp2l(long double)
//
// Overview of operation
//==============================================================
// Background
//
// Implementation
//
// Let x= K + f + r, where
// K is an integer, f= 0.b1 b2... b8 (f>= 0),
// and |r|<2^{-8}
// T is a table that stores 2^f (256 entries) rounded to
// double extended precision (only mantissa is stored)
// D stores (2^f/T [ f ] - 1), rounded to single precision
//
// 2^x is approximated as
// 2^K * T [ f ] * (1+D [ f ] +c1*r+c2*r^2+...+c6*r^6)
//
// Special values
//==============================================================
// exp2(0)= 1
// exp2(+inf)= inf
// exp2(-inf)= 0
//
// Registers used
//==============================================================
// f6-f15, f32-f46
// r2-r3, r8-r11, r14-r40
// p6, p7, p8, p12
FR_X = f10
FR_Y = f1
FR_RESULT = f8
FR_KF0 = f6
FR_EXP63 = f7
FR_T = f9
FR_COEFF3 = f10
FR_COEFF4 = f11
FR_COEFF5 = f12
FR_COEFF6 = f13
FR_COEFF1 = f14
FR_COEFF2 = f15
FR_2P14 = f32
FR_UF_TEST = f33
FR_D = f34
FR_R = f35
FR_2EXP = f36
FR_EMIN = f37
FR_P34 = f38
FR_P56 = f39
FR_R2 = f40
FR_P12 = f41
FR_TS = f42
FR_P36 = f43
FR_P02 = f44
FR_R3 = f45
FR_P06 = f46
GR_ADDR0 = r2
GR_ADDR = r2
GR_D_ADDR0 = r3
GR_D_ADDR = r3
GR_LEADBITS = r8
GR_256 = r9
GR_EM63 = r10
GR_255 = r11
GR_EXPON = r14
GR_BM63 = r15
GR_UF_TEST = r16
GR_INDEX = r17
GR_K = r18
GR_KF = r19
GR_2P14 = r19
GR_EMIN = r20
GR_IT = r21
GR_ID = r22
GR_63 = r23
GR_CONST1 = r24
GR_EBIAS = r25
GR_CONST2 = r26
GR_CONST3 = r27
GR_SIGNIF = r28
GR_ARGEXP = r29
GR_SGN = r30
GR_EMIN1 = r31
GR_SREG = r32
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
GR_SAVE_GP = r35
GR_SAVE_SP = r36
GR_Parameter_X = r37
GR_Parameter_Y = r38
GR_Parameter_RESULT= r39
GR_Parameter_TAG = r40
// Data tables
//==============================================================
RODATA
.align 16
LOCAL_OBJECT_START(poly_coeffs)
data8 0x3fac6b08d704a0c0 // C_3
data8 0x3f83b2ab6fba4e77 // C_4
data8 0x3f55d87fe78a6731 // C_5
data8 0x3f2430912f86c787 // C_6
data8 0xb17217f7d1cf79ab, 0x00003ffe // C_1
data8 0xf5fdeffc162c7541, 0x00003ffc // C_2
LOCAL_OBJECT_END(poly_coeffs)
LOCAL_OBJECT_START(T_table)
data8 0x8000000000000000, 0x8058d7d2d5e5f6b1
data8 0x80b1ed4fd999ab6c, 0x810b40a1d81406d4
data8 0x8164d1f3bc030773, 0x81bea1708dde6056
data8 0x8218af4373fc25ec, 0x8272fb97b2a5894c
data8 0x82cd8698ac2ba1d7, 0x83285071e0fc4547
data8 0x8383594eefb6ee37, 0x83dea15b9541b132
data8 0x843a28c3acde4046, 0x8495efb3303efd30
data8 0x84f1f656379c1a29, 0x854e3cd8f9c8c95d
data8 0x85aac367cc487b15, 0x86078a2f23642a9f
data8 0x8664915b923fba04, 0x86c1d919caef5c88
data8 0x871f61969e8d1010, 0x877d2afefd4e256c
data8 0x87db357ff698d792, 0x88398146b919f1d4
data8 0x88980e8092da8527, 0x88f6dd5af155ac6b
data8 0x8955ee03618e5fdd, 0x89b540a7902557a4
data8 0x8a14d575496efd9a, 0x8a74ac9a79896e47
data8 0x8ad4c6452c728924, 0x8b3522a38e1e1032
data8 0x8b95c1e3ea8bd6e7, 0x8bf6a434adde0085
data8 0x8c57c9c4646f4dde, 0x8cb932c1bae97a95
data8 0x8d1adf5b7e5ba9e6, 0x8d7ccfc09c50e2f8
data8 0x8ddf042022e69cd6, 0x8e417ca940e35a01
data8 0x8ea4398b45cd53c0, 0x8f073af5a2013520
data8 0x8f6a8117e6c8e5c4, 0x8fce0c21c6726481
data8 0x9031dc431466b1dc, 0x9095f1abc540ca6b
data8 0x90fa4c8beee4b12b, 0x915eed13c89689d3
data8 0x91c3d373ab11c336, 0x9228ffdc10a051ad
data8 0x928e727d9531f9ac, 0x92f42b88f673aa7c
data8 0x935a2b2f13e6e92c, 0x93c071a0eef94bc1
data8 0x9426ff0fab1c04b6, 0x948dd3ac8ddb7ed3
data8 0x94f4efa8fef70961, 0x955c5336887894d5
data8 0x95c3fe86d6cc7fef, 0x962bf1cbb8d97560
data8 0x96942d3720185a00, 0x96fcb0fb20ac4ba3
data8 0x97657d49f17ab08e, 0x97ce9255ec4357ab
data8 0x9837f0518db8a96f, 0x98a1976f7597e996
data8 0x990b87e266c189aa, 0x9975c1dd47518c77
data8 0x99e0459320b7fa65, 0x9a4b13371fd166ca
data8 0x9ab62afc94ff864a, 0x9b218d16f441d63d
data8 0x9b8d39b9d54e5539, 0x9bf93118f3aa4cc1
data8 0x9c6573682ec32c2d, 0x9cd200db8a0774cb
data8 0x9d3ed9a72cffb751, 0x9dabfdff6367a2aa
data8 0x9e196e189d472420, 0x9e872a276f0b98ff
data8 0x9ef5326091a111ae, 0x9f6386f8e28ba651
data8 0x9fd228256400dd06, 0xa041161b3d0121be
data8 0xa0b0510fb9714fc2, 0xa11fd9384a344cf7
data8 0xa18faeca8544b6e4, 0xa1ffd1fc25cea188
data8 0xa27043030c496819, 0xa2e102153e918f9e
data8 0xa3520f68e802bb93, 0xa3c36b345991b47c
data8 0xa43515ae09e6809e, 0xa4a70f0c95768ec5
data8 0xa5195786be9ef339, 0xa58bef536dbeb6ee
data8 0xa5fed6a9b15138ea, 0xa6720dc0be08a20c
data8 0xa6e594cfeee86b1e, 0xa7596c0ec55ff55b
data8 0xa7cd93b4e965356a, 0xa8420bfa298f70d1
data8 0xa8b6d5167b320e09, 0xa92bef41fa77771b
data8 0xa9a15ab4ea7c0ef8, 0xaa1717a7b5693979
data8 0xaa8d2652ec907629, 0xab0386ef48868de1
data8 0xab7a39b5a93ed337, 0xabf13edf162675e9
data8 0xac6896a4be3fe929, 0xace0413ff83e5d04
data8 0xad583eea42a14ac6, 0xadd08fdd43d01491
data8 0xae493452ca35b80e, 0xaec22c84cc5c9465
data8 0xaf3b78ad690a4375, 0xafb51906e75b8661
data8 0xb02f0dcbb6e04584, 0xb0a957366fb7a3c9
data8 0xb123f581d2ac2590, 0xb19ee8e8c94feb09
data8 0xb21a31a66618fe3b, 0xb295cff5e47db4a4
data8 0xb311c412a9112489, 0xb38e0e38419fae18
data8 0xb40aaea2654b9841, 0xb487a58cf4a9c180
data8 0xb504f333f9de6484, 0xb58297d3a8b9f0d2
data8 0xb60093a85ed5f76c, 0xb67ee6eea3b22b8f
data8 0xb6fd91e328d17791, 0xb77c94c2c9d725e9
data8 0xb7fbefca8ca41e7c, 0xb87ba337a1743834
data8 0xb8fbaf4762fb9ee9, 0xb97c143756844dbf
data8 0xb9fcd2452c0b9deb, 0xba7de9aebe5fea09
data8 0xbaff5ab2133e45fb, 0xbb81258d5b704b6f
data8 0xbc034a7ef2e9fb0d, 0xbc85c9c560e7b269
data8 0xbd08a39f580c36bf, 0xbd8bd84bb67ed483
data8 0xbe0f6809860993e2, 0xbe935317fc378238
data8 0xbf1799b67a731083, 0xbf9c3c248e2486f8
data8 0xc0213aa1f0d08db0, 0xc0a6956e8836ca8d
data8 0xc12c4cca66709456, 0xc1b260f5ca0fbb33
data8 0xc238d2311e3d6673, 0xc2bfa0bcfad907c9
data8 0xc346ccda24976407, 0xc3ce56c98d21b15d
data8 0xc4563ecc5334cb33, 0xc4de8523c2c07baa
data8 0xc5672a115506dadd, 0xc5f02dd6b0bbc3d9
data8 0xc67990b5aa245f79, 0xc70352f04336c51e
data8 0xc78d74c8abb9b15d, 0xc817f681416452b2
data8 0xc8a2d85c8ffe2c45, 0xc92e1a9d517f0ecc
data8 0xc9b9bd866e2f27a3, 0xca45c15afcc72624
data8 0xcad2265e4290774e, 0xcb5eecd3b38597c9
data8 0xcbec14fef2727c5d, 0xcc799f23d11510e5
data8 0xcd078b86503dcdd2, 0xcd95da6a9ff06445
data8 0xce248c151f8480e4, 0xceb3a0ca5dc6a55d
data8 0xcf4318cf191918c1, 0xcfd2f4683f94eeb5
data8 0xd06333daef2b2595, 0xd0f3d76c75c5db8d
data8 0xd184df6251699ac6, 0xd2164c023056bcab
data8 0xd2a81d91f12ae45a, 0xd33a5457a3029054
data8 0xd3ccf099859ac379, 0xd45ff29e0972c561
data8 0xd4f35aabcfedfa1f, 0xd5872909ab75d18a
data8 0xd61b5dfe9f9bce07, 0xd6aff9d1e13ba2fe
data8 0xd744fccad69d6af4, 0xd7da67311797f56a
data8 0xd870394c6db32c84, 0xd9067364d44a929c
data8 0xd99d15c278afd7b6, 0xda3420adba4d8704
data8 0xdacb946f2ac9cc72, 0xdb63714f8e295255
data8 0xdbfbb797daf23755, 0xdc9467913a4f1c92
data8 0xdd2d818508324c20, 0xddc705bcd378f7f0
data8 0xde60f4825e0e9124, 0xdefb4e1f9d1037f2
data8 0xdf9612deb8f04420, 0xe031430a0d99e627
data8 0xe0ccdeec2a94e111, 0xe168e6cfd3295d23
data8 0xe2055afffe83d369, 0xe2a23bc7d7d91226
data8 0xe33f8972be8a5a51, 0xe3dd444c46499619
data8 0xe47b6ca0373da88d, 0xe51a02ba8e26d681
data8 0xe5b906e77c8348a8, 0xe658797368b3a717
data8 0xe6f85aaaee1fce22, 0xe798aadadd5b9cbf
data8 0xe8396a503c4bdc68, 0xe8da9958464b42ab
data8 0xe97c38406c4f8c57, 0xea1e4756550eb27b
data8 0xeac0c6e7dd24392f, 0xeb63b74317369840
data8 0xec0718b64c1cbddc, 0xecaaeb8ffb03ab41
data8 0xed4f301ed9942b84, 0xedf3e6b1d418a491
data8 0xee990f980da3025b, 0xef3eab20e032bc6b
data8 0xefe4b99bdcdaf5cb, 0xf08b3b58cbe8b76a
data8 0xf13230a7ad094509, 0xf1d999d8b7708cc1
data8 0xf281773c59ffb13a, 0xf329c9233b6bae9c
data8 0xf3d28fde3a641a5b, 0xf47bcbbe6db9fddf
data8 0xf5257d152486cc2c, 0xf5cfa433e6537290
data8 0xf67a416c733f846e, 0xf7255510c4288239
data8 0xf7d0df730ad13bb9, 0xf87ce0e5b2094d9c
data8 0xf92959bb5dd4ba74, 0xf9d64a46eb939f35
data8 0xfa83b2db722a033a, 0xfb3193cc4227c3f4
data8 0xfbdfed6ce5f09c49, 0xfc8ec01121e447bb
data8 0xfd3e0c0cf486c175, 0xfdedd1b496a89f35
data8 0xfe9e115c7b8f884c, 0xff4ecb59511ec8a5
LOCAL_OBJECT_END(T_table)
LOCAL_OBJECT_START(D_table)
data4 0x00000000, 0x9f55c08f, 0x1e93ffa3, 0x1dcd43a8
data4 0x1f751f79, 0x9f3cdd88, 0x9f43d155, 0x1eda222c
data4 0x1ef35513, 0x9f597895, 0x9e698881, 0x1ec71073
data4 0x1e50e371, 0x9dc01e19, 0x1de74133, 0x1e2f028c
data4 0x9edefb47, 0x1ebbac48, 0x9e8b0330, 0x9e9e9314
data4 0x1edc1d11, 0x1f098529, 0x9f52827c, 0x1f50050d
data4 0x1f301e8e, 0x1f5b64d1, 0x9f45e3ee, 0x9ef64d6d
data4 0x1d6ec5e8, 0x9e61ad9a, 0x1d44ccbb, 0x9e4a8bbb
data4 0x9cf11576, 0x9dcce7e7, 0x9d02ac90, 0x1f26ccf0
data4 0x9f0877c6, 0x9ddd62ae, 0x9f4b7fc3, 0x1ea8ef6b
data4 0x1ea4378d, 0x1ef6fc38, 0x1db99fd9, 0x1f22bf6f
data4 0x1f53e172, 0x1e85504a, 0x9f37cc75, 0x1f0c5e17
data4 0x1dde8aac, 0x9cb42bb2, 0x1e153cd7, 0x1eb62bba
data4 0x9e9b941b, 0x9ea80e3c, 0x1f508823, 0x1ec3fd36
data4 0x1e9ffaa1, 0x1e21e2eb, 0x9d948b1d, 0x9e8ac93a
data4 0x1ef7ee6f, 0x9e80dda3, 0x1f0814be, 0x1dc5ddfe
data4 0x1eedb9d1, 0x9f2aaa26, 0x9ea5b0fc, 0x1edf702e
data4 0x9e391201, 0x1f1316bb, 0x1ea27fb7, 0x9e05ed18
data4 0x9f199ed2, 0x1ee7fd7c, 0x1f003db6, 0x9eac3793
data4 0x9e5b8c10, 0x9f3af17c, 0x1bc9a8be, 0x1ee3c004
data4 0x9f19b1b2, 0x9f242ce9, 0x9ce67dd1, 0x9e4f6275
data4 0x1e20742c, 0x1eb9328a, 0x9f477153, 0x1d969718
data4 0x9f1e6c43, 0x1f2f67f4, 0x9f39c7e4, 0x9e3c4feb
data4 0x1da3956b, 0x9e7c685d, 0x1f280911, 0x9f0d8afb
data4 0x1e314b40, 0x9eb4f250, 0x9f1a34ad, 0x1ef5d5e7
data4 0x9f145496, 0x1e604827, 0x9f1e5195, 0x1e9c1fc0
data4 0x1efde521, 0x1e69b385, 0x1f316830, 0x9f244eae
data4 0x1f1787ec, 0x9e939971, 0x1f0bb393, 0x9f0511d6
data4 0x1ed919de, 0x1d8b7b28, 0x1e5ca4a9, 0x1e7c357b
data4 0x9e3ff8e8, 0x1eef53b5, 0x9ed22ed7, 0x1f16659b
data4 0x9f2db102, 0x9e2c6a78, 0x1f328d7d, 0x9f2fec3c
data4 0x1eb395bd, 0x9f242b84, 0x9e2683e6, 0x1ed71e68
data4 0x1efd1df5, 0x9e9eeafd, 0x9ed2249c, 0x1eef129a
data4 0x1d1ea44c, 0x9e81f7ff, 0x1eaf77c9, 0x9ee7a285
data4 0x1e1864ed, 0x9ee7edbb, 0x9e15a27d, 0x9ae61655
data4 0x1f1ff1a2, 0x1da29755, 0x9e5f46fb, 0x1e901236
data4 0x9eecfb9b, 0x9f204d2f, 0x1ec64685, 0x9eb809bd
data4 0x9e0026c5, 0x1d9f1da1, 0x1f142b49, 0x9f20f22e
data4 0x1f24b067, 0x1f185a4c, 0x9f09765c, 0x9ece902f
data4 0x1e2ca5db, 0x1e6de464, 0x9f071f67, 0x1f1518c3
data4 0x1ea13ded, 0x1f0b8414, 0x1edb6ad4, 0x9e548740
data4 0x9ea10efb, 0x1ee48a60, 0x1e7954c5, 0x9edad013
data4 0x9f21517d, 0x9e9b6e0c, 0x9ee7f9a6, 0x9ebd4298
data4 0x9d65b24e, 0x1eed751f, 0x9f1573ea, 0x9d430377
data4 0x9e13fc0c, 0x1e47008a, 0x1e3d5c1d, 0x1ef41a91
data4 0x9e4a4ef7, 0x9e952f18, 0x1d620566, 0x1d9b8d33
data4 0x1db06247, 0x1e94b31e, 0x1f0730ad, 0x9d79ffb4
data4 0x1ed64d51, 0x9e91fd11, 0x9e28d35a, 0x9dea0ed9
data4 0x1e891def, 0x9ee28ac0, 0x1e1db99b, 0x9ee1ce38
data4 0x9bdd9bca, 0x1eb72cb9, 0x9e8c53c6, 0x1e0df6ca
data4 0x1e8f2ccd, 0x9e9b0886, 0x1eeb3bc7, 0x1ec7e772
data4 0x9e210776, 0x9daf246c, 0x1ea1f151, 0x1ece4dc6
data4 0x1ce741c8, 0x1ed3c88f, 0x9ec9a4fd, 0x9e0c8d30
data4 0x1d2fbb26, 0x9ef212a7, 0x1ee44f1c, 0x9e445550
data4 0x1e075f77, 0x9d9291a3, 0x1f09c2ee, 0x9e012c88
data4 0x1f057d62, 0x9e7bb0dc, 0x9d8758ee, 0x1ee8d6c1
data4 0x9e509a57, 0x9e4ca7b7, 0x1e2cb341, 0x9ec35106
data4 0x1ecf3baf, 0x1e11781c, 0x1ea0cc78, 0x1eb75ca6
data4 0x1e961e1a, 0x1eb88853, 0x1e7abf50, 0x1ee38704
data4 0x9dc5ab0f, 0x1afe197b, 0x9ec07523, 0x9d9b7f78
data4 0x1f011618, 0x1ed43b0b, 0x9f035945, 0x9e3fd014
data4 0x9bbda5cd, 0x9e83f8ab, 0x1e58a928, 0x1e392d61
data4 0x1efdbb52, 0x1ee310a8, 0x9ec7ecc1, 0x1e8c9ed6
data4 0x9ef82dee, 0x9e70545b, 0x9ea53fc4, 0x1e40f419
LOCAL_OBJECT_END(D_table)
.section .text
GLOBAL_LIBM_ENTRY(exp2l)
{.mii
// get exponent
getf.exp GR_EBIAS = f8
// GR_D_ADDR0 = pointer to D_table
addl GR_D_ADDR0 = @ltoff(D_table), gp
// GR_ADDR0 = pointer to C_1...C_6 followed by T_table
addl GR_ADDR0 = @ltoff(poly_coeffs), gp ;;
}
{.mfi
// get significand
getf.sig GR_SIGNIF = f8
// will continue only for normal/denormal numbers
fclass.nm.unc p12, p7 = f8, 0x1b
mov GR_63 = 63 ;;
}
{.mfi
nop.m 0
nop.f 0
// GR_CONST2 = bias+63-8
mov GR_CONST2 = 0xffff+55
}
{.mfi
// GR_CONST1 = bias+15
mov GR_CONST1 = 0xffff+15
nop.f 0
mov GR_CONST3 = 0x1ffff ;;
}
{.mfi
// load start address for C_1...C_6 followed by T_table
ld8 GR_ADDR = [ GR_ADDR0 ]
nop.f 0
// get sign of argument
andcm GR_SGN = GR_EBIAS, GR_CONST3
}
{.mfi
// GR_D_ADDR = pointer to D_table
ld8 GR_D_ADDR = [ GR_D_ADDR0 ]
nop.f 0
// get argument exponent
and GR_ARGEXP = GR_CONST3, GR_EBIAS ;;
}
{.mfi
alloc GR_SREG = ar.pfs, 1, 4, 4, 0
nop.f 0
// p6 = 1 if sign = 1
cmp.ne p6, p8 = GR_SGN, r0
}
{.mfi
// p7 = 1 if exponent> = 15 (argument out of range)
cmp.ge p7, p0 = GR_ARGEXP, GR_CONST1
nop.f 0
sub GR_EXPON = GR_CONST2, GR_ARGEXP ;;
}
{.mib
// load C_3, C_4
ldfpd FR_COEFF3, FR_COEFF4 = [ GR_ADDR ], 16
// get first exponent+8 bits
shr.u GR_LEADBITS = GR_SIGNIF, GR_EXPON
(p12) br.cond.spnt SPECIAL_exp2l
}
{.mib
mov GR_256 = 256
// exponent- = 63
sub GR_EM63 = GR_EBIAS, GR_63
(p7) br.cond.spnt OUT_RANGE_exp2l ;;
}
{.mlx
// load C_5, C_6
ldfpd FR_COEFF5, FR_COEFF6 = [ GR_ADDR ], 16
// GR_2P14 = 2^14
movl GR_2P14 = 0x46800000 ;;
}
{.mfi
// load C_1
ldfe FR_COEFF1 = [ GR_ADDR ], 16
fma.s0 f8 = f8, f1, f0
// GR_BM63 = bias-63
mov GR_BM63 = 0xffff-63 ;;
}
{.mlx
setf.s FR_2P14 = GR_2P14
// GR_UF_TEST = -2^14-62
movl GR_UF_TEST = 0xc6807c00
}
{.mfi
// load C_2
ldfe FR_COEFF2 = [ GR_ADDR ], 16
nop.f 0
mov GR_255 = 255 ;;
}
{.mib
// get 8-bit index
and GR_INDEX = GR_255, GR_LEADBITS
// get K = integer part
shr.u GR_K = GR_LEADBITS, 8
nop.b 0 ;;
}
{.mmi
// if sign = 1 && f>0, set p7 = 1
(p6) cmp.gt.unc p7, p0 = GR_INDEX, r0
setf.s FR_UF_TEST = GR_UF_TEST
shl GR_KF = GR_LEADBITS, GR_EXPON ;;
}
{.mfi
// if sign = 1 && f>0, set f = 1-f
(p7) sub GR_INDEX = GR_256, GR_INDEX
nop.f 0
// if sign = 1 && f>0, set K = K+1
(p7) add GR_K = GR_K, r0, 1 ;;
}
{.mfi
// FR_EXP63 = 2^{expon-63}
setf.exp FR_EXP63 = GR_EM63
nop.f 0
nop.i 0 ;;
}
.pred.rel "mutex", p6, p8
{.mfi
// if sign = 0, set scale factor exponent S = K+bias-63
(p8) add GR_K = GR_K, GR_BM63
nop.f 0
// if sign = 1, set scale factor exponent S = -K+bias-63
(p6) sub GR_K = GR_BM63, GR_K ;;
}
{.mmi
// FR_KF0 = 2^{63-expon}*(K+f)
setf.sig FR_KF0 = GR_KF
nop.m 0
// GR_EMIN = EMIN = 2-2^14
mov GR_EMIN = 0x18cfff ;;
}
{.mfi
// get T_table index
shladd GR_IT = GR_INDEX, 3, GR_ADDR
// p7 = 1 if x> = 2^10
fcmp.ge.s1 p7, p12 = f8, FR_2P14
// get D_table index
shladd GR_ID = GR_INDEX, 2, GR_D_ADDR ;;
}
{.mfi
// load T_table value
ldf8 FR_T = [ GR_IT ]
// p7 = 1 if x<-2^10-50
(p12) fcmp.lt.s1 p7, p0 = f8, FR_UF_TEST
// GR_EMIN1 = EMIN = 2-2^14
shl GR_EMIN1 = GR_EMIN, 11 ;;
}
{.mmb
// f50 = scale factor = 2^{K-63}
setf.exp FR_2EXP = GR_K
// load D_table value
ldfs FR_D = [ GR_ID ]
(p7) br.cond.spnt OUT_RANGE_exp2l ;;
}
{.mfi
nop.m 0
// get r = x-(K+f)
fnma.s1 FR_R = FR_KF0, FR_EXP63, f8
nop.i 0 ;;
}
{.mfi
// FR_EMIN = EMIN
setf.s FR_EMIN = GR_EMIN1
// P34 = C_4*r+C_3
fma.s1 FR_P34 = FR_COEFF4, FR_R, FR_COEFF3
nop.i 0
}
{.mfi
nop.m 0
// P56 = C_6*r+C_5
fma.s1 FR_P56 = FR_COEFF6, FR_R, FR_COEFF5
nop.i 0 ;;
}
{.mfi
nop.m 0
// r*r
fma.s1 FR_R2 = FR_R, FR_R, f0
nop.i 0
}
{.mfi
nop.m 0
// P12 = C_2*r+C_1
fma.s1 FR_P12 = FR_COEFF2, FR_R, FR_COEFF1
nop.i 0 ;;
}
{.mfi
nop.m 0
// T* = scaling factor
fma.s1 FR_TS = FR_T, FR_2EXP, f0
nop.i 0
}
{.mfi
nop.m 0
// P36 = P34+r2*P56
fma.s1 FR_P36 = FR_P56, FR_R2, FR_P34
nop.i 0 ;;
}
{.mfi
nop.m 0
// P02 = D+r*P12
fma.s1 FR_P02 = FR_P12, FR_R, FR_D
nop.i 0
}
{.mfi
nop.m 0
// GR_ID = r*r2
fma.s1 FR_R3 = FR_R2, FR_R, f0
nop.i 0 ;;
}
{.mfi
nop.m 0
// P06 = P02+r3*P36
fma.s1 FR_P06 = FR_P36, FR_R3, FR_P02
nop.i 0 ;;
}
{.mfi
nop.m 0
// underflow (x<EMIN) ?
fcmp.lt.s0 p6, p8 = f8, FR_EMIN
nop.i 0 ;;
}
{.mfb
nop.m 0
// result = T+T*P06
fma.s0 f8 = FR_TS, FR_P06, FR_TS
// return
(p8) br.ret.sptk b0
}
{.mfb
(p6) mov GR_Parameter_TAG = 160
nop.f 0
(p6) br.cond.sptk __libm_error_region ;;
}
SPECIAL_exp2l:
{.mfi
nop.m 0
// x = -Infinity ?
fclass.m p6, p0 = f8, 0x22
nop.i 0 ;;
}
{.mfi
nop.m 0
// x = +Infinity ?
fclass.m p7, p0 = f8, 0x21
nop.i 0 ;;
}
{.mfi
nop.m 0
// x = +/-Zero ?
fclass.m p8, p0 = f8, 0x7
nop.i 0
}
{.mfb
nop.m 0
// exp2l(-Infinity) = 0
(p6) mov f8 = f0
(p6) br.ret.spnt b0 ;;
}
{.mfb
nop.m 0
// exp2l(+Infinity) = +Infinity
nop.f 0
(p7) br.ret.spnt b0 ;;
}
{.mfb
nop.m 0
// exp2l(+/-0) = 1
(p8) mov f8 = f1
(p8) br.ret.spnt b0 ;;
}
{.mfb
nop.m 0
// Remaining cases: NaNs
fma.s0 f8 = f8, f1, f0
br.ret.sptk b0 ;;
}
OUT_RANGE_exp2l:
{.mfi
// overflow: p8 = 1
(p8) mov GR_EM63 = 0x1fffe
// normalize input, to detect pseudo-zeroes
fma.s0 f8 = f8, f1, f0
nop.i 0 ;;
}
{.mfi
nop.m 0
// f8 = 0?
fcmp.eq.s1 p7, p0 = f8, f0
nop.i 0 ;;
}
{.mmb
(p8) mov GR_Parameter_TAG = 159
(p8) setf.exp FR_TS = GR_EM63
nop.b 999 ;;
}
{.mfb
nop.m 0
// pseudo-zero
(p7) mov f8 = f1
(p7) br.ret.sptk b0 ;;
}
{.mfi
nop.m 999
(p8) fma.s0 f8 = FR_TS, FR_TS, f0
nop.i 999
}
{.mii
nop.m 0
// underflow: p6 = 1
(p6) mov GR_EM63 = 1
nop.i 0 ;;
}
{.mmb
(p6) mov GR_Parameter_TAG = 160
(p6) setf.exp FR_TS = GR_EM63
nop.b 999 ;;
}
{.mfb
nop.m 999
(p6) fma.s0 f8 = FR_TS, FR_TS, f0
nop.b 0 ;;
}
GLOBAL_LIBM_END(exp2l)
libm_alias_ldouble_other (exp2, exp2)
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{.mfi
add GR_Parameter_Y = -32, sp // Parameter 2 value
nop.f 0
.save ar.pfs, GR_SAVE_PFS
mov GR_SAVE_PFS = ar.pfs // Save ar.pfs
}
{.mfi
.fframe 64
add sp = -64, sp // Create new stack
nop.f 0
mov GR_SAVE_GP = gp ;; // Save gp
}
{.mmi
stfe [ GR_Parameter_Y ] = FR_Y, 16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16, sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0 = b0 ;; // Save b0
}
.body
{.mib
stfe [ GR_Parameter_X ] = FR_X // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0, GR_Parameter_Y // Parameter 3 address
nop.b 0
}
{.mib
stfe [ GR_Parameter_Y ] = FR_RESULT // STORE Parameter 3 on stack
add GR_Parameter_Y = -16, GR_Parameter_Y
br.call.sptk b0 = __libm_error_support# ;; // Call error handling function
}
{.mmi
add GR_Parameter_RESULT = 48, sp
nop.m 0
nop.i 0 ;;
}
{.mmi
ldfe f8 = [ GR_Parameter_RESULT ] // Get return result off stack
.restore sp
add sp = 64, sp // Restore stack pointer
mov b0 = GR_SAVE_B0 ;; // Restore return address
}
{.mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 ;; // Return
}
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#, @function
.global __libm_error_support#

View File

@ -1 +0,0 @@
/* Not needed. */

View File

@ -1,722 +0,0 @@
.file "expf.s"
// Copyright (c) 2000 - 2005, Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
// History
//*********************************************************************
// 02/02/00 Original version
// 04/04/00 Unwind support added
// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
// 08/21/00 Improvements to save 2 cycles on main path, and shorten x=0 case
// 12/07/00 Widen main path, shorten x=inf, nan paths
// 03/15/01 Fix monotonicity problem around x=0 for round to +inf
// 02/05/02 Corrected uninitialize predicate in POSSIBLE_UNDERFLOW path
// 05/20/02 Cleaned up namespace and sf0 syntax
// 07/26/02 Algorithm changed, accuracy improved
// 09/26/02 support of higher precision inputs added, underflow threshold
// corrected
// 11/15/02 Improved performance on Itanium 2, added possible over/under paths
// 05/30/03 Set inexact flag on unmasked overflow/underflow
// 03/31/05 Reformatted delimiters between data tables
//
//
// API
//*********************************************************************
// float expf(float)
//
// Overview of operation
//*********************************************************************
// Take the input x. w is "how many log2/128 in x?"
// w = x * 64/log2
// NJ = int(w)
// x = NJ*log2/64 + R
// NJ = 64*n + j
// x = n*log2 + (log2/64)*j + R
//
// So, exp(x) = 2^n * 2^(j/64)* exp(R)
//
// T = 2^n * 2^(j/64)
// Construct 2^n
// Get 2^(j/64) table
// actually all the entries of 2^(j/64) table are stored in DP and
// with exponent bits set to 0 -> multiplication on 2^n can be
// performed by doing logical "or" operation with bits presenting 2^n
// exp(R) = 1 + (exp(R) - 1)
// P = exp(R) - 1 approximated by Taylor series of 3rd degree
// P = A3*R^3 + A2*R^2 + R, A3 = 1/6, A2 = 1/2
//
// The final result is reconstructed as follows
// exp(x) = T + T*P
// Special values
//*********************************************************************
// expf(+0) = 1.0
// expf(-0) = 1.0
// expf(+qnan) = +qnan
// expf(-qnan) = -qnan
// expf(+snan) = +qnan
// expf(-snan) = -qnan
// expf(-inf) = +0
// expf(+inf) = +inf
// Overflow and Underflow
//*********************************************************************
// expf(x) = largest single normal when
// x = 88.72283 = 0x42b17217
// expf(x) = smallest single normal when
// x = -87.33654 = 0xc2aeac4f
// expf(x) = largest round-to-nearest single zero when
// x = -103.97208 = 0xc2cff1b5
// Registers used
//*********************************************************************
// Floating Point registers used:
// f8, input
// f6,f7, f9 -> f15, f32 -> f40
// General registers used:
// r3, r23 -> r38
// Predicate registers used:
// p10 -> p15
// Assembly macros
//*********************************************************************
// integer registers used
// scratch
rNJ = r3
rTmp = r23
rJ = r23
rN = r24
rTblAddr = r25
rA3 = r26
rExpHalf = r27
rLn2Div64 = r28
r17ones_m1 = r29
rGt_ln = r29
rRightShifter = r30
r64DivLn2 = r31
// stacked
GR_SAVE_PFS = r32
GR_SAVE_B0 = r33
GR_SAVE_GP = r34
GR_Parameter_X = r35
GR_Parameter_Y = r36
GR_Parameter_RESULT = r37
GR_Parameter_TAG = r38
// floating point registers used
FR_X = f10
FR_Y = f1
FR_RESULT = f8
// scratch
fRightShifter = f6
f64DivLn2 = f7
fNormX = f9
fNint = f10
fN = f11
fR = f12
fLn2Div64 = f13
fA2 = f14
fA3 = f15
// stacked
fP = f32
fT = f33
fMIN_SGL_OFLOW_ARG = f34
fMAX_SGL_ZERO_ARG = f35
fMAX_SGL_NORM_ARG = f36
fMIN_SGL_NORM_ARG = f37
fRSqr = f38
fTmp = f39
fGt_pln = f39
fWre_urm_f8 = f40
fFtz_urm_f8 = f40
RODATA
.align 16
LOCAL_OBJECT_START(_expf_table)
data4 0x42b17218 // Smallest sgl arg to overflow sgl result, +88.7228
data4 0xc2cff1b5 // Largest sgl for rnd-to-nearest 0 result, -103.9720
data4 0x42b17217 // Largest sgl arg to give normal sgl result, +88.7228
data4 0xc2aeac4f // Smallest sgl arg to give normal sgl result, -87.3365
//
// 2^(j/64) table, j goes from 0 to 63
data8 0x0000000000000000 // 2^(0/64)
data8 0x00002C9A3E778061 // 2^(1/64)
data8 0x000059B0D3158574 // 2^(2/64)
data8 0x0000874518759BC8 // 2^(3/64)
data8 0x0000B5586CF9890F // 2^(4/64)
data8 0x0000E3EC32D3D1A2 // 2^(5/64)
data8 0x00011301D0125B51 // 2^(6/64)
data8 0x0001429AAEA92DE0 // 2^(7/64)
data8 0x000172B83C7D517B // 2^(8/64)
data8 0x0001A35BEB6FCB75 // 2^(9/64)
data8 0x0001D4873168B9AA // 2^(10/64)
data8 0x0002063B88628CD6 // 2^(11/64)
data8 0x0002387A6E756238 // 2^(12/64)
data8 0x00026B4565E27CDD // 2^(13/64)
data8 0x00029E9DF51FDEE1 // 2^(14/64)
data8 0x0002D285A6E4030B // 2^(15/64)
data8 0x000306FE0A31B715 // 2^(16/64)
data8 0x00033C08B26416FF // 2^(17/64)
data8 0x000371A7373AA9CB // 2^(18/64)
data8 0x0003A7DB34E59FF7 // 2^(19/64)
data8 0x0003DEA64C123422 // 2^(20/64)
data8 0x0004160A21F72E2A // 2^(21/64)
data8 0x00044E086061892D // 2^(22/64)
data8 0x000486A2B5C13CD0 // 2^(23/64)
data8 0x0004BFDAD5362A27 // 2^(24/64)
data8 0x0004F9B2769D2CA7 // 2^(25/64)
data8 0x0005342B569D4F82 // 2^(26/64)
data8 0x00056F4736B527DA // 2^(27/64)
data8 0x0005AB07DD485429 // 2^(28/64)
data8 0x0005E76F15AD2148 // 2^(29/64)
data8 0x0006247EB03A5585 // 2^(30/64)
data8 0x0006623882552225 // 2^(31/64)
data8 0x0006A09E667F3BCD // 2^(32/64)
data8 0x0006DFB23C651A2F // 2^(33/64)
data8 0x00071F75E8EC5F74 // 2^(34/64)
data8 0x00075FEB564267C9 // 2^(35/64)
data8 0x0007A11473EB0187 // 2^(36/64)
data8 0x0007E2F336CF4E62 // 2^(37/64)
data8 0x00082589994CCE13 // 2^(38/64)
data8 0x000868D99B4492ED // 2^(39/64)
data8 0x0008ACE5422AA0DB // 2^(40/64)
data8 0x0008F1AE99157736 // 2^(41/64)
data8 0x00093737B0CDC5E5 // 2^(42/64)
data8 0x00097D829FDE4E50 // 2^(43/64)
data8 0x0009C49182A3F090 // 2^(44/64)
data8 0x000A0C667B5DE565 // 2^(45/64)
data8 0x000A5503B23E255D // 2^(46/64)
data8 0x000A9E6B5579FDBF // 2^(47/64)
data8 0x000AE89F995AD3AD // 2^(48/64)
data8 0x000B33A2B84F15FB // 2^(49/64)
data8 0x000B7F76F2FB5E47 // 2^(50/64)
data8 0x000BCC1E904BC1D2 // 2^(51/64)
data8 0x000C199BDD85529C // 2^(52/64)
data8 0x000C67F12E57D14B // 2^(53/64)
data8 0x000CB720DCEF9069 // 2^(54/64)
data8 0x000D072D4A07897C // 2^(55/64)
data8 0x000D5818DCFBA487 // 2^(56/64)
data8 0x000DA9E603DB3285 // 2^(57/64)
data8 0x000DFC97337B9B5F // 2^(58/64)
data8 0x000E502EE78B3FF6 // 2^(59/64)
data8 0x000EA4AFA2A490DA // 2^(60/64)
data8 0x000EFA1BEE615A27 // 2^(61/64)
data8 0x000F50765B6E4540 // 2^(62/64)
data8 0x000FA7C1819E90D8 // 2^(63/64)
LOCAL_OBJECT_END(_expf_table)
.section .text
GLOBAL_IEEE754_ENTRY(expf)
{ .mlx
addl rTblAddr = @ltoff(_expf_table),gp
movl r64DivLn2 = 0x40571547652B82FE // 64/ln(2)
}
{ .mlx
addl rA3 = 0x3E2AA, r0 // high bits of 1.0/6.0 rounded to SP
movl rRightShifter = 0x43E8000000000000 // DP Right Shifter
}
;;
{ .mfi
// point to the beginning of the table
ld8 rTblAddr = [rTblAddr]
fclass.m p14, p0 = f8, 0x22 // test for -INF
shl rA3 = rA3, 12 // 0x3E2AA000, approx to 1.0/6.0 in SP
}
{ .mfi
nop.m 0
fnorm.s1 fNormX = f8 // normalized x
addl rExpHalf = 0xFFFE, r0 // exponent of 1/2
}
;;
{ .mfi
setf.d f64DivLn2 = r64DivLn2 // load 64/ln(2) to FP reg
fclass.m p15, p0 = f8, 0x1e1 // test for NaT,NaN,+Inf
nop.i 0
}
{ .mlx
// load Right Shifter to FP reg
setf.d fRightShifter = rRightShifter
movl rLn2Div64 = 0x3F862E42FEFA39EF // DP ln(2)/64 in GR
}
;;
{ .mfi
nop.m 0
fcmp.eq.s1 p13, p0 = f0, f8 // test for x = 0.0
nop.i 0
}
{ .mfb
setf.s fA3 = rA3 // load A3 to FP reg
(p14) fma.s.s0 f8 = f0, f1, f0 // result if x = -inf
(p14) br.ret.spnt b0 // exit here if x = -inf
}
;;
{ .mfi
setf.exp fA2 = rExpHalf // load A2 to FP reg
fcmp.eq.s0 p6, p0 = f8, f0 // Dummy to flag denorm
nop.i 0
}
{ .mfb
setf.d fLn2Div64 = rLn2Div64 // load ln(2)/64 to FP reg
(p15) fma.s.s0 f8 = f8, f1, f0 // result if x = NaT,NaN,+Inf
(p15) br.ret.spnt b0 // exit here if x = NaT,NaN,+Inf
}
;;
{ .mfb
// overflow and underflow_zero threshold
ldfps fMIN_SGL_OFLOW_ARG, fMAX_SGL_ZERO_ARG = [rTblAddr], 8
(p13) fma.s.s0 f8 = f1, f1, f0 // result if x = 0.0
(p13) br.ret.spnt b0 // exit here if x =0.0
}
;;
// max normal and underflow_denorm threshold
{ .mfi
ldfps fMAX_SGL_NORM_ARG, fMIN_SGL_NORM_ARG = [rTblAddr], 8
nop.f 0
nop.i 0
}
;;
{ .mfi
nop.m 0
// x*(64/ln(2)) + Right Shifter
fma.s1 fNint = fNormX, f64DivLn2, fRightShifter
nop.i 0
}
;;
// Divide arguments into the following categories:
// Certain Underflow p11 - -inf < x <= MAX_SGL_ZERO_ARG
// Possible Underflow p13 - MAX_SGL_ZERO_ARG < x < MIN_SGL_NORM_ARG
// Certain Safe - MIN_SGL_NORM_ARG <= x <= MAX_SGL_NORM_ARG
// Possible Overflow p14 - MAX_SGL_NORM_ARG < x < MIN_SGL_OFLOW_ARG
// Certain Overflow p15 - MIN_SGL_OFLOW_ARG <= x < +inf
//
// If the input is really a single arg, then there will never be
// "Possible Overflow" arguments.
//
{ .mfi
nop.m 0
// check for overflow
fcmp.ge.s1 p15, p0 = fNormX, fMIN_SGL_OFLOW_ARG
nop.i 0
}
;;
{ .mfi
nop.m 0
// check for underflow and tiny (+0) result
fcmp.le.s1 p11, p0 = fNormX, fMAX_SGL_ZERO_ARG
nop.i 0
}
{ .mfb
nop.m 0
fms.s1 fN = fNint, f1, fRightShifter // n in FP register
// branch out if overflow
(p15) br.cond.spnt EXP_CERTAIN_OVERFLOW
}
;;
{ .mfb
getf.sig rNJ = fNint // bits of n, j
// check for underflow and deno result
fcmp.lt.s1 p13, p0 = fNormX, fMIN_SGL_NORM_ARG
// branch out if underflow and tiny (+0) result
(p11) br.cond.spnt EXP_CERTAIN_UNDERFLOW
}
;;
{ .mfi
nop.m 0
// check for possible overflow
fcmp.gt.s1 p14, p0 = fNormX, fMAX_SGL_NORM_ARG
extr.u rJ = rNJ, 0, 6 // bits of j
}
{ .mfi
addl rN = 0xFFFF - 63, rNJ // biased and shifted n
fnma.s1 fR = fLn2Div64, fN, fNormX // R = x - N*ln(2)/64
nop.i 0
}
;;
{ .mfi
shladd rJ = rJ, 3, rTblAddr // address in the 2^(j/64) table
nop.f 0
shr rN = rN, 6 // biased n
}
;;
{ .mfi
ld8 rJ = [rJ]
nop.f 0
shl rN = rN, 52 // 2^n bits in DP format
}
;;
{ .mfi
or rN = rN, rJ // bits of 2^n * 2^(j/64) in DP format
nop.f 0
nop.i 0
}
;;
{ .mfi
setf.d fT = rN // 2^n * 2^(j/64)
fma.s1 fP = fA3, fR, fA2 // A3*R + A2
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fRSqr = fR, fR, f0 // R^2
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fP = fP, fRSqr, fR // P = (A3*R + A2)*R^2 + R
nop.i 0
}
;;
{ .mbb
nop.m 0
// branch out if possible underflow
(p13) br.cond.spnt EXP_POSSIBLE_UNDERFLOW
// branch out if possible overflow result
(p14) br.cond.spnt EXP_POSSIBLE_OVERFLOW
}
;;
{ .mfb
nop.m 0
// final result in the absence of over- and underflow
fma.s.s0 f8 = fP, fT, fT
// exit here in the absence of over- and underflow
br.ret.sptk b0
}
;;
EXP_POSSIBLE_OVERFLOW:
// Here if fMAX_SGL_NORM_ARG < x < fMIN_SGL_OFLOW_ARG
// This cannot happen if input is a single, only if input higher precision.
// Overflow is a possibility, not a certainty.
// Recompute result using status field 2 with user's rounding mode,
// and wre set. If result is larger than largest single, then we have
// overflow
{ .mfi
mov rGt_ln = 0x1007f // Exponent for largest single + 1 ulp
fsetc.s2 0x7F,0x42 // Get user's round mode, set wre
nop.i 0
}
;;
{ .mfi
setf.exp fGt_pln = rGt_ln // Create largest single + 1 ulp
fma.s.s2 fWre_urm_f8 = fP, fT, fT // Result with wre set
nop.i 0
}
;;
{ .mfi
nop.m 0
fsetc.s2 0x7F,0x40 // Turn off wre in sf2
nop.i 0
}
;;
{ .mfi
nop.m 0
fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow
nop.i 0
}
;;
{ .mfb
nop.m 0
nop.f 0
(p6) br.cond.spnt EXP_CERTAIN_OVERFLOW // Branch if overflow
}
;;
{ .mfb
nop.m 0
fma.s.s0 f8 = fP, fT, fT
br.ret.sptk b0 // Exit if really no overflow
}
;;
// here if overflow
EXP_CERTAIN_OVERFLOW:
{ .mmi
addl r17ones_m1 = 0x1FFFE, r0
;;
setf.exp fTmp = r17ones_m1
nop.i 0
}
;;
{ .mfi
alloc r32=ar.pfs,0,3,4,0
fmerge.s FR_X = f8,f8
nop.i 0
}
{ .mfb
mov GR_Parameter_TAG = 16
fma.s.s0 FR_RESULT = fTmp, fTmp, fTmp // Set I,O and +INF result
br.cond.sptk __libm_error_region
}
;;
EXP_POSSIBLE_UNDERFLOW:
// Here if fMAX_SGL_ZERO_ARG < x < fMIN_SGL_NORM_ARG
// Underflow is a possibility, not a certainty
// We define an underflow when the answer with
// ftz set
// is zero (tiny numbers become zero)
// Notice (from below) that if we have an unlimited exponent range,
// then there is an extra machine number E between the largest denormal and
// the smallest normal.
// So if with unbounded exponent we round to E or below, then we are
// tiny and underflow has occurred.
// But notice that you can be in a situation where we are tiny, namely
// rounded to E, but when the exponent is bounded we round to smallest
// normal. So the answer can be the smallest normal with underflow.
// E
// -----+--------------------+--------------------+-----
// | | |
// 1.1...10 2^-3fff 1.1...11 2^-3fff 1.0...00 2^-3ffe
// 0.1...11 2^-3ffe (biased, 1)
// largest dn smallest normal
{ .mfi
nop.m 0
fsetc.s2 0x7F,0x41 // Get user's round mode, set ftz
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s.s2 fFtz_urm_f8 = fP, fT, fT // Result with ftz set
nop.i 0
}
;;
{ .mfi
nop.m 0
fsetc.s2 0x7F,0x40 // Turn off ftz in sf2
nop.i 0
}
;;
{ .mfi
nop.m 0
fcmp.eq.s1 p6, p7 = fFtz_urm_f8, f0 // Test for underflow
nop.i 0
}
{ .mfi
nop.m 0
fma.s.s0 f8 = fP, fT, fT // Compute result, set I, maybe U
nop.i 0
}
;;
{ .mbb
nop.m 0
(p6) br.cond.spnt EXP_UNDERFLOW_COMMON // Branch if really underflow
(p7) br.ret.sptk b0 // Exit if really no underflow
}
;;
EXP_CERTAIN_UNDERFLOW:
// Here if x < fMAX_SGL_ZERO_ARG
// Result will be zero (or smallest denorm if round to +inf) with I, U set
{ .mmi
mov rTmp = 1
;;
setf.exp fTmp = rTmp // Form small normal
nop.i 0
}
;;
{ .mfi
nop.m 0
fmerge.se fTmp = fTmp, f64DivLn2 // Small with non-trial signif
nop.i 0
}
;;
{ .mfb
nop.m 0
fma.s.s0 f8 = fTmp, fTmp, f0 // Set I,U, tiny (+0.0) result
br.cond.sptk EXP_UNDERFLOW_COMMON
}
;;
EXP_UNDERFLOW_COMMON:
// Determine if underflow result is zero or nonzero
{ .mfi
alloc r32=ar.pfs,0,3,4,0
fcmp.eq.s1 p6, p0 = f8, f0
nop.i 0
}
;;
{ .mfb
nop.m 0
fmerge.s FR_X = fNormX,fNormX
(p6) br.cond.spnt EXP_UNDERFLOW_ZERO
}
;;
EXP_UNDERFLOW_NONZERO:
// Here if x < fMIN_SGL_NORM_ARG and result nonzero;
// I, U are set
{ .mfb
mov GR_Parameter_TAG = 17
nop.f 0 // FR_RESULT already set
br.cond.sptk __libm_error_region
}
;;
EXP_UNDERFLOW_ZERO:
// Here if x < fMIN_SGL_NORM_ARG and result zero;
// I, U are set
{ .mfb
mov GR_Parameter_TAG = 17
nop.f 0 // FR_RESULT already set
br.cond.sptk __libm_error_region
}
;;
GLOBAL_IEEE754_END(expf)
libm_alias_float_other (__exp, exp)
#ifdef SHARED
.symver expf,expf@@GLIBC_2.27
.weak __expf_compat
.set __expf_compat,__expf
.symver __expf_compat,expf@GLIBC_2.2
#endif
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
};;
{ .mmi
stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mfi
stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
nop.f 0
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
}
{ .mib
stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
add GR_Parameter_RESULT = 48,sp
nop.m 0
nop.i 0
};;
{ .mmi
ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
};;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#

View File

@ -1 +0,0 @@
/* Not needed. */

View File

@ -1,555 +0,0 @@
.file "fmod.s"
// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//====================================================================
// 02/02/00 Initial version
// 03/02/00 New Algorithm
// 04/04/00 Unwind support added
// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
// 11/28/00 Set FR_Y to f9
// 03/11/02 Fixed flags for fmod(qnan,zero)
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
// 04/28/03 Fix: fmod(sNaN,0) no longer sets errno
//
// API
//====================================================================
// double fmod(double,double);
//
// Overview of operation
//====================================================================
// fmod(a,b)=a-i*b,
// where i is an integer such that, if b!=0,
// |i|<|a/b| and |a/b-i|<1
//
// Algorithm
//====================================================================
// a). if |a|<|b|, return a
// b). get quotient and reciprocal overestimates accurate to
// 33 bits (q2,y2)
// c). if the exponent difference (exponent(a)-exponent(b))
// is less than 32, truncate quotient to integer and
// finish in one iteration
// d). if exponent(a)-exponent(b)>=32 (q2>=2^32)
// round quotient estimate to single precision (k=RN(q2)),
// calculate partial remainder (a'=a-k*b),
// get quotient estimate (a'*y2), and repeat from c).
//
// Special cases
//====================================================================
// b=+/-0: return NaN, call libm_error_support
// a=+/-Inf, a=NaN or b=NaN: return NaN
//
// Registers used
//====================================================================
// Predicate registers: p6-p11
// General registers: r2,r29,r32 (ar.pfs), r33-r39
// Floating point registers: f6-f15
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
GR_SAVE_GP = r35
GR_SAVE_SP = r36
GR_Parameter_X = r37
GR_Parameter_Y = r38
GR_Parameter_RESULT = r39
GR_Parameter_TAG = r40
FR_X = f10
FR_Y = f9
FR_RESULT = f8
.section .text
GLOBAL_IEEE754_ENTRY(fmod)
// inputs in f8, f9
// result in f8
{ .mfi
alloc r32=ar.pfs,1,4,4,0
// f6=|a|
fmerge.s f6=f0,f8
mov r2 = 0x0ffdd
}
{.mfi
nop.m 0
// f7=|b|
fmerge.s f7=f0,f9
nop.i 0;;
}
{ .mfi
setf.exp f11 = r2
// (1) y0
frcpa.s1 f10,p6=f6,f7
nop.i 0
}
// Y +-NAN, +-inf, +-0? p7
{ .mfi
nop.m 999
fclass.m.unc p7,p0 = f9, 0xe7
nop.i 999;;
}
// qnan snan inf norm unorm 0 -+
// 1 1 1 0 0 0 11
// e 3
// X +-NAN, +-inf, ? p9
{ .mfi
nop.m 999
fclass.m.unc p9,p0 = f8, 0xe3
nop.i 999
}
// |x| < |y|? Return x p8
{ .mfi
nop.m 999
fcmp.lt.unc.s1 p8,p0 = f6,f7
nop.i 999 ;;
}
{ .mfi
nop.m 0
// normalize y (if |x|<|y|)
(p8) fma.s0 f9=f9,f1,f0
nop.i 0;;
}
{ .mfi
mov r2=0x1001f
// (2) q0=a*y0
(p6) fma.s1 f13=f6,f10,f0
nop.i 0
}
{ .mfi
nop.m 0
// (3) e0 = 1 - b * y0
(p6) fnma.s1 f12=f7,f10,f1
nop.i 0;;
}
{.mfi
nop.m 0
// normalize x (if |x|<|y|)
(p8) fma.d.s0 f8=f8,f1,f0
nop.i 0
}
{.bbb
(p9) br.cond.spnt FMOD_X_NAN_INF
(p7) br.cond.spnt FMOD_Y_NAN_INF_ZERO
// if |x|<|y|, return
(p8) br.ret.spnt b0;;
}
{.mfi
nop.m 0
// normalize x
fma.s0 f6=f6,f1,f0
nop.i 0
}
{.mfi
nop.m 0
// normalize y
fma.s0 f7=f7,f1,f0
nop.i 0;;
}
{.mfi
// f15=2^32
setf.exp f15=r2
// (4) q1=q0+e0*q0
(p6) fma.s1 f13=f12,f13,f13
nop.i 0
}
{ .mfi
nop.m 0
// (5) e1 = e0 * e0 + 2^-34
(p6) fma.s1 f14=f12,f12,f11
nop.i 0;;
}
{.mlx
nop.m 0
movl r2=0x33a00000;;
}
{ .mfi
nop.m 0
// (6) y1 = y0 + e0 * y0
(p6) fma.s1 f10=f12,f10,f10
nop.i 0;;
}
{.mfi
// set f12=1.25*2^{-24}
setf.s f12=r2
// (7) q2=q1+e1*q1
(p6) fma.s1 f13=f13,f14,f13
nop.i 0;;
}
{.mfi
nop.m 0
fmerge.s f9=f8,f9
nop.i 0
}
{ .mfi
nop.m 0
// (8) y2 = y1 + e1 * y1
(p6) fma.s1 f10=f14,f10,f10
// set p6=0, p10=0
cmp.ne.and p6,p10=r0,r0;;
}
.align 32
loop53:
{.mfi
nop.m 0
// compare q2, 2^32
fcmp.lt.unc.s1 p8,p7=f13,f15
nop.i 0
}
{.mfi
nop.m 0
// will truncate quotient to integer, if exponent<32 (in advance)
fcvt.fx.trunc.s1 f11=f13
nop.i 0;;
}
{.mfi
nop.m 0
// if exponent>32, round quotient to single precision (perform in advance)
fma.s.s1 f13=f13,f1,f0
nop.i 0;;
}
{.mfi
nop.m 0
// set f12=sgn(a)
(p8) fmerge.s f12=f8,f1
nop.i 0
}
{.mfi
nop.m 0
// normalize truncated quotient
(p8) fcvt.xf f13=f11
nop.i 0;;
}
{ .mfi
nop.m 0
// calculate remainder (assuming f13=RZ(Q))
(p7) fnma.s1 f14=f13,f7,f6
nop.i 0
}
{.mfi
nop.m 0
// also if exponent>32, round quotient to single precision
// and subtract 1 ulp: q=q-q*(1.25*2^{-24})
(p7) fnma.s.s1 f11=f13,f12,f13
nop.i 0;;
}
{.mfi
nop.m 0
// (p8) calculate remainder (82-bit format)
(p8) fnma.s1 f11=f13,f7,f6
nop.i 0
}
{.mfi
nop.m 0
// (p7) calculate remainder (assuming f11=RZ(Q))
(p7) fnma.s1 f6=f11,f7,f6
nop.i 0;;
}
{.mfi
nop.m 0
// Final iteration (p8): is f6 the correct remainder (quotient was not overestimated) ?
(p8) fcmp.lt.unc.s1 p6,p10=f11,f0
nop.i 0;;
}
{.mfi
nop.m 0
// get new quotient estimation: a'*y2
(p7) fma.s1 f13=f14,f10,f0
nop.i 0
}
{.mfb
nop.m 0
// was f14=RZ(Q) ? (then new remainder f14>=0)
(p7) fcmp.lt.unc.s1 p7,p9=f14,f0
nop.b 0;;
}
.pred.rel "mutex",p6,p10
{.mfb
nop.m 0
// add b to estimated remainder (to cover the case when the quotient was overestimated)
// also set correct sign by using f9=|b|*sgn(a), f12=sgn(a)
(p6) fma.d.s0 f8=f11,f12,f9
nop.b 0
}
{.mfb
nop.m 0
// calculate remainder (single precision)
// set correct sign of result before returning
(p10) fma.d.s0 f8=f11,f12,f0
(p8) br.ret.sptk b0;;
}
{.mfi
nop.m 0
// if f13!=RZ(Q), get alternative quotient estimation: a''*y2
(p7) fma.s1 f13=f6,f10,f0
nop.i 0
}
{.mfb
nop.m 0
// if f14 was RZ(Q), set remainder to f14
(p9) mov f6=f14
br.cond.sptk loop53;;
}
FMOD_X_NAN_INF:
// Y zero ?
{.mfi
nop.m 0
fclass.m p10,p0=f8,0xc3 // Test x=nan
nop.i 0
}
{.mfi
nop.m 0
fma.s1 f10=f9,f1,f0
nop.i 0;;
}
{.mfi
nop.m 0
fma.s0 f8=f8,f1,f0
nop.i 0
}
{.mfi
nop.m 0
(p10) fclass.m p10,p0=f9,0x07 // Test x=nan, and y=zero
nop.i 0;;
}
{.mfb
nop.m 0
fcmp.eq.unc.s1 p11,p0=f10,f0
(p10) br.ret.spnt b0;; // Exit with result=x if x=nan and y=zero
}
{.mib
nop.m 0
nop.i 0
// if Y zero
(p11) br.cond.spnt FMOD_Y_ZERO;;
}
// X infinity? Return QNAN indefinite
{ .mfi
nop.m 999
fclass.m.unc p8,p9 = f8, 0x23
nop.i 999;;
}
// Y NaN ?
{.mfi
nop.m 999
(p8) fclass.m p9,p8=f9,0xc3
nop.i 0;;
}
{.mfi
nop.m 999
(p8) frcpa.s0 f8,p0 = f8,f8
nop.i 0
}
{ .mfi
nop.m 999
// also set Denormal flag if necessary
(p8) fma.s0 f9=f9,f1,f0
nop.i 999 ;;
}
{ .mfb
nop.m 999
(p8) fma.d.s0 f8=f8,f1,f0
nop.b 999 ;;
}
{ .mfb
nop.m 999
(p9) frcpa.s0 f8,p7=f8,f9
br.ret.sptk b0 ;;
}
FMOD_Y_NAN_INF_ZERO:
// Y INF
{ .mfi
nop.m 999
fclass.m.unc p7,p0 = f9, 0x23
nop.i 999 ;;
}
{ .mfb
nop.m 999
(p7) fma.d.s0 f8=f8,f1,f0
(p7) br.ret.spnt b0 ;;
}
// Y NAN?
{ .mfi
nop.m 999
fclass.m.unc p9,p0 = f9, 0xc3
nop.i 999 ;;
}
{ .mfb
nop.m 999
(p9) fma.d.s0 f8=f9,f1,f0
(p9) br.ret.spnt b0 ;;
}
FMOD_Y_ZERO:
// Y zero? Must be zero at this point
// because it is the only choice left.
// Return QNAN indefinite
{.mfi
nop.m 0
// set Invalid
frcpa.s0 f12,p0=f0,f0
nop.i 0
}
// X NAN?
{ .mfi
nop.m 999
fclass.m.unc p9,p10 = f8, 0xc3
nop.i 999 ;;
}
{ .mfi
nop.m 999
(p10) fclass.nm p9,p10 = f8, 0xff
nop.i 999 ;;
}
{.mfi
nop.m 999
(p9) frcpa.s0 f11,p7=f8,f0
nop.i 0;;
}
{ .mfi
nop.m 999
(p10) frcpa.s0 f11,p7 = f9,f9
mov GR_Parameter_TAG = 121 ;;
}
{ .mfi
nop.m 999
fmerge.s f10 = f8, f8
nop.i 999
}
{ .mfb
nop.m 999
fma.d.s0 f8=f11,f1,f0
br.sptk __libm_error_region;;
}
GLOBAL_IEEE754_END(fmod)
libm_alias_double_other (__fmod, fmod)
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
};;
{ .mmi
stfd [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
stfd [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y
nop.b 0 // Parameter 3 address
}
{ .mib
stfd [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
nop.m 0
nop.m 0
add GR_Parameter_RESULT = 48,sp
};;
{ .mmi
ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
};;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#

View File

@ -1,569 +0,0 @@
.file "fmodf.s"
// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//====================================================================
// 02/02/00 Initial version
// 03/02/00 New Algorithm
// 04/04/00 Unwind support added
// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
// 11/28/00 Set FR_Y to f9
// 03/11/02 Fixed flags for fmodf(qnan,zero)
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
// 04/28/03 Fix: fmod(sNaN,0) no longer sets errno
//
// API
//====================================================================
// float fmodf(float,float);
//
// Overview of operation
//====================================================================
// fmod(a,b)=a-i*b,
// where i is an integer such that, if b!=0,
// |i|<|a/b| and |a/b-i|<1
// Algorithm
//====================================================================
// a). if |a|<|b|, return a
// b). get quotient and reciprocal overestimates accurate to
// 33 bits (q2,y2)
// c). if the exponent difference (exponent(a)-exponent(b))
// is less than 32, truncate quotient to integer and
// finish in one iteration
// d). if exponent(a)-exponent(b)>=32 (q2>=2^32)
// round quotient estimate to single precision (k=RN(q2)),
// calculate partial remainder (a'=a-k*b),
// get quotient estimate (a'*y2), and repeat from c).
// Special cases
//====================================================================
// b=+/-0: return NaN, call libm_error_support
// a=+/-Inf, a=NaN or b=NaN: return NaN
// Registers used
//====================================================================
// Predicate registers: p6-p11
// General registers: r2,r29,r32 (ar.pfs), r33-r39
// Floating point registers: f6-f15
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
GR_SAVE_GP = r35
GR_SAVE_SP = r36
GR_Parameter_X = r37
GR_Parameter_Y = r38
GR_Parameter_RESULT = r39
GR_Parameter_TAG = r40
FR_X = f10
FR_Y = f9
FR_RESULT = f8
.section .text
GLOBAL_IEEE754_ENTRY(fmodf)
// inputs in f8, f9
// result in f8
{ .mfi
alloc r32=ar.pfs,1,4,4,0
// f6=|a|
fmerge.s f6=f0,f8
mov r2 = 0x0ffdd
}
{.mfi
nop.m 0
// f7=|b|
fmerge.s f7=f0,f9
nop.i 0;;
}
{ .mfi
setf.exp f11 = r2
// (1) y0
frcpa.s1 f10,p6=f6,f7
nop.i 0
}
// eliminate special cases
// Y +-NAN, +-inf, +-0? p7
{ .mfi
nop.m 999
fclass.m.unc p7,p0 = f9, 0xe7
nop.i 999;;
}
// qnan snan inf norm unorm 0 -+
// 1 1 1 0 0 0 11
// e 3
// X +-NAN, +-inf, ? p9
{ .mfi
nop.m 999
fclass.m.unc p9,p0 = f8, 0xe3
nop.i 999
}
// |x| < |y|? Return x p8
{ .mfi
nop.m 999
fcmp.lt.unc.s1 p8,p0 = f6,f7
nop.i 999 ;;
}
{ .mfi
nop.m 0
// normalize y (if |x|<|y|)
(p8) fma.s0 f9=f9,f1,f0
nop.i 0;;
}
{ .mfi
mov r2=0x1001f
// (2) q0=a*y0
(p6) fma.s1 f13=f6,f10,f0
nop.i 0
}
{ .mfi
nop.m 0
// (3) e0 = 1 - b * y0
(p6) fnma.s1 f12=f7,f10,f1
nop.i 0;;
}
{.mfi
nop.m 0
// normalize x (if |x|<|y|)
(p8) fma.s.s0 f8=f8,f1,f0
nop.i 0
}
{.bbb
(p9) br.cond.spnt FMOD_X_NAN_INF
(p7) br.cond.spnt FMOD_Y_NAN_INF_ZERO
// if |x|<|y|, return
(p8) br.ret.spnt b0;;
}
{.mfi
nop.m 0
// normalize x
fma.s0 f6=f6,f1,f0
nop.i 0
}
{.mfi
nop.m 0
// normalize y
fma.s0 f7=f7,f1,f0
nop.i 0;;
}
{.mfi
// f15=2^32
setf.exp f15=r2
// (4) q1=q0+e0*q0
(p6) fma.s1 f13=f12,f13,f13
nop.i 0
}
{ .mfi
nop.m 0
// (5) e1 = e0 * e0 + 2^-34
(p6) fma.s1 f14=f12,f12,f11
nop.i 0;;
}
{.mlx
nop.m 0
movl r2=0x33a00000;;
}
{ .mfi
nop.m 0
// (6) y1 = y0 + e0 * y0
(p6) fma.s1 f10=f12,f10,f10
nop.i 0;;
}
{.mfi
// set f12=1.25*2^{-24}
setf.s f12=r2
// (7) q2=q1+e1*q1
(p6) fma.s1 f13=f13,f14,f13
nop.i 0;;
}
{.mfi
nop.m 0
fmerge.s f9=f8,f9
nop.i 0
}
{ .mfi
nop.m 0
// (8) y2 = y1 + e1 * y1
(p6) fma.s1 f10=f14,f10,f10
// set p6=0, p10=0
cmp.ne.and p6,p10=r0,r0;;
}
.align 32
loop24:
{.mfi
nop.m 0
// compare q2, 2^32
fcmp.lt.unc.s1 p8,p7=f13,f15
nop.i 0
}
{.mfi
nop.m 0
// will truncate quotient to integer, if exponent<32 (in advance)
fcvt.fx.trunc.s1 f11=f13
nop.i 0;;
}
{.mfi
nop.m 0
// if exponent>32, round quotient to single precision (perform in advance)
fma.s.s1 f13=f13,f1,f0
nop.i 0;;
}
{.mfi
nop.m 0
// set f12=sgn(a)
(p8) fmerge.s f12=f8,f1
nop.i 0
}
{.mfi
nop.m 0
// normalize truncated quotient
(p8) fcvt.xf f13=f11
nop.i 0;;
}
{ .mfi
nop.m 0
// calculate remainder (assuming f13=RZ(Q))
(p7) fnma.s1 f14=f13,f7,f6
nop.i 0
}
{.mfi
nop.m 0
// also if exponent>32, round quotient to single precision
// and subtract 1 ulp: q=q-q*(1.25*2^{-24})
(p7) fnma.s.s1 f11=f13,f12,f13
nop.i 0;;
}
{.mfi
nop.m 0
// (p8) calculate remainder (82-bit format)
(p8) fnma.s1 f11=f13,f7,f6
nop.i 0
}
{.mfi
nop.m 0
// (p7) calculate remainder (assuming f11=RZ(Q))
(p7) fnma.s1 f6=f11,f7,f6
nop.i 0;;
}
{.mfi
nop.m 0
// Final iteration (p8): is f6 the correct remainder (quotient was not overestimated) ?
(p8) fcmp.lt.unc.s1 p6,p10=f11,f0
nop.i 0;;
}
{.mfi
nop.m 0
// get new quotient estimation: a'*y2
(p7) fma.s1 f13=f14,f10,f0
nop.i 0
}
{.mfb
nop.m 0
// was f14=RZ(Q) ? (then new remainder f14>=0)
(p7) fcmp.lt.unc.s1 p7,p9=f14,f0
nop.b 0;;
}
.pred.rel "mutex",p6,p10
{.mfb
nop.m 0
// add b to estimated remainder (to cover the case when the quotient was overestimated)
// also set correct sign by using f9=|b|*sgn(a), f12=sgn(a)
(p6) fma.s.s0 f8=f11,f12,f9
nop.b 0
}
{.mfb
nop.m 0
// calculate remainder (single precision)
// set correct sign of result before returning
(p10) fma.s.s0 f8=f11,f12,f0
(p8) br.ret.sptk b0;;
}
{.mfi
nop.m 0
// if f13!=RZ(Q), get alternative quotient estimation: a''*y2
(p7) fma.s1 f13=f6,f10,f0
nop.i 0
}
{.mfb
nop.m 0
// if f14 was RZ(Q), set remainder to f14
(p9) mov f6=f14
br.cond.sptk loop24;;
}
{ .mmb
nop.m 0
nop.m 0
br.ret.sptk b0;;
}
FMOD_X_NAN_INF:
// Y zero ?
{.mfi
nop.m 0
fclass.m p10,p0=f8,0xc3 // Test x=nan
nop.i 0
}
{.mfi
nop.m 0
fma.s1 f10=f9,f1,f0
nop.i 0;;
}
{.mfi
nop.m 0
fma.s0 f8=f8,f1,f0
nop.i 0
}
{.mfi
nop.m 0
(p10) fclass.m p10,p0=f9,0x07 // Test x=nan, and y=zero
nop.i 0;;
}
{.mfb
nop.m 0
fcmp.eq.unc.s1 p11,p0=f10,f0
(p10) br.ret.spnt b0;; // Exit with result=x if x=nan and y=zero
}
{.mib
nop.m 0
nop.i 0
// if Y zero
(p11) br.cond.spnt FMOD_Y_ZERO;;
}
// X infinity? Return QNAN indefinite
{ .mfi
nop.m 999
fclass.m.unc p8,p9 = f8, 0x23
nop.i 999;;
}
// Y NaN ?
{.mfi
nop.m 999
(p8) fclass.m p9,p8=f9,0xc3
nop.i 0;;
}
{.mfi
nop.m 999
(p8) frcpa.s0 f8,p0 = f8,f8
nop.i 0
}
{ .mfi
nop.m 999
// also set Denormal flag if necessary
(p8) fma.s0 f9=f9,f1,f0
nop.i 999 ;;
}
{ .mfb
nop.m 999
(p8) fma.s.s0 f8=f8,f1,f0
nop.b 999 ;;
}
{ .mfb
nop.m 999
(p9) frcpa.s0 f8,p7=f8,f9
br.ret.sptk b0 ;;
}
FMOD_Y_NAN_INF_ZERO:
// Y INF
{ .mfi
nop.m 999
fclass.m.unc p7,p0 = f9, 0x23
nop.i 999 ;;
}
{ .mfb
nop.m 999
(p7) fma.s.s0 f8=f8,f1,f0
(p7) br.ret.spnt b0 ;;
}
// Y NAN?
{ .mfi
nop.m 999
fclass.m.unc p9,p0 = f9, 0xc3
nop.i 999 ;;
}
{ .mfb
nop.m 999
(p9) fma.s.s0 f8=f9,f1,f0
(p9) br.ret.spnt b0 ;;
}
FMOD_Y_ZERO:
// Y zero? Must be zero at this point
// because it is the only choice left.
// Return QNAN indefinite
{.mfi
nop.m 0
// set Invalid
frcpa.s0 f12,p0=f0,f0
nop.i 999
}
// X NAN?
{ .mfi
nop.m 999
fclass.m.unc p9,p10 = f8, 0xc3
nop.i 999 ;;
}
{ .mfi
nop.m 999
(p10) fclass.nm p9,p10 = f8, 0xff
nop.i 999 ;;
}
{.mfi
nop.m 999
(p9) frcpa.s0 f11,p7=f8,f0
nop.i 0;;
}
{ .mfi
nop.m 999
(p10) frcpa.s0 f11,p7 = f0,f0
nop.i 999;;
}
{ .mfi
nop.m 999
fmerge.s f10 = f8, f8
nop.i 999
}
{ .mfi
nop.m 999
fma.s.s0 f8=f11,f1,f0
nop.i 999;;
}
EXP_ERROR_RETURN:
{ .mib
nop.m 0
mov GR_Parameter_TAG=122
br.sptk __libm_error_region;;
}
GLOBAL_IEEE754_END(fmodf)
libm_alias_float_other (__fmod, fmod)
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
};;
{ .mmi
stfs [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y
nop.b 0 // Parameter 3 address
}
{ .mib
stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support#;; // Call error handling function
}
{ .mmi
nop.m 0
nop.m 0
add GR_Parameter_RESULT = 48,sp
};;
{ .mmi
ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
};;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#

View File

@ -1,672 +0,0 @@
.file "fmodl.s"
// Copyright (c) 2000 - 2004, Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//====================================================================
// 02/02/00 Initial version
// 03/02/00 New Algorithm
// 04/04/00 Unwind support added
// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [ the previously overwritten ] GR_Parameter_RESULT.
// 11/28/00 Set FR_Y to f9
// 03/11/02 Fixed flags for fmodl(qnan, zero)
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header:.section,.global,.proc,.align
// 04/28/03 Fix: fmod(sNaN, 0) no longer sets errno
// 11/23/04 Reformatted routine and improved speed
//
// API
//====================================================================
// long double fmodl(long double, long double);
//
// Overview of operation
//====================================================================
// fmod(a, b)= a-i*b,
// where i is an integer such that, if b!= 0,
// |i|<|a/b| and |a/b-i|<1
//
// Algorithm
//====================================================================
// a). if |a|<|b|, return a
// b). get quotient and reciprocal overestimates accurate to
// 33 bits (q2, y2)
// c). if the exponent difference (exponent(a)-exponent(b))
// is less than 32, truncate quotient to integer and
// finish in one iteration
// d). if exponent(a)-exponent(b)>= 32 (q2>= 2^32)
// round quotient estimate to single precision (k= RN(q2)),
// calculate partial remainder (a'= a-k*b),
// get quotient estimate (a'*y2), and repeat from c).
//
// Registers used
//====================================================================
GR_SMALLBIASEXP = r2
GR_2P32 = r3
GR_SMALLBIASEXP = r20
GR_ROUNDCONST = r21
GR_SIG_B = r22
GR_ARPFS = r23
GR_TMP1 = r24
GR_TMP2 = r25
GR_TMP3 = r26
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
GR_SAVE_GP = r35
GR_SAVE_SP = r36
GR_Parameter_X = r37
GR_Parameter_Y = r38
GR_Parameter_RESULT = r39
GR_Parameter_TAG = r40
FR_X = f10
FR_Y = f9
FR_RESULT = f8
FR_ABS_A = f6
FR_ABS_B = f7
FR_Y_INV = f10
FR_SMALLBIAS = f11
FR_E0 = f12
FR_Q = f13
FR_E1 = f14
FR_2P32 = f15
FR_TMPX = f32
FR_TMPY = f33
FR_ROUNDCONST = f34
FR_QINT = f35
FR_QRND24 = f36
FR_NORM_B = f37
FR_TMP = f38
FR_TMP2 = f39
FR_DFLAG = f40
FR_Y_INV0 = f41
FR_Y_INV1 = f42
FR_Q0 = f43
FR_Q1 = f44
FR_QINT_Z = f45
FR_QREM = f46
FR_B_SGN_A = f47
.section .text
GLOBAL_IEEE754_ENTRY(fmodl)
// inputs in f8, f9
// result in f8
{ .mfi
getf.sig GR_SIG_B = f9
// FR_ABS_A = |a|
fmerge.s FR_ABS_A = f0, f8
mov GR_SMALLBIASEXP = 0x0ffdd
}
{ .mfi
nop.m 0
// FR_ABS_B = |b|
fmerge.s FR_ABS_B = f0, f9
nop.i 0
}
;;
{ .mfi
setf.exp FR_SMALLBIAS = GR_SMALLBIASEXP
// (1) y0
frcpa.s1 FR_Y_INV0, p6 = FR_ABS_A, FR_ABS_B
nop.i 0
}
;;
{ .mlx
nop.m 0
movl GR_ROUNDCONST = 0x33a00000
}
;;
// eliminate special cases
{ .mmi
nop.m 0
nop.m 0
// y pseudo-zero ?
cmp.eq p7, p10 = GR_SIG_B, r0
}
;;
// set p7 if b +/-NAN, +/-inf, +/-0
{ .mfi
nop.m 0
(p10) fclass.m p7, p10 = f9, 0xe7
nop.i 0
}
;;
{ .mfi
mov GR_2P32 = 0x1001f
// (2) q0 = a*y0
(p6) fma.s1 FR_Q0 = FR_ABS_A, FR_Y_INV0, f0
nop.i 0
}
{ .mfi
nop.m 0
// (3) e0 = 1 - b * y0
(p6) fnma.s1 FR_E0 = FR_ABS_B, FR_Y_INV0, f1
nop.i 0
}
;;
// set p9 if a +/-NAN, +/-inf
{ .mfi
nop.m 0
fclass.m.unc p9, p11 = f8, 0xe3
nop.i 0
}
// |a| < |b|? Return a, p8=1
{ .mfi
nop.m 0
(p10) fcmp.lt.unc.s1 p8, p0 = FR_ABS_A, FR_ABS_B
nop.i 0
}
;;
// set p7 if b +/-NAN, +/-inf, +/-0
{ .mfi
nop.m 0
// pseudo-NaN ?
(p10) fclass.nm p7, p0 = f9, 0xff
nop.i 0
}
;;
// set p9 if a is +/-NaN, +/-Inf
{ .mfi
nop.m 0
(p11) fclass.nm p9, p0 = f8, 0xff
nop.i 0
}
{ .mfi
nop.m 0
// b denormal ? set D flag (if |a|<|b|)
(p8) fnma.s0 FR_DFLAG = f9, f1, f9
nop.i 0
}
;;
{ .mfi
// FR_2P32 = 2^32
setf.exp FR_2P32 = GR_2P32
// (4) q1 = q0+e0*q0
(p6) fma.s1 FR_Q1 = FR_E0, FR_Q0, FR_Q0
nop.i 0
}
{ .mfi
nop.m 0
// (5) e1 = e0 * e0 + 2^-34
(p6) fma.s1 FR_E1 = FR_E0, FR_E0, FR_SMALLBIAS
nop.i 0
}
;;
{ .mfi
nop.m 0
// normalize a (if |a|<|b|)
(p8) fma.s0 f8 = f8, f1, f0
nop.i 0
}
{ .bbb
(p9) br.cond.spnt FMOD_A_NAN_INF
(p7) br.cond.spnt FMOD_B_NAN_INF_ZERO
// if |a|<|b|, return
(p8) br.ret.spnt b0
}
;;
{ .mfi
nop.m 0
// (6) y1 = y0 + e0 * y0
(p6) fma.s1 FR_Y_INV1 = FR_E0, FR_Y_INV0, FR_Y_INV0
nop.i 0
}
;;
{ .mfi
nop.m 0
// a denormal ? set D flag
// b denormal ? set D flag
fcmp.eq.s0 p12,p0 = FR_ABS_A, FR_ABS_B
nop.i 0
}
{ .mfi
// set FR_ROUNDCONST = 1.25*2^{-24}
setf.s FR_ROUNDCONST = GR_ROUNDCONST
// (7) q2 = q1+e1*q1
(p6) fma.s1 FR_Q = FR_Q1, FR_E1, FR_Q1
nop.i 0
}
;;
{ .mfi
nop.m 0
fmerge.s FR_B_SGN_A = f8, f9
nop.i 0
}
{ .mfi
nop.m 0
// (8) y2 = y1 + e1 * y1
(p6) fma.s1 FR_Y_INV = FR_E1, FR_Y_INV1, FR_Y_INV1
// set p6 = 0, p10 = 0
cmp.ne.and p6, p10 = r0, r0
}
;;
// will compute integer quotient bits (24 bits per iteration)
.align 32
loop64:
{ .mfi
nop.m 0
// compare q2, 2^32
fcmp.lt.unc.s1 p8, p7 = FR_Q, FR_2P32
nop.i 0
}
{ .mfi
nop.m 0
// will truncate quotient to integer, if exponent<32 (in advance)
fcvt.fx.trunc.s1 FR_QINT = FR_Q
nop.i 0
}
;;
{ .mfi
nop.m 0
// if exponent>32 round quotient to single precision (perform in advance)
fma.s.s1 FR_QRND24 = FR_Q, f1, f0
nop.i 0
}
;;
{ .mfi
nop.m 0
// set FR_ROUNDCONST = sgn(a)
(p8) fmerge.s FR_ROUNDCONST = f8, f1
nop.i 0
}
{ .mfi
nop.m 0
// normalize truncated quotient
(p8) fcvt.xf FR_QRND24 = FR_QINT
nop.i 0
}
;;
{ .mfi
nop.m 0
// calculate remainder (assuming FR_QRND24 = RZ(Q))
(p7) fnma.s1 FR_E1 = FR_QRND24, FR_ABS_B, FR_ABS_A
nop.i 0
}
{ .mfi
nop.m 0
// also if exponent>32, round quotient to single precision
// and subtract 1 ulp: q = q-q*(1.25*2^{-24})
(p7) fnma.s.s1 FR_QINT_Z = FR_QRND24, FR_ROUNDCONST, FR_QRND24
nop.i 0
}
;;
{ .mfi
nop.m 0
// (p8) calculate remainder (82-bit format)
(p8) fnma.s1 FR_QREM = FR_QRND24, FR_ABS_B, FR_ABS_A
nop.i 0
}
{ .mfi
nop.m 0
// (p7) calculate remainder (assuming FR_QINT_Z = RZ(Q))
(p7) fnma.s1 FR_ABS_A = FR_QINT_Z, FR_ABS_B, FR_ABS_A
nop.i 0
}
;;
{ .mfi
nop.m 0
// Final iteration (p8): is FR_ABS_A the correct remainder
// (quotient was not overestimated) ?
(p8) fcmp.lt.unc.s1 p6, p10 = FR_QREM, f0
nop.i 0
}
;;
{ .mfi
nop.m 0
// get new quotient estimation: a'*y2
(p7) fma.s1 FR_Q = FR_E1, FR_Y_INV, f0
nop.i 0
}
{ .mfb
nop.m 0
// was FR_Q = RZ(Q) ? (then new remainder FR_E1> = 0)
(p7) fcmp.lt.unc.s1 p7, p9 = FR_E1, f0
nop.b 0
}
;;
.pred.rel "mutex", p6, p10
{ .mfb
nop.m 0
// add b to estimated remainder (to cover the case when the quotient was
// overestimated)
// also set correct sign by using
// FR_B_SGN_A = |b|*sgn(a), FR_ROUNDCONST = sgn(a)
(p6) fma.s0 f8 = FR_QREM, FR_ROUNDCONST, FR_B_SGN_A
nop.b 0
}
{ .mfb
nop.m 0
// set correct sign of result before returning: FR_ROUNDCONST = sgn(a)
(p10) fma.s0 f8 = FR_QREM, FR_ROUNDCONST, f0
(p8) br.ret.sptk b0
}
;;
{ .mfi
nop.m 0
// if f13! = RZ(Q), get alternative quotient estimation: a''*y2
(p7) fma.s1 FR_Q = FR_ABS_A, FR_Y_INV, f0
nop.i 0
}
{ .mfb
nop.m 0
// if FR_E1 was RZ(Q), set remainder to FR_E1
(p9) fma.s1 FR_ABS_A = FR_E1, f1, f0
br.cond.sptk loop64
}
;;
FMOD_A_NAN_INF:
// b zero ?
{ .mfi
nop.m 0
fclass.m p10, p0 = f8, 0xc3 // Test a = nan
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 FR_NORM_B = f9, f1, f0
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s0 f8 = f8, f1, f0
nop.i 0
}
{ .mfi
nop.m 0
(p10) fclass.m p10, p0 = f9, 0x07 // Test x = nan, and y = zero
nop.i 0
}
;;
{ .mfb
nop.m 0
fcmp.eq.unc.s1 p11, p0 = FR_NORM_B, f0
(p10) br.ret.spnt b0 // Exit with result = a if a = nan and b = zero
}
;;
{ .mib
nop.m 0
nop.i 0
// if Y zero
(p11) br.cond.spnt FMOD_B_ZERO
}
;;
// a= infinity? Return QNAN indefinite
{ .mfi
// set p7 t0 0
cmp.ne p7, p0 = r0, r0
fclass.m.unc p8, p9 = f8, 0x23
nop.i 0
}
;;
// b NaN ?
{ .mfi
nop.m 0
(p8) fclass.m p9, p8 = f9, 0xc3
nop.i 0
}
;;
// b not pseudo-zero ? (GR_SIG_B holds significand)
{ .mii
nop.m 0
(p8) cmp.ne p7, p0 = GR_SIG_B, r0
nop.i 0
}
;;
{ .mfi
nop.m 0
(p8) frcpa.s0 f8, p0 = f8, f8
nop.i 0
}
{ .mfi
nop.m 0
// also set Denormal flag if necessary
(p7) fnma.s0 f9 = f9, f1, f9
nop.i 0
}
;;
{ .mfb
nop.m 0
(p8) fma.s0 f8 = f8, f1, f0
nop.b 0
}
;;
{ .mfb
nop.m 0
(p9) frcpa.s0 f8, p7 = f8, f9
br.ret.sptk b0
}
;;
FMOD_B_NAN_INF_ZERO:
// b INF
{ .mfi
nop.m 0
fclass.m.unc p7, p0 = f9, 0x23
nop.i 0
}
;;
{ .mfb
nop.m 0
(p7) fma.s0 f8 = f8, f1, f0
(p7) br.ret.spnt b0
}
;;
// b NAN?
{ .mfi
nop.m 0
fclass.m.unc p9, p10 = f9, 0xc3
nop.i 0
}
;;
{ .mfi
nop.m 0
(p10) fclass.nm p9, p0 = f9, 0xff
nop.i 0
}
;;
{ .mfb
nop.m 0
(p9) fma.s0 f8 = f9, f1, f0
(p9) br.ret.spnt b0
}
;;
FMOD_B_ZERO:
// Y zero? Must be zero at this point
// because it is the only choice left.
// Return QNAN indefinite
{ .mfi
nop.m 0
// set Invalid
frcpa.s0 FR_TMP, p0 = f0, f0
nop.i 0
}
;;
// a NAN?
{ .mfi
nop.m 0
fclass.m.unc p9, p10 = f8, 0xc3
nop.i 0
}
;;
{ .mfi
alloc GR_ARPFS = ar.pfs, 1, 4, 4, 0
(p10) fclass.nm p9, p10 = f8, 0xff
nop.i 0
}
;;
{ .mfi
nop.m 0
(p9) frcpa.s0 FR_TMP2, p7 = f8, f0
nop.i 0
}
;;
{ .mfi
nop.m 0
(p10) frcpa.s0 FR_TMP2, p7 = f9, f9
mov GR_Parameter_TAG = 120
}
;;
{ .mfi
nop.m 0
fmerge.s FR_X = f8, f8
nop.i 0
}
{ .mfb
nop.m 0
fma.s0 f8 = FR_TMP2, f1, f0
br.sptk __libm_error_region
}
;;
GLOBAL_IEEE754_END(fmodl)
libm_alias_ldouble_other (__fmod, fmod)
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y = -32, sp // Parameter 2 value
nop.f 0
.save ar.pfs, GR_SAVE_PFS
mov GR_SAVE_PFS = ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
add sp = -64, sp // Create new stack
nop.f 0
mov GR_SAVE_GP = gp // Save gp
}
;;
{ .mmi
stfe [ GR_Parameter_Y ] = FR_Y, 16 // Save Parameter 2 on stack
add GR_Parameter_X = 16, sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0 = b0 // Save b0
}
;;
.body
{ .mib
stfe [ GR_Parameter_X ] = FR_X // Store Parameter 1 on stack
add GR_Parameter_RESULT = 0, GR_Parameter_Y
nop.b 0 // Parameter 3 address
}
{ .mib
stfe [ GR_Parameter_Y ] = FR_RESULT // Store Parameter 3 on stack
add GR_Parameter_Y = -16, GR_Parameter_Y
br.call.sptk b0 = __libm_error_support# // Call error handling function
}
;;
{ .mmi
nop.m 0
nop.m 0
add GR_Parameter_RESULT = 48, sp
}
;;
{ .mmi
ldfe f8 = [ GR_Parameter_RESULT ] // Get return result off stack
.restore sp
add sp = 64, sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
}
;;
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
}
;;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#, @function
.global __libm_error_support#

View File

@ -1 +0,0 @@
/* Not needed. */

View File

@ -1 +0,0 @@
/* Not needed. */

View File

@ -1 +0,0 @@
/* Not needed. */

View File

@ -1,438 +0,0 @@
.file "hypot.s"
// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
//*********************************************************************
//
// History:
// 02/02/00 hand-optimized
// 04/04/00 Unwind support added
// 06/20/00 new version
// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
// 04/17/03 Added missing mutex directive
//
//*********************************************************************
// ___________
// Function: hypot(x,y) = |(x^2 + y^2) = for double precision values
// x and y
// Also provides cabs functionality.
//
//*********************************************************************
//
// Resources Used:
//
// Floating-Point Registers: f8 (Input and Return Value)
// f9 (Input)
// f6 -f15, f32-f34
//
// General Purpose Registers:
// r2,r3,r29 (Scratch)
// r32-r36 (Locals)
// r37-r40 (Used to pass arguments to error handling routine)
//
// Predicate Registers: p6 - p10
//
//*********************************************************************
//
// IEEE Special Conditions:
//
// All faults and exceptions should be raised correctly.
// Overflow can occur.
// hypot(Infinity and anything) = +Infinity
// hypot(QNaN and anything) = QNaN
// hypot(SNaN and anything ) = QNaN
//
//*********************************************************************
//
// Implementation:
// x2 = x * x in double-extended
// y2 = y * y in double-extended
// temp = x2 + y2 in double-extended
// sqrt(temp) rounded to double
//
//*********************************************************************
GR_SAVE_PFS = r33
GR_SAVE_B0 = r34
GR_SAVE_GP = r35
GR_Parameter_X = r36
GR_Parameter_Y = r37
GR_Parameter_RESULT = r38
GR_Parameter_TAG = r39
FR_X = f32
FR_Y = f33
FR_RESULT = f8
.section .text
LOCAL_LIBM_ENTRY(cabs)
LOCAL_LIBM_END(cabs)
GLOBAL_IEEE754_ENTRY(hypot)
{.mfi
alloc r32= ar.pfs,0,4,4,0
// Compute x*x
fma.s1 f10=f8,f8,f0
// r2=bias-1
mov r2=0xfffe
}
{.mfi
// 63/8
mov r3=0x40fc //0000
// y*y
fma.s1 f11=f9,f9,f0
// r29=429/16
mov r29=0x41d68;; //000
}
{ .mfi
nop.m 0
// Check if x is an Inf - if so return Inf even
// if y is a NaN (C9X)
fclass.m.unc p7, p6 = f8, 0x023
shl r3=r3,16
}
{.mfi
nop.m 0
// if possible overflow, copy f8 to f32
// set Denormal, if necessary
// (p8)
fma.d.s0 f32=f8,f1,f0
nop.i 0;;
}
{ .mfi
nop.m 0
// Check if y is an Inf - if so return Inf even
// if x is a NaN (C9X)
fclass.m.unc p8, p9 = f9, 0x023
shl r29=r29,12
}
{ .mfb
// f7=0.5
setf.exp f7=r2
// For x=inf, multiply y by 1 to raise invalid on y an SNaN
// (p7) fma.s0 f9=f9,f1,f0
// copy f9 to f33; set Denormal, if necessary
fma.d.s0 f33=f9,f1,f0
nop.b 0;;
}
{.mfb
// f13=63/8
setf.s f13=r3
// is y Zero ?
(p6) fclass.m p6,p0=f9,0x7
nop.b 0
}
{.mlx
nop.m 0
movl r2=0x408c0000;;
}
{.mfi
// f34=429/16
setf.s f34=r29
// is x Zero ?
(p9) fclass.m p9,p0=f8,0x7
// 231/16
mov r3=0x4167;; //0000
}
{.mfi
nop.m 0
// a=x2+y2
fma.s1 f12=f10,f1,f11
nop.i 0;;
}
{.mfi
nop.m 0
// y not NaN ?
(p9) fclass.m p8,p0=f9,0x3f
shl r3=r3,16
}
{.mfi
nop.m 0
// f6=2
fma.s1 f6=f1,f1,f1
nop.i 0;;
}
{.mfi
nop.m 0
// x not NaN ?
(p6) fclass.m p7,p0=f8,0x3f
nop.i 0;;
}
{.mfi
// f9=35/8
setf.s f9=r2
nop.f 0
// 2*emax-2
mov r2=0x107fb;;
}
.pred.rel "mutex",p7,p8
{.mfb
nop.m 0
// if f8=Infinity or f9=Zero, return |f8|
(p7) fmerge.s f8=f0,f32
(p7) br.ret.spnt b0
}
{.mfb
nop.m 0
// if f9=Infinity or f8=Zero, return |f9|
(p8) fmerge.s f8=f0,f33
(p8) br.ret.spnt b0;;
}
{.mfi
// f10 =231/16
setf.s f10=r3
// z0=frsqrta(a)
frsqrta.s1 f8,p6=f12
nop.i 0;;
}
{ .mfi
nop.m 0
// Identify Natvals, Infs, NaNs, and Zeros
// and return result
fclass.m.unc p7, p0 = f12, 0x1E7
nop.i 0;;
}
{.mfb
// get exponent of x^2+y^2
getf.exp r3=f12
// if special case, set f8
(p7) mov f8=f12
(p7) br.ret.spnt b0;;
}
{.mfi
nop.m 0
// S0=a*z0
(p6) fma.s1 f14=f12,f8,f0
nop.i 0
}
{.mfi
nop.m 0
// H0=0.5*z0
(p6) fma.s1 f15=f8,f7,f0
nop.i 0;;
}
{.mfi
nop.m 0
// f6=5/2
fma.s1 f6=f7,f1,f6
nop.i 0
}
{.mfi
nop.m 0
// f11=3/2
fma.s1 f11=f7,f1,f1
nop.i 0;;
}
{.mfi
nop.m 0
// d=0.5-S0*H0
(p6) fnma.s1 f7=f14,f15,f7
nop.i 0;;
}
{.mfi
nop.m 0
// P67=231/16+429/16*d
(p6) fma.s1 f10=f34,f7,f10
nop.i 0
}
{.mfi
nop.m 0
// P45=63/8*d+35/8
(p6) fma.s1 f9=f13,f7,f9
nop.i 0;;
}
{.mfi
nop.m 0
// P23=5/2*d+3/2
(p6) fma.s1 f11=f6,f7,f11
nop.i 0
}
{.mfi
nop.m 0
// d2=d*d
(p6) fma.s1 f13=f7,f7,f0
nop.i 0;;
}
{.mfi
nop.m 0
// P47=d2*P67+P45
(p6) fma.s1 f10=f10,f13,f9
nop.i 0
}
{.mfi
nop.m 0
// P13=d*P23+1
(p6) fma.s1 f11=f11,f7,f1
nop.i 0;;
}
{.mfi
nop.m 0
// d3=d2*d
(p6) fma.s1 f13=f13,f7,f0
nop.i 0;;
}
{.mfi
nop.m 0
// T0=d*S0
(p6) fma.s1 f15=f7,f14,f0
nop.i 0
}
{.mfi
// Is x^2 + y^2 well less than the overflow
// threshold?
(p6) cmp.lt.unc p7, p8 = r3,r2
// P=P13+d3*P47
(p6) fma.s1 f10=f13,f10,f11
nop.i 0;;
}
{.mfb
nop.m 0
// S=P*T0+S0
fma.d.s0 f8=f10,f15,f14
// No overflow in this case
(p7) br.ret.sptk b0;;
}
{ .mfi
nop.m 0
(p8) fsetc.s2 0x7F,0x42
// Possible overflow path, must detect by
// Setting widest range exponent with prevailing
// rounding mode.
nop.i 0 ;;
}
{ .mfi
// bias+0x400 (bias+EMAX+1)
(p8) mov r2=0x103ff
// S=P*T0+S0
(p8) fma.d.s2 f12=f10,f15,f14
nop.i 0 ;;
}
{ .mfi
(p8) setf.exp f11 = r2
(p8) fsetc.s2 0x7F,0x40
// Restore Original Mode in S2
nop.i 0 ;;
}
{ .mfi
nop.m 0
(p8) fcmp.lt.unc.s1 p9, p10 = f12, f11
nop.i 0 ;;
}
{ .mib
nop.m 0
mov GR_Parameter_TAG = 46
// No overflow
(p9) br.ret.sptk b0;;
}
GLOBAL_IEEE754_END(hypot)
libm_alias_double_other (__hypot, hypot)
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
};;
{ .mmi
stfd [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
stfd [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y
nop.b 0 // Parameter 3 address
}
{ .mib
stfd [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
nop.m 0
nop.m 0
add GR_Parameter_RESULT = 48,sp
};;
{ .mmi
ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
};;
LOCAL_LIBM_END(__libm_error_region#)
.type __libm_error_support#,@function
.global __libm_error_support#

View File

@ -1,394 +0,0 @@
.file "hypotf.s"
// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
//*********************************************************************
//
// History:
// 02/02/00 hand-optimized
// 04/04/00 Unwind support added
// 06/26/00 new version
// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
// 04/17/03 Added missing mutex directive
//
//*********************************************************************
// ___________
// Function: hypotf(x,y) = |(x^2 + y^2) = for single precision values
// x and y
// Also provides cabsf functionality.
//
//*********************************************************************
//
// Resources Used:
//
// Floating-Point Registers: f8 (Input and Return Value)
// f9 (Input)
// f6 -f15
//
// General Purpose Registers:
// r2-r3 (Scratch)
// r32-r36 (Locals)
// r37-r40 (Used to pass arguments to error handling routine)
//
// Predicate Registers: p6 - p10
//
//*********************************************************************
//
// IEEE Special Conditions:
//
// All faults and exceptions should be raised correctly.
// Overflow can occur.
// hypotf(Infinity and anything) = +Infinity
// hypotf(QNaN and anything) = QNaN
// hypotf(SNaN and anything ) = QNaN
//
//*********************************************************************
//
// Implementation:
// x2 = x * x in double-extended
// y2 = y * y in double-extended
// temp = x2 + y2 in double-extended
// sqrt(temp) rounded to single precision
//
//*********************************************************************
GR_SAVE_PFS = r33
GR_SAVE_B0 = r34
GR_SAVE_GP = r35
GR_Parameter_X = r36
GR_Parameter_Y = r37
GR_Parameter_RESULT = r38
GR_Parameter_TAG = r39
FR_X = f14
FR_Y = f15
FR_RESULT = f8
.section .text
LOCAL_LIBM_ENTRY(cabsf)
LOCAL_LIBM_END(cabsf)
GLOBAL_IEEE754_ENTRY(hypotf)
{.mfi
alloc r32= ar.pfs,0,4,4,0
// Compute x*x
fma.s1 f10=f8,f8,f0
// r2=bias-1
mov r2=0xfffe
}
{.mfi
nop.m 0
// y*y
fma.s1 f11=f9,f9,f0
nop.i 0;;
}
{ .mfi
nop.m 0
// Check if x is an Inf - if so return Inf even
// if y is a NaN (C9X)
fclass.m.unc p7, p6 = f8, 0x023
nop.i 0
}
{.mfi
nop.m 0
// if possible overflow, copy f8 to f14
// set Denormal, if necessary
// (p8)
fma.s.s0 f14=f8,f1,f0
nop.i 0;;
}
{ .mfi
nop.m 0
// Check if y is an Inf - if so return Inf even
// if x is a NaN (C9X)
fclass.m.unc p8, p9 = f9, 0x023
nop.i 0
}
{ .mfi
nop.m 0
// For x=inf, multiply y by 1 to raise invalid on y an SNaN
// (p7) fma.s0 f9=f9,f1,f0
// copy f9 to f15; set Denormal, if necessary
fma.s.s0 f15=f9,f1,f0
nop.i 0;;
}
{.mfi
nop.m 0
// is y Zero ?
(p6) fclass.m p6,p0=f9,0x7
nop.i 0;;
}
{.mfi
nop.m 0
// is x Zero ?
(p9) fclass.m p9,p0=f8,0x7
nop.i 0;;
}
{.mfi
// f7=0.5
setf.exp f7=r2
// a=x2+y2
fma.s1 f12=f10,f1,f11
nop.i 0;;
}
{.mfi
nop.m 0
// x not NaN ?
(p6) fclass.m p7,p0=f8,0x3f
nop.i 0
}
{.mfi
// 2*emax-2
mov r2=0x100fb
// f6=2
fma.s1 f6=f1,f1,f1
nop.i 0;;
}
{.mfi
nop.m 0
// y not NaN ?
(p9) fclass.m p8,p0=f9,0x3f
nop.i 0;;
}
.pred.rel "mutex",p7,p8
{.mfb
nop.m 0
// if f8=Infinity or f9=Zero, return |f8|
(p7) fmerge.s f8=f0,f14
(p7) br.ret.spnt b0
}
{.mfb
nop.m 0
// if f9=Infinity or f8=Zero, return |f9|
(p8) fmerge.s f8=f0,f15
(p8) br.ret.spnt b0;;
}
{ .mfi
nop.m 0
// Identify Natvals, Infs, NaNs, and Zeros
// and return result
fclass.m.unc p7, p0 = f12, 0x1E7
nop.i 0
}
{.mfi
nop.m 0
// z0=frsqrta(a)
frsqrta.s1 f8,p6=f12
nop.i 0;;
}
{.mfb
// get exponent of x^2+y^2
getf.exp r3=f12
// if special case, set f8
(p7) mov f8=f12
(p7) br.ret.spnt b0;;
}
{.mfi
nop.m 0
// S0=a*z0
(p6) fma.s1 f12=f12,f8,f0
nop.i 0
}
{.mfi
nop.m 0
// H0=0.5*z0
(p6) fma.s1 f10=f8,f7,f0
nop.i 0;;
}
{.mfi
nop.m 0
// f6=5/2
fma.s1 f6=f7,f1,f6
nop.i 0
}
{.mfi
nop.m 0
// f11=3/2
fma.s1 f11=f7,f1,f1
nop.i 0;;
}
{.mfi
nop.m 0
// d=0.5-S0*H0
(p6) fnma.s1 f7=f12,f10,f7
nop.i 0;;
}
{.mfi
nop.m 0
// P01=d+1
(p6) fma.s1 f10=f1,f7,f1
nop.i 0
}
{.mfi
nop.m 0
// P23=5/2*d+3/2
(p6) fma.s1 f11=f6,f7,f11
nop.i 0;;
}
{.mfi
nop.m 0
// d2=d*d
(p6) fma.s1 f7=f7,f7,f0
nop.i 0;;
}
{.mfi
// Is x^2 + y^2 well less than the overflow
// threshold?
(p6) cmp.lt.unc p7, p8 = r3,r2
// P=P01+d2*P23
(p6) fma.s1 f10=f7,f11,f10
nop.i 0;;
}
{.mfb
nop.m 0
// S=P*S0
fma.s.s0 f8=f10,f12,f0
// No overflow in this case
(p7) br.ret.sptk b0;;
}
{ .mfi
nop.m 0
(p8) fsetc.s2 0x7F,0x42
// Possible overflow path, must detect by
// Setting widest range exponent with prevailing
// rounding mode.
nop.i 0 ;;
}
{ .mfi
// bias+0x400 (bias+EMAX+1)
(p8) mov r2=0x1007f
// S=P*S0
(p8) fma.s.s2 f12=f10,f12,f0
nop.i 0 ;;
}
{ .mfi
(p8) setf.exp f11 = r2
(p8) fsetc.s2 0x7F,0x40
// Restore Original Mode in S2
nop.i 0 ;;
}
{ .mfi
nop.m 0
(p8) fcmp.lt.unc.s1 p9, p10 = f12, f11
nop.i 0 ;;
}
{ .mib
nop.m 0
mov GR_Parameter_TAG = 47
// No overflow
(p9) br.ret.sptk b0;;
}
GLOBAL_IEEE754_END(hypotf)
libm_alias_float_other (__hypot, hypot)
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mii
add GR_Parameter_Y=-32,sp // Parameter 2 value
mov GR_Parameter_TAG = 47
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
};;
{ .mmi
stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y
nop.b 0 // Parameter 3 address
}
{ .mib
stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
nop.m 0
nop.m 0
add GR_Parameter_RESULT = 48,sp
};;
{ .mmi
ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
};;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#

View File

@ -1,475 +0,0 @@
.file "hypotl.s"
// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
//*********************************************************************
//
// History:
// 02/02/00 hand-optimized
// 04/04/00 Unwind support added
// 06/20/00 new version
// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
//
//*********************************************************************
// ___________
// Function: hypotl(x,y) = |(x^2 + y^2) = for double extended values
// x and y
// Also provides cabsl functionality.
//
//*********************************************************************
//
// Resources Used:
//
// Floating-Point Registers: f8 (Input and Return Value)
// f9 (Input)
// f6 -f15, f32-f34
//
// General Purpose Registers:
// r2-r3 (Scratch)
// r32-r36 (Locals)
// r37-r40 (Used to pass arguments to error handling routine)
//
// Predicate Registers: p6 - p10
//
//*********************************************************************
//
// IEEE Special Conditions:
//
// All faults and exceptions should be raised correctly.
// Overflow can occur.
// hypotl(Infinity and anything) = +Infinity
// hypotl(QNaN and anything) = QNaN
// hypotl(SNaN and anything ) = QNaN
//
//*********************************************************************
//
// Implementation:
// x2 = x * x in double-extended
// y2 = y * y in double-extended
// temp = x2 + y2 in double-extended
// sqrt(temp) rounded to double extended
//
//*********************************************************************
GR_SAVE_PFS = r33
GR_SAVE_B0 = r34
GR_SAVE_GP = r35
GR_Parameter_X = r36
GR_Parameter_Y = r37
GR_Parameter_RESULT = r38
GR_Parameter_TAG = r39
FR_X = f32
FR_Y = f33
FR_RESULT = f8
.section .text
LOCAL_LIBM_ENTRY(cabsl)
LOCAL_LIBM_END(cabsl)
GLOBAL_IEEE754_ENTRY(hypotl)
{.mfi
alloc r32= ar.pfs,0,4,4,0
// Compute x*x
fma.s1 f10=f8,f8,f0
// r2=bias-1
mov r2=0xfffe
}
{.mfi
nop.m 0
// y*y
fma.s1 f11=f9,f9,f0
nop.i 0;;
}
{ .mfi
nop.m 0
// Check if x is an Inf - if so return Inf even
// if y is a NaN (C9X)
fclass.m.unc p7, p6 = f8, 0x023
nop.i 0
}
{.mfi
nop.m 0
// if possible overflow, copy f8 to f32
// set Denormal, if necessary
// (p8)
fma.s0 f32=f8,f1,f0
nop.i 0;;
}
{ .mfi
nop.m 0
// Check if y is an Inf - if so return Inf even
// if x is a NaN (C9X)
fclass.m.unc p8, p9 = f9, 0x023
nop.i 0
}
{ .mfi
nop.m 999
// For x=inf, multiply y by 1 to raise invalid on y an SNaN
// (p7) fma.s0 f9=f9,f1,f0
// copy f9 to f33; set Denormal, if necessary
fma.s0 f33=f9,f1,f0
nop.i 0;;
}
{.mfi
nop.m 0
// is y Zero ?
(p6) fclass.m p6,p0=f9,0x7
nop.i 0;;
}
{.mfi
// f7=0.5
setf.exp f7=r2
// a=x2+y2
fma.s1 f12=f10,f1,f11
nop.i 0
}
{.mfi
mov r2=0x408c //0000
// dx=x*x-x2
fms.s1 f13=f8,f8,f10
nop.i 0;;
}
{.mfi
nop.m 0
// is x Zero ?
(p9) fclass.m p9,p0=f8,0x7
shl r2=r2,16
}
{.mfi
nop.m 0
// dy=y*y-y2
fms.s1 f14=f9,f9,f11
nop.i 0;;
}
{.mfi
nop.m 0
// x not NaN ?
(p6) fclass.m p7,p0=f8,0x3f
nop.i 0
}
{.mfi
nop.m 0
// f6=2
fma.s1 f6=f1,f1,f1
nop.i 0;;
}
{.mfi
nop.m 0
// f34=min(x2,y2)
famin.s1 f34=f10,f11
nop.i 0
}
{.mfb
nop.m 0
// f10=max(x2,y2)
famax.s1 f10=f11,f10
nop.b 0;; //
}
{.mfi
nop.m 0
// y not NaN ?
(p9) fclass.m p8,p0=f9,0x3f
nop.i 0;;
}
{.mfb
// f9=35/8
setf.s f9=r2
// if f8=Infinity or f9=Zero, return |f8|
(p7) fmerge.s f8=f0,f32
(p7) br.ret.spnt b0;;
}
{.mfi
nop.m 0
// z0=frsqrta(a)
frsqrta.s1 f8,p6=f12
nop.i 0;;
}
{ .mfi
nop.m 0
// Identify Natvals, Infs, NaNs, and Zeros
// and return result
fclass.m.unc p7, p0 = f12, 0x1E7
nop.i 0
}
{.mfi
// get exponent of x^2+y^2
getf.exp r3=f12
// dxy=dx+dy
fma.s1 f13=f13,f1,f14
nop.i 0;;
}
{.mfb
// 2*emax-2
mov r2=0x17ffb
// if f9=Infinity or f8=Zero, return |f9|
(p8) fmerge.s f8=f0,f33
(p8) br.ret.spnt b0
}
{.mfi
nop.m 0
// dd=a-max(x2,y2)
fnma.s1 f10=f10,f1,f12
nop.i 0;;
}
{.mfi
nop.m 0
// S0=a*z0
(p6) fma.s1 f14=f12,f8,f0
nop.i 0
}
{.mfi
nop.m 0
// H0=0.5*z0
(p6) fma.s1 f15=f8,f7,f0
nop.i 0;;
}
{.mfb
nop.m 0
// if special case, set f8
(p7) mov f8=f12
(p7) br.ret.spnt b0
}
{.mfi
nop.m 0
// da=min(x2,y2)-dd
fnma.s1 f10=f10,f1,f34
nop.i 0;;
}
{.mfi
nop.m 0
// f6=5/2
fma.s1 f6=f7,f1,f6
nop.i 0
}
{.mfi
nop.m 0
// f11=3/2
fma.s1 f11=f7,f1,f1
nop.i 0;;
}
{.mfi
nop.m 0
// d=0.5-S0*H0
(p6) fnma.s1 f7=f14,f15,f7
nop.i 0;;
}
{.mfi
nop.m 0
// P1=3/2*d+1
(p6) fma.s1 f11=f11,f7,f1
nop.i 0
}
{.mfi
nop.m 0
// P2=35/8*d+5/2
(p6) fma.s1 f9=f9,f7,f6
nop.i 0;;
}
{.mfi
nop.m 0
// d2=d*d
(p6) fma.s1 f34=f7,f7,f0
nop.i 0;;
}
{.mfi
nop.m 0
// T0=d*S0
(p6) fma.s1 f6=f7,f14,f0
nop.i 0
}
{.mfi
nop.m 0
// G0=d*H0
(p6) fma.s1 f7=f7,f15,f0
nop.i 0;;
}
{.mfi
nop.m 0
// P=d2*P2+P1
(p6) fma.s1 f11=f34,f9,f11
nop.i 0;;
}
{.mfi
nop.m 0
// S1=p*T0+S0
(p6) fma.s1 f14=f11,f6,f14
nop.i 0
}
{.mfi
nop.m 0
// H1=p*G0+H0
(p6) fma.s1 f15=f11,f7,f15
nop.i 0;;
}
{.mfi
nop.m 0
// e1=a-S1*S1
(p6) fnma.s1 f7=f14,f14,f12
nop.i 0
}
{.mfi
// Is x^2 + y^2 well less than the overflow
// threshold?
(p6) cmp.lt.unc p7, p8 = r3,r2
// c=dxy+da
(p6) fma.s1 f13=f13,f1,f10
nop.i 0;;
}
{.mfi
nop.m 0
// e=e1+c
(p6) fma.s1 f13=f7,f1,f13
nop.i 0;;
}
{.mfb
nop.m 0
// S=e*H1+S1
fma.s0 f8=f13,f15,f14
// No overflow in this case
(p7) br.ret.sptk b0;;
}
{ .mfi
nop.m 0
(p8) fsetc.s2 0x7F,0x42
// Possible overflow path, must detect by
// Setting widest range exponent with prevailing
// rounding mode.
nop.i 0 ;;
}
{ .mfi
// bias+0x4000 (bias+EMAX+1)
(p8) mov r2=0x13fff
// S=e*H1+S1
(p8) fma.s2 f12=f13,f15,f14
nop.i 0 ;;
}
{ .mfi
(p8) setf.exp f11 = r2
(p8) fsetc.s2 0x7F,0x40
// Restore Original Mode in S2
nop.i 0 ;;
}
{ .mfi
nop.m 0
(p8) fcmp.lt.unc.s1 p9, p10 = f12, f11
nop.i 0 ;;
}
{ .mib
nop.m 0
mov GR_Parameter_TAG = 45;
// No overflow
(p9) br.ret.sptk b0;;
}
GLOBAL_IEEE754_END(hypotl)
libm_alias_ldouble_other (__hypot, hypot)
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
};;
{ .mmi
stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y
nop.b 0 // Parameter 3 address
}
{ .mib
stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
nop.m 0
nop.m 0
add GR_Parameter_RESULT = 48,sp
};;
{ .mmi
ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
};;
LOCAL_LIBM_END(__libm_error_region#)
.type __libm_error_support#,@function
.global __libm_error_support#

View File

@ -1,267 +0,0 @@
.file "ilogbl.s"
// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
// 02/03/00 Initial version
// 05/26/00 Fix bug when x a double-extended denormal;
// if x=0 call error routine, per C9X
// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
// 01/20/01 Fixed result for x=0
// 05/20/02 Cleaned up namespace and sf0 syntax
// 01/20/03 Improved performance
//
// API
//==============================================================
// int ilogbl( long double x );
//
// Overview of operation
//==============================================================
// The ilogbl function extracts the exponent of x as an integer
// and returns it in r8
//
// ilogbl is similar to logbl but differs in the following ways:
// +-inf
// ilogbl: returns INT_MAX
// logbl: returns +inf
// Nan returns FP_LOGBNAN (which is either INT_MAX or INT_MIN)
// ilogbl: returns INT_MAX (7fffffff)
// logbl: returns QNAN (quietized SNAN)
// 0 returns FP_ILOGB0 (which is either INT_MIN or -INT_MAX)
// ilogbl: returns -INT_MAX (80000001)
// logbl: returns -inf, raises the divide-by-zero exception,
// and calls libm_error_support to set domain error
//
// Registers used
//==============================================================
// general registers used:
// r26 -> r39
// r36 -> r39 used as parameters to error path
//
// predicate registers used:
// p6 -> p10
// floating-point registers used:
// f9, f10, f11
// f8, input
rExpBias = r26
rExpMask = r27
rSignexp_x = r28
rExp_x = r29
rIntMax = r30
rExp_2to64 = r31
GR_SAVE_PFS = r32
rTrialResult = r33
GR_SAVE_B0 = r34
GR_SAVE_GP = r35
GR_Parameter_X = r36
GR_Parameter_Y = r37
GR_Parameter_RESULT = r38
GR_Parameter_TAG = r39
fTmp = f9
fNorm_x = f10
f2to64 = f11
.section .text
GLOBAL_LIBM_ENTRY(__ieee754_ilogbl)
// X NORMAL
// TrueExp_x = exp(f8) - 0xffff
// r8 = TrueExp_x
{ .mfi
getf.exp rSignexp_x = f8
fclass.m p8,p0 = f8, 0x0b // Test for x unorm
mov rExpBias = 0xffff // Exponent bias
}
{ .mfi
nop.m 0
fnorm.s1 fNorm_x = f8
mov rExpMask = 0x1ffff // Exponent mask
}
;;
// Form signexp of 2^64 in case need to scale denormal
{ .mfb
mov rExp_2to64 = 0x1003f
fclass.m p6,p9 = f8, 0x1e3 // Test x natval, nan, inf
(p8) br.cond.spnt ILOGB_DENORM // Branch if x unorm
}
;;
ILOGB_COMMON:
// Return here from ILOGB_DENORM
{ .mfi
and rExp_x = rSignexp_x, rExpMask // Get biased exponent
fclass.m p7,p10 = f8, 0x07 // Test x zero
nop.i 0
}
{ .mlx
nop.m 0
movl rIntMax = 0x000000007fffffff // Form INT_MAX
}
;;
.pred.rel "mutex",p6,p9
{ .mfi
(p9) sub r8 = rExp_x, rExpBias // Get true exponent for normal path
(p6) fma.s0 fTmp = f8, f8, f0 // Dummy to set Invalid flag
(p6) mov r8 = rIntMax // If nan, inf, return INT_MAX
}
{ .mbb
nop.m 0
(p7) br.cond.spnt ILOGB_ZERO // Branch if x zero
(p10) br.ret.sptk b0 // Exit if x not zero
}
;;
ILOGB_DENORM:
// Form 2^64 in case need to scale denormal
// Check to see if double-extended denormal
{ .mfi
setf.exp f2to64 = rExp_2to64
fclass.m p8,p0 = fNorm_x, 0x0b
nop.i 0
}
;;
{ .mfi
nop.m 0
fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag
nop.i 0
}
;;
// If double-extended denormal add 64 to exponent bias for scaling
// If double-extended denormal form x * 2^64 which is normal
{ .mfi
(p8) add rExpBias = 64, rExpBias
(p8) fmpy.s1 fNorm_x = fNorm_x, f2to64
nop.i 0
}
;;
// Logic is the same as normal path but use normalized input
{ .mib
getf.exp rSignexp_x = fNorm_x
nop.i 0
br.cond.sptk ILOGB_COMMON // Return to main path
}
;;
ILOGB_ZERO:
// Here if x zero
// Return INT_MIN, call error support
{ .mlx
alloc r32=ar.pfs,1,3,4,0
movl rTrialResult = 0x0000000080000000
}
{ .mib
mov GR_Parameter_TAG = 156 // Error code
nop.i 0
br.cond.sptk __libm_error_region // Call error support
}
;;
GLOBAL_LIBM_END(__ieee754_ilogbl)
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
};;
{ .mmi
stfe [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
stfe [GR_Parameter_X] = f8 // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
nop.b 0
}
{ .mib
stfe [GR_Parameter_Y] = f9 // Store Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
add GR_Parameter_RESULT = 48,sp
nop.m 0
nop.i 0
};;
{ .mmi
mov r8 = rTrialResult
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0
};;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#

View File

@ -1,70 +0,0 @@
/* file: lgamma_r.c */
// Copyright (c) 2002 Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
//
// History
//==============================================================
// 02/04/02: Initial version
// 02/22/02: Removed lgammaf_r, gammaf_r
/*
// FUNCTIONS: double lgamma_r(double x, int* signgam)
// double gamma_r(double x, int* signgam)
// Natural logarithm of GAMMA function
*/
#include "libm_support.h"
extern double __libm_lgamma(double /*x*/, int* /*signgam*/, int /*signgamsz*/);
double __ieee754_lgamma_r(double x, int* signgam)
{
return __libm_lgamma(x, signgam, sizeof(*signgam));
}
libm_alias_double_r (__ieee754_lgamma, lgamma, _r)
#ifndef _LIBC
double __ieee754_gamma_r(double x, int* signgam)
{
return __libm_lgamma(x, signgam, sizeof(*signgam));
}
weak_alias (__ieee754_gamma_r, gamma_r)
#endif

View File

@ -1,70 +0,0 @@
/* file: lgammaf_r.c */
// Copyright (c) 2002 Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
//
// History
//==============================================================
// 02/04/02: Initial version
// 02/22/02: Removed lgamma_r, gamma_r
/*
// FUNCTIONS: float lgammaf_r(float x, int* signgam)
// float gammaf_r(float x, int* signgam)
// Natural logarithm of GAMMA function
*/
#include "libm_support.h"
extern float __libm_lgammaf(float /*x*/, int* /*signgam*/, int /*signgamsz*/);
float __ieee754_lgammaf_r(float x, int* signgam)
{
return __libm_lgammaf(x, signgam, sizeof(*signgam));
}
libm_alias_float_r (__ieee754_lgamma, lgamma, _r)
#ifndef _LIBC
float __ieee754_gammaf_r(float x, int* signgam)
{
return __libm_lgammaf(x, signgam, sizeof(*signgam));
}
weak_alias (__ieee754_gammaf_r, gammaf_r)
#endif

View File

@ -1,69 +0,0 @@
/* file: lgammal_r.c */
// Copyright (c) 2002 Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
//
// History
//==============================================================
// 08/15/02: Initial version
/*
// FUNCTIONS: long double lgammal_r(long double x, int* signgam)
// long double gammal_r(long double x, int* signgam)
// Natural logarithm of GAMMA function
*/
#include "libm_support.h"
extern double __libm_lgammal(long double /*x*/, int* /*signgam*/, int /*signgamsz*/);
long double __ieee754_lgammal_r(long double x, int* signgam)
{
return __libm_lgammal(x, signgam, sizeof(*signgam));
}
libm_alias_ldouble_r (__ieee754_lgamma, lgamma, _r)
#ifndef _LIBC
long double __ieee754_gammal_r(long double x, int* signgam)
{
return __libm_lgammal(x, signgam, sizeof(*signgam));
}
weak_alias (__ieee754_gammal_r, gammal_r)
#endif

File diff suppressed because it is too large Load Diff

View File

@ -1 +0,0 @@
/* Not needed. */

View File

@ -1 +0,0 @@
/* Not needed. */

View File

@ -1 +0,0 @@
/* Not needed. */

View File

@ -1,715 +0,0 @@
.file "log2.s"
// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//=================================================================
// 09/11/00 Initial version
// 03/19/01 Added one polynomial coefficient, to improve accuracy
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
// 04/18/03 Reformatted T[255]
//
// API
//=================================================================
// double log2(double)
//
// Overview of operation
//=================================================================
// Background
//
// Implementation
//
// Let x = 2^l * m, where m=1.b1 b2 ... b8 b9 ... b52
// y=frcpa(m), r=m*y-1, f=b1 b2 .. b8 (table index)
// j=0 if f<128; j=1 if f>=128
// T is a table that stores log2(1/y) (in entries 1..255) rounded to
// double extended precision; f is used as an index; T[255]=0
//
// If f=0 and b9=0, r is set to 2^{-8}* 0.b9 b10 ... b52 = m-1 (fractional part of m),
// and 0 is used instead of T[0]
// (polynomial evaluation only, for m=1+r, 0<=r<2^{-9})
// If f=255, r is set to (m-2)/2 (T[255]=0, and only polynomial evaluation is used
// for m=2(1-r'), 0<=r'<2^{-9})
//
// log2(x) is approximated as
// (l-j) + T[f] + (c1*r+c2*r^2+...+c7*r^7), if f>0
//
// Special values
//=================================================================
// log2(0)=-inf, raises Divide by Zero
// log2(+inf)=inf
// log2(x)=NaN, raises Invalid if x<0
//
// Registers used
//==============================================================
// f6-f15, f32-f33
// r2-r3, r23-r30
// p6,p7,p8,p12
//
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
GR_SAVE_GP = r35 // This reg. can safely be used
GR_SAVE_SP = r36
GR_Parameter_X = r37
GR_Parameter_Y = r38
GR_Parameter_RESULT = r39
GR_Parameter_TAG = r40
FR_X = f10
FR_Y = f1
FR_RESULT = f8
// Data tables
//==============================================================
RODATA
.align 16
LOCAL_OBJECT_START(poly_coeffs)
data8 0xbfd0000000000000, 0x3fc999999999999a //C_4, C_5
data8 0xbfc5555555555555, 0x3fc2492492492492 //C_6, C_7
data8 0xb8aa3b295c17f0bc, 0x00003fff // C_1
data8 0xaaaaaaaaaaaaaaab, 0x00003ffd // C_3=1/3
LOCAL_OBJECT_END(poly_coeffs)
LOCAL_OBJECT_START(T_table)
data8 0xb8d8752172fed131, 0x00003ff6
data8 0x8ae7f475764180a3, 0x00003ff8
data8 0xe7f73862e72ee35d, 0x00003ff8
data8 0xa2b25310c941a2f2, 0x00003ff9
data8 0xcbb91d671abb2e85, 0x00003ff9
data8 0xfac91e34daa50483, 0x00003ff9
data8 0x9504a5042eb495c5, 0x00003ffa
data8 0xa9c4a0bbb580ee02, 0x00003ffa
data8 0xc19264dc8a5e3bf9, 0x00003ffa
data8 0xd67aa6703ebf4a77, 0x00003ffa
data8 0xee76cac6d6e08ce7, 0x00003ffa
data8 0x81c3f7de5434ed04, 0x00003ffb
data8 0x8c563033a3ce01e4, 0x00003ffb
data8 0x9876e9f09a98661c, 0x00003ffb
data8 0xa31e0ac9b2326ce2, 0x00003ffb
data8 0xadcf09e1fd10e4a5, 0x00003ffb
data8 0xb889f992cf03cdb6, 0x00003ffb
data8 0xc34eec68d901a714, 0x00003ffb
data8 0xce1df524e9909ed9, 0x00003ffb
data8 0xd8f726bcb0b80ad0, 0x00003ffb
data8 0xe3da945b878e27d1, 0x00003ffb
data8 0xeec851633b76a320, 0x00003ffb
data8 0xf82ea4bb6101421a, 0x00003ffb
data8 0x8197ddd7736b2864, 0x00003ffc
data8 0x871dad4f994253f0, 0x00003ffc
data8 0x8ca8cae3e892d549, 0x00003ffc
data8 0x916d6e1559a4b697, 0x00003ffc
data8 0x97028118efabeb7d, 0x00003ffc
data8 0x9bcfbce1592ad5d5, 0x00003ffc
data8 0xa16ee95d0da54a91, 0x00003ffc
data8 0xa644dcf3403fa5d0, 0x00003ffc
data8 0xab1ee14ffd659064, 0x00003ffc
data8 0xb0cd12faebcc6757, 0x00003ffc
data8 0xb5affdf9b3b221e0, 0x00003ffc
data8 0xba970fb307c6ade1, 0x00003ffc
data8 0xbf824f3a9f3e7561, 0x00003ffc
data8 0xc544c055fde99333, 0x00003ffc
data8 0xca39266532bdf26c, 0x00003ffc
data8 0xcf31d124b8fa2f56, 0x00003ffc
data8 0xd42ec7f59017b6ab, 0x00003ffc
data8 0xd930124bea9a2c67, 0x00003ffc
data8 0xde35b7af70e4dab3, 0x00003ffc
data8 0xe33fbfbb8533ef03, 0x00003ffc
data8 0xe77625911a7dcef3, 0x00003ffc
data8 0xec884bd689cc12e3, 0x00003ffc
data8 0xf19eeabf9e99a40a, 0x00003ffc
data8 0xf6ba0a35e3d88051, 0x00003ffc
data8 0xfbd9b237f7b4192b, 0x00003ffc
data8 0x80111d4a1ee0c79e, 0x00003ffd
data8 0x82a523a5f875bbfc, 0x00003ffd
data8 0x84ccecdc92cd0815, 0x00003ffd
data8 0x87653369d92c057a, 0x00003ffd
data8 0x89ffd1742da3aa21, 0x00003ffd
data8 0x8c2d2227d053d9b6, 0x00003ffd
data8 0x8e5c189793f7f798, 0x00003ffd
data8 0x90fd0a20e72f3c96, 0x00003ffd
data8 0x932fa937301e59ae, 0x00003ffd
data8 0x95d5061a5f0f5f7f, 0x00003ffd
data8 0x980b5a2ef10e7023, 0x00003ffd
data8 0x9a4361c5514d3c27, 0x00003ffd
data8 0x9c7d1f7d541313fd, 0x00003ffd
data8 0x9f2b16040b500d04, 0x00003ffd
data8 0xa168a0fa9db22c98, 0x00003ffd
data8 0xa3a7eaa1f9116293, 0x00003ffd
data8 0xa5e8f5b4072a3d44, 0x00003ffd
data8 0xa82bc4f11a5e88aa, 0x00003ffd
data8 0xaa705b2001db8317, 0x00003ffd
data8 0xacb6bb0e1e0f8005, 0x00003ffd
data8 0xaefee78f75707221, 0x00003ffd
data8 0xb148e37ec994dd99, 0x00003ffd
data8 0xb394b1bdaca0bc17, 0x00003ffd
data8 0xb5e255349707e496, 0x00003ffd
data8 0xb831d0d2fda791cc, 0x00003ffd
data8 0xba83278f6838ab20, 0x00003ffd
data8 0xbcd65c67881c7d47, 0x00003ffd
data8 0xbeb3e0f21d72dc92, 0x00003ffd
data8 0xc10a7a03457d35dc, 0x00003ffd
data8 0xc362f9b6f51eddd3, 0x00003ffd
data8 0xc5bd6326ebfce656, 0x00003ffd
data8 0xc7a0b3d0637c8f97, 0x00003ffd
data8 0xc9fe96af0df8e4b5, 0x00003ffd
data8 0xcc5e6c214b4a2cd7, 0x00003ffd
data8 0xce46199f374d29cf, 0x00003ffd
data8 0xd0a978a14c0d9ebe, 0x00003ffd
data8 0xd293fecafec7f9b5, 0x00003ffd
data8 0xd4faf1f6f5cf32e6, 0x00003ffd
data8 0xd6e8595abaad34d1, 0x00003ffd
data8 0xd952eb7a8ffc1593, 0x00003ffd
data8 0xdb433ccd805f171e, 0x00003ffd
data8 0xddb178dc43e6bd84, 0x00003ffd
data8 0xdfa4bcfb333342a4, 0x00003ffd
data8 0xe19953741ccea015, 0x00003ffd
data8 0xe40cee16a2ff21c5, 0x00003ffd
data8 0xe6048470cdbde8ea, 0x00003ffd
data8 0xe7fd7308d6895b14, 0x00003ffd
data8 0xe9f7bbb6a1ff9f87, 0x00003ffd
data8 0xec7280138809433d, 0x00003ffd
data8 0xee6fda4365cd051f, 0x00003ffd
data8 0xf06e94a122ff1f12, 0x00003ffd
data8 0xf26eb1151441fce5, 0x00003ffd
data8 0xf470318b88a77e2f, 0x00003ffd
data8 0xf67317f4d4c8aa58, 0x00003ffd
data8 0xf8f8b250a9c4cde6, 0x00003ffd
data8 0xfafec54831f1a484, 0x00003ffd
data8 0xfd06449bf3eaea1e, 0x00003ffd
data8 0xff0f324ddb19ab67, 0x00003ffd
data8 0x808cc8320a9acf15, 0x00003ffe
data8 0x8192b0748f2cef06, 0x00003ffe
data8 0x829952f5e6a24ee5, 0x00003ffe
data8 0x83a0b0bfafe1424e, 0x00003ffe
data8 0x8466b29f9c41caea, 0x00003ffe
data8 0x856f5aae0881d857, 0x00003ffe
data8 0x8678c0eae8ee8190, 0x00003ffe
data8 0x8782e6685676b9d7, 0x00003ffe
data8 0x888dcc3abc4554ec, 0x00003ffe
data8 0x89997378de7b98b8, 0x00003ffe
data8 0x8aa5dd3be1044279, 0x00003ffe
data8 0x8b6facdfd0360ab8, 0x00003ffe
data8 0x8c7d6db7169e0cdb, 0x00003ffe
data8 0x8d8bf424d6e130b2, 0x00003ffe
data8 0x8e575b506f409fa6, 0x00003ffe
data8 0x8f673e418776492c, 0x00003ffe
data8 0x9077e9ed700ef9ba, 0x00003ffe
data8 0x9144ef1baec80b20, 0x00003ffe
data8 0x9256fcdb537f035f, 0x00003ffe
data8 0x9369d68d75e7e1d6, 0x00003ffe
data8 0x943880613b8f9f1e, 0x00003ffe
data8 0x954cc1d9e0d94206, 0x00003ffe
data8 0xd3c70a37bdf7a294, 0x0000bffd
data8 0xd19bb053fb0284ec, 0x0000bffd
data8 0xcffa1a3b7dafb8bf, 0x0000bffd
data8 0xcdcbe1e2776479ee, 0x0000bffd
data8 0xcc282218b8bfdda2, 0x0000bffd
data8 0xc9f703a9afcb38ac, 0x0000bffd
data8 0xc851146ab89593c6, 0x0000bffd
data8 0xc61d08265927a860, 0x0000bffd
data8 0xc474e39705912d26, 0x0000bffd
data8 0xc23de19ec30c6e3e, 0x0000bffd
data8 0xc09381cc45db45b4, 0x0000bffd
data8 0xbee82b4e025ff90c, 0x0000bffd
data8 0xbcace101149788ec, 0x0000bffd
data8 0xbaff46962ea47964, 0x0000bffd
data8 0xb950b1be5e0c14a2, 0x0000bffd
data8 0xb7110e6ce866f2bc, 0x0000bffd
data8 0xb5602ccc2a81db52, 0x0000bffd
data8 0xb3ae4ce740fc8ef1, 0x0000bffd
data8 0xb1fb6d92c8240ccc, 0x0000bffd
data8 0xafb609c09b244abc, 0x0000bffd
data8 0xae00d1cfdeb43cfd, 0x0000bffd
data8 0xac4a967a8c8c9bd0, 0x0000bffd
data8 0xaa93568c249e6c52, 0x0000bffd
data8 0xa8db10cdff375343, 0x0000bffd
data8 0xa68e6fc5a42376e3, 0x0000bffd
data8 0xa4d3c25e68dc57f2, 0x0000bffd
data8 0xa3180b0c192a3816, 0x0000bffd
data8 0xa15b488e7aa329a0, 0x0000bffd
data8 0x9f9d79a30f0e1d5f, 0x0000bffd
data8 0x9dde9d050ee7d4ac, 0x0000bffd
data8 0x9c1eb16d63d7356c, 0x0000bffd
data8 0x9a5db592a310c36a, 0x0000bffd
data8 0x989ba82907a9016f, 0x0000bffd
data8 0x96d887e26cd57b79, 0x0000bffd
data8 0x9514536e481c3a4f, 0x0000bffd
data8 0x934f0979a3715fc9, 0x0000bffd
data8 0x9188a8af1742a9d5, 0x0000bffd
data8 0x8fc12fb6c470995f, 0x0000bffd
data8 0x8df89d364e34f8f1, 0x0000bffd
data8 0x8c2eefd0d3f67dd6, 0x0000bffd
data8 0x8a642626eb093d54, 0x0000bffd
data8 0x88983ed6985bae58, 0x0000bffd
data8 0x86cb387b4a0feec6, 0x0000bffd
data8 0x84fd11add101024b, 0x0000bffd
data8 0x83c856dd81804b78, 0x0000bffd
data8 0x81f84c2c62afd6f1, 0x0000bffd
data8 0x80271d3e4be5ea5a, 0x0000bffd
data8 0xfca991447e7b485d, 0x0000bffc
data8 0xf90299c904793a3c, 0x0000bffc
data8 0xf559511d2dc1ed69, 0x0000bffc
data8 0xf2e72afee9bd2aee, 0x0000bffc
data8 0xef39ff1d8a40770e, 0x0000bffc
data8 0xeb8a7a2311c935dc, 0x0000bffc
data8 0xe7d8990dc620012f, 0x0000bffc
data8 0xe560b1e3b86e44b6, 0x0000bffc
data8 0xe1aadb38caee80c4, 0x0000bffc
data8 0xddf2a051f81b76a4, 0x0000bffc
data8 0xdb7678bafcaf4b5f, 0x0000bffc
data8 0xd7ba3a8f0df19bfc, 0x0000bffc
data8 0xd3fb8fdbdd5cebdb, 0x0000bffc
data8 0xd17b191905c35652, 0x0000bffc
data8 0xcdb85d29cefd7121, 0x0000bffc
data8 0xc9f32c3c88221ef6, 0x0000bffc
data8 0xc76e5741a95b5dae, 0x0000bffc
data8 0xc3a506d80d38c718, 0x0000bffc
data8 0xbfd938ccef8b68c1, 0x0000bffc
data8 0xbd4ff63e82eef78c, 0x0000bffc
data8 0xb97ffa2b563865bd, 0x0000bffc
data8 0xb6f3eb3011eddcea, 0x0000bffc
data8 0xb31fb7d64898b3e6, 0x0000bffc
data8 0xb090d63a409e7880, 0x0000bffc
data8 0xacb8623c7ffa4f39, 0x0000bffc
data8 0xa8dd5c83d2e45246, 0x0000bffc
data8 0xa649e998a8d91f2e, 0x0000bffc
data8 0xa26a93fed6faa94f, 0x0000bffc
data8 0x9fd43df079d0db1f, 0x0000bffc
data8 0x9d3cbe69aecac4c2, 0x0000bffc
data8 0x99574f13c570d0fb, 0x0000bffc
data8 0x96bce349bf7ee6c7, 0x0000bffc
data8 0x92d30c9b86cee18e, 0x0000bffc
data8 0x9035adef17c5bd5c, 0x0000bffc
data8 0x8c4765e8e8b5f251, 0x0000bffc
data8 0x89a70da448316ffa, 0x0000bffc
data8 0x85b44a24474af78a, 0x0000bffc
data8 0x8310f17aab5adf70, 0x0000bffc
data8 0x806c6388d0965f29, 0x0000bffc
data8 0xf8e69092bf0c5ead, 0x0000bffb
data8 0xf397608bfd2d90e6, 0x0000bffb
data8 0xee45be24d0eedbc4, 0x0000bffb
data8 0xe646af233db881e9, 0x0000bffb
data8 0xe0eee4e1ce3d06fb, 0x0000bffb
data8 0xdb94a049e6e87a4f, 0x0000bffb
data8 0xd3888ef9a4249f5a, 0x0000bffb
data8 0xce280e6fbac39194, 0x0000bffb
data8 0xc8c50b72319ad574, 0x0000bffb
data8 0xc0abcd39f41e329b, 0x0000bffb
data8 0xbb4279cfa7f9667b, 0x0000bffb
data8 0xb5d69bac77ec398a, 0x0000bffb
data8 0xb068306bf20d6233, 0x0000bffb
data8 0xa83dc1b019ddb6a8, 0x0000bffb
data8 0xa2c8eb1886c2d024, 0x0000bffb
data8 0x9d517ee93f8e16c0, 0x0000bffb
data8 0x97d77aae659b92fb, 0x0000bffb
data8 0x8f9b91da5736d415, 0x0000bffb
data8 0x8a1b06b09b7fd1d1, 0x0000bffb
data8 0x8497daca0a2e077a, 0x0000bffb
data8 0xfe241745a453f10c, 0x0000bffa
data8 0xf3132d6708d723c5, 0x0000bffa
data8 0xe7fcf2e21a0e7d77, 0x0000bffa
data8 0xd75198b04afb8da9, 0x0000bffa
data8 0xcc2dfe1a4a8ca305, 0x0000bffa
data8 0xc10500d63aa65882, 0x0000bffa
data8 0xb5d69bac77ec398a, 0x0000bffa
data8 0xaaa2c95dc66abcde, 0x0000bffa
data8 0x9f6984a342d13101, 0x0000bffa
data8 0x942ac82e5387ac51, 0x0000bffa
data8 0x88e68ea899a0976c, 0x0000bffa
data8 0xefebc4409ccf872e, 0x0000bff9
data8 0xd947b0c6642ef69e, 0x0000bff9
data8 0xc2987d51e043d407, 0x0000bff9
data8 0xabde1eeee6bfd257, 0x0000bff9
data8 0x95188a9917cf2e01, 0x0000bff9
data8 0xfc8f6a777c1b7f1e, 0x0000bff8
data8 0xced727635c59725c, 0x0000bff8
data8 0xa108358a4c904615, 0x0000bff8
data8 0xe644fcbeb3ac9c90, 0x0000bff7
data8 0x8a4bd667bf08e7de, 0x0000bff7
data8 0x0000000000000000 // T[255] Low
data8 0x0000000000000000 // T[255] High
LOCAL_OBJECT_END(T_table)
.section .text
WEAK_LIBM_ENTRY(log2)
{ .mfi
alloc r32=ar.pfs,1,4,4,0
// y=frcpa(x)
frcpa.s1 f6,p0=f1,f8
// will form significand of 1.5 (to test whether the index is 128 or above)
mov r24=0xc
}
{.mfi
nop.m 0
// normalize x
fma.s1 f7=f8,f1,f0
// r2 = pointer to C_1...C_6 followed by T_table
addl r2 = @ltoff(poly_coeffs), gp;;
}
{.mfi
// get significand
getf.sig r25=f8
// f8 denormal ?
fclass.m p8,p10=f8,0x9
// will form significand of 1.5 (to test whether the index is 128 or above)
shl r24=r24,60
}
{.mfi
mov r26=0x804
nop.f 0
// r23=bias-1
mov r23=0xfffe;;
}
{.mmf
getf.exp r29=f8
// load start address for C_1...C_6 followed by T_table
ld8 r2=[r2]
// will continue only for positive normal/denormal numbers
fclass.nm.unc p12,p7 = f8, 0x19 ;;
}
.pred.rel "mutex",p8,p10
{.mfi
// denormal input, repeat get significand (after normalization)
(p8) getf.sig r25=f7
// x=1 ?
fcmp.eq.s0 p6,p0=f8,f1
// get T_index
(p10) shr.u r28=r25,63-8
}
{.mfi
// f32=0.5
setf.exp f32=r23
nop.f 0
// r27=bias
mov r27=0xffff;;
}
{.mmi
// denormal input, repeat get exponent (after normalization)
(p8) getf.exp r29=f7
mov r23=0xff
// r26=0x80400...0 (threshold for using polynomial approximation)
shl r26=r26,64-12;;
}
{.mfb
add r3=48,r2
// r=1-x*y
fms.s1 f6=f6,f8,f1
(p12) br.cond.spnt SPECIAL_LOG2
}
{.mfi
// load C_4, C_5
ldfpd f10,f11=[r2],16
nop.f 0
cmp.geu p12,p0=r25,r24;;
}
{.mmi
// load C_6, C_7
ldfpd f12,f13=[r2],16
// r27=bias-1 (if index >=128, will add exponent+1)
(p12) mov r27=0xfffe
(p8) shr.u r28=r25,63-8;;
}
{.mfi
// load C_1
ldfe f14=[r2],32
fmerge.se f7=f1,f7
// if first 9 bits after leading 1 are all zero, then p8=1
cmp.ltu p8,p12=r25,r26
}
{.mfi
// load C_3
ldfe f15=[r3]
nop.f 0
// get T_index
and r28=r28,r23;;
}
{.mfi
// r29=exponent-bias
sub r29=r29,r27
// x=1, return 0
(p6) fma.d.s0 f8=f0,f0,f0
// get T address
shladd r2=r28,4,r2
}
{.mfb
// first 8 bits after leading 1 are all ones ?
cmp.eq p10,p0=r23,r28
// if first 8 bits after leading bit are 0, use polynomial approx. only
(p8) fms.s1 f6=f7,f1,f1
// x=1, return
(p6) br.ret.spnt b0;;
}
{.mfi
// r26=1
mov r26=1
// if first 8 bits after leading 1 are all ones, use polynomial approx. only
(p10) fms.s1 f6=f7,f32,f1
nop.i 0;;
}
.pred.rel "mutex",p8,p12
{.mmf
// load T (unless first 9 bits after leading 1 are 0)
(p12) ldfe f33=[r2]
// f8=expon - bias
setf.sig f8=r29
// set T=0 (if first 9 bits after leading 1 are 0)
(p8) fma.s1 f33=f0,f0,f0;;
}
{.mfi
nop.m 0
// P12=1-0.5*r
fnma.s1 f32=f32,f6,f1
// r26=2^{63}
shl r26=r26,63
}
{.mfi
nop.m 0
// r2=r*r
fma.s1 f7=f6,f6,f0
nop.i 0;;
}
{.mfi
// significand(x)=1 ?
cmp.eq p0,p6=r26,r25
// P67=C_6+C_7*r
fma.s1 f13=f13,f6,f12
nop.i 0
}
{.mfi
nop.m 0
// P45=C_4+C_5*r
fma.s1 f10=f11,f6,f10
nop.i 0;;
}
{.mfi
nop.m 0
// C_1*r
(p6) fma.s1 f14=f14,f6,f0
nop.i 0;;
}
{.mfi
nop.m 0
// normalize additive term (l=exponent of x)
fcvt.xf f8=f8
nop.i 0
}
{.mfi
nop.m 0
// P13=1-0.5*r+C_3*r^2
(p6) fma.s1 f15=f15,f7,f32
nop.i 0;;
}
{.mfi
nop.m 0
// P47=P45+r2*P67
(p6) fma.s1 f13=f13,f7,f10
// if significand(x)=1, return exponent (l)
nop.i 0
}
{.mfi
nop.m 0
// r3=r^3
(p6) fma.s1 f7=f7,f6,f0
nop.i 0;;
}
{.mfi
nop.m 0
// add T+l
(p6) fma.s1 f8=f8,f1,f33
nop.i 0
}
{.mfi
nop.m 0
// P17=P13+r3*P47
(p6) fma.s1 f13=f13,f7,f15
nop.i 0;;
}
{.mfb
nop.m 0
// result=T+l+(C_1*r)*P16
(p6) fma.d.s0 f8=f13,f14,f8
// return
br.ret.sptk b0;;
}
SPECIAL_LOG2:
{.mfi
nop.m 0
// x=+Infinity ?
fclass.m p7,p0=f8,0x21
nop.i 0;;
}
{.mfi
nop.m 0
// x=+/-Zero ?
fclass.m p8,p0=f8,0x7
nop.i 0;;
}
{.mfi
nop.m 0
// x=-Infinity, -normal, -denormal ?
fclass.m p6,p0=f8,0x3a
nop.i 0;;
}
{.mfb
nop.m 0
// log2(+Infinity)=+Infinity
nop.f 0
(p7) br.ret.spnt b0;;
}
{.mfi
(p8) mov GR_Parameter_TAG = 170
// log2(+/-0)=-infinity, raises Divide by Zero
// set f8=-0
(p8) fmerge.ns f8=f0,f8
nop.i 0;;
}
{.mfb
nop.m 0
(p8) frcpa.s0 f8,p0=f1,f8
(p8) br.cond.sptk __libm_error_region;;
}
{.mfb
(p6) mov GR_Parameter_TAG = 171
// x<0: return NaN, raise Invalid
(p6) frcpa.s0 f8,p0=f0,f0
(p6) br.cond.sptk __libm_error_region;;
}
{.mfb
nop.m 0
// Remaining cases: NaNs
fma.d.s0 f8=f8,f1,f0
br.ret.sptk b0;;
}
WEAK_LIBM_END(log2)
libm_alias_double_other (__log2, log2)
#ifdef SHARED
.symver log2,log2@@GLIBC_2.29
.weak __log2_compat
.set __log2_compat,__log2
.symver __log2_compat,log2@GLIBC_2.2
#endif
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
};;
{ .mmi
stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
nop.b 0
}
{ .mib
stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
nop.m 0
nop.m 0
add GR_Parameter_RESULT = 48,sp
};;
{ .mmi
ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
};;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#

View File

@ -1 +0,0 @@
/* Not needed. */

View File

@ -1,553 +0,0 @@
.file "log2f.s"
// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
// 09/11/00 Initial version
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
// float log2f(float)
//
// Overview of operation
//==============================================================
// Background
//
// Implementation
//
// Let x = 2^l * m, where m=1.b1 b2 ... b8 b9 ... b52
// y=frcpa(m), r=m*y-1, f=b1 b2 .. b8 (table index)
// j=0 if f<128; j=1 if f>=128
// T is a table that stores log2(1/y) (in entries 1..255) rounded to
// double extended precision; f is used as an index; T[255]=0
//
// If f=0 and b9=0, r is set to 2^{-8}* 0.b9 b10 ... b52 = m-1 (fractional part of m),
// and 0 is used instead of T[0]
// (polynomial evaluation only, for m=1+r, 0<=r<2^{-9})
// If f=255, r is set to (m-2)/2 (T[255]=0, and only polynomial evaluation is used
// for m=2(1-r'), 0<=r'<2^{-9})
//
// log2f(x) is approximated as
// (l-j) + T[f] + (c1*r+c2*r^2+...+c6*r^6), if f>0
//
// Special values
//==============================================================
// log2f(0)=-inf, raises Divide by Zero
// log2f(+inf)=inf
// log2f(x)=NaN, raises Invalid if x<0
//
// Registers used
//==============================================================
// f6-f14
// r2-r3, r23-r30
// p6,p7,p8,p12
//
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
GR_SAVE_GP = r35 // This reg. can safely be used
GR_SAVE_SP = r36
GR_Parameter_X = r37
GR_Parameter_Y = r38
GR_Parameter_RESULT = r39
GR_Parameter_TAG = r40
FR_X = f10
FR_Y = f1
FR_RESULT = f8
// Data tables
//==============================================================
RODATA
.align 16
LOCAL_OBJECT_START(poly_coeffs)
data8 0x3fdec709dc3a03fd, 0xbfd71547652b82fe //C_3 and C_4
data8 0xb8aa3b295c17f0bc, 0x00003fff // C_1
data8 0xb8aa3b295c17f0bc, 0x0000bffe // C_2
LOCAL_OBJECT_END(poly_coeffs)
LOCAL_OBJECT_START(T_table)
data8 0x3f671b0ea42e5fda, 0x3f815cfe8eaec830
data8 0x3f8cfee70c5ce5dc, 0x3f94564a62192834
data8 0x3f997723ace35766, 0x3f9f5923c69b54a1
data8 0x3fa2a094a085d693, 0x3fa538941776b01e
data8 0x3fa8324c9b914bc7, 0x3faacf54ce07d7e9
data8 0x3fadced958dadc12, 0x3fb0387efbca869e
data8 0x3fb18ac6067479c0, 0x3fb30edd3e13530d
data8 0x3fb463c15936464e, 0x3fb5b9e13c3fa21d
data8 0x3fb7113f3259e07a, 0x3fb869dd8d1b2035
data8 0x3fb9c3bea49d3214, 0x3fbb1ee4d7961701
data8 0x3fbc7b528b70f1c5, 0x3fbdd90a2c676ed4
data8 0x3fbf05d4976c2028, 0x3fc032fbbaee6d65
data8 0x3fc0e3b5a9f3284a, 0x3fc195195c7d125b
data8 0x3fc22dadc2ab3497, 0x3fc2e050231df57d
data8 0x3fc379f79c2b255b, 0x3fc42ddd2ba1b4a9
data8 0x3fc4c89b9e6807f5, 0x3fc563dc29ffacb2
data8 0x3fc619a25f5d798d, 0x3fc6b5ffbf367644
data8 0x3fc752e1f660f8d6, 0x3fc7f049e753e7cf
data8 0x3fc8a8980abfbd32, 0x3fc94724cca657be
data8 0x3fc9e63a24971f46, 0x3fca85d8feb202f7
data8 0x3fcb2602497d5346, 0x3fcbc6b6f5ee1c9b
data8 0x3fcc67f7f770a67e, 0x3fcceec4b2234fba
data8 0x3fcd91097ad13982, 0x3fce33dd57f3d335
data8 0x3fced74146bc7b10, 0x3fcf7b3646fef683
data8 0x3fd00223a943dc19, 0x3fd054a474bf0eb7
data8 0x3fd0999d9b9259a1, 0x3fd0eca66d3b2581
data8 0x3fd13ffa2e85b475, 0x3fd185a444fa0a7b
data8 0x3fd1cb8312f27eff, 0x3fd21fa1441ce5e8
data8 0x3fd265f526e603cb, 0x3fd2baa0c34be1ec
data8 0x3fd3016b45de21ce, 0x3fd3486c38aa29a8
data8 0x3fd38fa3efaa8262, 0x3fd3e562c0816a02
data8 0x3fd42d141f53b646, 0x3fd474fd543f222c
data8 0x3fd4bd1eb680e548, 0x3fd505789e234bd1
data8 0x3fd54e0b64003b70, 0x3fd596d761c3c1f0
data8 0x3fd5dfdcf1eeae0e, 0x3fd6291c6fd9329c
data8 0x3fd6729637b59418, 0x3fd6bc4aa692e0fd
data8 0x3fd7063a1a5fb4f2, 0x3fd75064f1ed0715
data8 0x3fd79acb8cf10390, 0x3fd7d67c1e43ae5c
data8 0x3fd8214f4068afa7, 0x3fd86c5f36dea3dc
data8 0x3fd8b7ac64dd7f9d, 0x3fd8f4167a0c6f92
data8 0x3fd93fd2d5e1bf1d, 0x3fd98bcd84296946
data8 0x3fd9c8c333e6e9a5, 0x3fda152f142981b4
data8 0x3fda527fd95fd8ff, 0x3fda9f5e3edeb9e6
data8 0x3fdadd0b2b5755a7, 0x3fdb2a5d6f51ff83
data8 0x3fdb686799b00be3, 0x3fdbb62f1b887cd8
data8 0x3fdbf4979f666668, 0x3fdc332a6e8399d4
data8 0x3fdc819dc2d45fe4, 0x3fdcc0908e19b7bd
data8 0x3fdcffae611ad12b, 0x3fdd3ef776d43ff4
data8 0x3fdd8e5002710128, 0x3fddcdfb486cb9a1
data8 0x3fde0dd294245fe4, 0x3fde4dd622a28840
data8 0x3fde8e06317114f0, 0x3fdece62fe9a9915
data8 0x3fdf1f164a15389a, 0x3fdf5fd8a9063e35
data8 0x3fdfa0c8937e7d5d, 0x3fdfe1e649bb6335
data8 0x3fe011990641535a, 0x3fe032560e91e59e
data8 0x3fe0532a5ebcd44a, 0x3fe0741617f5fc28
data8 0x3fe08cd653f38839, 0x3fe0adeb55c1103b
data8 0x3fe0cf181d5d1dd0, 0x3fe0f05ccd0aced7
data8 0x3fe111b9875788ab, 0x3fe1332e6f1bcf73
data8 0x3fe154bba77c2088, 0x3fe16df59bfa06c1
data8 0x3fe18fadb6e2d3c2, 0x3fe1b17e849adc26
data8 0x3fe1caeb6a0de814, 0x3fe1ece7c830eec9
data8 0x3fe20efd3dae01df, 0x3fe2289de375d901
data8 0x3fe24adf9b6a6fe0, 0x3fe26d3ad1aebcfc
data8 0x3fe287100c2771f4, 0x3fe2a9983b3c1b28
data8 0xbfda78e146f7bef4, 0xbfda33760a7f6051
data8 0xbfd9ff43476fb5f7, 0xbfd9b97c3c4eec8f
data8 0xbfd98504431717fc, 0xbfd93ee07535f967
data8 0xbfd90a228d5712b2, 0xbfd8c3a104cb24f5
data8 0xbfd88e9c72e0b226, 0xbfd847bc33d8618e
data8 0xbfd812703988bb69, 0xbfd7dd0569c04bff
data8 0xbfd7959c202292f1, 0xbfd75fe8d2c5d48f
data8 0xbfd72a1637cbc183, 0xbfd6e221cd9d0cde
data8 0xbfd6ac059985503b, 0xbfd675c99ce81f92
data8 0xbfd63f6db2590482, 0xbfd5f6c138136489
data8 0xbfd5c01a39fbd688, 0xbfd58952cf519193
data8 0xbfd5526ad18493ce, 0xbfd51b6219bfe6ea
data8 0xbfd4d1cdf8b4846f, 0xbfd49a784bcd1b8b
data8 0xbfd4630161832547, 0xbfd42b6911cf5465
data8 0xbfd3f3af3461e1c4, 0xbfd3bbd3a0a1dcfb
data8 0xbfd383d62dac7ae7, 0xbfd34bb6b2546218
data8 0xbfd313750520f520, 0xbfd2db10fc4d9aaf
data8 0xbfd2a28a6dc90387, 0xbfd269e12f346e2c
data8 0xbfd2311515e2e855, 0xbfd1f825f6d88e13
data8 0xbfd1bf13a6c9c69f, 0xbfd185ddfa1a7ed0
data8 0xbfd14c84c4dd6128, 0xbfd11307dad30b76
data8 0xbfd0d9670f6941fe, 0xbfd09fa235ba2020
data8 0xbfd0790adbb03009, 0xbfd03f09858c55fb
data8 0xbfd004e3a7c97cbd, 0xbfcf9532288fcf69
data8 0xbfcf205339208f27, 0xbfceab2a23a5b83e
data8 0xbfce5ce55fdd37a5, 0xbfcde73fe3b1480f
data8 0xbfcd714f44623927, 0xbfccfb1321b8c400
data8 0xbfccac163c770dc9, 0xbfcc355b67195dd0
data8 0xbfcbbe540a3f036f, 0xbfcb6ecf175f95e9
data8 0xbfcaf74751e1be33, 0xbfca7f71fb7bab9d
data8 0xbfca2f632320b86b, 0xbfc9b70ba539dfae
data8 0xbfc93e6587910444, 0xbfc8edcae8352b6c
data8 0xbfc874a0db01a719, 0xbfc7fb27199df16d
data8 0xbfc7a9fec7d05ddf, 0xbfc72fff456ac70d
data8 0xbfc6de7d66023dbc, 0xbfc663f6fac91316
data8 0xbfc6121ac74813cf, 0xbfc5970c478fff4a
data8 0xbfc51bab907a5c8a, 0xbfc4c93d33151b24
data8 0xbfc44d527fdadf55, 0xbfc3fa87be0f3a1b
data8 0xbfc3a797cd35d959, 0xbfc32ae9e278ae1a
data8 0xbfc2d79c6937efdd, 0xbfc25a619370d9dc
data8 0xbfc206b5bde2f8b8, 0xbfc188ecbd1d16be
data8 0xbfc134e1b489062e, 0xbfc0b6894488e95f
data8 0xbfc0621e2f556b5c, 0xbfc00d8c711a12cc
data8 0xbfbf1cd21257e18c, 0xbfbe72ec117fa5b2
data8 0xbfbdc8b7c49a1ddb, 0xbfbcc8d5e467b710
data8 0xbfbc1ddc9c39c7a1, 0xbfbb7294093cdd0f
data8 0xbfba7111df348494, 0xbfb9c501cdf75872
data8 0xbfb918a16e46335b, 0xbfb81579a73e83c6
data8 0xbfb7684f39f4ff2d, 0xbfb6bad3758efd87
data8 0xbfb60d060d7e41ac, 0xbfb507b836033bb7
data8 0xbfb4591d6310d85a, 0xbfb3aa2fdd27f1c3
data8 0xbfb2faef55ccb372, 0xbfb1f3723b4ae6db
data8 0xbfb14360d6136ffa, 0xbfb092fb594145c1
data8 0xbfafc482e8b48a7e, 0xbfae6265ace11ae4
data8 0xbfacff9e5c4341d0, 0xbfaaea3316095f72
data8 0xbfa985bfc3495194, 0xbfa820a01ac754cb
data8 0xbfa6bad3758efd87, 0xbfa554592bb8cd58
data8 0xbfa3ed3094685a26, 0xbfa2855905ca70f6
data8 0xbfa11cd1d5133413, 0xbf9dfd78881399f1
data8 0xbf9b28f618cc85df, 0xbf98530faa3c087b
data8 0xbf957bc3dddcd7fa, 0xbf92a3115322f9e6
data8 0xbf8f91ed4eef8370, 0xbf89dae4ec6b8b2e
data8 0xbf842106b1499209, 0xbf7cc89f97d67594
data8 0xbf71497accf7e11d, 0x0000000000000000
LOCAL_OBJECT_END(T_table)
.section .text
WEAK_LIBM_ENTRY(log2f)
{ .mfi
alloc r32=ar.pfs,1,4,4,0
// y=frcpa(x)
frcpa.s1 f6,p0=f1,f8
// will form significand of 1.5 (to test whether the index is 128 or above)
mov r24=0xc
}
{.mfi
nop.m 0
// normalize x
fma.s1 f7=f8,f1,f0
// r2 = pointer to C_1...C_6 followed by T_table
addl r2 = @ltoff(poly_coeffs), gp;;
}
{.mfi
// get significand
getf.sig r25=f8
// f8 denormal ?
fclass.m p8,p10=f8,0x9
// will form significand of 1.5 (to test whether the index is 128 or above)
shl r24=r24,60
}
{.mfi
mov r26=0x804
nop.f 0
// r23=bias-1
mov r23=0xfffe;;
}
{.mmf
getf.exp r29=f8
// load start address for C_1...C_6 followed by T_table
ld8 r2=[r2]
// will continue only for positive normal/denormal numbers
fclass.nm.unc p12,p7 = f8, 0x19 ;;
}
.pred.rel "mutex",p8,p10
{.mfi
// denormal input, repeat get significand (after normalization)
(p8) getf.sig r25=f7
// x=1 ?
fcmp.eq.s0 p6,p0=f8,f1
// get T_index
(p10) shr.u r28=r25,63-8
}
{.mfi
// f12=0.5
setf.exp f12=r23
nop.f 0
// r27=bias
mov r27=0xffff;;
}
{.mfb
// denormal input, repeat get exponent (after normalization)
(p8) getf.exp r29=f7
nop.f 0
(p12) br.cond.spnt SPECIAL_log2f
}
{.mfi
cmp.geu p12,p0=r25,r24
nop.f 0
mov r23=0xff;;
}
{.mfi
add r3=32,r2
// r=1-x*y
fms.s1 f6=f6,f8,f1
// r26=0x80400...0 (threshold for using polynomial approximation)
shl r26=r26,64-12
}
{.mfi
// load C_3, C_4
ldfpd f10,f11=[r2],16
nop.f 0
// r27=bias-1 (if index >=128, will add exponent+1)
(p12) mov r27=0xfffe;;
}
{.mfi
// load C_1
ldfe f14=[r2],32
// x=1, return 0
(p6) fma.s.s0 f8=f0,f0,f0
(p8) shr.u r28=r25,63-8
}
{.mib
// load C_2
ldfe f13=[r3]
// r29=exponent-bias
sub r29=r29,r27
// x=1, return
(p6) br.ret.spnt b0;;
}
{.mfi
// get T_index
and r28=r28,r23
fmerge.se f7=f1,f7
// if first 9 bits after leading 1 are all zero, then p8=1
cmp.ltu p8,p12=r25,r26;;
}
{.mfi
// f8=expon - bias
setf.sig f8=r29
nop.f 0
// get T address
shladd r2=r28,3,r2
}
{.mfi
// first 8 bits after leading 1 are all ones ?
cmp.eq p10,p0=r23,r28
// if first 8 bits after leading bit are 0, use polynomial approx. only
(p8) fms.s1 f6=f7,f1,f1
nop.i 0;;
}
{.mfi
//r26=1
mov r26=1
// if first 8 bits after leading 1 are all ones, use polynomial approx. only
(p10) fms.s1 f6=f7,f12,f1
nop.i 0;;
}
.pred.rel "mutex",p8,p12
{.mmf
// load T (unless first 9 bits after leading 1 are 0)
(p12) ldfd f12=[r2]
nop.m 0
// set T=0 (if first 9 bits after leading 1 are 0)
(p8) fma.s1 f12=f0,f0,f0;;
}
{.mfi
nop.m 0
// P34=C_3+C_4*r
fma.s1 f10=f11,f6,f10
// r26=2^{63}
shl r26=r26,63
}
{.mfi
nop.m 0
// r2=r*r
fma.s1 f11=f6,f6,f0
nop.i 0;;
}
{.mfi
// significand of x is 1 ?
cmp.eq p0,p6=r25,r26
// P12=C_1+C_2*r
fma.s1 f14=f13,f6,f14
nop.i 0;;
}
{.mfi
nop.m 0
// normalize additive term (l=exponent of x)
fcvt.xf f8=f8
// if significand(x)=1, return exponent (l)
nop.i 0;;
}
{.mfi
nop.m 0
// add T+l
(p6) fma.s1 f8=f8,f1,f12
nop.i 0
}
{.mfi
nop.m 0
// P14=P12+r2*P34
(p6) fma.s1 f13=f10,f11,f14
nop.i 0;;
}
{.mfb
nop.m 0
// result=T+l+r*P14
(p6) fma.s.s0 f8=f13,f6,f8
// return
br.ret.sptk b0;;
}
SPECIAL_log2f:
{.mfi
nop.m 0
// x=+Infinity ?
fclass.m p7,p0=f8,0x21
nop.i 0;;
}
{.mfi
nop.m 0
// x=+/-Zero ?
fclass.m p8,p0=f8,0x7
nop.i 0;;
}
{.mfi
nop.m 0
// x=-Infinity, -normal, -denormal ?
fclass.m p6,p0=f8,0x3a
nop.i 0;;
}
{.mfb
nop.m 0
// log2f(+Infinity)=+Infinity
nop.f 0
(p7) br.ret.spnt b0;;
}
{.mfi
(p8) mov GR_Parameter_TAG = 172
// log2f(+/-0)=-infinity, raises Divide by Zero
// set f8=-0
(p8) fmerge.ns f8=f0,f8
nop.i 0;;
}
{.mfb
nop.m 0
(p8) frcpa.s0 f8,p0=f1,f8
(p8) br.cond.sptk __libm_error_region;;
}
{.mfb
(p6) mov GR_Parameter_TAG = 173
// x<0: return NaN, raise Invalid
(p6) frcpa.s0 f8,p0=f0,f0
(p6) br.cond.sptk __libm_error_region;;
}
{.mfb
nop.m 0
// Remaining cases: NaNs
fma.s.s0 f8=f8,f1,f0
br.ret.sptk b0;;
}
WEAK_LIBM_END(log2f)
libm_alias_float_other (__log2, log2)
#ifdef SHARED
.symver log2f,log2f@@GLIBC_2.27
.weak __log2f_compat
.set __log2f_compat,__log2f
.symver __log2f_compat,log2f@GLIBC_2.2
#endif
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
};;
{ .mmi
stfs [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
stfs [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
nop.b 0
}
{ .mib
stfs [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
nop.m 0
nop.m 0
add GR_Parameter_RESULT = 48,sp
};;
{ .mmi
ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
};;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#

View File

@ -1 +0,0 @@
/* Not needed. */

View File

@ -1,815 +0,0 @@
.file "log2l.s"
// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
// 09/25/00 Initial version
// 11/22/00 Fixed accuracy bug (for mantissas near 1, 2)
// 12/07/00 Fixed C_1l constant, eliminated rounding errors in
// reduced argument (x*frcpa(x)-1)
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
// long double log2l(long double)
//
// Overview of operation
//==============================================================
// Background
//
// Implementation
//
// Let x = 2^l * m, where m=1.b1 b2 ... b8 b9 ... b52
// y=frcpa(m), r=m*y-1, f=b1 b2 .. b8
// T_hi is a table that stores the 24 most significant bits of log2(1/y)
// (in entries 1..255) in single precision format
// T_low is a table that stores (log2(1/y)-T_high), rounded to double
// precision
//
// f is used as an index; T_high[255]=T_low[255]=0
//
// If f=0 and b9=0, r is set to 2^{-8}* 0.b9 b10 ... b52 = m-1 (fractional part of m),
// and 0 is used instead of T_high[0], T_low[0]
// (polynomial evaluation only, for m=1+r, 0<=r<2^{-9})
// If f=255, r is set to (m-2)/2 (T[255]=0, and only polynomial evaluation is used
// for m=2(1-r'), 0<=r'<2^{-9})
//
// If 2^{-9}<=m<2-2^{-8} or (input not near 1), let C1r=(2^{16}+C1*r)-2^{16}
// and let E=((RN(m*y)-1)-r)+(m*y-RN(m*y))
// Else let C1r=C1*r (rounded to 64 significant bits) and let E=0
//
// Let D=C1*r-C1r
//
//
// log2l(x) is approximated as
// (l+T_high[f]+C1r) + (D+r*(c1+c2*r+c3*r^2...+c8*r^7)+(T_low[f]+C_1*E))
//
// Special values
//==============================================================
// log2l(0)=-inf, raises Divide by Zero
// log2l(+inf)=inf
// log2l(x)=NaN, raises Invalid if x<0
//
// Registers used
//==============================================================
// f6-f15, f32-f36
// r2-r3, r23-r23
// p6,p7,p8,p12
//
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
GR_SAVE_GP = r35 // This reg. can safely be used
GR_SAVE_SP = r36
GR_Parameter_X = r37
GR_Parameter_Y = r38
GR_Parameter_RESULT = r39
GR_Parameter_TAG = r40
FR_X = f10
FR_Y = f1
FR_RESULT = f8
// Data tables
//==============================================================
RODATA
.align 16
LOCAL_OBJECT_START(poly_coeffs)
data8 0xb8aa3b295c17f0bc, 0x00003fff // C_1
data8 0x3fca61762a7aded9, 0xbfc71547652b82fe // C_7, C_8
data8 0x3fd2776c50ef9bfe, 0xbfcec709dc3a03fd // C_5, C_6
data8 0x3fdec709dc3a03fd, 0xbfd71547652b82fe // C_3, C_4
//data8 0xd871319ff0342580, 0x0000bfbd // C_1l (low part of C1)
data8 0x82f0025f2dc582ee, 0x0000bfbe // C_1l (low part of C1)
data8 0xb8aa3b295c17f0bc, 0x0000bffe // C_2
LOCAL_OBJECT_END(poly_coeffs)
LOCAL_OBJECT_START(T_table)
data4 0x3b38d875, 0x3c0ae7f4, 0x3c67f738, 0x3ca2b253
data4 0x3ccbb91d, 0x3cfac91e, 0x3d1504a5, 0x3d29c4a0
data4 0x3d419264, 0x3d567aa6, 0x3d6e76ca, 0x3d81c3f7
data4 0x3d8c5630, 0x3d9876e9, 0x3da31e0a, 0x3dadcf09
data4 0x3db889f9, 0x3dc34eec, 0x3dce1df5, 0x3dd8f726
data4 0x3de3da94, 0x3deec851, 0x3df82ea4, 0x3e0197dd
data4 0x3e071dad, 0x3e0ca8ca, 0x3e116d6e, 0x3e170281
data4 0x3e1bcfbc, 0x3e216ee9, 0x3e2644dc, 0x3e2b1ee1
data4 0x3e30cd12, 0x3e35affd, 0x3e3a970f, 0x3e3f824f
data4 0x3e4544c0, 0x3e4a3926, 0x3e4f31d1, 0x3e542ec7
data4 0x3e593012, 0x3e5e35b7, 0x3e633fbf, 0x3e677625
data4 0x3e6c884b, 0x3e719eea, 0x3e76ba0a, 0x3e7bd9b2
data4 0x3e80111d, 0x3e82a523, 0x3e84ccec, 0x3e876533
data4 0x3e89ffd1, 0x3e8c2d22, 0x3e8e5c18, 0x3e90fd0a
data4 0x3e932fa9, 0x3e95d506, 0x3e980b5a, 0x3e9a4361
data4 0x3e9c7d1f, 0x3e9f2b16, 0x3ea168a0, 0x3ea3a7ea
data4 0x3ea5e8f5, 0x3ea82bc4, 0x3eaa705b, 0x3eacb6bb
data4 0x3eaefee7, 0x3eb148e3, 0x3eb394b1, 0x3eb5e255
data4 0x3eb831d0, 0x3eba8327, 0x3ebcd65c, 0x3ebeb3e0
data4 0x3ec10a7a, 0x3ec362f9, 0x3ec5bd63, 0x3ec7a0b3
data4 0x3ec9fe96, 0x3ecc5e6c, 0x3ece4619, 0x3ed0a978
data4 0x3ed293fe, 0x3ed4faf1, 0x3ed6e859, 0x3ed952eb
data4 0x3edb433c, 0x3eddb178, 0x3edfa4bc, 0x3ee19953
data4 0x3ee40cee, 0x3ee60484, 0x3ee7fd73, 0x3ee9f7bb
data4 0x3eec7280, 0x3eee6fda, 0x3ef06e94, 0x3ef26eb1
data4 0x3ef47031, 0x3ef67317, 0x3ef8f8b2, 0x3efafec5
data4 0x3efd0644, 0x3eff0f32, 0x3f008cc8, 0x3f0192b0
data4 0x3f029952, 0x3f03a0b0, 0x3f0466b2, 0x3f056f5a
data4 0x3f0678c0, 0x3f0782e6, 0x3f088dcc, 0x3f099973
data4 0x3f0aa5dd, 0x3f0b6fac, 0x3f0c7d6d, 0x3f0d8bf4
data4 0x3f0e575b, 0x3f0f673e, 0x3f1077e9, 0x3f1144ef
data4 0x3f1256fc, 0x3f1369d6, 0x3f143880, 0x3f154cc1
data4 0x3f161c7a, 0x3f173227, 0x3f1802f2, 0x3f191a0f
data4 0x3f19ebee, 0x3f1b047e, 0x3f1bd775, 0x3f1cf17b
data4 0x3f1dc58e, 0x3f1ee10f, 0x3f1fb63f, 0x3f208bea
data4 0x3f21a98f, 0x3f22805c, 0x3f2357a7, 0x3f247778
data4 0x3f254fe9, 0x3f2628d9, 0x3f270249, 0x3f2824fb
data4 0x3f28ff97, 0x3f29dab4, 0x3f2ab654, 0x3f2b9277
data4 0x3f2cb8c8, 0x3f2d961e, 0x3f2e73fa, 0x3f2f525b
data4 0x3f303143, 0x3f3110b1, 0x3f31f0a7, 0x3f32d125
data4 0x3f33b22b, 0x3f3493bc, 0x3f3575d6, 0x3f36587b
data4 0x3f373bab, 0x3f381f68, 0x3f3903b1, 0x3f39e888
data4 0x3f3acdec, 0x3f3bb3e0, 0x3f3c9a63, 0x3f3d8177
data4 0x3f3e1bd4, 0x3f3f03d9, 0x3f3fec71, 0x3f40d59b
data4 0x3f41bf59, 0x3f42a9ab, 0x3f434635, 0x3f443180
data4 0x3f451d61, 0x3f4609d9, 0x3f46a7d3, 0x3f479549
data4 0x3f488357, 0x3f492261, 0x3f4a1171, 0x3f4b011c
data4 0x3f4ba139, 0x3f4c91e8, 0x3f4d8334, 0x3f4e246a
data4 0x3f4f16be, 0x3f5009b1, 0x3f50ac02, 0x3f51a001
data4 0x3f524305, 0x3f533812, 0x3f53dbca, 0x3f54d1e7
data4 0x3f55c8a8, 0x3f566d85, 0x3f57655b, 0x3f580af0
data4 0x3f58b0d0, 0x3f59aa2c, 0x3f5a50c7, 0x3f5b4b3c
data4 0x3f5bf294, 0x3f5cee26, 0x3f5d963c, 0x3f5e92ed
data4 0x3f5f3bc3, 0x3f5fe4e7, 0x3f60e32d, 0x3f618d13
data4 0x3f623748, 0x3f63372a, 0x3f63e223, 0x3f648d6b
data4 0x3f658eee, 0x3f663afe, 0x3f66e75e, 0x3f67ea86
data4 0x3f6897b0, 0x3f69452c, 0x3f69f2f9, 0x3f6af847
data4 0x3f6ba6e2, 0x3f6c55d0, 0x3f6d0510, 0x3f6e0c8d
data4 0x3f6ebc9f, 0x3f6f6d04, 0x3f701dbe, 0x3f70cecd
data4 0x3f718030, 0x3f728ae6, 0x3f733d20, 0x3f73efaf
data4 0x3f74a296, 0x3f7555d3, 0x3f760967, 0x3f76bd53
data4 0x3f777197, 0x3f7880a1, 0x3f7935c2, 0x3f79eb3c
data4 0x3f7aa10f, 0x3f7b573b, 0x3f7c0dc2, 0x3f7cc4a3
data4 0x3f7d7bdf, 0x3f7e3376, 0x3f7eeb68, 0x00000000
LOCAL_OBJECT_END(T_table)
LOCAL_OBJECT_START(T_low)
data8 0x3dc0b97f689876ef, 0x3dfd5d906028ac01
data8 0x3df8b9cbb8d7240b, 0x3de0c941a2f220cd
data8 0x3e09c6aecba15936, 0x3dfa6d528241827c
data8 0x3dd0bad25714903c, 0x3e2776b01dc036a2
data8 0x3e2b914bc77f158b, 0x3e1c0fafd29dc74a
data8 0x3e28dadc119cd3de, 0x3e3bca869da085be
data8 0x3e19d1e700f2200a, 0x3e3e13530cc37504
data8 0x3e3936464d9c41ee, 0x3e3c3fa21c9499d0
data8 0x3e3259e079b6c6e8, 0x3e2a364069c4f7f3
data8 0x3e1274c84f6c6364, 0x3e3796170159f454
data8 0x3e26e1e389f4364e, 0x3e28cedda8c7f658
data8 0x3e376c2028433268, 0x3e4aee6d650c82e1
data8 0x3e33e65094fbeeb4, 0x3e4c7d125aa92c5d
data8 0x3e1559a4b69691d8, 0x3e18efabeb7d7221
data8 0x3e4c2b255abaa8de, 0x3e37436952a4538b
data8 0x3e4e6807f4ba00b8, 0x3e33ff5964190e42
data8 0x3e4f5d798cead43c, 0x3e4f3676443bf453
data8 0x3e4660f8d5bc1bf5, 0x3e2d4f9f3ab04f36
data8 0x3e357f7a64ccd537, 0x3e394caf7c9b05af
data8 0x3e225c7d17ab29b0, 0x3e4eb202f6d55a12
data8 0x3e32faa68b19bcd2, 0x3e45ee1c9b566a8b
data8 0x3e4770a67de054ff, 0x3e42234fb9de6d6b
data8 0x3e4ad139825c6e19, 0x3e47f3d334814a93
data8 0x3e2af1ec402867b6, 0x3e2bfbda0c956e3d
data8 0x3e4287b831e77ff2, 0x3e54bf0eb77f7b89
data8 0x3e5b9259a1029607, 0x3e4a764b015e699d
data8 0x3e4d0b68ea883ab5, 0x3e33e829ecdadf46
data8 0x3e52f27efef3031b, 0x3e3073979e4af89e
data8 0x3e3b980f2cd6c253, 0x3e2a5f0f5f7f66a9
data8 0x3e37788738117b02, 0x3e58aa29a784d52f
data8 0x3e4f5504c4ff2466, 0x3e002d40340fa647
data8 0x3e5f53b64592f4c3, 0x3e543f222c526802
data8 0x3e5680e547a872fa, 0x3e5e234bd1154450
data8 0x3e3000edc18b6d21, 0x3e1c3c1f000942a8
data8 0x3e51eeae0e442d6e, 0x3e4fb265376623f2
data8 0x3e57b5941782d830, 0x3e3a4b83f24ae52c
data8 0x3e5a5fb4f23978de, 0x3e51ed071563fb02
data8 0x3e49e2071f51a7a8, 0x3e5e43ae5b924234
data8 0x3dfa2be9aedf374a, 0x3e56dea3dbba67d5
data8 0x3e3375fe732b3c3e, 0x3e5a0c6f91f2e77e
data8 0x3e55e1bf1c969e41, 0x3e30a5a5166b8eee
data8 0x3e53e6e9a539d46c, 0x3e542981b3d7b0e6
data8 0x3e595fd8ff36ad64, 0x3e5edeb9e65cbbb4
data8 0x3e46aeab4d3434c1, 0x3e4ea3ff0564b010
data8 0x3e59b00be2e3c25a, 0x3e5b887cd7b0821f
data8 0x3e5f666668547b4d, 0x3e4d0733a805273f
data8 0x3e26a2ff21c4aec5, 0x3e4c336f7a3a78f3
data8 0x3e11ad12b628e2d0, 0x3e56d43ff3f0ea64
data8 0x3e238809433cccd2, 0x3e40d9734147d40f
data8 0x3e54245fe3e24e06, 0x3e251441fce4d48c
data8 0x3e517114efc5d1f9, 0x3e5e9a99154b0d82
data8 0x3e442a71337970f8, 0x3e420c7c69211fdf
data8 0x3e537e7d5d43c6a7, 0x3e4376c66ad9ad8b
data8 0x3e49054d678a4f1c, 0x3e5d23cb3bc19f18
data8 0x3e6ebcd449dcab2b, 0x3e67f5fc2849c88a
data8 0x3e63f388395d3e84, 0x3e65c1103b0ad7e9
data8 0x3e6d5d1dd031f353, 0x3e5a159dae75c4d0
data8 0x3e4d5e22aa75f71d, 0x3e5e379ee62e1e35
data8 0x3e4df082213cb2dc, 0x3e6bfa06c156f521
data8 0x3e66e2d3c19b517b, 0x3e426b7098590071
data8 0x3e541bd027e9854e, 0x3e5061dd924b0ac0
data8 0x3e6dae01df373a03, 0x3e3baec80b207b0b
data8 0x3e6b6a6fe06bebac, 0x3e61aebcfc3ab5d1
data8 0x3e584ee3e7c79d83, 0x3e6b3c1b2840cb40
data8 0x3e6c842085d6befd, 0x3e6ac04fd7b141e0
data8 0x3e6c48250474141d, 0x3e2d889b86125f69
data8 0x3e6e74740225dad0, 0x3e45940d31d50a7c
data8 0x3e695476a6c39ddc, 0x3e6d9a6d857a060a
data8 0x3e4a3e9bb4b69337, 0x3e484f3ce4707ed6
data8 0x3e39dd125d25fc27, 0x3e563fb400de8732
data8 0x3e5fdd6d0ee28b48, 0x3e669d15b869bb07
data8 0x3e40687cfad7964d, 0x3e69317990d43957
data8 0x3e633d57e24ae1bd, 0x3e618bf03710eabb
data8 0x3e4b4df6fccd1160, 0x3e3fb26ddaa1ec45
data8 0x3e3810a5e1817fd4, 0x3e6857373642fa5c
data8 0x3e673db6193add31, 0x3e63200c8acbc9c3
data8 0x3e3d2dee448ebb62, 0x3e6a19723a80db6a
data8 0x3e5e7cdab8fd3e6a, 0x3e671855cd660672
data8 0x3e473c3c78a85ecd, 0x3e5f5e23056a7cf2
data8 0x3e52538519527367, 0x3e4b573bcf2580e9
data8 0x3e6d6f856fe90c60, 0x3e2d932a8487642e
data8 0x3e5236fc78b6174c, 0x3e50cb91d406db50
data8 0x3e650e8bd562aa57, 0x3e424ee3d9a82f2e
data8 0x3e59363960e1e3d9, 0x3e379604c1150a3e
data8 0x3e6d914f6c2ac258, 0x3e62967a451a7b48
data8 0x3e684b5f01139cb2, 0x3e448bbfbf6d292c
data8 0x3e6227e7fb487e73, 0x3e6d39d50290f458
data8 0x3e58368342b4b668, 0x3e65dc0c25bd1763
data8 0x3e61b7dc362e22b5, 0x3e671691f094bb80
data8 0x3e5011642d5123f2, 0x3e4c4eb7f11e41be
data8 0x3e5dcee36ca242cf, 0x3e6791cefff688f1
data8 0x3e60e23c8dda4ecd, 0x3e48e6a22fe78cfe
data8 0x3e6d703f244adc86, 0x3e6a281a85a5049d
data8 0x3e570f20e6403d9e, 0x3e2211518a12956f
data8 0x3e6737d1e54d71df, 0x3e66b1881476f5e9
data8 0x3e6e1bbeef085376, 0x3e47cad4944a32be
data8 0x3e527f2c738e7ee9, 0x3e699883a4b9fb29
data8 0x3e5c17d1108740d9, 0x3e5d4a9c79a43389
data8 0x3e49fdc24462ba3b, 0x3e24dbb3a60cceb2
data8 0x3e5c5bf618780748, 0x3e5c38005b0c778c
data8 0x3e6be168dd6dd3fe, 0x3e633ab9370693b0
data8 0x3dd290556b0ae339, 0x3e607c317927096a
data8 0x3e59651353b3d90e, 0x3e4d8751e5e0ae0d
data8 0x3e46c81023272a85, 0x3e6b23c988f391b2
data8 0x3e608741d215209c, 0x3e60b8ba506d758f
data8 0x3e62ddbe74803297, 0x3e5dbb8b5087587d
data8 0x3e642aa529048131, 0x3e3dcbda6835dcf4
data8 0x3e6db503ce854d2a, 0x3e6dd00b49bc6849
data8 0x3e4db2f11243bc84, 0x3e3b9848efc2ea97
data8 0x3e58f18e17c82609, 0x3e6ed8645e16c312
data8 0x3e4065bdb60a5dd4, 0x3e490453c6e6c30a
data8 0x3e62373994aa31ba, 0x3e56305f0e6b2a95
data8 0x3e68c1601a6614ee, 0x3e614e204f19d93f
data8 0x3e6e5037ca773299, 0x3e693f98892561a6
data8 0x3e639de4f4bf700d, 0x3e416c071e93fd97
data8 0x3e65466991b415ef, 0x3e6896a324afac9d
data8 0x3e44f64802e2f11c, 0x3e64d7d747e2191a
data8 0x3e6174b7581de84c, 0x3e44c7b946e1d43c
data8 0x3e6a3bcbe30512ec, 0x3e5d3ed411c95ce4
data8 0x3e3e5b5735cfaf8e, 0x3e6e538ab34efb51
data8 0x3e514e204f19d93f, 0x3e5a88e6550c89a4
data8 0x3e66b97a5d9dfd8b, 0x3e5f46b1e14ebaf3
data8 0x3e357665f6893f5d, 0x3e6bbf633078d1d5
data8 0x3e5e7337a212c417, 0x3e3570fde15fc8cc
data8 0x3e21119402da92b4, 0x3e6566e830d1ff3b
data8 0x3e558883e480e220, 0x3e589ca3a68da411
data8 0x3e44eb66df73d648, 0x3e1a0a629b1b7e68
data8 0x3e54cc207b8c1116, 0x0000000000000000
LOCAL_OBJECT_END(T_low)
.section .text
GLOBAL_IEEE754_ENTRY(log2l)
{ .mfi
alloc r32=ar.pfs,1,4,4,0
// normalize x
// y=frcpa(x)
frcpa.s1 f41,p0=f1,f8
// r26=bias-1
mov r26=0xfffe
}
{.mfi
// r23=bias+16
mov r23=0xffff+16
fma.s1 f7=f8,f1,f0
// r2 = pointer to C_1...C_6 followed by T_table
addl r2 = @ltoff(poly_coeffs), gp;;
}
{.mfi
// get significand
getf.sig r25=f8
// f8 denormal ?
fclass.m p8,p10=f8,0x9
// r24=bias-8
mov r24=0xffff-8;;
}
{.mfi
setf.exp f36=r26
nop.f 0
// r27=bias
mov r27=0xffff;;
}
{.mmf
getf.exp r29=f8
// load start address for C_1...C_7 followed by T_table
ld8 r2=[r2]
// will continue only for positive normal/unnormal numbers
fclass.m.unc p0,p12 = f8, 0x19;;
}
.pred.rel "mutex",p8,p10
{.mfi
// denormal input, repeat get significand (after normalization)
(p8) getf.sig r25=f7
// x=1 ?
fcmp.eq.s0 p6,p0=f8,f1
// get T_index
(p10) shr.u r28=r25,63-8
}
{.mfi
// f32=2^16
setf.exp f32=r23
nop.f 0
mov r26=0x804;;
}
{.mfi
// denormal input, repeat get exponent (after normalization)
(p8) getf.exp r29=f7
// f33=0
mov f33=f0
// r26=0x80400...0 (threshold for using polynomial approximation)
shl r26=r26,64-12;;
}
{.mfb
add r3=16,r2
// r=x*y-1
fms.s1 f6=f41,f8,f1
(p12) br.cond.spnt SPECIAL_log2l
}
{.mfi
// load C_1
ldfe f14=[r2],48
// RN(x*y)
fma.s1 f43=f41,f8,f0
mov r23=0xff;;
}
{.mmi
// load C_7, C_8
ldfpd f10,f11=[r3],16
// load C_3,C_4
ldfpd f15,f42=[r2],16
(p8) shr.u r28=r25,63-8;;
}
{.mfi
// load C_5, C_6
ldfpd f12,f13=[r3]
// pseudo-zero ?
fcmp.eq.s0 p7,p0=f7,f0
// if first 9 bits after leading 1 are all zero, then p8=1
cmp.ltu p8,p12=r25,r26
}
{.mfi
// load C1l
ldfe f34=[r2],16
fmerge.se f7=f1,f7
// get T_index
and r28=r28,r23;;
}
{.mfi
// r29=exponent-bias
sub r29=r29,r27
// if first 8 bits after leading bit are 0, use polynomial approx. only
(p8) fms.s1 f6=f7,f1,f1
// start address of T_low
add r3=1024+16,r2
}
{.mfi
// load C_2
ldfe f35=[r2],16
// x=1, return 0
(p6) fma.s0 f8=f0,f0,f0
// first 8 bits after leading 1 are all ones ?
cmp.eq p10,p0=r23,r28;;
}
{.mfb
// if first 8 bits after leading 1 are all ones, use polynomial approx. only
// add 1 to the exponent additive term, and estimate log2(1-r)
(p10) add r29=1,r29
nop.f 0
(p7) br.cond.spnt LOG2_PSEUDO_ZERO
}
{.mfi
// get T_low address
shladd r3=r28,3,r3
// if first 8 bits after leading 1 are all ones, use polynomial approx. only
(p10) fms.s1 f6=f7,f36,f1
// p10 --> p8=1, p12=0
(p10) cmp.eq p8,p12=r0,r0;;
}
{.mfi
// get T_high address
shladd r2=r28,2,r2
// L(x*y)=x*y-RN(x*y)
fms.s1 f41=f41,f8,f43
nop.i 0
}
{.mfi
// p13=p12
(p12) cmp.eq.unc p13,p0=r0,r0
// RtH=RN(x*y)-1 (will eliminate rounding errors in r)
fms.s1 f43=f43,f1,f1
nop.i 0;;
}
.pred.rel "mutex",p8,p12
{.mfb
// load T_high (unless first 9 bits after leading 1 are 0)
(p12) ldfs f7=[r2]
// set T_high=0 (if first 9 bits after leading 1 are 0)
(p8) fma.s1 f7=f0,f0,f0
// x=1, return
(p6) br.ret.spnt b0
}
.pred.rel "mutex",p8,p12
{.mfi
// p12: load T_low
(p12) ldfd f36=[r3]
// p8: set T_low=0
(p8) fma.s1 f36=f0,f0,f0
(p8) cmp.eq p8,p12=r29,r0;; //nop.i 0;;
}
.pred.rel "mutex",p8,p12
{.mfi
// f8=expon - bias
setf.sig f8=r29
// general case: 2^{16}+C1*r
(p12) fma.s1 f33=f6,f14,f32
nop.i 0
}
{.mfi
// r26=1
mov r26=1
// p8 (mantissa is close to 1, or close to 2): 2^{-8}+C1*r
(p8) fma.s1 f32=f6,f14,f33
nop.i 0;;
}
{.mfi
nop.m 0
// P78=C_7+C_8*r
fma.s1 f10=f11,f6,f10
// r26=2^{63}
shl r26=r26,63
}
{.mfi
nop.m 0
// P34=C_3+r*C_4
fma.s1 f15=f42,f6,f15
nop.i 0;;
}
{.mfi
nop.m 0
// r2=r*r
fma.s1 f11=f6,f6,f0
nop.i 0
}
{.mfi
nop.m 0
// P56=C_5+C_6*r
fma.s1 f13=f13,f6,f12
nop.i 0;;
}
{.mfi
nop.m 0
// Rth-r
(p13) fms.s1 f43=f43,f1,f6
nop.i 0
}
{.mfi
// significand(x)=1 ?
cmp.eq p0,p6=r25,r26
// P12=C1l+C_2*r
fma.s1 f34=f35,f6,f34
nop.i 0;;
}
.pred.rel "mutex",p8,p12
{.mfi
nop.m 0
// p12: C1r=(2^{16}+C1*r)-2^{16}
(p12) fms.s1 f32=f33,f1,f32
nop.i 0
}
{.mfi
nop.m 0
// p8: C1r=C1*r (double extended)
(p8) fms.s1 f32=f32,f1,f33
nop.i 0;;
}
{.mfi
nop.m 0
// L(x*y)*C_1+T_low
(p13) fma.s1 f36=f41,f14,f36
nop.i 0
}
{.mfi
nop.m 0
// P58=P56+r2*P78
fma.s1 f13=f11,f10,f13
nop.i 0;;
}
{.mfi
nop.m 0
// P14=P12+r2*P34
fma.s1 f15=f15,f11,f34
nop.i 0
}
{.mfi
nop.m 0
// r4=r2*r2
fma.s1 f11=f11,f11,f0
nop.i 0;;
}
{.mfi
nop.m 0
// normalize additive term (l=exponent of x)
fcvt.xf f8=f8
nop.i 0;;
}
{.mfi
nop.m 0
// D=C1*r-C1r
(p6) fms.s1 f12=f14,f6,f32
nop.i 0;;
}
{.mfi
nop.m 0
// T_low'=(Rth-r)*C1+(L(x*y)*C1+T_low)
(p13) fma.s1 f36=f43,f14,f36
nop.i 0;;
}
{.mfi
nop.m 0
// P18=P14+r4*P58
(p6) fma.s1 f13=f11,f13,f15
nop.i 0;;
}
{.mfi
nop.m 0
// add T_high+l
(p6) fma.s1 f8=f8,f1,f7
nop.i 0;;
}
{.mfi
nop.m 0
// D+T_low
(p6) fma.s1 f12=f12,f1,f36
nop.i 0;;
}
{.mfi
nop.m 0
// (T_high+l)+C1r
(p6) fma.s1 f8=f8,f1,f32
nop.i 0
}
{.mfi
nop.m 0
// (D+T_low)+r*P18
(p6) fma.s1 f13=f13,f6,f12
nop.i 0;;
}
//{.mfb
//nop.m 0
//mov f8=f36
//fma.s0 f8=f13,f6,f0
//br.ret.sptk b0;;
//}
{.mfb
nop.m 0
// result=((T_high+l)+C1r)+((D+T_low)+r*P18)
(p6) fma.s0 f8=f13,f1,f8
// return
br.ret.sptk b0;;
}
SPECIAL_log2l:
{.mfi
nop.m 0
mov FR_X=f8
nop.i 0
}
{.mfi
nop.m 0
// x=+Infinity ?
fclass.m p7,p0=f8,0x21
nop.i 0;;
}
{.mfi
nop.m 0
// x=+/-Zero ?
fclass.m p8,p0=f7,0x7
nop.i 0;;
}
{.mfi
nop.m 0
// x=-Infinity, -normal, -denormal ?
fclass.m p6,p0=f8,0x3a
nop.i 0;;
}
{.mfb
nop.m 0
// log2l(+Infinity)=+Infinity
nop.f 0
(p7) br.ret.spnt b0;;
}
{.mfi
(p8) mov GR_Parameter_TAG = 168
// log2l(+/-0)=-infinity, raises Divide by Zero
// set f8=-0
(p8) fmerge.ns f8=f0,f8
nop.i 0;;
}
{.mfb
nop.m 0
(p8) frcpa.s0 f8,p0=f1,f8
(p8) br.cond.sptk __libm_error_region;;
}
{.mfb
(p6) mov GR_Parameter_TAG = 169
// x<0: return NaN, raise Invalid
(p6) frcpa.s0 f8,p0=f0,f0
(p6) br.cond.sptk __libm_error_region;;
}
{.mfb
nop.m 0
// Remaining cases: NaNs
fma.s0 f8=f8,f1,f0
br.ret.sptk b0;;
}
LOG2_PSEUDO_ZERO:
{.mfi
nop.m 0
mov FR_X=f8
nop.i 0
}
{.mfi
mov GR_Parameter_TAG = 168
// log2l(+/-0)=-infinity, raises Divide by Zero
// set f8=-0
fmerge.ns f8=f0,f8
nop.i 0;;
}
{.mfb
nop.m 0
frcpa.s0 f8,p0=f1,f8
br.cond.sptk __libm_error_region;;
}
GLOBAL_IEEE754_END(log2l)
libm_alias_ldouble_other (__log2, log2)
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
};;
{ .mmi
stfe [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
stfe [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
nop.b 0
}
{ .mib
stfe [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
nop.m 0
nop.m 0
add GR_Parameter_RESULT = 48,sp
};;
{ .mmi
ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
};;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#

View File

@ -1 +0,0 @@
/* Not needed. */

Some files were not shown because too many files have changed in this diff Show More