mirror of
https://sourceware.org/git/glibc.git
synced 2024-11-21 12:30:06 +00:00
Remove ia64-linux-gnu
Linux 6.7 removed ia64 from the official tree [1], following the general principle that a glibc port needs upstream support for the architecture in all the components it depends on (binutils, GCC, and the Linux kernel). Apart from the removal of sysdeps/ia64 and sysdeps/unix/sysv/linux/ia64, there are updates to various comments referencing ia64 for which removal of those references seemed appropriate. The configuration is removed from README and build-many-glibcs.py. The CONTRIBUTED-BY, elf/elf.h, manual/contrib.texi (the porting mention), *.po files, config.guess, and longlong.h are not changed. For Linux it allows cleanup some clone2 support on multiple files. The following bug can be closed as WONTFIX: BZ 22634 [2], BZ 14250 [3], BZ 21634 [4], BZ 10163 [5], BZ 16401 [6], and BZ 11585 [7]. [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=43ff221426d33db909f7159fdf620c3b052e2d1c [2] https://sourceware.org/bugzilla/show_bug.cgi?id=22634 [3] https://sourceware.org/bugzilla/show_bug.cgi?id=14250 [4] https://sourceware.org/bugzilla/show_bug.cgi?id=21634 [5] https://sourceware.org/bugzilla/show_bug.cgi?id=10163 [6] https://sourceware.org/bugzilla/show_bug.cgi?id=16401 [7] https://sourceware.org/bugzilla/show_bug.cgi?id=11585 Reviewed-by: Carlos O'Donell <carlos@redhat.com>
This commit is contained in:
parent
e171ad7d59
commit
460860f457
4
INSTALL
4
INSTALL
@ -609,9 +609,7 @@ Specific advice for GNU/Linux systems
|
||||
|
||||
If you are installing the GNU C Library on GNU/Linux systems, you need
|
||||
to have the header files from a 3.2 or newer kernel around for
|
||||
reference. (For the ia64 architecture, you need version 3.2.18 or newer
|
||||
because this is the first version with support for the ‘accept4’ system
|
||||
call.) These headers must be installed using ‘make headers_install’;
|
||||
reference. These headers must be installed using ‘make headers_install’;
|
||||
the headers present in the kernel source directory are not suitable for
|
||||
direct use by the GNU C Library. You do not need to use that kernel,
|
||||
just have its headers installed where the GNU C Library can access them,
|
||||
|
2
NEWS
2
NEWS
@ -80,6 +80,8 @@ Deprecated and removed features, and other changes affecting compatibility:
|
||||
of GNU libc are advised to check whether their build processes can be
|
||||
simplified.
|
||||
|
||||
* The ia64*-*-linux-gnu configurations are no longer supported.
|
||||
|
||||
Changes to build and runtime requirements:
|
||||
|
||||
* Building on LoongArch requires at a minimum binutils 2.41 for vector
|
||||
|
1
README
1
README
@ -30,7 +30,6 @@ The GNU C Library supports these configurations for using Linux kernels:
|
||||
hppa-*-linux-gnu
|
||||
i[4567]86-*-linux-gnu
|
||||
x86_64-*-linux-gnu Can build either x86_64 or x32
|
||||
ia64-*-linux-gnu
|
||||
loongarch64-*-linux-gnu Hardware floating point, LE only.
|
||||
m68k-*-linux-gnu
|
||||
microblaze*-*-linux-gnu
|
||||
|
@ -24,8 +24,7 @@ type mcontext_t
|
||||
|
||||
type ucontext_t
|
||||
element ucontext_t {ucontext_t*} uc_link
|
||||
// Bug 21634: uc_sigmask has wrong type.
|
||||
xfail[ia64-linux]-element ucontext_t sigset_t uc_sigmask
|
||||
element ucontext_t sigset_t uc_sigmask
|
||||
element ucontext_t stack_t uc_stack
|
||||
// Bug 21635: uc_mcontext has wrong type.
|
||||
xfail[powerpc32-linux]-element ucontext_t mcontext_t uc_mcontext
|
||||
@ -138,8 +137,7 @@ constant SIGSTKSZ
|
||||
type ucontext_t
|
||||
|
||||
element ucontext_t {ucontext_t*} uc_link
|
||||
// Bug 21634: uc_sigmask has wrong type.
|
||||
xfail[ia64-linux]-element ucontext_t sigset_t uc_sigmask
|
||||
element ucontext_t sigset_t uc_sigmask
|
||||
element ucontext_t stack_t uc_stack
|
||||
// Bug 21635: uc_mcontext has wrong type.
|
||||
xfail[powerpc32-linux]-element ucontext_t mcontext_t uc_mcontext
|
||||
|
@ -4,8 +4,7 @@ type mcontext_t
|
||||
type ucontext_t
|
||||
|
||||
element ucontext_t {ucontext_t*} uc_link
|
||||
// Bug 21634: uc_sigmask has wrong type.
|
||||
xfail[ia64-linux]-element ucontext_t sigset_t uc_sigmask
|
||||
element ucontext_t sigset_t uc_sigmask
|
||||
element ucontext_t stack_t uc_stack
|
||||
// Bug 21635: uc_mcontext has wrong type.
|
||||
xfail[powerpc32-linux]-element ucontext_t mcontext_t uc_mcontext
|
||||
|
@ -179,9 +179,6 @@ print_entry (const char *lib, int flag, uint64_t hwcap,
|
||||
case FLAG_SPARC_LIB64:
|
||||
fputs (",64bit", stdout);
|
||||
break;
|
||||
case FLAG_IA64_LIB64:
|
||||
fputs (",IA-64", stdout);
|
||||
break;
|
||||
case FLAG_X8664_LIB64:
|
||||
fputs (",x86-64", stdout);
|
||||
break;
|
||||
|
@ -34,7 +34,6 @@ size_t taddr[] =
|
||||
0x00010000 /* Linux elf32/sparc */
|
||||
#if __WORDSIZE > 32
|
||||
,
|
||||
0x4000000000000000, /* Linux elf64/ia64 */
|
||||
0x0000000120000000, /* Linux elf64/alpha */
|
||||
0x4000000000001000, /* elf64/hppa */
|
||||
0x0000000100000000 /* Linux elf64/sparc */
|
||||
|
@ -669,8 +669,6 @@ patches, although we try to avoid this.
|
||||
|
||||
If you are installing @theglibc{} on @gnulinuxsystems{}, you need to have
|
||||
the header files from a 3.2 or newer kernel around for reference.
|
||||
(For the ia64 architecture, you need version 3.2.18 or newer because this
|
||||
is the first version with support for the @code{accept4} system call.)
|
||||
These headers must be installed using @samp{make headers_install}; the
|
||||
headers present in the kernel source directory are not suitable for
|
||||
direct use by @theglibc{}. You do not need to use that kernel, just have
|
||||
|
@ -69,7 +69,7 @@ Support for @code{_Float@var{N}} or @code{_Float@var{N}x} types is
|
||||
provided for @code{_Float32}, @code{_Float64} and @code{_Float32x} on
|
||||
all platforms.
|
||||
It is also provided for @code{_Float128} and @code{_Float64x} on
|
||||
powerpc64le (PowerPC 64-bits little-endian), x86_64, x86, ia64,
|
||||
powerpc64le (PowerPC 64-bits little-endian), x86_64, x86,
|
||||
aarch64, alpha, loongarch, mips64, riscv, s390 and sparc.
|
||||
|
||||
@menu
|
||||
|
@ -128,7 +128,7 @@ extern const char doc[];
|
||||
/* On some architectures, glibc can be built with compilers that do
|
||||
not have suitable built-in functions for setting the payload of a
|
||||
_Float128 NaN. */
|
||||
#if ((defined __x86_64__ || defined __i386__ || defined __ia64__) \
|
||||
#if ((defined __x86_64__ || defined __i386__) \
|
||||
&& !__GNUC_PREREQ (7, 0))
|
||||
# define XFAIL_FLOAT128_PAYLOAD (TEST_COND_binary128 ? XFAIL_TEST : 0)
|
||||
#else
|
||||
|
@ -150,9 +150,7 @@ __attribute ((always_inline))
|
||||
guard_position (void *mem, size_t size, size_t guardsize, struct pthread *pd,
|
||||
size_t pagesize_m1)
|
||||
{
|
||||
#ifdef NEED_SEPARATE_REGISTER_STACK
|
||||
return mem + (((size - guardsize) / 2) & ~pagesize_m1);
|
||||
#elif _STACK_GROWS_DOWN
|
||||
#if _STACK_GROWS_DOWN
|
||||
return mem;
|
||||
#elif _STACK_GROWS_UP
|
||||
return (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
|
||||
@ -166,7 +164,7 @@ setup_stack_prot (char *mem, size_t size, char *guard, size_t guardsize,
|
||||
const int prot)
|
||||
{
|
||||
char *guardend = guard + guardsize;
|
||||
#if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
|
||||
#if _STACK_GROWS_DOWN
|
||||
/* As defined at guard_position, for architectures with downward stack
|
||||
the guard page is always at start of the allocated area. */
|
||||
if (__mprotect (guardend, size - guardsize, prot) != 0)
|
||||
@ -189,7 +187,7 @@ advise_stack_range (void *mem, size_t size, uintptr_t pd, size_t guardsize)
|
||||
{
|
||||
uintptr_t sp = (uintptr_t) CURRENT_STACK_FRAME;
|
||||
size_t pagesize_m1 = __getpagesize () - 1;
|
||||
#if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
|
||||
#if _STACK_GROWS_DOWN
|
||||
size_t freesize = (sp - (uintptr_t) mem) & ~pagesize_m1;
|
||||
assert (freesize < size);
|
||||
if (freesize > PTHREAD_STACK_MIN)
|
||||
@ -510,19 +508,7 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
|
||||
{
|
||||
/* The old guard area is too large. */
|
||||
|
||||
#ifdef NEED_SEPARATE_REGISTER_STACK
|
||||
char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
|
||||
char *oldguard = mem + (((size - pd->guardsize) / 2) & ~pagesize_m1);
|
||||
|
||||
if (oldguard < guard
|
||||
&& __mprotect (oldguard, guard - oldguard, prot) != 0)
|
||||
goto mprot_error;
|
||||
|
||||
if (__mprotect (guard + guardsize,
|
||||
oldguard + pd->guardsize - guard - guardsize,
|
||||
prot) != 0)
|
||||
goto mprot_error;
|
||||
#elif _STACK_GROWS_DOWN
|
||||
#if _STACK_GROWS_DOWN
|
||||
if (__mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
|
||||
prot) != 0)
|
||||
goto mprot_error;
|
||||
@ -599,7 +585,7 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
|
||||
static void
|
||||
name_stack_maps (struct pthread *pd, bool set)
|
||||
{
|
||||
#if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
|
||||
#if _STACK_GROWS_DOWN
|
||||
void *stack = pd->stackblock + pd->guardsize;
|
||||
#else
|
||||
void *stack = pd->stackblock;
|
||||
|
@ -708,8 +708,6 @@ clock_getcpuclockid (pid_t pid, clockid_t *clock_id)
|
||||
({ unsigned int _hi, _lo; \
|
||||
asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \
|
||||
(Var) = ((unsigned long long int) _hi << 32) | _lo; })
|
||||
#elif defined __ia64__
|
||||
#define HP_TIMING_NOW(Var) __asm__ __volatile__ ("mov %0=ar.itc" : "=r" (Var) : : "memory")
|
||||
#else
|
||||
#error "HP_TIMING_NOW missing"
|
||||
#endif
|
||||
|
@ -234,10 +234,6 @@ class Context(object):
|
||||
os_name='linux-gnu')
|
||||
self.add_config(arch='i686',
|
||||
os_name='gnu')
|
||||
self.add_config(arch='ia64',
|
||||
os_name='linux-gnu',
|
||||
first_gcc_cfg=['--with-system-libunwind'],
|
||||
binutils_cfg=['--enable-obsolete'])
|
||||
self.add_config(arch='loongarch64',
|
||||
os_name='linux-gnu',
|
||||
variant='lp64d',
|
||||
@ -1300,7 +1296,6 @@ def install_linux_headers(policy, cmdlist):
|
||||
'i586': 'x86',
|
||||
'i686': 'x86',
|
||||
'i786': 'x86',
|
||||
'ia64': 'ia64',
|
||||
'loongarch64': 'loongarch',
|
||||
'm68k': 'm68k',
|
||||
'microblaze': 'microblaze',
|
||||
|
@ -27,18 +27,11 @@ xclone (int (*fn) (void *arg), void *arg, void *stack, size_t stack_size,
|
||||
{
|
||||
pid_t r = -1;
|
||||
|
||||
# ifdef __ia64__
|
||||
extern int __clone2 (int (*fn) (void *arg), void *stack, size_t stack_size,
|
||||
int flags, void *arg, ...);
|
||||
r = __clone2 (fn, stack, stack_size, flags, arg, /* ptid */ NULL,
|
||||
/* tls */ NULL, /* ctid */ NULL);
|
||||
# else
|
||||
# if _STACK_GROWS_DOWN
|
||||
r = clone (fn, stack + stack_size, flags, arg, /* ptid */ NULL,
|
||||
/* tls */ NULL, /* ctid */ NULL);
|
||||
# elif _STACK_GROWS_UP
|
||||
r = clone (fn, stack, flags, arg, /* ptid */ NULL, /* tls */ NULL, NULL);
|
||||
# endif
|
||||
# endif
|
||||
|
||||
if (r < 0)
|
||||
|
@ -30,7 +30,6 @@
|
||||
#define FLAG_ELF_LIBC6 0x0003
|
||||
#define FLAG_REQUIRED_MASK 0xff00
|
||||
#define FLAG_SPARC_LIB64 0x0100
|
||||
#define FLAG_IA64_LIB64 0x0200
|
||||
#define FLAG_X8664_LIB64 0x0300
|
||||
#define FLAG_S390_LIB64 0x0400
|
||||
#define FLAG_POWERPC_LIB64 0x0500
|
||||
|
@ -224,7 +224,6 @@ _Unwind_FindEnclosingFunction (void *pc)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#ifndef __ia64__
|
||||
_Unwind_Ptr
|
||||
_Unwind_GetDataRelBase (struct _Unwind_Context *context)
|
||||
{
|
||||
@ -236,7 +235,6 @@ _Unwind_GetTextRelBase (struct _Unwind_Context *context)
|
||||
{
|
||||
return (_Unwind_Ptr) context->bases.tbase;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Extract any interesting information from the CIE for the translation
|
||||
unit F belongs to. Return a pointer to the byte after the augmentation,
|
||||
|
@ -33,11 +33,7 @@ extern "C" {
|
||||
inefficient for 32-bit and smaller machines. */
|
||||
typedef unsigned _Unwind_Word __attribute__((__mode__(__unwind_word__)));
|
||||
typedef signed _Unwind_Sword __attribute__((__mode__(__unwind_word__)));
|
||||
#if defined(__ia64__) && defined(__hpux__)
|
||||
typedef unsigned _Unwind_Ptr __attribute__((__mode__(__word__)));
|
||||
#else
|
||||
typedef unsigned _Unwind_Ptr __attribute__((__mode__(__pointer__)));
|
||||
#endif
|
||||
typedef unsigned _Unwind_Internal_Ptr __attribute__((__mode__(__pointer__)));
|
||||
|
||||
/* @@@ The IA-64 ABI uses a 64-bit word to identify the producer and
|
||||
@ -190,29 +186,8 @@ extern void _Unwind_SjLj_Resume (struct _Unwind_Exception *);
|
||||
and data-relative addressing in the LDSA. In order to stay link
|
||||
compatible with the standard ABI for IA-64, we inline these. */
|
||||
|
||||
#ifdef __ia64__
|
||||
#include <stdlib.h>
|
||||
|
||||
static inline _Unwind_Ptr
|
||||
_Unwind_GetDataRelBase (struct _Unwind_Context *_C)
|
||||
{
|
||||
/* The GP is stored in R1. */
|
||||
return _Unwind_GetGR (_C, 1);
|
||||
}
|
||||
|
||||
static inline _Unwind_Ptr
|
||||
_Unwind_GetTextRelBase (struct _Unwind_Context *_C)
|
||||
{
|
||||
abort ();
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* @@@ Retrieve the Backing Store Pointer of the given context. */
|
||||
extern _Unwind_Word _Unwind_GetBSP (struct _Unwind_Context *);
|
||||
#else
|
||||
extern _Unwind_Ptr _Unwind_GetDataRelBase (struct _Unwind_Context *);
|
||||
extern _Unwind_Ptr _Unwind_GetTextRelBase (struct _Unwind_Context *);
|
||||
#endif
|
||||
|
||||
/* @@@ Given an address, return the entry point of the function that
|
||||
contains it. */
|
||||
|
@ -1,6 +0,0 @@
|
||||
wordsize-64
|
||||
# ia64 uses IEEE 754 floating point.
|
||||
ieee754/float128
|
||||
ieee754/ldbl-96
|
||||
ieee754/dbl-64
|
||||
ieee754/flt-32
|
@ -1,4 +0,0 @@
|
||||
# ia64 does not provide crtbeginT.o, so use crtbegin.o.
|
||||
+prectorT = $(+prector)
|
||||
|
||||
float64x-alias-fcts = yes
|
@ -1,25 +0,0 @@
|
||||
# The ia64 `long double' is a distinct type we support.
|
||||
long-double-fcts = yes
|
||||
|
||||
ifeq ($(subdir),math)
|
||||
# sqrtf128 requires soft-fp.
|
||||
CPPFLAGS += -I../soft-fp
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),gmon)
|
||||
sysdep_routines += _mcount
|
||||
endif
|
||||
|
||||
ifeq ($(subdir), csu)
|
||||
CPPFLAGS-start.S = -D__ASSEMBLY__
|
||||
|
||||
ifeq (yes,$(build-shared))
|
||||
# Compatibility
|
||||
sysdep_routines += ia64libgcc
|
||||
shared-only-routines += ia64libgcc
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),elf)
|
||||
sysdep-dl-routines += dl-symaddr dl-fptr
|
||||
endif
|
@ -1,21 +0,0 @@
|
||||
ld {
|
||||
GLIBC_PRIVATE {
|
||||
# ia64 specific functions in the dynamic linker, but used by libc.so.
|
||||
_dl_symbol_address; _dl_lookup_address;
|
||||
_dl_function_address;
|
||||
}
|
||||
}
|
||||
libc {
|
||||
GLIBC_2.2 {
|
||||
# Functions from libgcc.
|
||||
__divtf3; __divdf3; __divsf3; __divdi3; __moddi3; __udivdi3; __umoddi3;
|
||||
__multi3;
|
||||
}
|
||||
}
|
||||
libm {
|
||||
GLIBC_2.1 {
|
||||
# A generic bug got this omitted from other configurations' version
|
||||
# sets, but we always had it.
|
||||
exp2l;
|
||||
}
|
||||
}
|
@ -1,90 +0,0 @@
|
||||
/* Machine-specific calling sequence for `mcount' profiling function. ia64
|
||||
Copyright (C) 2000-2024 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
/* Assembly stub to invoke _mcount(). Compiler generated code calls
|
||||
this stub before executing a function's prologue and without saving
|
||||
any registers. It is therefore necessary to preserve the input
|
||||
registers as they may contain function arguments. To work
|
||||
correctly with frame-less functions, it is also necessary to
|
||||
preserve the return pointer (b0 aka rp).
|
||||
|
||||
State upon entering _mcount:
|
||||
|
||||
r8 address of return value structure (used only when called
|
||||
function returns a large structure)
|
||||
r15 static link (used only for nested functions)
|
||||
in0 ar.pfs to restore before returning to the function that
|
||||
called _mcount
|
||||
in1 gp value to restore before returning to the function that
|
||||
called _mcount
|
||||
in2 return address in the function that invoked the caller
|
||||
of _mcount (frompc)
|
||||
in3 address of the global-offset table entry that holds the
|
||||
profile count dword allocated by the compiler; to get
|
||||
the address of this dword, use "ld8 in2=[in2]; this
|
||||
dword can be used in any way by _mcount (including
|
||||
not at all, as is the case with the current implementation)
|
||||
b0 address to return to after _mcount is done
|
||||
*/
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
#undef ret
|
||||
|
||||
LEAF(_mcount)
|
||||
.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(4)
|
||||
alloc loc1 = ar.pfs, 4, 4, 3, 0
|
||||
mov loc0 = rp
|
||||
.body
|
||||
mov loc2 = r8 // gcc uses r8 to pass pointer to return structure
|
||||
;;
|
||||
mov loc3 = r15 // gcc uses r15 to pass the static link to nested functions
|
||||
mov out0 = in2
|
||||
mov out1 = rp
|
||||
br.call.sptk.few rp = __mcount
|
||||
;;
|
||||
.here:
|
||||
{
|
||||
.mii
|
||||
mov gp = in1
|
||||
mov r2 = ip
|
||||
mov ar.pfs = loc1
|
||||
}
|
||||
;;
|
||||
adds r2 = _mcount_ret_helper - .here, r2
|
||||
mov b7 = loc0
|
||||
mov rp = in2
|
||||
;;
|
||||
mov r3 = in0
|
||||
mov r8 = loc2
|
||||
mov r15 = loc3
|
||||
mov b6 = r2
|
||||
br.ret.sptk.few b6
|
||||
END(_mcount)
|
||||
|
||||
LOCAL_LEAF(_mcount_ret_helper)
|
||||
.prologue
|
||||
.altrp b7
|
||||
.save ar.pfs, r3
|
||||
.body
|
||||
alloc r2 = ar.pfs, 0, 0, 8, 0
|
||||
mov ar.pfs = r3
|
||||
br b7
|
||||
END(_mcount_ret_helper)
|
||||
|
||||
weak_alias (_mcount, mcount)
|
@ -1,3 +0,0 @@
|
||||
/* An instruction which should crash any program is `break 0' which triggers
|
||||
SIGILL. */
|
||||
#define ABORT_INSTRUCTION asm ("break 0")
|
@ -1,96 +0,0 @@
|
||||
/* Copyright (C) 2003-2024 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <ia64intrin.h>
|
||||
|
||||
#define __HAVE_64B_ATOMICS 1
|
||||
#define USE_ATOMIC_COMPILER_BUILTINS 0
|
||||
|
||||
/* XXX Is this actually correct? */
|
||||
#define ATOMIC_EXCHANGE_USES_CAS 0
|
||||
|
||||
|
||||
#define __arch_compare_and_exchange_bool_8_acq(mem, newval, oldval) \
|
||||
(abort (), 0)
|
||||
|
||||
#define __arch_compare_and_exchange_bool_16_acq(mem, newval, oldval) \
|
||||
(abort (), 0)
|
||||
|
||||
#define __arch_compare_and_exchange_bool_32_acq(mem, newval, oldval) \
|
||||
(!__sync_bool_compare_and_swap ((mem), (int) (long) (oldval), \
|
||||
(int) (long) (newval)))
|
||||
|
||||
#define __arch_compare_and_exchange_bool_64_acq(mem, newval, oldval) \
|
||||
(!__sync_bool_compare_and_swap ((mem), (long) (oldval), \
|
||||
(long) (newval)))
|
||||
|
||||
#define __arch_compare_and_exchange_val_8_acq(mem, newval, oldval) \
|
||||
(abort (), (__typeof (*mem)) 0)
|
||||
|
||||
#define __arch_compare_and_exchange_val_16_acq(mem, newval, oldval) \
|
||||
(abort (), (__typeof (*mem)) 0)
|
||||
|
||||
#define __arch_compare_and_exchange_val_32_acq(mem, newval, oldval) \
|
||||
__sync_val_compare_and_swap ((mem), (int) (long) (oldval), \
|
||||
(int) (long) (newval))
|
||||
|
||||
#define __arch_compare_and_exchange_val_64_acq(mem, newval, oldval) \
|
||||
__sync_val_compare_and_swap ((mem), (long) (oldval), (long) (newval))
|
||||
|
||||
/* Atomically store newval and return the old value. */
|
||||
#define atomic_exchange_acq(mem, value) \
|
||||
__sync_lock_test_and_set (mem, value)
|
||||
|
||||
#define atomic_exchange_rel(mem, value) \
|
||||
(__sync_synchronize (), __sync_lock_test_and_set (mem, value))
|
||||
|
||||
#define atomic_exchange_and_add(mem, value) \
|
||||
__sync_fetch_and_add ((mem), (value))
|
||||
|
||||
#define atomic_decrement_if_positive(mem) \
|
||||
({ __typeof (*mem) __oldval, __val; \
|
||||
__typeof (mem) __memp = (mem); \
|
||||
\
|
||||
__val = (*__memp); \
|
||||
do \
|
||||
{ \
|
||||
__oldval = __val; \
|
||||
if (__builtin_expect (__val <= 0, 0)) \
|
||||
break; \
|
||||
__val = atomic_compare_and_exchange_val_acq (__memp, __oldval - 1, \
|
||||
__oldval); \
|
||||
} \
|
||||
while (__builtin_expect (__val != __oldval, 0)); \
|
||||
__oldval; })
|
||||
|
||||
#define atomic_bit_test_set(mem, bit) \
|
||||
({ __typeof (*mem) __oldval, __val; \
|
||||
__typeof (mem) __memp = (mem); \
|
||||
__typeof (*mem) __mask = ((__typeof (*mem)) 1 << (bit)); \
|
||||
\
|
||||
__val = (*__memp); \
|
||||
do \
|
||||
{ \
|
||||
__oldval = __val; \
|
||||
__val = atomic_compare_and_exchange_val_acq (__memp, \
|
||||
__oldval | __mask, \
|
||||
__oldval); \
|
||||
} \
|
||||
while (__builtin_expect (__val != __oldval, 0)); \
|
||||
__oldval & __mask; })
|
||||
|
||||
#define atomic_full_barrier() __sync_synchronize ()
|
@ -1,11 +0,0 @@
|
||||
#ifndef _BITS_ENDIANNESS_H
|
||||
#define _BITS_ENDIANNESS_H 1
|
||||
|
||||
#ifndef _BITS_ENDIAN_H
|
||||
# error "Never use <bits/endianness.h> directly; include <endian.h> instead."
|
||||
#endif
|
||||
|
||||
/* IA64 is little-endian. */
|
||||
#define __BYTE_ORDER __LITTLE_ENDIAN
|
||||
|
||||
#endif /* bits/endianness.h */
|
@ -1,104 +0,0 @@
|
||||
/* Copyright (C) 1999-2024 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#ifndef _FENV_H
|
||||
# error "Never use <bits/fenv.h> directly; include <fenv.h> instead."
|
||||
#endif
|
||||
|
||||
|
||||
/* Define bits representing the exception. We use the bit positions of
|
||||
the appropriate bits in the FPSR... (Tahoe EAS 2.4 5-4)*/
|
||||
|
||||
enum
|
||||
{
|
||||
FE_INEXACT =
|
||||
#define FE_INEXACT (1 << 5)
|
||||
FE_INEXACT,
|
||||
|
||||
FE_UNDERFLOW =
|
||||
#define FE_UNDERFLOW (1 << 4)
|
||||
FE_UNDERFLOW,
|
||||
|
||||
FE_OVERFLOW =
|
||||
#define FE_OVERFLOW (1 << 3)
|
||||
FE_OVERFLOW,
|
||||
|
||||
FE_DIVBYZERO =
|
||||
#define FE_DIVBYZERO (1 << 2)
|
||||
FE_DIVBYZERO,
|
||||
|
||||
FE_UNNORMAL =
|
||||
#define FE_UNNORMAL (1 << 1)
|
||||
FE_UNNORMAL,
|
||||
|
||||
FE_INVALID =
|
||||
#define FE_INVALID (1 << 0)
|
||||
FE_INVALID,
|
||||
|
||||
FE_ALL_EXCEPT =
|
||||
#define FE_ALL_EXCEPT (FE_INEXACT | FE_UNDERFLOW | FE_OVERFLOW | FE_DIVBYZERO | FE_UNNORMAL | FE_INVALID)
|
||||
FE_ALL_EXCEPT
|
||||
};
|
||||
|
||||
|
||||
enum
|
||||
{
|
||||
FE_TOWARDZERO =
|
||||
#define FE_TOWARDZERO 3
|
||||
FE_TOWARDZERO,
|
||||
|
||||
FE_UPWARD =
|
||||
#define FE_UPWARD 2
|
||||
FE_UPWARD,
|
||||
|
||||
FE_DOWNWARD =
|
||||
#define FE_DOWNWARD 1
|
||||
FE_DOWNWARD,
|
||||
|
||||
FE_TONEAREST =
|
||||
#define FE_TONEAREST 0
|
||||
FE_TONEAREST,
|
||||
};
|
||||
|
||||
|
||||
/* Type representing exception flags. */
|
||||
typedef unsigned long int fexcept_t;
|
||||
|
||||
/* Type representing floating-point environment. */
|
||||
typedef unsigned long int fenv_t;
|
||||
|
||||
/* If the default argument is used we use this value. */
|
||||
#define FE_DFL_ENV ((const fenv_t *) 0xc009804c0270033fUL)
|
||||
|
||||
#ifdef __USE_GNU
|
||||
/* Floating-point environment where only FE_UNNORMAL is masked since this
|
||||
exception is not generally supported by glibc. */
|
||||
# define FE_NOMASK_ENV ((const fenv_t *) 0xc009804c02700302UL)
|
||||
|
||||
/* Floating-point environment with (processor-dependent) non-IEEE
|
||||
floating point. In this case, turning on flush-to-zero mode for
|
||||
s0, s2, and s3. */
|
||||
# define FE_NONIEEE_ENV ((const fenv_t *) 0xc009a04d0270037fUL)
|
||||
#endif
|
||||
|
||||
#if __GLIBC_USE (IEC_60559_BFP_EXT_C2X)
|
||||
/* Type representing floating-point control modes. */
|
||||
typedef unsigned long int femode_t;
|
||||
|
||||
/* Default floating-point control modes. */
|
||||
# define FE_DFL_MODE ((const femode_t *) 0xc009804c0270033fUL)
|
||||
#endif
|
@ -1,119 +0,0 @@
|
||||
/* Macros to control TS 18661-3 glibc features on ia64.
|
||||
Copyright (C) 2017-2024 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#ifndef _BITS_FLOATN_H
|
||||
#define _BITS_FLOATN_H
|
||||
|
||||
#include <features.h>
|
||||
|
||||
/* Defined to 1 if the current compiler invocation provides a
|
||||
floating-point type with the IEEE 754 binary128 format, and this
|
||||
glibc includes corresponding *f128 interfaces for it. The required
|
||||
libgcc support was added some time after the basic compiler
|
||||
support. */
|
||||
#if __GNUC_PREREQ (4, 4)
|
||||
# define __HAVE_FLOAT128 1
|
||||
#else
|
||||
# define __HAVE_FLOAT128 0
|
||||
#endif
|
||||
|
||||
/* Defined to 1 if __HAVE_FLOAT128 is 1 and the type is ABI-distinct
|
||||
from the default float, double and long double types in this glibc. */
|
||||
#if __HAVE_FLOAT128
|
||||
# define __HAVE_DISTINCT_FLOAT128 1
|
||||
#else
|
||||
# define __HAVE_DISTINCT_FLOAT128 0
|
||||
#endif
|
||||
|
||||
/* Defined to 1 if the current compiler invocation provides a
|
||||
floating-point type with the right format for _Float64x, and this
|
||||
glibc includes corresponding *f64x interfaces for it. */
|
||||
#define __HAVE_FLOAT64X 1
|
||||
|
||||
/* Defined to 1 if __HAVE_FLOAT64X is 1 and _Float64x has the format
|
||||
of long double. Otherwise, if __HAVE_FLOAT64X is 1, _Float64x has
|
||||
the format of _Float128, which must be different from that of long
|
||||
double. */
|
||||
#define __HAVE_FLOAT64X_LONG_DOUBLE 1
|
||||
|
||||
#ifndef __ASSEMBLER__
|
||||
|
||||
/* Defined to concatenate the literal suffix to be used with _Float128
|
||||
types, if __HAVE_FLOAT128 is 1. */
|
||||
# if __HAVE_FLOAT128
|
||||
# if !__GNUC_PREREQ (7, 0) || (defined __cplusplus && !__GNUC_PREREQ (13, 0))
|
||||
/* The literal suffix f128 exists only since GCC 7.0. */
|
||||
# define __f128(x) x##q
|
||||
# else
|
||||
# define __f128(x) x##f128
|
||||
# endif
|
||||
# endif
|
||||
|
||||
/* Defined to a complex binary128 type if __HAVE_FLOAT128 is 1. */
|
||||
# if __HAVE_FLOAT128
|
||||
# if !__GNUC_PREREQ (7, 0) || (defined __cplusplus && !__GNUC_PREREQ (13, 0))
|
||||
/* Add a typedef for older GCC compilers which don't natively support
|
||||
_Complex _Float128. */
|
||||
typedef _Complex float __cfloat128 __attribute__ ((__mode__ (__TC__)));
|
||||
# define __CFLOAT128 __cfloat128
|
||||
# else
|
||||
# define __CFLOAT128 _Complex _Float128
|
||||
# endif
|
||||
# endif
|
||||
|
||||
/* The remaining of this file provides support for older compilers. */
|
||||
# if __HAVE_FLOAT128
|
||||
|
||||
/* The type _Float128 exists only since GCC 7.0. */
|
||||
# if !__GNUC_PREREQ (7, 0) || (defined __cplusplus && !__GNUC_PREREQ (13, 0))
|
||||
typedef __float128 _Float128;
|
||||
# endif
|
||||
|
||||
/* __builtin_huge_valf128 doesn't exist before GCC 7.0. */
|
||||
# if !__GNUC_PREREQ (7, 0)
|
||||
# define __builtin_huge_valf128() ((_Float128) __builtin_huge_val ())
|
||||
# endif
|
||||
|
||||
/* Older GCC has only a subset of built-in functions for _Float128 on
|
||||
ia64, and __builtin_infq is not usable in static initializers.
|
||||
Converting a narrower sNaN to _Float128 produces a quiet NaN, so
|
||||
attempts to use _Float128 sNaNs will not work properly with older
|
||||
compilers. */
|
||||
# if !__GNUC_PREREQ (7, 0)
|
||||
# define __builtin_copysignf128 __builtin_copysignq
|
||||
# define __builtin_fabsf128 __builtin_fabsq
|
||||
# define __builtin_inff128() ((_Float128) __builtin_inf ())
|
||||
# define __builtin_nanf128(x) ((_Float128) __builtin_nan (x))
|
||||
# define __builtin_nansf128(x) ((_Float128) __builtin_nans (x))
|
||||
# endif
|
||||
|
||||
/* In math/math.h, __MATH_TG will expand signbit to __builtin_signbit*,
|
||||
e.g.: __builtin_signbitf128, before GCC 6. However, there has never
|
||||
been a __builtin_signbitf128 in GCC and the type-generic builtin is
|
||||
only available since GCC 6. */
|
||||
# if !__GNUC_PREREQ (6, 0)
|
||||
# define __builtin_signbitf128 __signbitf128
|
||||
# endif
|
||||
|
||||
# endif
|
||||
|
||||
#endif /* !__ASSEMBLER__. */
|
||||
|
||||
#include <bits/floatn-common.h>
|
||||
|
||||
#endif /* _BITS_FLOATN_H */
|
@ -1,24 +0,0 @@
|
||||
/* Define __FP_LOGB0_IS_MIN and __FP_LOGBNAN_IS_MIN. IA64 version.
|
||||
Copyright (C) 2016-2024 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#ifndef _MATH_H
|
||||
# error "Never use <bits/fp-logb.h> directly; include <math.h> instead."
|
||||
#endif
|
||||
|
||||
#define __FP_LOGB0_IS_MIN 1
|
||||
#define __FP_LOGBNAN_IS_MIN 0
|
@ -1,62 +0,0 @@
|
||||
/* Copyright (C) 2005-2024 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#ifndef _LINK_H
|
||||
# error "Never include <bits/link.h> directly; use <link.h> instead."
|
||||
#endif
|
||||
|
||||
/* Registers for entry into PLT on ia64. */
|
||||
typedef struct La_ia64_regs
|
||||
{
|
||||
uint64_t lr_r8;
|
||||
uint64_t lr_r9;
|
||||
uint64_t lr_r10;
|
||||
uint64_t lr_r11;
|
||||
uint64_t lr_gr [8];
|
||||
long double lr_fr [8];
|
||||
uint64_t lr_unat;
|
||||
uint64_t lr_sp;
|
||||
} La_ia64_regs;
|
||||
|
||||
/* Return values for calls from PLT on ia64. */
|
||||
typedef struct La_ia64_retval
|
||||
{
|
||||
uint64_t lrv_r8;
|
||||
uint64_t lrv_r9;
|
||||
uint64_t lrv_r10;
|
||||
uint64_t lrv_r11;
|
||||
long double lr_fr [8];
|
||||
} La_ia64_retval;
|
||||
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
extern Elf64_Addr la_ia64_gnu_pltenter (Elf64_Sym *__sym, unsigned int __ndx,
|
||||
uintptr_t *__refcook,
|
||||
uintptr_t *__defcook,
|
||||
La_ia64_regs *__regs,
|
||||
unsigned int *__flags,
|
||||
const char *__symname,
|
||||
long int *__framesizep);
|
||||
extern unsigned int la_ia64_gnu_pltexit (Elf64_Sym *__sym, unsigned int __ndx,
|
||||
uintptr_t *__refcook,
|
||||
uintptr_t *__defcook,
|
||||
const La_ia64_regs *__inregs,
|
||||
La_ia64_retval *__outregs,
|
||||
const char *__symname);
|
||||
|
||||
__END_DECLS
|
@ -1,3 +0,0 @@
|
||||
/* ia64 does not export __bzero symbol. */
|
||||
#define __bzero bzero
|
||||
#include <string/bzero.c>
|
9
sysdeps/ia64/configure
vendored
9
sysdeps/ia64/configure
vendored
@ -1,9 +0,0 @@
|
||||
# This file is generated from configure.ac by Autoconf. DO NOT EDIT!
|
||||
# Local configure fragment for sysdeps/ia64.
|
||||
|
||||
# PIE builds fail on binutils 2.37 and earlier, see:
|
||||
# https://sourceware.org/bugzilla/show_bug.cgi?id=28672
|
||||
printf "%s\n" "#define PIE_UNSUPPORTED 1" >>confdefs.h
|
||||
|
||||
# work around problem with autoconf and empty lines at the end of files
|
||||
|
@ -1,7 +0,0 @@
|
||||
GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
|
||||
# Local configure fragment for sysdeps/ia64.
|
||||
|
||||
# PIE builds fail on binutils 2.37 and earlier, see:
|
||||
# https://sourceware.org/bugzilla/show_bug.cgi?id=28672
|
||||
AC_DEFINE(PIE_UNSUPPORTED)
|
||||
# work around problem with autoconf and empty lines at the end of files
|
@ -1,162 +0,0 @@
|
||||
/* Special .init and .fini section support for IA64.
|
||||
Copyright (C) 2000-2024 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
In addition to the permissions in the GNU Lesser General Public
|
||||
License, the Free Software Foundation gives you unlimited
|
||||
permission to link the compiled version of this file with other
|
||||
programs, and to distribute those programs without any restriction
|
||||
coming from the use of this file. (The GNU Lesser General Public
|
||||
License restrictions do apply in other respects; for example, they
|
||||
cover modification of the file, and distribution when not linked
|
||||
into another program.)
|
||||
|
||||
Note that people who make modified versions of this file are not
|
||||
obligated to grant this special exception for their modified
|
||||
versions; it is their choice whether to do so. The GNU Lesser
|
||||
General Public License gives permission to release a modified
|
||||
version without this exception; this exception also makes it
|
||||
possible to release a modified version which carries forward this
|
||||
exception.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library. If not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
/* crti.S puts a function prologue at the beginning of the .init and
|
||||
.fini sections and defines global symbols for those addresses, so
|
||||
they can be called as functions. The symbols _init and _fini are
|
||||
magic and cause the linker to emit DT_INIT and DT_FINI. */
|
||||
|
||||
#include <libc-symbols.h>
|
||||
#include <sysdep.h>
|
||||
#undef ret
|
||||
|
||||
#ifndef PREINIT_FUNCTION
|
||||
# define PREINIT_FUNCTION __gmon_start__
|
||||
#endif
|
||||
|
||||
#ifndef PREINIT_FUNCTION_WEAK
|
||||
# define PREINIT_FUNCTION_WEAK 1
|
||||
#endif
|
||||
|
||||
#if PREINIT_FUNCTION_WEAK
|
||||
weak_extern (PREINIT_FUNCTION)
|
||||
#else
|
||||
.hidden PREINIT_FUNCTION
|
||||
#endif
|
||||
|
||||
/* If we have working .init_array support, we want to keep the .init
|
||||
section empty (apart from the mandatory prologue/epilogue. This
|
||||
ensures that the default unwind conventions (return-pointer in b0,
|
||||
frame state in ar.pfs, etc.) will do the Right Thing. To ensure
|
||||
an empty .init section, we register gmon_initializer() via the
|
||||
.init_array.
|
||||
|
||||
--davidm 02/10/29 */
|
||||
|
||||
#if PREINIT_FUNCTION_WEAK
|
||||
/* This blob of assembly code is one simple C function:
|
||||
|
||||
static void
|
||||
__attribute__ ((used))
|
||||
gmon_initializer (void)
|
||||
{
|
||||
extern void weak_function __gmon_start__ (void);
|
||||
|
||||
if (__gmon_start__)
|
||||
(*__gmon_start__)();
|
||||
}
|
||||
*/
|
||||
.text
|
||||
.align 64
|
||||
.proc gmon_initializer#
|
||||
gmon_initializer:
|
||||
.prologue 12, 32
|
||||
.mmi
|
||||
.save ar.pfs, r33
|
||||
alloc r33 = ar.pfs, 0, 3, 0, 0
|
||||
addl r14 = @ltoff(@fptr(PREINIT_FUNCTION#)), gp
|
||||
.save rp, r32
|
||||
mov r32 = b0
|
||||
.mmi
|
||||
mov r34 = r1
|
||||
.body
|
||||
;;
|
||||
ld8 r14 = [r14]
|
||||
nop 0
|
||||
;;
|
||||
.mib
|
||||
cmp.eq p6, p7 = 0, r14
|
||||
nop 0
|
||||
(p6) br.cond.spnt .L1
|
||||
;;
|
||||
.mib
|
||||
nop 0
|
||||
nop 0
|
||||
br.call.sptk.many b0 = PREINIT_FUNCTION#
|
||||
;;
|
||||
.mmi
|
||||
mov r1 = r34
|
||||
nop 0
|
||||
nop 0
|
||||
.L1:
|
||||
.mii
|
||||
nop 0
|
||||
mov ar.pfs = r33
|
||||
nop 0
|
||||
;;
|
||||
.mib
|
||||
nop 0
|
||||
mov b0 = r32
|
||||
br.ret.sptk.many b0
|
||||
.endp gmon_initializer#
|
||||
# undef PREINIT_FUNCTION
|
||||
# define PREINIT_FUNCTION gmon_initializer
|
||||
#endif
|
||||
.section .init_array, "aw"
|
||||
data8 @fptr(PREINIT_FUNCTION)
|
||||
|
||||
.section .init,"ax",@progbits
|
||||
.global _init#
|
||||
.hidden _init#
|
||||
.proc _init#
|
||||
_init:
|
||||
.prologue
|
||||
.save ar.pfs, r34
|
||||
alloc r34 = ar.pfs, 0, 3, 0, 0
|
||||
.vframe r32
|
||||
mov r32 = r12
|
||||
.save rp, r33
|
||||
mov r33 = b0
|
||||
.body
|
||||
adds r12 = -16, r12
|
||||
;; /* see gmon_initializer() above */
|
||||
.endp _init#
|
||||
|
||||
.section .fini,"ax",@progbits
|
||||
.global _fini#
|
||||
.hidden _fini#
|
||||
.proc _fini#
|
||||
_fini:
|
||||
.prologue
|
||||
.save ar.pfs, r34
|
||||
alloc r34 = ar.pfs, 0, 3, 0, 0
|
||||
.vframe r32
|
||||
mov r32 = r12
|
||||
.save rp, r33
|
||||
mov r33 = b0
|
||||
.body
|
||||
adds r12 = -16, r12
|
||||
;;
|
||||
.endp _fini#
|
@ -1,69 +0,0 @@
|
||||
/* Special .init and .fini section support for ARM.
|
||||
Copyright (C) 2000-2024 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
In addition to the permissions in the GNU Lesser General Public
|
||||
License, the Free Software Foundation gives you unlimited
|
||||
permission to link the compiled version of this file with other
|
||||
programs, and to distribute those programs without any restriction
|
||||
coming from the use of this file. (The GNU Lesser General Public
|
||||
License restrictions do apply in other respects; for example, they
|
||||
cover modification of the file, and distribution when not linked
|
||||
into another program.)
|
||||
|
||||
Note that people who make modified versions of this file are not
|
||||
obligated to grant this special exception for their modified
|
||||
versions; it is their choice whether to do so. The GNU Lesser
|
||||
General Public License gives permission to release a modified
|
||||
version without this exception; this exception also makes it
|
||||
possible to release a modified version which carries forward this
|
||||
exception.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library. If not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <sysdep.h>
|
||||
#undef ret
|
||||
|
||||
/* crtn.S puts function epilogues in the .init and .fini sections
|
||||
corresponding to the prologues in crti.S. */
|
||||
|
||||
.section .init,"ax",@progbits
|
||||
.proc _init#
|
||||
_init:
|
||||
.prologue
|
||||
.save ar.pfs, r34
|
||||
.vframe r32
|
||||
.save rp, r33
|
||||
.body
|
||||
.regstk 0,2,0,0
|
||||
mov r12 = r32
|
||||
mov ar.pfs = r34
|
||||
mov b0 = r33
|
||||
br.ret.sptk.many b0
|
||||
.endp _init#
|
||||
|
||||
.section .fini,"ax",@progbits
|
||||
.proc _fini#
|
||||
_fini:
|
||||
.prologue
|
||||
.save ar.pfs, r34
|
||||
.vframe r32
|
||||
.save rp, r33
|
||||
.body
|
||||
mov r12 = r32
|
||||
mov ar.pfs = r34
|
||||
mov b0 = r33
|
||||
br.ret.sptk.many b0
|
||||
.endp _fini#
|
@ -1,21 +0,0 @@
|
||||
/* Configuration of lookup functions. IA-64 version.
|
||||
Copyright (C) 2000-2024 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
/* Number of extra dynamic section entries for this architecture. By
|
||||
default there are none. */
|
||||
#define DT_THISPROCNUM DT_IA_64_NUM
|
@ -1,45 +0,0 @@
|
||||
/* Function descriptors. IA64 version.
|
||||
Copyright (C) 2003-2024 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#ifndef dl_ia64_fptr_h
|
||||
#define dl_ia64_fptr_h 1
|
||||
|
||||
#include <ia64intrin.h>
|
||||
#include <sysdeps/generic/dl-fptr.h>
|
||||
|
||||
#define COMPARE_AND_SWAP(ptr, old, new) \
|
||||
__sync_bool_compare_and_swap (ptr, old, new)
|
||||
|
||||
/* There are currently 123 dynamic symbols in ld.so.
|
||||
ELF_MACHINE_BOOT_FPTR_TABLE_LEN needs to be at least that big. */
|
||||
#define ELF_MACHINE_BOOT_FPTR_TABLE_LEN 200
|
||||
|
||||
#define ELF_MACHINE_LOAD_ADDRESS(var, symbol) \
|
||||
asm ("movl %0 = @gprel (" #symbol ");; add %0 = %0, gp" : "=&r" (var));
|
||||
|
||||
/* We don't have a gcc helper to extract the plabel info. */
|
||||
#define ELF_PTR_TO_FDESC(ptr) \
|
||||
({ union { \
|
||||
void *_ptr; \
|
||||
struct fdesc *_fdesc; \
|
||||
} _u; \
|
||||
_u._ptr = ptr; \
|
||||
_u._fdesc; \
|
||||
})
|
||||
|
||||
#endif /* !dl_ia64_fptr_h */
|
@ -1,79 +0,0 @@
|
||||
/* Configuration of lookup functions.
|
||||
Copyright (C) 2000-2024 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#define ELF_FUNCTION_PTR_IS_SPECIAL
|
||||
#define DL_UNMAP_IS_SPECIAL
|
||||
|
||||
#include <dl-fptr.h>
|
||||
|
||||
/* We do not support copy relocations for IA-64. */
|
||||
#define DL_NO_COPY_RELOCS
|
||||
|
||||
/* Forward declaration. */
|
||||
struct link_map;
|
||||
|
||||
extern void *_dl_symbol_address (struct link_map *map, const Elf64_Sym *ref);
|
||||
rtld_hidden_proto (_dl_symbol_address)
|
||||
|
||||
#define DL_SYMBOL_ADDRESS(map, ref) _dl_symbol_address(map, ref)
|
||||
|
||||
extern Elf64_Addr _dl_lookup_address (const void *address);
|
||||
|
||||
#define DL_LOOKUP_ADDRESS(addr) _dl_lookup_address (addr)
|
||||
|
||||
extern void attribute_hidden _dl_unmap (struct link_map *map);
|
||||
|
||||
#define DL_UNMAP(map) _dl_unmap (map)
|
||||
|
||||
#define DL_DT_FUNCTION_ADDRESS(map, start, attr, addr) \
|
||||
attr volatile unsigned long int fptr[2]; \
|
||||
fptr[0] = (unsigned long int) (start); \
|
||||
fptr[1] = (map)->l_info[DT_PLTGOT]->d_un.d_ptr; \
|
||||
addr = (ElfW(Addr)) fptr; \
|
||||
|
||||
#define DL_CALL_DT_INIT(map, start, argc, argv, env) \
|
||||
{ \
|
||||
ElfW(Addr) addr; \
|
||||
DL_DT_FUNCTION_ADDRESS(map, start, , addr) \
|
||||
dl_init_t init = (dl_init_t) addr; \
|
||||
init (argc, argv, env); \
|
||||
}
|
||||
|
||||
#define DL_CALL_DT_FINI(map, start) \
|
||||
{ \
|
||||
ElfW(Addr) addr; \
|
||||
DL_DT_FUNCTION_ADDRESS(map, start, , addr) \
|
||||
fini_t fini = (fini_t) addr; \
|
||||
fini (); \
|
||||
}
|
||||
|
||||
/* The type of the return value of fixup/profile_fixup. */
|
||||
#define DL_FIXUP_VALUE_TYPE struct fdesc
|
||||
/* Construct a value of type DL_FIXUP_VALUE_TYPE from a code address
|
||||
and a link map. */
|
||||
#define DL_FIXUP_MAKE_VALUE(map, addr) \
|
||||
((struct fdesc) { (addr), (map)->l_info[DT_PLTGOT]->d_un.d_ptr })
|
||||
/* Extract the code address from a value of type DL_FIXUP_MAKE_VALUE.
|
||||
*/
|
||||
#define DL_FIXUP_VALUE_CODE_ADDR(value) (value).ip
|
||||
|
||||
#define DL_FIXUP_VALUE_ADDR(value) ((uintptr_t) &(value))
|
||||
#define DL_FIXUP_ADDR_VALUE(addr) (*(struct fdesc *) (addr))
|
||||
#define DL_FIXUP_BINDNOW_ADDR_VALUE(addr) (addr)
|
||||
#define DL_FIXUP_BINDNOW_RELOC(l, reloc, value, new_value, st_value, lazy) \
|
||||
(*value) = *(struct fdesc *) (st_value)
|
@ -1,460 +0,0 @@
|
||||
/* Machine-dependent ELF dynamic relocation inline functions. IA-64 version.
|
||||
Copyright (C) 1995-2024 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#ifndef dl_machine_h
|
||||
#define dl_machine_h 1
|
||||
|
||||
#define ELF_MACHINE_NAME "ia64"
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <link.h>
|
||||
#include <errno.h>
|
||||
#include <dl-fptr.h>
|
||||
#include <tls.h>
|
||||
#include <dl-static-tls.h>
|
||||
#include <dl-machine-rel.h>
|
||||
|
||||
/* Translate a processor specific dynamic tag to the index
|
||||
in l_info array. */
|
||||
#define DT_IA_64(x) (DT_IA_64_##x - DT_LOPROC + DT_NUM)
|
||||
|
||||
static inline void __attribute__ ((always_inline))
|
||||
__ia64_init_bootstrap_fdesc_table (struct link_map *map)
|
||||
{
|
||||
Elf64_Addr *boot_table;
|
||||
|
||||
/* careful: this will be called before got has been relocated... */
|
||||
asm (";; addl %0 = @gprel (_dl_boot_fptr_table), gp" : "=r"(boot_table));
|
||||
|
||||
map->l_mach.fptr_table_len = ELF_MACHINE_BOOT_FPTR_TABLE_LEN;
|
||||
map->l_mach.fptr_table = boot_table;
|
||||
}
|
||||
|
||||
#define ELF_MACHINE_BEFORE_RTLD_RELOC(map, dynamic_info) \
|
||||
__ia64_init_bootstrap_fdesc_table (map);
|
||||
|
||||
/* Return nonzero iff ELF header is compatible with the running host. */
|
||||
static inline int __attribute__ ((unused))
|
||||
elf_machine_matches_host (const Elf64_Ehdr *ehdr)
|
||||
{
|
||||
return ehdr->e_machine == EM_IA_64;
|
||||
}
|
||||
|
||||
|
||||
/* Return the link-time address of _DYNAMIC. */
|
||||
static inline Elf64_Addr __attribute__ ((unused, const))
|
||||
elf_machine_dynamic (void)
|
||||
{
|
||||
Elf64_Addr *p;
|
||||
|
||||
__asm__ (
|
||||
".section .sdata\n"
|
||||
" .type __dynamic_ltv#, @object\n"
|
||||
" .size __dynamic_ltv#, 8\n"
|
||||
"__dynamic_ltv:\n"
|
||||
" data8 @ltv(_DYNAMIC#)\n"
|
||||
".previous\n"
|
||||
" addl %0 = @gprel(__dynamic_ltv#), gp ;;"
|
||||
: "=r" (p));
|
||||
|
||||
return *p;
|
||||
}
|
||||
|
||||
|
||||
/* Return the run-time load address of the shared object. */
|
||||
static inline Elf64_Addr __attribute__ ((unused))
|
||||
elf_machine_load_address (void)
|
||||
{
|
||||
Elf64_Addr ip;
|
||||
int *p;
|
||||
|
||||
__asm__ (
|
||||
"1: mov %0 = ip\n"
|
||||
".section .sdata\n"
|
||||
"2: data4 @ltv(1b)\n"
|
||||
" .align 8\n"
|
||||
".previous\n"
|
||||
" addl %1 = @gprel(2b), gp ;;"
|
||||
: "=r" (ip), "=r" (p));
|
||||
|
||||
return ip - (Elf64_Addr) *p;
|
||||
}
|
||||
|
||||
/* Set up the loaded object described by L so its unrelocated PLT
|
||||
entries will jump to the on-demand fixup code in dl-runtime.c. */
|
||||
|
||||
static inline int __attribute__ ((unused, always_inline))
|
||||
elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
|
||||
int lazy, int profile)
|
||||
{
|
||||
extern void _dl_runtime_resolve (void);
|
||||
extern void _dl_runtime_profile (void);
|
||||
|
||||
if (lazy)
|
||||
{
|
||||
register Elf64_Addr gp __asm__ ("gp");
|
||||
Elf64_Addr *reserve, doit;
|
||||
|
||||
/*
|
||||
* Careful with the typecast here or it will try to add l-l_addr
|
||||
* pointer elements
|
||||
*/
|
||||
reserve = ((Elf64_Addr *)
|
||||
(l->l_info[DT_IA_64 (PLT_RESERVE)]->d_un.d_ptr + l->l_addr));
|
||||
/* Identify this shared object. */
|
||||
reserve[0] = (Elf64_Addr) l;
|
||||
|
||||
/* This function will be called to perform the relocation. */
|
||||
#ifdef SHARED
|
||||
if (__glibc_unlikely (profile))
|
||||
{
|
||||
if (GLRO(dl_profile) != NULL
|
||||
&& _dl_name_match_p (GLRO(dl_profile), l))
|
||||
{
|
||||
/* This is the object we are looking for. Say that we really
|
||||
want profiling and the timers are started. */
|
||||
GL(dl_profile_map) = l;
|
||||
}
|
||||
doit = (Elf64_Addr) ELF_PTR_TO_FDESC (&_dl_runtime_profile)->ip;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
doit = (Elf64_Addr) ELF_PTR_TO_FDESC (&_dl_runtime_resolve)->ip;
|
||||
}
|
||||
|
||||
reserve[1] = doit;
|
||||
reserve[2] = gp;
|
||||
}
|
||||
|
||||
return lazy;
|
||||
}
|
||||
|
||||
/* Names of the architecture-specific auditing callback functions. */
|
||||
#define ARCH_LA_PLTENTER ia64_gnu_pltenter
|
||||
#define ARCH_LA_PLTEXIT ia64_gnu_pltexit
|
||||
|
||||
/* Undo the adds out0 = 16, sp below to get at the value we want in
|
||||
__libc_stack_end. */
|
||||
#define DL_STACK_END(cookie) \
|
||||
((void *) (((long) (cookie)) - 16))
|
||||
|
||||
/* Initial entry point code for the dynamic linker.
|
||||
The C function `_dl_start' is the real entry point;
|
||||
its return value is the user program's entry point. */
|
||||
|
||||
#define RTLD_START asm ( \
|
||||
".text\n" \
|
||||
" .global _start#\n" \
|
||||
" .proc _start#\n" \
|
||||
"_start:\n" \
|
||||
"0: { .mii\n" \
|
||||
" .prologue\n" \
|
||||
" .save rp, r0\n" \
|
||||
" .body\n" \
|
||||
" .prologue\n" \
|
||||
" .save ar.pfs, r32\n" \
|
||||
" alloc loc0 = ar.pfs, 0, 3, 4, 0\n" \
|
||||
" .body\n" \
|
||||
" mov r2 = ip\n" \
|
||||
" addl r3 = @gprel(0b), r0\n" \
|
||||
" ;;\n" \
|
||||
" }\n" \
|
||||
" { .mlx\n" \
|
||||
" /* Calculate the GP, and save a copy in loc1. */\n" \
|
||||
" sub gp = r2, r3\n" \
|
||||
" movl r8 = 0x9804c0270033f\n" \
|
||||
" ;;\n" \
|
||||
" }\n" \
|
||||
" { .mii\n" \
|
||||
" mov ar.fpsr = r8\n" \
|
||||
" sub loc1 = r2, r3\n" \
|
||||
" /* _dl_start wants a pointer to the pointer to the arg block and\n" \
|
||||
" the arg block starts with an integer, thus the magic 16. */\n" \
|
||||
" adds out0 = 16, sp\n" \
|
||||
" }\n" \
|
||||
" { .bbb\n" \
|
||||
" br.call.sptk.many b0 = _dl_start#\n" \
|
||||
" ;;\n" \
|
||||
" }\n" \
|
||||
" .endp _start#\n" \
|
||||
" /* FALLTHRU */\n" \
|
||||
" .global _dl_start_user#\n" \
|
||||
" .proc _dl_start_user#\n" \
|
||||
"_dl_start_user:\n" \
|
||||
" .prologue\n" \
|
||||
" .save rp, r0\n" \
|
||||
" .body\n" \
|
||||
" .prologue\n" \
|
||||
" .save ar.pfs, r32\n" \
|
||||
" .body\n" \
|
||||
" { .mii\n" \
|
||||
" /* Save the pointer to the user entry point fptr in loc2. */\n" \
|
||||
" mov loc2 = ret0\n" \
|
||||
" addl r2 = @ltoff(_dl_argc), gp\n" \
|
||||
" ;;\n" \
|
||||
" }\n" \
|
||||
" { .mii\n" \
|
||||
" ld8 out1 = [r2] /* Get the _dl_argc address. */\n" \
|
||||
" addl r3 = @ltoff(_dl_argv), gp\n" \
|
||||
" ;;\n" \
|
||||
" }\n" \
|
||||
" { .mmi\n" \
|
||||
" ld8 out2 = [r3] /* Get the _dl_argv address. */\n" \
|
||||
" ld8 out1 = [out1] /* Get the adjusted _dl_argc. */\n" \
|
||||
" addl r2 = @gprel(_rtld_local), gp\n" \
|
||||
" ;;\n" \
|
||||
" }\n" \
|
||||
" { .mmi\n" \
|
||||
" sxt4 out3 = out1 /* envp = argv + argc + 1 */\n" \
|
||||
" ;;\n" \
|
||||
" }\n" \
|
||||
" { .mmi\n" \
|
||||
" adds out3 = 1, out3\n" \
|
||||
" ;;\n" \
|
||||
" }\n" \
|
||||
" { .mmi\n" \
|
||||
" ld8 out2 = [out2] /* Get the adjusted _dl_argv. */\n" \
|
||||
" shladd out3 = out3, 3, r0\n" \
|
||||
" ;;\n" \
|
||||
" }\n" \
|
||||
" { .mmb\n" \
|
||||
" add out3 = out3, out2\n" \
|
||||
" ld8 out0 = [r2] /* Get the linkmap. */\n" \
|
||||
" br.call.sptk.many b0 = _dl_init#\n" \
|
||||
" }\n" \
|
||||
" /* Pass our finalizer function to the user,\n" \
|
||||
" and jump to the user's entry point. */\n" \
|
||||
" { .mmi\n" \
|
||||
" ld8 r3 = [loc2], 8\n" \
|
||||
" mov b0 = r0\n" \
|
||||
" }\n" \
|
||||
" { .mmi\n" \
|
||||
" addl ret0 = @ltoff(@fptr(_dl_fini#)), gp\n" \
|
||||
" ;;\n" \
|
||||
" mov b6 = r3\n" \
|
||||
" }\n" \
|
||||
" { .mmi\n" \
|
||||
" ld8 ret0 = [ret0]\n" \
|
||||
" ld8 gp = [loc2]\n" \
|
||||
" mov ar.pfs = loc0\n" \
|
||||
" ;;\n" \
|
||||
" }\n" \
|
||||
" { .mfb\n" \
|
||||
" br.sptk.many b6\n" \
|
||||
" ;;\n" \
|
||||
" }\n" \
|
||||
" .endp _dl_start_user#\n" \
|
||||
".previous\n");
|
||||
|
||||
|
||||
#ifndef RTLD_START_SPECIAL_INIT
|
||||
#define RTLD_START_SPECIAL_INIT /* nothing */
|
||||
#endif
|
||||
|
||||
/* ELF_RTYPE_CLASS_PLT iff TYPE describes relocation of a PLT entry or TLS
|
||||
variable, so undefined references should not be allowed to define the
|
||||
value.
|
||||
ELF_RTYPE_CLASS_COPY iff TYPE should not be allowed to resolve to one
|
||||
of the main executable's symbols, as for a COPY reloc, which we don't
|
||||
use. */
|
||||
/* ??? Ignore *MSB for now. */
|
||||
#define elf_machine_type_class(type) \
|
||||
(((type) == R_IA64_IPLTLSB || (type) == R_IA64_DTPMOD64LSB \
|
||||
|| (type) == R_IA64_DTPREL64LSB || (type) == R_IA64_TPREL64LSB) \
|
||||
* ELF_RTYPE_CLASS_PLT)
|
||||
|
||||
/* A reloc type used for ld.so cmdline arg lookups to reject PLT entries. */
|
||||
#define ELF_MACHINE_JMP_SLOT R_IA64_IPLTLSB
|
||||
|
||||
/* Return the address of the entry point. */
|
||||
#define ELF_MACHINE_START_ADDRESS(map, start) \
|
||||
({ \
|
||||
ElfW(Addr) addr; \
|
||||
DL_DT_FUNCTION_ADDRESS(map, start, static, addr) \
|
||||
addr; \
|
||||
})
|
||||
|
||||
/* Fixup a PLT entry to bounce directly to the function at VALUE. */
|
||||
static inline struct fdesc __attribute__ ((always_inline))
|
||||
elf_machine_fixup_plt (struct link_map *l, lookup_t t,
|
||||
const ElfW(Sym) *refsym, const ElfW(Sym) *sym,
|
||||
const Elf64_Rela *reloc,
|
||||
Elf64_Addr *reloc_addr, struct fdesc value)
|
||||
{
|
||||
/* l is the link_map for the caller, t is the link_map for the object
|
||||
* being called */
|
||||
/* got has already been relocated in elf_get_dynamic_info() */
|
||||
reloc_addr[1] = value.gp;
|
||||
/* we need a "release" here to ensure that the gp is visible before
|
||||
the code entry point is updated: */
|
||||
((volatile Elf64_Addr *) reloc_addr)[0] = value.ip;
|
||||
return value;
|
||||
}
|
||||
|
||||
/* Return the final value of a plt relocation. */
|
||||
static inline struct fdesc
|
||||
elf_machine_plt_value (struct link_map *map, const Elf64_Rela *reloc,
|
||||
struct fdesc value)
|
||||
{
|
||||
/* No need to handle rel vs rela since IA64 is rela only */
|
||||
return (struct fdesc) { value.ip + reloc->r_addend, value.gp };
|
||||
}
|
||||
|
||||
#endif /* !dl_machine_h */
|
||||
|
||||
#ifdef RESOLVE_MAP
|
||||
|
||||
#define R_IA64_TYPE(R) ((R) & -8)
|
||||
#define R_IA64_FORMAT(R) ((R) & 7)
|
||||
|
||||
#define R_IA64_FORMAT_32MSB 4
|
||||
#define R_IA64_FORMAT_32LSB 5
|
||||
#define R_IA64_FORMAT_64MSB 6
|
||||
#define R_IA64_FORMAT_64LSB 7
|
||||
|
||||
|
||||
/* Perform the relocation specified by RELOC and SYM (which is fully
|
||||
resolved). MAP is the object containing the reloc. */
|
||||
static inline void
|
||||
__attribute ((always_inline))
|
||||
elf_machine_rela (struct link_map *map, struct r_scope_elem *scope[],
|
||||
const Elf64_Rela *reloc,
|
||||
const Elf64_Sym *sym,
|
||||
const struct r_found_version *version,
|
||||
void *const reloc_addr_arg,
|
||||
int skip_ifunc)
|
||||
{
|
||||
Elf64_Addr *const reloc_addr = reloc_addr_arg;
|
||||
const unsigned long int r_type = ELF64_R_TYPE (reloc->r_info);
|
||||
Elf64_Addr value;
|
||||
|
||||
/* We cannot use a switch here because we cannot locate the switch
|
||||
jump table until we've self-relocated. */
|
||||
|
||||
#if !defined RTLD_BOOTSTRAP
|
||||
if (__builtin_expect (R_IA64_TYPE (r_type) == R_IA64_TYPE (R_IA64_REL64LSB),
|
||||
0))
|
||||
{
|
||||
assert (ELF64_R_TYPE (reloc->r_info) == R_IA64_REL64LSB);
|
||||
value = *reloc_addr + map->l_addr;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
if (__builtin_expect (r_type == R_IA64_NONE, 0))
|
||||
return;
|
||||
else
|
||||
{
|
||||
struct link_map *sym_map = RESOLVE_MAP (map, scope, &sym, version,
|
||||
r_type);
|
||||
|
||||
/* RESOLVE_MAP() will return NULL if it fail to locate the symbol. */
|
||||
if (sym_map != NULL)
|
||||
{
|
||||
value = SYMBOL_ADDRESS (sym_map, sym, true) + reloc->r_addend;
|
||||
|
||||
if (R_IA64_TYPE (r_type) == R_IA64_TYPE (R_IA64_DIR64LSB))
|
||||
;/* No adjustment. */
|
||||
else if (r_type == R_IA64_IPLTLSB)
|
||||
{
|
||||
elf_machine_fixup_plt (NULL, NULL, NULL, NULL, reloc, reloc_addr,
|
||||
DL_FIXUP_MAKE_VALUE (sym_map, value));
|
||||
return;
|
||||
}
|
||||
else if (R_IA64_TYPE (r_type) == R_IA64_TYPE (R_IA64_FPTR64LSB))
|
||||
value = _dl_make_fptr (sym_map, sym, value);
|
||||
else if (R_IA64_TYPE (r_type) == R_IA64_TYPE (R_IA64_PCREL64LSB))
|
||||
value -= (Elf64_Addr) reloc_addr & -16;
|
||||
else if (R_IA64_TYPE (r_type) == R_IA64_TYPE (R_IA64_DTPMOD64LSB))
|
||||
#ifdef RTLD_BOOTSTRAP
|
||||
/* During startup the dynamic linker is always index 1. */
|
||||
value = 1;
|
||||
#else
|
||||
/* Get the information from the link map returned by the
|
||||
resolv function. */
|
||||
value = sym_map->l_tls_modid;
|
||||
else if (R_IA64_TYPE (r_type) == R_IA64_TYPE (R_IA64_DTPREL64LSB))
|
||||
value -= sym_map->l_addr;
|
||||
#endif
|
||||
else if (R_IA64_TYPE (r_type) == R_IA64_TYPE (R_IA64_TPREL64LSB))
|
||||
{
|
||||
#ifndef RTLD_BOOTSTRAP
|
||||
CHECK_STATIC_TLS (map, sym_map);
|
||||
#endif
|
||||
value += sym_map->l_tls_offset - sym_map->l_addr;
|
||||
}
|
||||
else
|
||||
_dl_reloc_bad_type (map, r_type, 0);
|
||||
}
|
||||
else
|
||||
value = 0;
|
||||
}
|
||||
|
||||
/* ??? Ignore MSB and Instruction format for now. */
|
||||
if (R_IA64_FORMAT (r_type) == R_IA64_FORMAT_64LSB)
|
||||
*reloc_addr = value;
|
||||
else if (R_IA64_FORMAT (r_type) == R_IA64_FORMAT_32LSB)
|
||||
*(int *) reloc_addr = value;
|
||||
else if (r_type == R_IA64_IPLTLSB)
|
||||
{
|
||||
reloc_addr[0] = 0;
|
||||
reloc_addr[1] = 0;
|
||||
}
|
||||
else
|
||||
_dl_reloc_bad_type (map, r_type, 0);
|
||||
}
|
||||
|
||||
/* Let do-rel.h know that on IA-64 if l_addr is 0, all RELATIVE relocs
|
||||
can be skipped. */
|
||||
#define ELF_MACHINE_REL_RELATIVE 1
|
||||
|
||||
static inline void
|
||||
__attribute ((always_inline))
|
||||
elf_machine_rela_relative (Elf64_Addr l_addr, const Elf64_Rela *reloc,
|
||||
void *const reloc_addr_arg)
|
||||
{
|
||||
Elf64_Addr *const reloc_addr = reloc_addr_arg;
|
||||
/* ??? Ignore MSB and Instruction format for now. */
|
||||
assert (ELF64_R_TYPE (reloc->r_info) == R_IA64_REL64LSB);
|
||||
|
||||
*reloc_addr += l_addr;
|
||||
}
|
||||
|
||||
/* Perform a RELATIVE reloc on the .got entry that transfers to the .plt. */
|
||||
static inline void
|
||||
__attribute ((always_inline))
|
||||
elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
|
||||
Elf64_Addr l_addr, const Elf64_Rela *reloc,
|
||||
int skip_ifunc)
|
||||
{
|
||||
Elf64_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
|
||||
const unsigned long int r_type = ELF64_R_TYPE (reloc->r_info);
|
||||
|
||||
if (r_type == R_IA64_IPLTLSB)
|
||||
{
|
||||
reloc_addr[0] += l_addr;
|
||||
reloc_addr[1] += l_addr;
|
||||
}
|
||||
else if (r_type == R_IA64_NONE)
|
||||
return;
|
||||
else
|
||||
_dl_reloc_bad_type (map, r_type, 1);
|
||||
}
|
||||
|
||||
#endif /* RESOLVE_MAP */
|
@ -1,30 +0,0 @@
|
||||
/* Thread-local storage handling in the ELF dynamic linker. IA-64 version.
|
||||
Copyright (C) 2002-2024 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
|
||||
/* On IA-64 the __tls_get_addr function take the module ID and the
|
||||
offset as parameters. */
|
||||
#define GET_ADDR_ARGS size_t tls_ia64_m, size_t tls_ia64_offset
|
||||
#define GET_ADDR_PARAM tls_ia64_m, tls_ia64_offset
|
||||
#define GET_ADDR_MODULE tls_ia64_m
|
||||
#define GET_ADDR_OFFSET tls_ia64_offset
|
||||
|
||||
/* We have no tls_index type. */
|
||||
#define DONT_USE_TLS_INDEX 1
|
||||
|
||||
extern void *__tls_get_addr (size_t m, size_t offset);
|
@ -1,538 +0,0 @@
|
||||
/* PLT trampolines. ia64 version.
|
||||
Copyright (C) 2005-2024 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <sysdep.h>
|
||||
#undef ret
|
||||
|
||||
/*
|
||||
This code is used in dl-runtime.c to call the `_dl_fixup' function
|
||||
and then redirect to the address it returns. `_dl_fixup()' takes two
|
||||
arguments, however _dl_profile_fixup() takes five.
|
||||
|
||||
The ABI specifies that we will never see more than 8 input
|
||||
registers to a function call, thus it is safe to simply allocate
|
||||
those, and simpler than playing stack games. */
|
||||
|
||||
/* Used to save and restore 8 incoming fp registers */
|
||||
#define RESOLVE_FRAME_SIZE (16*8)
|
||||
|
||||
ENTRY(_dl_runtime_resolve)
|
||||
{ .mmi
|
||||
.prologue
|
||||
.save ar.pfs, r40
|
||||
alloc loc0 = ar.pfs, 8, 6, 2, 0
|
||||
/* Use the 16 byte scratch area. r2 will start at f8 and
|
||||
r3 will start at f9. */
|
||||
adds r2 = -(RESOLVE_FRAME_SIZE - 16), r12
|
||||
adds r3 = -(RESOLVE_FRAME_SIZE - 32), r12
|
||||
}
|
||||
{ .mii
|
||||
.fframe RESOLVE_FRAME_SIZE
|
||||
adds r12 = -RESOLVE_FRAME_SIZE, r12
|
||||
.save rp, loc1
|
||||
mov loc1 = b0
|
||||
.body
|
||||
mov loc2 = r8 /* preserve struct value register */
|
||||
;;
|
||||
}
|
||||
{ .mii
|
||||
mov loc3 = r9 /* preserve language specific register */
|
||||
mov loc4 = r10 /* preserve language specific register */
|
||||
mov loc5 = r11 /* preserve language specific register */
|
||||
}
|
||||
{ .mmi
|
||||
stf.spill [r2] = f8, 32
|
||||
stf.spill [r3] = f9, 32
|
||||
mov out0 = r16
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
stf.spill [r2] = f10, 32
|
||||
stf.spill [r3] = f11, 32
|
||||
shl out1 = r15, 4
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
stf.spill [r2] = f12, 32
|
||||
stf.spill [r3] = f13, 32
|
||||
/* Relocation record is 24 byte. */
|
||||
shladd out1 = r15, 3, out1
|
||||
;;
|
||||
}
|
||||
{ .mmb
|
||||
stf.spill [r2] = f14
|
||||
stf.spill [r3] = f15
|
||||
br.call.sptk.many b0 = _dl_fixup
|
||||
}
|
||||
{ .mii
|
||||
/* Skip the 16byte scratch area. */
|
||||
adds r2 = 16, r12
|
||||
adds r3 = 32, r12
|
||||
mov b6 = ret0
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
ldf.fill f8 = [r2], 32
|
||||
ldf.fill f9 = [r3], 32
|
||||
mov b0 = loc1
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
ldf.fill f10 = [r2], 32
|
||||
ldf.fill f11 = [r3], 32
|
||||
mov gp = ret1
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
ldf.fill f12 = [r2], 32
|
||||
ldf.fill f13 = [r3], 32
|
||||
mov ar.pfs = loc0
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
ldf.fill f14 = [r2], 32
|
||||
ldf.fill f15 = [r3], 32
|
||||
.restore sp /* pop the unwind frame state */
|
||||
adds r12 = RESOLVE_FRAME_SIZE, r12
|
||||
;;
|
||||
}
|
||||
{ .mii
|
||||
mov r9 = loc3 /* restore language specific register */
|
||||
mov r10 = loc4 /* restore language specific register */
|
||||
mov r11 = loc5 /* restore language specific register */
|
||||
}
|
||||
{ .mii
|
||||
mov r8 = loc2 /* restore struct value register */
|
||||
;;
|
||||
}
|
||||
/* An alloc is needed for the break system call to work.
|
||||
We don't care about the old value of the pfs register. */
|
||||
{ .mmb
|
||||
.prologue
|
||||
.body
|
||||
alloc r2 = ar.pfs, 0, 0, 8, 0
|
||||
br.sptk.many b6
|
||||
;;
|
||||
}
|
||||
END(_dl_runtime_resolve)
|
||||
|
||||
|
||||
/* The fourth argument to _dl_profile_fixup and the third one to
|
||||
_dl_audit_pltexit are a pointer to La_ia64_regs:
|
||||
|
||||
8byte r8
|
||||
8byte r9
|
||||
8byte r10
|
||||
8byte r11
|
||||
8byte in0
|
||||
8byte in1
|
||||
8byte in2
|
||||
8byte in3
|
||||
8byte in4
|
||||
8byte in5
|
||||
8byte in6
|
||||
8byte in7
|
||||
16byte f8
|
||||
16byte f9
|
||||
16byte f10
|
||||
16byte f11
|
||||
16byte f12
|
||||
16byte f13
|
||||
16byte f14
|
||||
16byte f15
|
||||
8byte ar.unat
|
||||
8byte sp
|
||||
|
||||
The fifth argument to _dl_profile_fixup is a pointer to long int.
|
||||
The fourth argument to _dl_audit_pltexit is a pointer to
|
||||
La_ia64_retval:
|
||||
|
||||
8byte r8
|
||||
8byte r9
|
||||
8byte r10
|
||||
8byte r11
|
||||
16byte f8
|
||||
16byte f9
|
||||
16byte f10
|
||||
16byte f11
|
||||
16byte f12
|
||||
16byte f13
|
||||
16byte f14
|
||||
16byte f15
|
||||
|
||||
Since stack has to be 16 byte aligned, the stack allocation is in
|
||||
16byte increment. Before calling _dl_profile_fixup, the stack will
|
||||
look like
|
||||
|
||||
psp new frame_size
|
||||
+16 La_ia64_regs
|
||||
sp scratch
|
||||
|
||||
*/
|
||||
|
||||
#define PLTENTER_FRAME_SIZE (4*8 + 8*8 + 8*16 + 2*8 + 16)
|
||||
#define PLTEXIT_FRAME_SIZE (PLTENTER_FRAME_SIZE + 4*8 + 8*16)
|
||||
|
||||
#if !defined PROF && defined SHARED
|
||||
ENTRY(_dl_runtime_profile)
|
||||
{ .mii
|
||||
.prologue
|
||||
.save ar.pfs, r40
|
||||
alloc loc0 = ar.pfs, 8, 12, 8, 0
|
||||
.vframe loc10
|
||||
mov loc10 = r12
|
||||
.save rp, loc1
|
||||
mov loc1 = b0
|
||||
}
|
||||
{ .mii
|
||||
.save ar.unat, r17
|
||||
mov r17 = ar.unat
|
||||
.save ar.lc, loc6
|
||||
mov loc6 = ar.lc
|
||||
mov loc11 = gp
|
||||
}
|
||||
{ .mii
|
||||
.body
|
||||
/* There is a 16 byte scratch area. r2 will start at r8 and
|
||||
r3 will start at r9 for La_ia64_regs. */
|
||||
adds r2 = -(PLTENTER_FRAME_SIZE - 16), r12
|
||||
adds r3 = -(PLTENTER_FRAME_SIZE - 24), r12
|
||||
adds r12 = -PLTENTER_FRAME_SIZE, r12
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
st8 [r2] = r8, 16;
|
||||
st8 [r3] = r9, 16;
|
||||
mov out2 = b0 /* needed by _dl_fixup_profile */
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
st8 [r2] = r10, 16;
|
||||
st8 [r3] = r11, 16;
|
||||
adds out3 = 16, r12 /* pointer to La_ia64_regs */
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
.mem.offset 0, 0
|
||||
st8.spill [r2] = in0, 16
|
||||
.mem.offset 8, 0
|
||||
st8.spill [r3] = in1, 16
|
||||
mov out4 = loc10 /* pointer to new frame size */
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
.mem.offset 0, 0
|
||||
st8.spill [r2] = in2, 16
|
||||
.mem.offset 8, 0
|
||||
st8.spill [r3] = in3, 16
|
||||
mov loc2 = r8 /* preserve struct value register */
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
.mem.offset 0, 0
|
||||
st8.spill [r2] = in4, 16
|
||||
.mem.offset 8, 0
|
||||
st8.spill [r3] = in5, 16
|
||||
mov loc3 = r9 /* preserve language specific register */
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
.mem.offset 0, 0
|
||||
st8 [r2] = in6, 16
|
||||
.mem.offset 8, 0
|
||||
st8 [r3] = in7, 24 /* adjust for f9 */
|
||||
mov loc4 = r10 /* preserve language specific register */
|
||||
;;
|
||||
}
|
||||
{ .mii
|
||||
mov r18 = ar.unat /* save it in La_ia64_regs */
|
||||
mov loc7 = out3 /* save it for _dl_audit_pltexit */
|
||||
mov loc5 = r11 /* preserve language specific register */
|
||||
}
|
||||
{ .mmi
|
||||
stf.spill [r2] = f8, 32
|
||||
stf.spill [r3] = f9, 32
|
||||
mov out0 = r16 /* needed by _dl_fixup_profile */
|
||||
;;
|
||||
}
|
||||
{ .mii
|
||||
mov ar.unat = r17 /* restore it for function call */
|
||||
mov loc8 = r16 /* save it for _dl_audit_pltexit */
|
||||
nop.i 0x0
|
||||
}
|
||||
{ .mmi
|
||||
stf.spill [r2] = f10, 32
|
||||
stf.spill [r3] = f11, 32
|
||||
shl out1 = r15, 4
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
stf.spill [r2] = f12, 32
|
||||
stf.spill [r3] = f13, 32
|
||||
/* Relocation record is 24 byte. */
|
||||
shladd out1 = r15, 3, out1
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
stf.spill [r2] = f14, 32
|
||||
stf.spill [r3] = f15, 24
|
||||
mov loc9 = out1 /* save it for _dl_audit_pltexit */
|
||||
;;
|
||||
}
|
||||
{ .mmb
|
||||
st8 [r2] = r18 /* store ar.unat */
|
||||
st8 [r3] = loc10 /* store sp */
|
||||
br.call.sptk.many b0 = _dl_profile_fixup
|
||||
}
|
||||
{ .mii
|
||||
/* Skip the 16byte scratch area, 4 language specific GRs and
|
||||
8 incoming GRs to restore incoming fp registers. */
|
||||
adds r2 = (4*8 + 8*8 + 16), r12
|
||||
adds r3 = (4*8 + 8*8 + 32), r12
|
||||
mov b6 = ret0
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
ldf.fill f8 = [r2], 32
|
||||
ldf.fill f9 = [r3], 32
|
||||
mov gp = ret1
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
ldf.fill f10 = [r2], 32
|
||||
ldf.fill f11 = [r3], 32
|
||||
mov r8 = loc2 /* restore struct value register */
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
ldf.fill f12 = [r2], 32
|
||||
ldf.fill f13 = [r3], 32
|
||||
mov r9 = loc3 /* restore language specific register */
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
ldf.fill f14 = [r2], 32
|
||||
ldf.fill f15 = [r3], 32
|
||||
mov r10 = loc4 /* restore language specific register */
|
||||
;;
|
||||
}
|
||||
{ .mii
|
||||
ld8 r15 = [loc10] /* load the new frame size */
|
||||
mov r11 = loc5 /* restore language specific register */
|
||||
;;
|
||||
cmp.eq p6, p7 = -1, r15
|
||||
;;
|
||||
}
|
||||
{ .mii
|
||||
(p7) cmp.eq p8, p9 = 0, r15
|
||||
(p6) mov b0 = loc1
|
||||
(p6) mov ar.lc = loc6
|
||||
}
|
||||
{ .mib
|
||||
nop.m 0x0
|
||||
(p6) mov ar.pfs = loc0
|
||||
(p6) br.cond.dptk.many .Lresolved
|
||||
;;
|
||||
}
|
||||
|
||||
/* At this point, the stack looks like
|
||||
|
||||
+psp free
|
||||
+16 La_ia64_regs
|
||||
sp scratch
|
||||
|
||||
We need to keep the current stack and call the resolved
|
||||
function by copying the r15 byte from sp + PLTENTER_FRAME_SIZE
|
||||
+ 16 (scratch area) to sp + 16 (scratch area). Since stack
|
||||
has to be 16byte aligned, we around r15 up to 16byte. */
|
||||
|
||||
{ .mbb
|
||||
(p9) adds r15 = 15, r15
|
||||
(p8) br.cond.dptk.many .Lno_new_frame
|
||||
nop.b 0x0
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
and r15 = -16, r15
|
||||
;;
|
||||
/* We don't copy the 16byte scratch area. Prepare r16/r17 as
|
||||
destination. */
|
||||
sub r16 = r12, r15
|
||||
sub r17 = r12, r15
|
||||
;;
|
||||
}
|
||||
{ .mii
|
||||
adds r16 = 16, r16
|
||||
adds r17 = 24, r17
|
||||
sub r12 = r12, r15 /* Adjust stack */
|
||||
;;
|
||||
}
|
||||
{ .mii
|
||||
nop.m 0x0
|
||||
shr r15 = r15, 4
|
||||
;;
|
||||
adds r15 = -1, r15
|
||||
;;
|
||||
}
|
||||
{ .mii
|
||||
/* Skip the 16byte scratch area. Prepare r2/r3 as source. */
|
||||
adds r2 = 16, loc10
|
||||
adds r3 = 24, loc10
|
||||
mov ar.lc = r15
|
||||
;;
|
||||
}
|
||||
.Lcopy:
|
||||
{ .mmi
|
||||
ld8 r18 = [r2], 16
|
||||
ld8 r19 = [r3], 16
|
||||
nop.i 0x0
|
||||
;;
|
||||
}
|
||||
{ .mmb
|
||||
st8 [r16] = r18, 16
|
||||
st8 [r17] = r19, 16
|
||||
br.cloop.sptk.few .Lcopy
|
||||
}
|
||||
.Lno_new_frame:
|
||||
{ .mii
|
||||
mov out0 = in0
|
||||
mov out1 = in1
|
||||
mov out2 = in2
|
||||
}
|
||||
{ .mii
|
||||
mov out3 = in3
|
||||
mov out4 = in4
|
||||
mov out5 = in5
|
||||
}
|
||||
{ .mib
|
||||
mov out6 = in6
|
||||
mov out7 = in7
|
||||
/* Call the resolved function */
|
||||
br.call.sptk.many b0 = b6
|
||||
}
|
||||
{ .mii
|
||||
/* Prepare stack for _dl_audit_pltexit. Loc10 has the original
|
||||
stack pointer. */
|
||||
adds r12 = -PLTEXIT_FRAME_SIZE, loc10
|
||||
adds r2 = -(PLTEXIT_FRAME_SIZE - 16), loc10
|
||||
adds r3 = -(PLTEXIT_FRAME_SIZE - 24), loc10
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
/* Load all possible return values into buffer. */
|
||||
st8 [r2] = r8, 16
|
||||
st8 [r3] = r9, 16
|
||||
mov out0 = loc8
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
st8 [r2] = r10, 16
|
||||
st8 [r3] = r11, 24
|
||||
mov out1 = loc9
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
stf.spill [r2] = f8, 32
|
||||
stf.spill [r3] = f9, 32
|
||||
mov out2 = loc7 /* Pointer to La_ia64_regs */
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
stf.spill [r2] = f10, 32
|
||||
stf.spill [r3] = f11, 32
|
||||
adds out3 = 16, r12 /* Pointer to La_ia64_retval */
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
stf.spill [r2] = f12, 32
|
||||
stf.spill [r3] = f13, 32
|
||||
/* We need to restore gp for _dl_audit_pltexit. */
|
||||
mov gp = loc11
|
||||
;;
|
||||
}
|
||||
{ .mmb
|
||||
stf.spill [r2] = f14
|
||||
stf.spill [r3] = f15
|
||||
br.call.sptk.many b0 = _dl_audit_pltexit
|
||||
}
|
||||
{ .mmi
|
||||
/* Load all the non-floating and floating return values. Skip
|
||||
the 16byte scratch area. */
|
||||
adds r2 = 16, r12
|
||||
adds r3 = 24, r12
|
||||
nop.i 0x0
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
ld8 r8 = [r2], 16
|
||||
ld8 r9 = [r3], 16
|
||||
nop.i 0x0
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
ld8 r10 = [r2], 16
|
||||
ld8 r11 = [r3], 24
|
||||
nop.i 0x0
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
ldf.fill f8 = [r2], 32
|
||||
ldf.fill f9 = [r3], 32
|
||||
mov ar.lc = loc6
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
ldf.fill f10 = [r2], 32
|
||||
ldf.fill f11 = [r3], 32
|
||||
mov ar.pfs = loc0
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
ldf.fill f12 = [r2], 32
|
||||
ldf.fill f13 = [r3], 32
|
||||
mov b0 = loc1
|
||||
;;
|
||||
}
|
||||
{ .mmi
|
||||
ldf.fill f14 = [r2]
|
||||
ldf.fill f15 = [r3]
|
||||
/* We know that the previous stack pointer, loc10, isn't 0.
|
||||
We use it to reload p7. */
|
||||
cmp.ne p7, p0 = 0, loc10
|
||||
;;
|
||||
}
|
||||
.Lresolved:
|
||||
{ .mmb
|
||||
.restore sp
|
||||
mov r12 = loc10
|
||||
(p7) br.ret.sptk.many b0
|
||||
;;
|
||||
}
|
||||
/* An alloc is needed for the break system call to work. We
|
||||
don't care about the old value of the pfs register. After
|
||||
this alloc, we can't use any rotating registers. Otherwise
|
||||
assembler won't be happy. This has to be at the end. */
|
||||
{ .mmb
|
||||
.prologue
|
||||
.body
|
||||
alloc r2 = ar.pfs, 0, 0, 8, 0
|
||||
br.sptk.many b6
|
||||
;;
|
||||
}
|
||||
END(_dl_runtime_profile)
|
||||
#endif
|
@ -1,20 +0,0 @@
|
||||
/* Determine DT_INIT/DT_FINI support in the dynamic loader. IA64 version.
|
||||
Copyright (C) 2020-2024 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
/* Enable DT_INIT/DT_FINI support. */
|
||||
#define ELF_INITFINI 1
|
@ -1,8 +0,0 @@
|
||||
#include <link.h>
|
||||
#include <dl-fptr.h>
|
||||
|
||||
extern void _start (void);
|
||||
|
||||
/* The function's entry point is stored in the first word of the
|
||||
function descriptor (plabel) of _start(). */
|
||||
#define ENTRY_POINT ELF_PTR_TO_FDESC (_start)->ip
|
@ -1,3 +0,0 @@
|
||||
/* ABI version for _Float128 ABI introduction. */
|
||||
#define FLOAT128_VERSION GLIBC_2.26
|
||||
#define FLOAT128_VERSION_M GLIBC_2_26
|
@ -1,34 +0,0 @@
|
||||
ifeq ($(subdir),math)
|
||||
#
|
||||
# Some files which need to go both into libc and libm have external
|
||||
# dependencies which need to be resolved differently for libc
|
||||
# vs. libm. For example, inside libc, __libm_error_support needs to
|
||||
# resolve to HIDDEN_JUMPTARGET(__libm_error_support) whereas within
|
||||
# libm it always resolves to __libm_error_support. Such files need to
|
||||
# be compiled twice. Fortunately, math/Makefile already has logic to
|
||||
# support this: if a file starts with "s_", make will automatically
|
||||
# generate a matching file whose name starts with "m_" which simply
|
||||
# includes the corresponding "s_" file.
|
||||
#
|
||||
duplicated-routines = s_libm_ldexp s_libm_ldexpf s_libm_ldexpl \
|
||||
s_libm_scalbn s_libm_scalbnf s_libm_scalbnl
|
||||
|
||||
libm-sysdep_routines += s_erfc s_erfcf s_erfcl \
|
||||
s_matherrf s_matherrl libm_reduce \
|
||||
libm_error \
|
||||
libm_frexp libm_frexpf libm_frexpl \
|
||||
libm_sincos libm_sincosf libm_sincosl \
|
||||
libm_sincos_large \
|
||||
libm_lgamma libm_lgammaf libm_lgammal \
|
||||
libm_scalblnf \
|
||||
$(duplicated-routines:s_%=m_%)
|
||||
|
||||
sysdep_routines += libc_libm_error libm_frexp libm_frexpf libm_frexpl \
|
||||
$(duplicated-routines)
|
||||
|
||||
sysdep-CPPFLAGS += -include libm-symbols.h \
|
||||
-D__POSIX__ -Dopensource \
|
||||
-D_LIB_VERSIONIMF=_LIB_VERSION \
|
||||
-DSIZE_INT_32 -DSIZE_LONG_INT_64 -DSIZE_LONG_LONG_INT_64 \
|
||||
-DSIZE_LONG_64 -DIA64
|
||||
endif
|
@ -1,50 +0,0 @@
|
||||
----------------------------------------------------------
|
||||
Notes on how to update libm based on Intel's libm releases
|
||||
----------------------------------------------------------
|
||||
|
||||
This source code in this directory is currently based on Intel libm
|
||||
v2.1 as available from:
|
||||
|
||||
http://www.intel.com/software/products/opensource/libraries/num.htm
|
||||
|
||||
To ease importing, fix some bugs, and simplify integration into libc,
|
||||
it is also necessary to apply the patch at:
|
||||
|
||||
ftp://ftp.hpl.hp.com/pub/linux-ia64/intel-libm-041228.diff.gz
|
||||
|
||||
The expectation is that Intel will integrate most if not all of these
|
||||
changes into future releases of libm, so this patching step can
|
||||
hopefully be omitted in the future.
|
||||
|
||||
Once the patched libm sources are extracted in a directory $LIBM, they
|
||||
can be imported into the libc source tree at $LIBC with the following
|
||||
step:
|
||||
|
||||
$ cd $LIBC/src/sysdep/ia64/fpu
|
||||
$ ./import_intel_libm $LIBM
|
||||
|
||||
This should produce a number of "Importing..." messages, without
|
||||
showing any errors.
|
||||
|
||||
At this point, you should be able to build glibc in the usual fashion.
|
||||
We assume you do this in directory $OBJ. Once the build has
|
||||
completed, run "make check" to verify that all (math) checks succeed.
|
||||
If these checks succeed, you should also run the following commands to
|
||||
verify that the new libm doesn't pollute the name-space and has proper
|
||||
size-info for the data objects:
|
||||
|
||||
$ cd $LIBC/src/sysdep/ia64/fpu
|
||||
$ import_check $OBJ/math/
|
||||
|
||||
There should be no (unexpected) errors reported by this script.
|
||||
|
||||
As an optional step, you may also want to confirm that the new libm
|
||||
exports the exact same global symbols as the old one.
|
||||
|
||||
If you want to see the changes introduced by the "import_intel_libm"
|
||||
script, you can run the commands:
|
||||
|
||||
$ cd $LIBC/src/sysdep/ia64/fpu
|
||||
$ import_diffs
|
||||
|
||||
That's it.
|
@ -1,10 +0,0 @@
|
||||
libc {
|
||||
GLIBC_PRIVATE {
|
||||
__libm_frexp_4; __libm_frexp_4f; __libm_frexp_4l; __libm_error_support;
|
||||
}
|
||||
}
|
||||
libm {
|
||||
GLIBC_2.2.3 {
|
||||
matherrf; matherrl;
|
||||
}
|
||||
}
|
@ -1 +0,0 @@
|
||||
/* Not needed. */
|
@ -1 +0,0 @@
|
||||
/* Not needed. */
|
@ -1 +0,0 @@
|
||||
/* Not needed. */
|
@ -1,878 +0,0 @@
|
||||
.file "acos.s"
|
||||
|
||||
|
||||
// Copyright (c) 2000 - 2003 Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
|
||||
// History
|
||||
//==============================================================
|
||||
// 02/02/00 Initial version
|
||||
// 08/17/00 New and much faster algorithm.
|
||||
// 08/30/00 Avoided bank conflicts on loads, shortened |x|=1 and x=0 paths,
|
||||
// fixed mfb split issue stalls.
|
||||
// 05/20/02 Cleaned up namespace and sf0 syntax
|
||||
// 08/02/02 New and much faster algorithm II
|
||||
// 02/06/03 Reordered header: .section, .global, .proc, .align
|
||||
|
||||
// Description
|
||||
//=========================================
|
||||
// The acos function computes the principal value of the arc cosine of x.
|
||||
// acos(0) returns Pi/2, acos(1) returns 0, acos(-1) returns Pi.
|
||||
// A domain error occurs for arguments not in the range [-1,+1].
|
||||
//
|
||||
// The acos function returns the arc cosine in the range [0, Pi] radians.
|
||||
//
|
||||
// There are 8 paths:
|
||||
// 1. x = +/-0.0
|
||||
// Return acos(x) = Pi/2 + x
|
||||
//
|
||||
// 2. 0.0 < |x| < 0.625
|
||||
// Return acos(x) = Pi/2 - x - x^3 *PolA(x^2)
|
||||
// where PolA(x^2) = A3 + A5*x^2 + A7*x^4 +...+ A35*x^32
|
||||
//
|
||||
// 3. 0.625 <=|x| < 1.0
|
||||
// Return acos(x) = Pi/2 - asin(x) =
|
||||
// = Pi/2 - sign(x) * ( Pi/2 - sqrt(R) * PolB(R))
|
||||
// Where R = 1 - |x|,
|
||||
// PolB(R) = B0 + B1*R + B2*R^2 +...+B12*R^12
|
||||
//
|
||||
// sqrt(R) is approximated using the following sequence:
|
||||
// y0 = (1 + eps)/sqrt(R) - initial approximation by frsqrta,
|
||||
// |eps| < 2^(-8)
|
||||
// Then 3 iterations are used to refine the result:
|
||||
// H0 = 0.5*y0
|
||||
// S0 = R*y0
|
||||
//
|
||||
// d0 = 0.5 - H0*S0
|
||||
// H1 = H0 + d0*H0
|
||||
// S1 = S0 + d0*S0
|
||||
//
|
||||
// d1 = 0.5 - H1*S1
|
||||
// H2 = H1 + d0*H1
|
||||
// S2 = S1 + d0*S1
|
||||
//
|
||||
// d2 = 0.5 - H2*S2
|
||||
// S3 = S3 + d2*S3
|
||||
//
|
||||
// S3 approximates sqrt(R) with enough accuracy for this algorithm
|
||||
//
|
||||
// So, the result should be reconstracted as follows:
|
||||
// acos(x) = Pi/2 - sign(x) * (Pi/2 - S3*PolB(R))
|
||||
//
|
||||
// But for optimization purposes the reconstruction step is slightly
|
||||
// changed:
|
||||
// acos(x) = Cpi + sign(x)*PolB(R)*S2 - sign(x)*d2*S2*PolB(R)
|
||||
// where Cpi = 0 if x > 0 and Cpi = Pi if x < 0
|
||||
//
|
||||
// 4. |x| = 1.0
|
||||
// Return acos(1.0) = 0.0, acos(-1.0) = Pi
|
||||
//
|
||||
// 5. 1.0 < |x| <= +INF
|
||||
// A domain error occurs for arguments not in the range [-1,+1]
|
||||
//
|
||||
// 6. x = [S,Q]NaN
|
||||
// Return acos(x) = QNaN
|
||||
//
|
||||
// 7. x is denormal
|
||||
// Return acos(x) = Pi/2 - x,
|
||||
//
|
||||
// 8. x is unnormal
|
||||
// Normalize input in f8 and return to the very beginning of the function
|
||||
//
|
||||
// Registers used
|
||||
//==============================================================
|
||||
// Floating Point registers used:
|
||||
// f8, input, output
|
||||
// f6, f7, f9 -> f15, f32 -> f64
|
||||
|
||||
// General registers used:
|
||||
// r3, r21 -> r31, r32 -> r38
|
||||
|
||||
// Predicate registers used:
|
||||
// p0, p6 -> p14
|
||||
|
||||
//
|
||||
// Assembly macros
|
||||
//=========================================
|
||||
// integer registers used
|
||||
// scratch
|
||||
rTblAddr = r3
|
||||
|
||||
rPiBy2Ptr = r21
|
||||
rTmpPtr3 = r22
|
||||
rDenoBound = r23
|
||||
rOne = r24
|
||||
rAbsXBits = r25
|
||||
rHalf = r26
|
||||
r0625 = r27
|
||||
rSign = r28
|
||||
rXBits = r29
|
||||
rTmpPtr2 = r30
|
||||
rTmpPtr1 = r31
|
||||
|
||||
// stacked
|
||||
GR_SAVE_PFS = r32
|
||||
GR_SAVE_B0 = r33
|
||||
GR_SAVE_GP = r34
|
||||
GR_Parameter_X = r35
|
||||
GR_Parameter_Y = r36
|
||||
GR_Parameter_RESULT = r37
|
||||
GR_Parameter_TAG = r38
|
||||
|
||||
// floating point registers used
|
||||
FR_X = f10
|
||||
FR_Y = f1
|
||||
FR_RESULT = f8
|
||||
|
||||
|
||||
// scratch
|
||||
fXSqr = f6
|
||||
fXCube = f7
|
||||
fXQuadr = f9
|
||||
f1pX = f10
|
||||
f1mX = f11
|
||||
f1pXRcp = f12
|
||||
f1mXRcp = f13
|
||||
fH = f14
|
||||
fS = f15
|
||||
// stacked
|
||||
fA3 = f32
|
||||
fB1 = f32
|
||||
fA5 = f33
|
||||
fB2 = f33
|
||||
fA7 = f34
|
||||
fPiBy2 = f34
|
||||
fA9 = f35
|
||||
fA11 = f36
|
||||
fB10 = f35
|
||||
fB11 = f36
|
||||
fA13 = f37
|
||||
fA15 = f38
|
||||
fB4 = f37
|
||||
fB5 = f38
|
||||
fA17 = f39
|
||||
fA19 = f40
|
||||
fB6 = f39
|
||||
fB7 = f40
|
||||
fA21 = f41
|
||||
fA23 = f42
|
||||
fB3 = f41
|
||||
fB8 = f42
|
||||
fA25 = f43
|
||||
fA27 = f44
|
||||
fB9 = f43
|
||||
fB12 = f44
|
||||
fA29 = f45
|
||||
fA31 = f46
|
||||
fA33 = f47
|
||||
fA35 = f48
|
||||
fBaseP = f49
|
||||
fB0 = f50
|
||||
fSignedS = f51
|
||||
fD = f52
|
||||
fHalf = f53
|
||||
fR = f54
|
||||
fCloseTo1Pol = f55
|
||||
fSignX = f56
|
||||
fDenoBound = f57
|
||||
fNormX = f58
|
||||
fX8 = f59
|
||||
fRSqr = f60
|
||||
fRQuadr = f61
|
||||
fR8 = f62
|
||||
fX16 = f63
|
||||
fCpi = f64
|
||||
|
||||
// Data tables
|
||||
//==============================================================
|
||||
RODATA
|
||||
.align 16
|
||||
LOCAL_OBJECT_START(acos_base_range_table)
|
||||
// Ai: Polynomial coefficients for the acos(x), |x| < .625000
|
||||
// Bi: Polynomial coefficients for the acos(x), |x| > .625000
|
||||
data8 0xBFDAAB56C01AE468 //A29
|
||||
data8 0x3FE1C470B76A5B2B //A31
|
||||
data8 0xBFDC5FF82A0C4205 //A33
|
||||
data8 0x3FC71FD88BFE93F0 //A35
|
||||
data8 0xB504F333F9DE6487, 0x00003FFF //B0
|
||||
data8 0xAAAAAAAAAAAAFC18, 0x00003FFC //A3
|
||||
data8 0x3F9F1C71BC4A7823 //A9
|
||||
data8 0x3F96E8BBAAB216B2 //A11
|
||||
data8 0x3F91C4CA1F9F8A98 //A13
|
||||
data8 0x3F8C9DDCEDEBE7A6 //A15
|
||||
data8 0x3F877784442B1516 //A17
|
||||
data8 0x3F859C0491802BA2 //A19
|
||||
data8 0x9999999998C88B8F, 0x00003FFB //A5
|
||||
data8 0x3F6BD7A9A660BF5E //A21
|
||||
data8 0x3F9FC1659340419D //A23
|
||||
data8 0xB6DB6DB798149BDF, 0x00003FFA //A7
|
||||
data8 0xBFB3EF18964D3ED3 //A25
|
||||
data8 0x3FCD285315542CF2 //A27
|
||||
data8 0xF15BEEEFF7D2966A, 0x00003FFB //B1
|
||||
data8 0x3EF0DDA376D10FB3 //B10
|
||||
data8 0xBEB83CAFE05EBAC9 //B11
|
||||
data8 0x3F65FFB67B513644 //B4
|
||||
data8 0x3F5032FBB86A4501 //B5
|
||||
data8 0x3F392162276C7CBA //B6
|
||||
data8 0x3F2435949FD98BDF //B7
|
||||
data8 0xD93923D7FA08341C, 0x00003FF9 //B2
|
||||
data8 0x3F802995B6D90BDB //B3
|
||||
data8 0x3F10DF86B341A63F //B8
|
||||
data8 0xC90FDAA22168C235, 0x00003FFF // Pi/2
|
||||
data8 0x3EFA3EBD6B0ECB9D //B9
|
||||
data8 0x3EDE18BA080E9098 //B12
|
||||
LOCAL_OBJECT_END(acos_base_range_table)
|
||||
|
||||
.section .text
|
||||
GLOBAL_LIBM_ENTRY(acos)
|
||||
acos_unnormal_back:
|
||||
{ .mfi
|
||||
getf.d rXBits = f8 // grab bits of input value
|
||||
// set p12 = 1 if x is a NaN, denormal, or zero
|
||||
fclass.m p12, p0 = f8, 0xcf
|
||||
adds rSign = 1, r0
|
||||
}
|
||||
{ .mfi
|
||||
addl rTblAddr = @ltoff(acos_base_range_table),gp
|
||||
// 1 - x = 1 - |x| for positive x
|
||||
fms.s1 f1mX = f1, f1, f8
|
||||
addl rHalf = 0xFFFE, r0 // exponent of 1/2
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
addl r0625 = 0x3FE4, r0 // high 16 bits of 0.625
|
||||
// set p8 = 1 if x < 0
|
||||
fcmp.lt.s1 p8, p9 = f8, f0
|
||||
shl rSign = rSign, 63 // sign bit
|
||||
}
|
||||
{ .mfi
|
||||
// point to the beginning of the table
|
||||
ld8 rTblAddr = [rTblAddr]
|
||||
// 1 + x = 1 - |x| for negative x
|
||||
fma.s1 f1pX = f1, f1, f8
|
||||
adds rOne = 0x3FF, r0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
andcm rAbsXBits = rXBits, rSign // bits of |x|
|
||||
fmerge.s fSignX = f8, f1 // signum(x)
|
||||
shl r0625 = r0625, 48 // bits of DP representation of 0.625
|
||||
}
|
||||
{ .mfb
|
||||
setf.exp fHalf = rHalf // load A2 to FP reg
|
||||
fma.s1 fXSqr = f8, f8, f0 // x^2
|
||||
// branch on special path if x is a NaN, denormal, or zero
|
||||
(p12) br.cond.spnt acos_special
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
adds rPiBy2Ptr = 272, rTblAddr
|
||||
nop.f 0
|
||||
shl rOne = rOne, 52 // bits of 1.0
|
||||
}
|
||||
{ .mfi
|
||||
adds rTmpPtr1 = 16, rTblAddr
|
||||
nop.f 0
|
||||
// set p6 = 1 if |x| < 0.625
|
||||
cmp.lt p6, p7 = rAbsXBits, r0625
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
ldfpd fA29, fA31 = [rTblAddr] // A29, fA31
|
||||
// 1 - x = 1 - |x| for positive x
|
||||
(p9) fms.s1 fR = f1, f1, f8
|
||||
// point to coefficient of "near 1" polynomial
|
||||
(p7) adds rTmpPtr2 = 176, rTblAddr
|
||||
}
|
||||
{ .mfi
|
||||
ldfpd fA33, fA35 = [rTmpPtr1], 16 // A33, fA35
|
||||
// 1 + x = 1 - |x| for negative x
|
||||
(p8) fma.s1 fR = f1, f1, f8
|
||||
(p6) adds rTmpPtr2 = 48, rTblAddr
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
ldfe fB0 = [rTmpPtr1], 16 // B0
|
||||
nop.f 0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mib
|
||||
adds rTmpPtr3 = 16, rTmpPtr2
|
||||
// set p10 = 1 if |x| = 1.0
|
||||
cmp.eq p10, p0 = rAbsXBits, rOne
|
||||
// branch on special path for |x| = 1.0
|
||||
(p10) br.cond.spnt acos_abs_1
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
ldfe fA3 = [rTmpPtr2], 48 // A3 or B1
|
||||
nop.f 0
|
||||
adds rTmpPtr1 = 64, rTmpPtr3
|
||||
}
|
||||
{ .mib
|
||||
ldfpd fA9, fA11 = [rTmpPtr3], 16 // A9, A11 or B10, B11
|
||||
// set p11 = 1 if |x| > 1.0
|
||||
cmp.gt p11, p0 = rAbsXBits, rOne
|
||||
// branch on special path for |x| > 1.0
|
||||
(p11) br.cond.spnt acos_abs_gt_1
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
ldfpd fA17, fA19 = [rTmpPtr2], 16 // A17, A19 or B6, B7
|
||||
// initial approximation of 1 / sqrt(1 - x)
|
||||
frsqrta.s1 f1mXRcp, p0 = f1mX
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
ldfpd fA13, fA15 = [rTmpPtr3] // A13, A15 or B4, B5
|
||||
fma.s1 fXCube = fXSqr, f8, f0 // x^3
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
ldfe fA5 = [rTmpPtr2], 48 // A5 or B2
|
||||
// initial approximation of 1 / sqrt(1 + x)
|
||||
frsqrta.s1 f1pXRcp, p0 = f1pX
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
ldfpd fA21, fA23 = [rTmpPtr1], 16 // A21, A23 or B3, B8
|
||||
fma.s1 fXQuadr = fXSqr, fXSqr, f0 // x^4
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
ldfe fA7 = [rTmpPtr1] // A7 or Pi/2
|
||||
fma.s1 fRSqr = fR, fR, f0 // R^2
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
ldfpd fA25, fA27 = [rTmpPtr2] // A25, A27 or B9, B12
|
||||
nop.f 0
|
||||
(p6) br.cond.spnt acos_base_range;
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p9) fma.s1 fH = fHalf, f1mXRcp, f0 // H0 for x > 0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p9) fma.s1 fS = f1mX, f1mXRcp, f0 // S0 for x > 0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p8) fma.s1 fH = fHalf, f1pXRcp, f0 // H0 for x < 0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p8) fma.s1 fS = f1pX, f1pXRcp, f0 // S0 for x > 0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fRQuadr = fRSqr, fRSqr, f0 // R^4
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fB11 = fB11, fR, fB10
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fB1 = fB1, fR, fB0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fB5 = fB5, fR, fB4
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fB7 = fB7, fR, fB6
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fB3 = fB3, fR, fB2
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fnma.s1 fD = fH, fS, fHalf // d0 = 1/2 - H0*S0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fR8 = fRQuadr, fRQuadr, f0 // R^4
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fB9 = fB9, fR, fB8
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s1 fB12 = fB12, fRSqr, fB11
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s1 fB7 = fB7, fRSqr, fB5
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s1 fB3 = fB3, fRSqr, fB1
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fH = fH, fD, fH // H1 = H0 + H0*d0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fS = fS, fD, fS // S1 = S0 + S0*d0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{.mfi
|
||||
nop.m 0
|
||||
(p9) fma.s1 fCpi = f1, f0, f0 // Cpi = 0 if x > 0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p8) fma.s1 fCpi = fPiBy2, f1, fPiBy2 // Cpi = Pi if x < 0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fB12 = fB12, fRSqr, fB9
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fB7 = fB7, fRQuadr, fB3
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fnma.s1 fD = fH, fS, fHalf // d1 = 1/2 - H1*S1
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fnma.s1 fSignedS = fSignX, fS, f0 // -signum(x)*S1
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fCloseTo1Pol = fB12, fR8, fB7
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fH = fH, fD, fH // H2 = H1 + H1*d1
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fS = fS, fD, fS // S2 = S1 + S1*d1
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// -signum(x)* S2 = -signum(x)*(S1 + S1*d1)
|
||||
fma.s1 fSignedS = fSignedS, fD, fSignedS
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fnma.s1 fD = fH, fS, fHalf // d2 = 1/2 - H2*S2
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// Cpi + signum(x)*PolB*S2
|
||||
fnma.s1 fCpi = fSignedS, fCloseTo1Pol, fCpi
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// signum(x)*PolB * S2
|
||||
fnma.s1 fCloseTo1Pol = fSignedS, fCloseTo1Pol, f0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
// final result for 0.625 <= |x| < 1
|
||||
fma.d.s0 f8 = fCloseTo1Pol, fD, fCpi
|
||||
// exit here for 0.625 <= |x| < 1
|
||||
br.ret.sptk b0
|
||||
}
|
||||
;;
|
||||
|
||||
|
||||
// here if |x| < 0.625
|
||||
.align 32
|
||||
acos_base_range:
|
||||
{ .mfi
|
||||
ldfe fCpi = [rPiBy2Ptr] // Pi/2
|
||||
fma.s1 fA33 = fA33, fXSqr, fA31
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA15 = fA15, fXSqr, fA13
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA29 = fA29, fXSqr, fA27
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA25 = fA25, fXSqr, fA23
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA21 = fA21, fXSqr, fA19
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA9 = fA9, fXSqr, fA7
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA5 = fA5, fXSqr, fA3
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA35 = fA35, fXQuadr, fA33
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA17 = fA17, fXQuadr, fA15
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fX8 = fXQuadr, fXQuadr, f0 // x^8
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA25 = fA25, fXQuadr, fA21
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA9 = fA9, fXQuadr, fA5
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fms.s1 fCpi = fCpi, f1, f8 // Pi/2 - x
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA35 = fA35, fXQuadr, fA29
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA17 = fA17, fXSqr, fA11
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fX16 = fX8, fX8, f0 // x^16
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA35 = fA35, fX8, fA25
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA17 = fA17, fX8, fA9
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fBaseP = fA35, fX16, fA17
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
// final result for |x| < 0.625
|
||||
fnma.d.s0 f8 = fBaseP, fXCube, fCpi
|
||||
// exit here for |x| < 0.625 path
|
||||
br.ret.sptk b0
|
||||
}
|
||||
;;
|
||||
|
||||
// here if |x| = 1
|
||||
// acos(1) = 0
|
||||
// acos(-1) = Pi
|
||||
.align 32
|
||||
acos_abs_1:
|
||||
{ .mfi
|
||||
ldfe fPiBy2 = [rPiBy2Ptr] // Pi/2
|
||||
nop.f 0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
.pred.rel "mutex", p8, p9
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// result for x = 1.0
|
||||
(p9) fma.d.s0 f8 = f1, f0, f0 // 0.0
|
||||
nop.i 0
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// result for x = -1.0
|
||||
(p8) fma.d.s0 f8 = fPiBy2, f1, fPiBy2 // Pi
|
||||
// exit here for |x| = 1.0
|
||||
br.ret.sptk b0
|
||||
}
|
||||
;;
|
||||
|
||||
// here if x is a NaN, denormal, or zero
|
||||
.align 32
|
||||
acos_special:
|
||||
{ .mfi
|
||||
// point to Pi/2
|
||||
adds rPiBy2Ptr = 272, rTblAddr
|
||||
// set p12 = 1 if x is a NaN
|
||||
fclass.m p12, p0 = f8, 0xc3
|
||||
nop.i 0
|
||||
}
|
||||
{ .mlx
|
||||
nop.m 0
|
||||
// smallest positive DP normalized number
|
||||
movl rDenoBound = 0x0010000000000000
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
ldfe fPiBy2 = [rPiBy2Ptr] // Pi/2
|
||||
// set p13 = 1 if x = 0.0
|
||||
fclass.m p13, p0 = f8, 0x07
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fnorm.s1 fNormX = f8
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfb
|
||||
// load smallest normal to FP reg
|
||||
setf.d fDenoBound = rDenoBound
|
||||
// answer if x is a NaN
|
||||
(p12) fma.d.s0 f8 = f8,f1,f0
|
||||
// exit here if x is a NaN
|
||||
(p12) br.ret.spnt b0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// absolute value of normalized x
|
||||
fmerge.s fNormX = f1, fNormX
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
// final result for x = 0
|
||||
(p13) fma.d.s0 f8 = fPiBy2, f1, f8
|
||||
// exit here if x = 0.0
|
||||
(p13) br.ret.spnt b0
|
||||
}
|
||||
;;
|
||||
// if we still here then x is denormal or unnormal
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// set p14 = 1 if normalized x is greater than or
|
||||
// equal to the smallest denormalized value
|
||||
// So, if p14 is set to 1 it means that we deal with
|
||||
// unnormal rather than with "true" denormal
|
||||
fcmp.ge.s1 p14, p0 = fNormX, fDenoBound
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p14) fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag if x unnormal
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
// normalize unnormal input
|
||||
(p14) fnorm.s1 f8 = f8
|
||||
// return to the main path
|
||||
(p14) br.cond.sptk acos_unnormal_back
|
||||
}
|
||||
;;
|
||||
// if we still here it means that input is "true" denormal
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
// final result if x is denormal
|
||||
fms.d.s0 f8 = fPiBy2, f1, f8 // Pi/2 - x
|
||||
// exit here if x is denormal
|
||||
br.ret.sptk b0
|
||||
}
|
||||
;;
|
||||
|
||||
// here if |x| > 1.0
|
||||
// error handler should be called
|
||||
.align 32
|
||||
acos_abs_gt_1:
|
||||
{ .mfi
|
||||
alloc r32 = ar.pfs, 0, 3, 4, 0 // get some registers
|
||||
fmerge.s FR_X = f8,f8
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
mov GR_Parameter_TAG = 58 // error code
|
||||
frcpa.s0 FR_RESULT, p0 = f0,f0
|
||||
// call error handler routine
|
||||
br.cond.sptk __libm_error_region
|
||||
}
|
||||
;;
|
||||
GLOBAL_LIBM_END(acos)
|
||||
libm_alias_double_other (acos, acos)
|
||||
|
||||
|
||||
|
||||
LOCAL_LIBM_ENTRY(__libm_error_region)
|
||||
.prologue
|
||||
{ .mfi
|
||||
add GR_Parameter_Y=-32,sp // Parameter 2 value
|
||||
nop.f 0
|
||||
.save ar.pfs,GR_SAVE_PFS
|
||||
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
|
||||
}
|
||||
{ .mfi
|
||||
.fframe 64
|
||||
add sp=-64,sp // Create new stack
|
||||
nop.f 0
|
||||
mov GR_SAVE_GP=gp // Save gp
|
||||
};;
|
||||
{ .mmi
|
||||
stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
|
||||
add GR_Parameter_X = 16,sp // Parameter 1 address
|
||||
.save b0, GR_SAVE_B0
|
||||
mov GR_SAVE_B0=b0 // Save b0
|
||||
};;
|
||||
.body
|
||||
{ .mib
|
||||
stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
|
||||
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
|
||||
nop.b 0
|
||||
}
|
||||
{ .mib
|
||||
stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
|
||||
add GR_Parameter_Y = -16,GR_Parameter_Y
|
||||
br.call.sptk b0=__libm_error_support# // Call error handling function
|
||||
};;
|
||||
{ .mmi
|
||||
add GR_Parameter_RESULT = 48,sp
|
||||
nop.m 0
|
||||
nop.i 0
|
||||
};;
|
||||
{ .mmi
|
||||
ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
|
||||
.restore sp
|
||||
add sp = 64,sp // Restore stack pointer
|
||||
mov b0 = GR_SAVE_B0 // Restore return address
|
||||
};;
|
||||
{ .mib
|
||||
mov gp = GR_SAVE_GP // Restore gp
|
||||
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
|
||||
br.ret.sptk b0 // Return
|
||||
};;
|
||||
|
||||
LOCAL_LIBM_END(__libm_error_region)
|
||||
.type __libm_error_support#,@function
|
||||
.global __libm_error_support#
|
@ -1,694 +0,0 @@
|
||||
.file "acosf.s"
|
||||
|
||||
|
||||
// Copyright (c) 2000 - 2003, Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
|
||||
// History
|
||||
//==============================================================
|
||||
// 02/02/00 Initial version
|
||||
// 06/28/00 Improved speed
|
||||
// 06/31/00 Changed register allocation because of some duplicate macros
|
||||
// moved nan exit bundle up to gain a cycle.
|
||||
// 08/15/00 Bundle added after call to __libm_error_support to properly
|
||||
// set [the previously overwritten] GR_Parameter_RESULT.
|
||||
// 08/17/00 Changed predicate register macro-usage to direct predicate
|
||||
// names due to an assembler bug.
|
||||
// 10/17/00 Improved speed of x=0 and x=1 paths, set D flag if x denormal.
|
||||
// 03/13/01 Corrected sign of imm1 value in dep instruction.
|
||||
// 05/20/02 Cleaned up namespace and sf0 syntax
|
||||
// 02/06/03 Reordered header: .section, .global, .proc, .align
|
||||
// 04/17/03 Moved mutex after label
|
||||
|
||||
|
||||
// Description
|
||||
//=========================================
|
||||
// The acosf function computes the principle value of the arc sine of x.
|
||||
// A domain error occurs for arguments not in the range [-1,+1].
|
||||
|
||||
// The acosf function returns the arc cosine in the range [0, +pi] radians.
|
||||
// acos(1) returns +0
|
||||
// acos(x) returns a Nan and raises the invalid exception for |x| >1
|
||||
|
||||
// |x| <= sqrt(2)/2. get Ax and Bx
|
||||
|
||||
// poly_p1 = x p1
|
||||
// poly_p3 = x2 p4 + p3
|
||||
// poly_p1 = x2 (poly_p1) + x = x2(x p1) + x
|
||||
// poly_p2 = x2( poly_p3) + p2 = x2(x2 p4 + p3) + p2
|
||||
|
||||
// poly_Ax = x5(x2( poly_p3) + p2) + x2(x p1) + x
|
||||
// = x5(x2(x2 p4 + p3) + p2) + x2(x p1) + x
|
||||
|
||||
// poly_p7 = x2 p8 + p7
|
||||
// poly_p5 = x2 p6 + p5
|
||||
|
||||
// poly_p7 = x4 p9 + (x2 p8 + p7)
|
||||
// poly_Bx = x4 (x4 p9 + (x2 p8 + p7)) + x2 p6 + p5
|
||||
|
||||
// sinf1 = x11(x4 (x4 p9 + (x2 p8 + p7)) + x2 p6 + p5) + x5(x2(x2 p4 + p3) + p2) + x2(x p1) + x
|
||||
// = x19 p9 + x17 p8 + x15 p7 x13 p6 + x11 p5 + x9 p4 + x7 p3 + x5 p2 + x3 p1 + x
|
||||
// answer1 = pi/2 - sinf1
|
||||
|
||||
|
||||
|
||||
// |x| > sqrt(2)/2
|
||||
|
||||
// Get z = sqrt(1-x2)
|
||||
|
||||
// Get polynomial in t = 1-x2
|
||||
|
||||
// t2 = t t
|
||||
// t4 = t2 t2
|
||||
|
||||
// poly_p4 = t p5 + p4
|
||||
// poly_p1 = t p1 + 1
|
||||
|
||||
// poly_p6 = t p7 + p6
|
||||
// poly_p2 = t p3 + p2
|
||||
|
||||
// poly_p8 = t p9 + p8
|
||||
|
||||
// poly_p4 = t2 poly_p6 + poly_p4
|
||||
// = t2 (t p7 + p6) + (t p5 + p4)
|
||||
|
||||
// poly_p2 = t2 poly_p2 + poly_p1
|
||||
// = t2 (t p3 + p2) + (t p1 + 1)
|
||||
|
||||
// poly_p4 = t4 poly_p8 + poly_p4
|
||||
// = t4 (t p9 + p8) + (t2 (t p7 + p6) + (t p5 + p4))
|
||||
|
||||
// P(t) = poly_p2 + t4 poly_p8
|
||||
// = t2 (t p3 + p2) + (t p1 + 1) + t4 (t4 (t p9 + p8) + (t2 (t p7 + p6) + (t p5 + p4)))
|
||||
// = t3 p3 + t2 p2 + t p1 + 1 + t9 p9 + t8 p8 + t7 p7 + t6 p6 + t5 p5 + t4 p4
|
||||
|
||||
|
||||
// answer2 = sign(x) z P(t) if x>0
|
||||
// = sign(x) z P(t) + pi if x<0
|
||||
|
||||
|
||||
//
|
||||
// Assembly macros
|
||||
//=========================================
|
||||
|
||||
// predicate registers
|
||||
//acosf_pred_LEsqrt2by2 = p7
|
||||
//acosf_pred_GTsqrt2by2 = p8
|
||||
|
||||
// integer registers
|
||||
ACOSF_Addr1 = r33
|
||||
ACOSF_Addr2 = r34
|
||||
ACOSF_GR_1by2 = r35
|
||||
|
||||
ACOSF_GR_3by2 = r36
|
||||
ACOSF_GR_5by2 = r37
|
||||
|
||||
GR_SAVE_B0 = r38
|
||||
GR_SAVE_PFS = r39
|
||||
GR_SAVE_GP = r40
|
||||
|
||||
GR_Parameter_X = r41
|
||||
GR_Parameter_Y = r42
|
||||
GR_Parameter_RESULT = r43
|
||||
GR_Parameter_TAG = r44
|
||||
|
||||
// floating point registers
|
||||
|
||||
acosf_y = f32
|
||||
acosf_abs_x = f33
|
||||
acosf_x2 = f34
|
||||
acosf_sgn_x = f35
|
||||
|
||||
acosf_1by2 = f36
|
||||
acosf_3by2 = f37
|
||||
acosf_5by2 = f38
|
||||
acosf_coeff_P3 = f39
|
||||
acosf_coeff_P8 = f40
|
||||
|
||||
acosf_coeff_P1 = f41
|
||||
acosf_coeff_P4 = f42
|
||||
acosf_coeff_P5 = f43
|
||||
acosf_coeff_P2 = f44
|
||||
acosf_coeff_P7 = f45
|
||||
|
||||
acosf_coeff_P6 = f46
|
||||
acosf_coeff_P9 = f47
|
||||
acosf_x2 = f48
|
||||
acosf_x3 = f49
|
||||
acosf_x4 = f50
|
||||
|
||||
acosf_x8 = f51
|
||||
acosf_x5 = f52
|
||||
acosf_const_piby2 = f53
|
||||
acosf_const_sqrt2by2 = f54
|
||||
acosf_x11 = f55
|
||||
|
||||
acosf_poly_p1 = f56
|
||||
acosf_poly_p3 = f57
|
||||
acosf_sinf1 = f58
|
||||
acosf_poly_p2 = f59
|
||||
acosf_poly_Ax = f60
|
||||
|
||||
acosf_poly_p7 = f61
|
||||
acosf_poly_p5 = f62
|
||||
acosf_sgnx_t4 = f63
|
||||
acosf_poly_Bx = f64
|
||||
acosf_t = f65
|
||||
|
||||
acosf_yby2 = f66
|
||||
acosf_B = f67
|
||||
acosf_B2 = f68
|
||||
acosf_Az = f69
|
||||
acosf_dz = f70
|
||||
|
||||
acosf_Sz = f71
|
||||
acosf_d2z = f72
|
||||
acosf_Fz = f73
|
||||
acosf_z = f74
|
||||
acosf_sgnx_z = f75
|
||||
|
||||
acosf_t2 = f76
|
||||
acosf_2poly_p4 = f77
|
||||
acosf_2poly_p6 = f78
|
||||
acosf_2poly_p1 = f79
|
||||
acosf_2poly_p2 = f80
|
||||
|
||||
acosf_2poly_p8 = f81
|
||||
acosf_t4 = f82
|
||||
acosf_Pt = f83
|
||||
acosf_sgnx_2poly_p2 = f84
|
||||
acosf_sgn_x_piby2 = f85
|
||||
|
||||
acosf_poly_p7a = f86
|
||||
acosf_2poly_p4a = f87
|
||||
acosf_2poly_p4b = f88
|
||||
acosf_2poly_p2a = f89
|
||||
acosf_poly_p1a = f90
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// Data tables
|
||||
//==============================================================
|
||||
|
||||
RODATA
|
||||
|
||||
.align 16
|
||||
|
||||
LOCAL_OBJECT_START(acosf_coeff_1_table)
|
||||
data8 0x3FC5555607DCF816 // P1
|
||||
data8 0x3F9CF81AD9BAB2C6 // P4
|
||||
data8 0x3FC59E0975074DF3 // P7
|
||||
data8 0xBFA6F4CC2780AA1D // P6
|
||||
data8 0x3FC2DD45292E93CB // P9
|
||||
data8 0x3fe6a09e667f3bcd // sqrt(2)/2
|
||||
LOCAL_OBJECT_END(acosf_coeff_1_table)
|
||||
|
||||
LOCAL_OBJECT_START(acosf_coeff_2_table)
|
||||
data8 0x3FA6F108E31EFBA6 // P3
|
||||
data8 0xBFCA31BF175D82A0 // P8
|
||||
data8 0x3FA30C0337F6418B // P5
|
||||
data8 0x3FB332C9266CB1F9 // P2
|
||||
data8 0x3ff921fb54442d18 // pi_by_2
|
||||
LOCAL_OBJECT_END(acosf_coeff_2_table)
|
||||
|
||||
|
||||
.section .text
|
||||
GLOBAL_LIBM_ENTRY(acosf)
|
||||
|
||||
// Load the addresses of the two tables.
|
||||
// Then, load the coefficients and other constants.
|
||||
|
||||
{ .mfi
|
||||
alloc r32 = ar.pfs,1,8,4,0
|
||||
fnma.s1 acosf_t = f8,f8,f1
|
||||
dep.z ACOSF_GR_1by2 = 0x3f,24,8 // 0x3f000000
|
||||
}
|
||||
{ .mfi
|
||||
addl ACOSF_Addr1 = @ltoff(acosf_coeff_1_table),gp
|
||||
fma.s1 acosf_x2 = f8,f8,f0
|
||||
addl ACOSF_Addr2 = @ltoff(acosf_coeff_2_table),gp ;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
ld8 ACOSF_Addr1 = [ACOSF_Addr1]
|
||||
fmerge.s acosf_abs_x = f1,f8
|
||||
dep ACOSF_GR_3by2 = -1,r0,22,8 // 0x3fc00000
|
||||
}
|
||||
{ .mlx
|
||||
nop.m 999
|
||||
movl ACOSF_GR_5by2 = 0x40200000;;
|
||||
}
|
||||
|
||||
|
||||
|
||||
{ .mfi
|
||||
setf.s acosf_1by2 = ACOSF_GR_1by2
|
||||
fmerge.s acosf_sgn_x = f8,f1
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
ld8 ACOSF_Addr2 = [ACOSF_Addr2]
|
||||
nop.f 0
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
setf.s acosf_5by2 = ACOSF_GR_5by2
|
||||
fcmp.lt.s1 p11,p12 = f8,f0
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
{ .mmf
|
||||
ldfpd acosf_coeff_P1,acosf_coeff_P4 = [ACOSF_Addr1],16
|
||||
setf.s acosf_3by2 = ACOSF_GR_3by2
|
||||
fclass.m.unc p8,p0 = f8, 0xc3 ;; //@qnan | @snan
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
ldfpd acosf_coeff_P7,acosf_coeff_P6 = [ACOSF_Addr1],16
|
||||
fma.s1 acosf_t2 = acosf_t,acosf_t,f0
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
ldfpd acosf_coeff_P3,acosf_coeff_P8 = [ACOSF_Addr2],16
|
||||
fma.s1 acosf_x4 = acosf_x2,acosf_x2,f0
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
ldfpd acosf_coeff_P9,acosf_const_sqrt2by2 = [ACOSF_Addr1]
|
||||
fclass.m.unc p10,p0 = f8, 0x07 //@zero
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
ldfpd acosf_coeff_P5,acosf_coeff_P2 = [ACOSF_Addr2],16
|
||||
fma.s1 acosf_x3 = f8,acosf_x2,f0
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
ldfd acosf_const_piby2 = [ACOSF_Addr2]
|
||||
frsqrta.s1 acosf_B,p0 = acosf_t
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
(p8) fma.s.s0 f8 = f8,f1,f0
|
||||
(p8) br.ret.spnt b0 ;; // Exit if x=nan
|
||||
}
|
||||
|
||||
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
fcmp.eq.s1 p6,p0 = acosf_abs_x,f1
|
||||
(p10) br.cond.spnt ACOSF_ZERO ;; // Branch if x=0
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fcmp.gt.s1 p9,p0 = acosf_abs_x,f1
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 acosf_x8 = acosf_x4,acosf_x4,f0
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
fma.s1 acosf_t4 = acosf_t2,acosf_t2,f0
|
||||
(p6) br.cond.spnt ACOSF_ABS_ONE ;; // Branch if |x|=1
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 acosf_x5 = acosf_x2,acosf_x3,f0
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfb
|
||||
(p9) mov GR_Parameter_TAG = 59
|
||||
fma.s1 acosf_yby2 = acosf_t,acosf_1by2,f0
|
||||
(p9) br.cond.spnt __libm_error_region ;; // Branch if |x|>1
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 acosf_Az = acosf_t,acosf_B,f0
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 acosf_B2 = acosf_B,acosf_B,f0
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 acosf_poly_p1 = f8,acosf_coeff_P1,f0
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 acosf_2poly_p1 = acosf_coeff_P1,acosf_t,f1
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 acosf_poly_p3 = acosf_coeff_P4,acosf_x2,acosf_coeff_P3
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 acosf_2poly_p6 = acosf_coeff_P7,acosf_t,acosf_coeff_P6
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 acosf_poly_p7 = acosf_x2,acosf_coeff_P8,acosf_coeff_P7
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 acosf_2poly_p2 = acosf_coeff_P3,acosf_t,acosf_coeff_P2
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 acosf_poly_p5 = acosf_x2,acosf_coeff_P6,acosf_coeff_P5
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 acosf_2poly_p4 = acosf_coeff_P5,acosf_t,acosf_coeff_P4
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 acosf_x11 = acosf_x8,acosf_x3,f0
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fnma.s1 acosf_dz = acosf_B2,acosf_yby2,acosf_1by2
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 acosf_poly_p1a = acosf_x2,acosf_poly_p1,f8
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 acosf_2poly_p8 = acosf_coeff_P9,acosf_t,acosf_coeff_P8
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
// Get the absolute value of x and determine the region in which x lies
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fcmp.le.s1 p7,p8 = acosf_abs_x,acosf_const_sqrt2by2
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 acosf_poly_p2 = acosf_x2,acosf_poly_p3,acosf_coeff_P2
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 acosf_poly_p7a = acosf_x4,acosf_coeff_P9,acosf_poly_p7
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 acosf_2poly_p2a = acosf_2poly_p2,acosf_t2,acosf_2poly_p1
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p8) fma.s1 acosf_sgnx_t4 = acosf_sgn_x,acosf_t4,f0
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p8) fma.s1 acosf_2poly_p4a = acosf_2poly_p6,acosf_t2,acosf_2poly_p4
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p8) fma.s1 acosf_Sz = acosf_5by2,acosf_dz,acosf_3by2
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p8) fma.s1 acosf_d2z = acosf_dz,acosf_dz,f0
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p8) fnma.d.s1 acosf_sgn_x_piby2 = acosf_sgn_x,acosf_const_piby2,acosf_const_piby2
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p7) fma.s1 acosf_poly_Ax = acosf_x5,acosf_poly_p2,acosf_poly_p1a
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p7) fma.s1 acosf_poly_Bx = acosf_x4,acosf_poly_p7a,acosf_poly_p5
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p8) fma.s1 acosf_sgnx_2poly_p2 = acosf_sgn_x,acosf_2poly_p2a,f0
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fcmp.eq.s0 p6,p0 = f8,f0 // Only purpose is to set D if x denormal
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p8) fma.s1 acosf_2poly_p4b = acosf_2poly_p8,acosf_t4,acosf_2poly_p4a
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p8) fma.s1 acosf_Fz = acosf_d2z,acosf_Sz,acosf_dz
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p8) fma.d.s1 acosf_Pt = acosf_2poly_p4b,acosf_sgnx_t4,acosf_sgnx_2poly_p2
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p8) fma.d.s1 acosf_z = acosf_Az,acosf_Fz,acosf_Az
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p7) fma.d.s1 acosf_sinf1 = acosf_x11,acosf_poly_Bx,acosf_poly_Ax
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
.pred.rel "mutex",p8,p7 //acosf_pred_GTsqrt2by2,acosf_pred_LEsqrt2by2
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p8) fma.s.s0 f8 = acosf_z,acosf_Pt,acosf_sgn_x_piby2
|
||||
nop.i 999
|
||||
}
|
||||
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
(p7) fms.s.s0 f8 = acosf_const_piby2,f1,acosf_sinf1
|
||||
br.ret.sptk b0 ;;
|
||||
}
|
||||
|
||||
ACOSF_ZERO:
|
||||
// Here if x=0
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
fma.s.s0 f8 = acosf_const_piby2,f1,f0 // acosf(0)=pi/2
|
||||
br.ret.sptk b0 ;;
|
||||
}
|
||||
|
||||
|
||||
ACOSF_ABS_ONE:
|
||||
.pred.rel "mutex",p11,p12
|
||||
// Here if |x|=1
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p11) fma.s.s0 f8 = acosf_const_piby2,f1,acosf_const_piby2 // acosf(-1)=pi
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
(p12) fma.s.s0 f8 = f1,f0,f0 // acosf(1)=0
|
||||
br.ret.sptk b0 ;;
|
||||
}
|
||||
|
||||
GLOBAL_LIBM_END(acosf)
|
||||
libm_alias_float_other (acos, acos)
|
||||
|
||||
|
||||
// Stack operations when calling error support.
|
||||
// (1) (2)
|
||||
// sp -> + psp -> +
|
||||
// | |
|
||||
// | | <- GR_Y
|
||||
// | |
|
||||
// | <-GR_Y Y2->|
|
||||
// | |
|
||||
// | | <- GR_X
|
||||
// | |
|
||||
// sp-64 -> + sp -> +
|
||||
// save ar.pfs save b0
|
||||
// save gp
|
||||
|
||||
|
||||
// Stack operations when calling error support.
|
||||
// (3) (call) (4)
|
||||
// psp -> + sp -> +
|
||||
// | |
|
||||
// R3 ->| <- GR_RESULT | -> f8
|
||||
// | |
|
||||
// Y2 ->| <- GR_Y |
|
||||
// | |
|
||||
// X1 ->| |
|
||||
// | |
|
||||
// sp -> + +
|
||||
// restore gp
|
||||
// restore ar.pfs
|
||||
|
||||
|
||||
LOCAL_LIBM_ENTRY(__libm_error_region)
|
||||
.prologue
|
||||
{ .mfi
|
||||
add GR_Parameter_Y=-32,sp // Parameter 2 value
|
||||
nop.f 999
|
||||
.save ar.pfs,GR_SAVE_PFS
|
||||
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
|
||||
}
|
||||
{ .mfi
|
||||
.fframe 64
|
||||
add sp=-64,sp // Create new stack
|
||||
nop.f 0
|
||||
mov GR_SAVE_GP=gp // Save gp
|
||||
};;
|
||||
{ .mmi
|
||||
stfs [GR_Parameter_Y] = f1,16 // Store Parameter 2 on stack
|
||||
add GR_Parameter_X = 16,sp // Parameter 1 address
|
||||
.save b0, GR_SAVE_B0
|
||||
mov GR_SAVE_B0=b0 // Save b0
|
||||
};;
|
||||
|
||||
.body
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
frcpa.s0 f9,p0 = f0,f0
|
||||
nop.i 0
|
||||
};;
|
||||
|
||||
{ .mib
|
||||
stfs [GR_Parameter_X] = f8 // Store Parameter 1 on stack
|
||||
add GR_Parameter_RESULT = 0,GR_Parameter_Y
|
||||
nop.b 0 // Parameter 3 address
|
||||
}
|
||||
{ .mib
|
||||
stfs [GR_Parameter_Y] = f9 // Store Parameter 3 on stack
|
||||
add GR_Parameter_Y = -16,GR_Parameter_Y
|
||||
br.call.sptk b0=__libm_error_support# // Call error handling function
|
||||
};;
|
||||
{ .mmi
|
||||
nop.m 0
|
||||
nop.m 0
|
||||
add GR_Parameter_RESULT = 48,sp
|
||||
};;
|
||||
|
||||
{ .mmi
|
||||
ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
|
||||
.restore sp
|
||||
add sp = 64,sp // Restore stack pointer
|
||||
mov b0 = GR_SAVE_B0 // Restore return address
|
||||
};;
|
||||
{ .mib
|
||||
mov gp = GR_SAVE_GP // Restore gp
|
||||
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
|
||||
br.ret.sptk b0 // Return
|
||||
};;
|
||||
|
||||
LOCAL_LIBM_END(__libm_error_region)
|
||||
|
||||
.type __libm_error_support#,@function
|
||||
.global __libm_error_support#
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,854 +0,0 @@
|
||||
.file "asin.s"
|
||||
|
||||
|
||||
// Copyright (c) 2000 - 2003 Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
|
||||
// History
|
||||
//==============================================================
|
||||
// 02/02/00 Initial version
|
||||
// 08/17/00 New and much faster algorithm.
|
||||
// 08/31/00 Avoided bank conflicts on loads, shortened |x|=1 path,
|
||||
// fixed mfb split issue stalls.
|
||||
// 12/19/00 Fixed small arg cases to force inexact, or inexact and underflow.
|
||||
// 08/02/02 New and much faster algorithm II
|
||||
// 02/06/03 Reordered header: .section, .global, .proc, .align
|
||||
|
||||
// Description
|
||||
//=========================================
|
||||
// The asin function computes the principal value of the arc sine of x.
|
||||
// asin(0) returns 0, asin(1) returns pi/2, asin(-1) returns -pi/2.
|
||||
// A domain error occurs for arguments not in the range [-1,+1].
|
||||
//
|
||||
// The asin function returns the arc sine in the range [-pi/2, +pi/2] radians.
|
||||
//
|
||||
// There are 8 paths:
|
||||
// 1. x = +/-0.0
|
||||
// Return asin(x) = +/-0.0
|
||||
//
|
||||
// 2. 0.0 < |x| < 0.625
|
||||
// Return asin(x) = x + x^3 *PolA(x^2)
|
||||
// where PolA(x^2) = A3 + A5*x^2 + A7*x^4 +...+ A35*x^32
|
||||
//
|
||||
// 3. 0.625 <=|x| < 1.0
|
||||
// Return asin(x) = sign(x) * ( Pi/2 - sqrt(R) * PolB(R))
|
||||
// Where R = 1 - |x|,
|
||||
// PolB(R) = B0 + B1*R + B2*R^2 +...+B12*R^12
|
||||
//
|
||||
// sqrt(R) is approximated using the following sequence:
|
||||
// y0 = (1 + eps)/sqrt(R) - initial approximation by frsqrta,
|
||||
// |eps| < 2^(-8)
|
||||
// Then 3 iterations are used to refine the result:
|
||||
// H0 = 0.5*y0
|
||||
// S0 = R*y0
|
||||
//
|
||||
// d0 = 0.5 - H0*S0
|
||||
// H1 = H0 + d0*H0
|
||||
// S1 = S0 + d0*S0
|
||||
//
|
||||
// d1 = 0.5 - H1*S1
|
||||
// H2 = H1 + d0*H1
|
||||
// S2 = S1 + d0*S1
|
||||
//
|
||||
// d2 = 0.5 - H2*S2
|
||||
// S3 = S3 + d2*S3
|
||||
//
|
||||
// S3 approximates sqrt(R) with enough accuracy for this algorithm
|
||||
//
|
||||
// So, the result should be reconstracted as follows:
|
||||
// asin(x) = sign(x) * (Pi/2 - S3*PolB(R))
|
||||
//
|
||||
// But for optimization perposes the reconstruction step is slightly
|
||||
// changed:
|
||||
// asin(x) = sign(x)*(Pi/2 - PolB(R)*S2) + sign(x)*d2*S2*PolB(R)
|
||||
//
|
||||
// 4. |x| = 1.0
|
||||
// Return asin(x) = sign(x)*Pi/2
|
||||
//
|
||||
// 5. 1.0 < |x| <= +INF
|
||||
// A domain error occurs for arguments not in the range [-1,+1]
|
||||
//
|
||||
// 6. x = [S,Q]NaN
|
||||
// Return asin(x) = QNaN
|
||||
//
|
||||
// 7. x is denormal
|
||||
// Return asin(x) = x + x^3,
|
||||
//
|
||||
// 8. x is unnormal
|
||||
// Normalize input in f8 and return to the very beginning of the function
|
||||
//
|
||||
// Registers used
|
||||
//==============================================================
|
||||
// Floating Point registers used:
|
||||
// f8, input, output
|
||||
// f6, f7, f9 -> f15, f32 -> f63
|
||||
|
||||
// General registers used:
|
||||
// r3, r21 -> r31, r32 -> r38
|
||||
|
||||
// Predicate registers used:
|
||||
// p0, p6 -> p14
|
||||
|
||||
//
|
||||
// Assembly macros
|
||||
//=========================================
|
||||
// integer registers used
|
||||
// scratch
|
||||
rTblAddr = r3
|
||||
|
||||
rPiBy2Ptr = r21
|
||||
rTmpPtr3 = r22
|
||||
rDenoBound = r23
|
||||
rOne = r24
|
||||
rAbsXBits = r25
|
||||
rHalf = r26
|
||||
r0625 = r27
|
||||
rSign = r28
|
||||
rXBits = r29
|
||||
rTmpPtr2 = r30
|
||||
rTmpPtr1 = r31
|
||||
|
||||
// stacked
|
||||
GR_SAVE_PFS = r32
|
||||
GR_SAVE_B0 = r33
|
||||
GR_SAVE_GP = r34
|
||||
GR_Parameter_X = r35
|
||||
GR_Parameter_Y = r36
|
||||
GR_Parameter_RESULT = r37
|
||||
GR_Parameter_TAG = r38
|
||||
|
||||
// floating point registers used
|
||||
FR_X = f10
|
||||
FR_Y = f1
|
||||
FR_RESULT = f8
|
||||
|
||||
|
||||
// scratch
|
||||
fXSqr = f6
|
||||
fXCube = f7
|
||||
fXQuadr = f9
|
||||
f1pX = f10
|
||||
f1mX = f11
|
||||
f1pXRcp = f12
|
||||
f1mXRcp = f13
|
||||
fH = f14
|
||||
fS = f15
|
||||
// stacked
|
||||
fA3 = f32
|
||||
fB1 = f32
|
||||
fA5 = f33
|
||||
fB2 = f33
|
||||
fA7 = f34
|
||||
fPiBy2 = f34
|
||||
fA9 = f35
|
||||
fA11 = f36
|
||||
fB10 = f35
|
||||
fB11 = f36
|
||||
fA13 = f37
|
||||
fA15 = f38
|
||||
fB4 = f37
|
||||
fB5 = f38
|
||||
fA17 = f39
|
||||
fA19 = f40
|
||||
fB6 = f39
|
||||
fB7 = f40
|
||||
fA21 = f41
|
||||
fA23 = f42
|
||||
fB3 = f41
|
||||
fB8 = f42
|
||||
fA25 = f43
|
||||
fA27 = f44
|
||||
fB9 = f43
|
||||
fB12 = f44
|
||||
fA29 = f45
|
||||
fA31 = f46
|
||||
fA33 = f47
|
||||
fA35 = f48
|
||||
fBaseP = f49
|
||||
fB0 = f50
|
||||
fSignedS = f51
|
||||
fD = f52
|
||||
fHalf = f53
|
||||
fR = f54
|
||||
fCloseTo1Pol = f55
|
||||
fSignX = f56
|
||||
fDenoBound = f57
|
||||
fNormX = f58
|
||||
fX8 = f59
|
||||
fRSqr = f60
|
||||
fRQuadr = f61
|
||||
fR8 = f62
|
||||
fX16 = f63
|
||||
// Data tables
|
||||
//==============================================================
|
||||
RODATA
|
||||
.align 16
|
||||
LOCAL_OBJECT_START(asin_base_range_table)
|
||||
// Ai: Polynomial coefficients for the asin(x), |x| < .625000
|
||||
// Bi: Polynomial coefficients for the asin(x), |x| > .625000
|
||||
data8 0xBFDAAB56C01AE468 //A29
|
||||
data8 0x3FE1C470B76A5B2B //A31
|
||||
data8 0xBFDC5FF82A0C4205 //A33
|
||||
data8 0x3FC71FD88BFE93F0 //A35
|
||||
data8 0xB504F333F9DE6487, 0x00003FFF //B0
|
||||
data8 0xAAAAAAAAAAAAFC18, 0x00003FFC //A3
|
||||
data8 0x3F9F1C71BC4A7823 //A9
|
||||
data8 0x3F96E8BBAAB216B2 //A11
|
||||
data8 0x3F91C4CA1F9F8A98 //A13
|
||||
data8 0x3F8C9DDCEDEBE7A6 //A15
|
||||
data8 0x3F877784442B1516 //A17
|
||||
data8 0x3F859C0491802BA2 //A19
|
||||
data8 0x9999999998C88B8F, 0x00003FFB //A5
|
||||
data8 0x3F6BD7A9A660BF5E //A21
|
||||
data8 0x3F9FC1659340419D //A23
|
||||
data8 0xB6DB6DB798149BDF, 0x00003FFA //A7
|
||||
data8 0xBFB3EF18964D3ED3 //A25
|
||||
data8 0x3FCD285315542CF2 //A27
|
||||
data8 0xF15BEEEFF7D2966A, 0x00003FFB //B1
|
||||
data8 0x3EF0DDA376D10FB3 //B10
|
||||
data8 0xBEB83CAFE05EBAC9 //B11
|
||||
data8 0x3F65FFB67B513644 //B4
|
||||
data8 0x3F5032FBB86A4501 //B5
|
||||
data8 0x3F392162276C7CBA //B6
|
||||
data8 0x3F2435949FD98BDF //B7
|
||||
data8 0xD93923D7FA08341C, 0x00003FF9 //B2
|
||||
data8 0x3F802995B6D90BDB //B3
|
||||
data8 0x3F10DF86B341A63F //B8
|
||||
data8 0xC90FDAA22168C235, 0x00003FFF // Pi/2
|
||||
data8 0x3EFA3EBD6B0ECB9D //B9
|
||||
data8 0x3EDE18BA080E9098 //B12
|
||||
LOCAL_OBJECT_END(asin_base_range_table)
|
||||
|
||||
|
||||
.section .text
|
||||
GLOBAL_LIBM_ENTRY(asin)
|
||||
asin_unnormal_back:
|
||||
{ .mfi
|
||||
getf.d rXBits = f8 // grab bits of input value
|
||||
// set p12 = 1 if x is a NaN, denormal, or zero
|
||||
fclass.m p12, p0 = f8, 0xcf
|
||||
adds rSign = 1, r0
|
||||
}
|
||||
{ .mfi
|
||||
addl rTblAddr = @ltoff(asin_base_range_table),gp
|
||||
// 1 - x = 1 - |x| for positive x
|
||||
fms.s1 f1mX = f1, f1, f8
|
||||
addl rHalf = 0xFFFE, r0 // exponent of 1/2
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
addl r0625 = 0x3FE4, r0 // high 16 bits of 0.625
|
||||
// set p8 = 1 if x < 0
|
||||
fcmp.lt.s1 p8, p9 = f8, f0
|
||||
shl rSign = rSign, 63 // sign bit
|
||||
}
|
||||
{ .mfi
|
||||
// point to the beginning of the table
|
||||
ld8 rTblAddr = [rTblAddr]
|
||||
// 1 + x = 1 - |x| for negative x
|
||||
fma.s1 f1pX = f1, f1, f8
|
||||
adds rOne = 0x3FF, r0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
andcm rAbsXBits = rXBits, rSign // bits of |x|
|
||||
fmerge.s fSignX = f8, f1 // signum(x)
|
||||
shl r0625 = r0625, 48 // bits of DP representation of 0.625
|
||||
}
|
||||
{ .mfb
|
||||
setf.exp fHalf = rHalf // load A2 to FP reg
|
||||
fma.s1 fXSqr = f8, f8, f0 // x^2
|
||||
// branch on special path if x is a NaN, denormal, or zero
|
||||
(p12) br.cond.spnt asin_special
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
adds rPiBy2Ptr = 272, rTblAddr
|
||||
nop.f 0
|
||||
shl rOne = rOne, 52 // bits of 1.0
|
||||
}
|
||||
{ .mfi
|
||||
adds rTmpPtr1 = 16, rTblAddr
|
||||
nop.f 0
|
||||
// set p6 = 1 if |x| < 0.625
|
||||
cmp.lt p6, p7 = rAbsXBits, r0625
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
ldfpd fA29, fA31 = [rTblAddr] // A29, fA31
|
||||
// 1 - x = 1 - |x| for positive x
|
||||
(p9) fms.s1 fR = f1, f1, f8
|
||||
// point to coefficient of "near 1" polynomial
|
||||
(p7) adds rTmpPtr2 = 176, rTblAddr
|
||||
}
|
||||
{ .mfi
|
||||
ldfpd fA33, fA35 = [rTmpPtr1], 16 // A33, fA35
|
||||
// 1 + x = 1 - |x| for negative x
|
||||
(p8) fma.s1 fR = f1, f1, f8
|
||||
(p6) adds rTmpPtr2 = 48, rTblAddr
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
ldfe fB0 = [rTmpPtr1], 16 // B0
|
||||
nop.f 0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mib
|
||||
adds rTmpPtr3 = 16, rTmpPtr2
|
||||
// set p10 = 1 if |x| = 1.0
|
||||
cmp.eq p10, p0 = rAbsXBits, rOne
|
||||
// branch on special path for |x| = 1.0
|
||||
(p10) br.cond.spnt asin_abs_1
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
ldfe fA3 = [rTmpPtr2], 48 // A3 or B1
|
||||
nop.f 0
|
||||
adds rTmpPtr1 = 64, rTmpPtr3
|
||||
}
|
||||
{ .mib
|
||||
ldfpd fA9, fA11 = [rTmpPtr3], 16 // A9, A11 or B10, B11
|
||||
// set p11 = 1 if |x| > 1.0
|
||||
cmp.gt p11, p0 = rAbsXBits, rOne
|
||||
// branch on special path for |x| > 1.0
|
||||
(p11) br.cond.spnt asin_abs_gt_1
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
ldfpd fA17, fA19 = [rTmpPtr2], 16 // A17, A19 or B6, B7
|
||||
// initial approximation of 1 / sqrt(1 - x)
|
||||
frsqrta.s1 f1mXRcp, p0 = f1mX
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
ldfpd fA13, fA15 = [rTmpPtr3] // A13, A15 or B4, B5
|
||||
fma.s1 fXCube = fXSqr, f8, f0 // x^3
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
ldfe fA5 = [rTmpPtr2], 48 // A5 or B2
|
||||
// initial approximation of 1 / sqrt(1 + x)
|
||||
frsqrta.s1 f1pXRcp, p0 = f1pX
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
ldfpd fA21, fA23 = [rTmpPtr1], 16 // A21, A23 or B3, B8
|
||||
fma.s1 fXQuadr = fXSqr, fXSqr, f0 // x^4
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
ldfe fA7 = [rTmpPtr1] // A7 or Pi/2
|
||||
fma.s1 fRSqr = fR, fR, f0 // R^2
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
ldfpd fA25, fA27 = [rTmpPtr2] // A25, A27 or B9, B12
|
||||
nop.f 0
|
||||
(p6) br.cond.spnt asin_base_range;
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p9) fma.s1 fH = fHalf, f1mXRcp, f0 // H0 for x > 0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p9) fma.s1 fS = f1mX, f1mXRcp, f0 // S0 for x > 0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p8) fma.s1 fH = fHalf, f1pXRcp, f0 // H0 for x < 0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p8) fma.s1 fS = f1pX, f1pXRcp, f0 // S0 for x > 0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fRQuadr = fRSqr, fRSqr, f0 // R^4
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fB11 = fB11, fR, fB10
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fB1 = fB1, fR, fB0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fB5 = fB5, fR, fB4
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fB7 = fB7, fR, fB6
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fB3 = fB3, fR, fB2
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fnma.s1 fD = fH, fS, fHalf // d0 = 1/2 - H0*S0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fR8 = fRQuadr, fRQuadr, f0 // R^4
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fB9 = fB9, fR, fB8
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s1 fB12 = fB12, fRSqr, fB11
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s1 fB7 = fB7, fRSqr, fB5
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s1 fB3 = fB3, fRSqr, fB1
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fH = fH, fD, fH // H1 = H0 + H0*d0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fS = fS, fD, fS // S1 = S0 + S0*d0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s1 fPiBy2 = fPiBy2, fSignX, f0 // signum(x)*Pi/2
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fB12 = fB12, fRSqr, fB9
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fB7 = fB7, fRQuadr, fB3
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fnma.s1 fD = fH, fS, fHalf // d1 = 1/2 - H1*S1
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fnma.s1 fSignedS = fSignX, fS, f0 // -signum(x)*S1
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fCloseTo1Pol = fB12, fR8, fB7
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fH = fH, fD, fH // H2 = H1 + H1*d1
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fS = fS, fD, fS // S2 = S1 + S1*d1
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// -signum(x)* S2 = -signum(x)*(S1 + S1*d1)
|
||||
fma.s1 fSignedS = fSignedS, fD, fSignedS
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fnma.s1 fD = fH, fS, fHalf // d2 = 1/2 - H2*S2
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// signum(x)*(Pi/2 - PolB*S2)
|
||||
fma.s1 fPiBy2 = fSignedS, fCloseTo1Pol, fPiBy2
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// -signum(x)*PolB * S2
|
||||
fma.s1 fCloseTo1Pol = fSignedS, fCloseTo1Pol, f0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
// final result for 0.625 <= |x| < 1
|
||||
fma.d.s0 f8 = fCloseTo1Pol, fD, fPiBy2
|
||||
// exit here for 0.625 <= |x| < 1
|
||||
br.ret.sptk b0
|
||||
}
|
||||
;;
|
||||
|
||||
|
||||
// here if |x| < 0.625
|
||||
.align 32
|
||||
asin_base_range:
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA33 = fA33, fXSqr, fA31
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA15 = fA15, fXSqr, fA13
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA29 = fA29, fXSqr, fA27
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA25 = fA25, fXSqr, fA23
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA21 = fA21, fXSqr, fA19
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA9 = fA9, fXSqr, fA7
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA5 = fA5, fXSqr, fA3
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA35 = fA35, fXQuadr, fA33
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA17 = fA17, fXQuadr, fA15
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fX8 = fXQuadr, fXQuadr, f0 // x^8
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA25 = fA25, fXQuadr, fA21
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA9 = fA9, fXQuadr, fA5
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA35 = fA35, fXQuadr, fA29
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA17 = fA17, fXSqr, fA11
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fX16 = fX8, fX8, f0 // x^16
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA35 = fA35, fX8, fA25
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA17 = fA17, fX8, fA9
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fBaseP = fA35, fX16, fA17
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
// final result for |x| < 0.625
|
||||
fma.d.s0 f8 = fBaseP, fXCube, f8
|
||||
// exit here for |x| < 0.625 path
|
||||
br.ret.sptk b0
|
||||
}
|
||||
;;
|
||||
|
||||
// here if |x| = 1
|
||||
// asin(x) = sign(x) * Pi/2
|
||||
.align 32
|
||||
asin_abs_1:
|
||||
{ .mfi
|
||||
ldfe fPiBy2 = [rPiBy2Ptr] // Pi/2
|
||||
nop.f 0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// result for |x| = 1.0
|
||||
fma.d.s0 f8 = fPiBy2, fSignX, f0
|
||||
// exit here for |x| = 1.0
|
||||
br.ret.sptk b0
|
||||
}
|
||||
;;
|
||||
|
||||
// here if x is a NaN, denormal, or zero
|
||||
.align 32
|
||||
asin_special:
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// set p12 = 1 if x is a NaN
|
||||
fclass.m p12, p0 = f8, 0xc3
|
||||
nop.i 0
|
||||
}
|
||||
{ .mlx
|
||||
nop.m 0
|
||||
// smallest positive DP normalized number
|
||||
movl rDenoBound = 0x0010000000000000
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// set p13 = 1 if x = 0.0
|
||||
fclass.m p13, p0 = f8, 0x07
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fnorm.s1 fNormX = f8
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfb
|
||||
// load smallest normal to FP reg
|
||||
setf.d fDenoBound = rDenoBound
|
||||
// answer if x is a NaN
|
||||
(p12) fma.d.s0 f8 = f8,f1,f0
|
||||
// exit here if x is a NaN
|
||||
(p12) br.ret.spnt b0
|
||||
}
|
||||
;;
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
nop.f 0
|
||||
// exit here if x = 0.0
|
||||
(p13) br.ret.spnt b0
|
||||
}
|
||||
;;
|
||||
// if we still here then x is denormal or unnormal
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// absolute value of normalized x
|
||||
fmerge.s fNormX = f1, fNormX
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// set p14 = 1 if normalized x is greater than or
|
||||
// equal to the smallest denormalized value
|
||||
// So, if p14 is set to 1 it means that we deal with
|
||||
// unnormal rather than with "true" denormal
|
||||
fcmp.ge.s1 p14, p0 = fNormX, fDenoBound
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p14) fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag if x unnormal
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
// normalize unnormal input
|
||||
(p14) fnorm.s1 f8 = f8
|
||||
// return to the main path
|
||||
(p14) br.cond.sptk asin_unnormal_back
|
||||
}
|
||||
;;
|
||||
// if we still here it means that input is "true" denormal
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
// final result if x is denormal
|
||||
fma.d.s0 f8 = f8, fXSqr, f8
|
||||
// exit here if x is denormal
|
||||
br.ret.sptk b0
|
||||
}
|
||||
;;
|
||||
|
||||
// here if |x| > 1.0
|
||||
// error handler should be called
|
||||
.align 32
|
||||
asin_abs_gt_1:
|
||||
{ .mfi
|
||||
alloc r32 = ar.pfs, 0, 3, 4, 0 // get some registers
|
||||
fmerge.s FR_X = f8,f8
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
mov GR_Parameter_TAG = 61 // error code
|
||||
frcpa.s0 FR_RESULT, p0 = f0,f0
|
||||
// call error handler routine
|
||||
br.cond.sptk __libm_error_region
|
||||
}
|
||||
;;
|
||||
GLOBAL_LIBM_END(asin)
|
||||
libm_alias_double_other (asin, asin)
|
||||
|
||||
|
||||
|
||||
LOCAL_LIBM_ENTRY(__libm_error_region)
|
||||
.prologue
|
||||
{ .mfi
|
||||
add GR_Parameter_Y=-32,sp // Parameter 2 value
|
||||
nop.f 0
|
||||
.save ar.pfs,GR_SAVE_PFS
|
||||
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
|
||||
}
|
||||
{ .mfi
|
||||
.fframe 64
|
||||
add sp=-64,sp // Create new stack
|
||||
nop.f 0
|
||||
mov GR_SAVE_GP=gp // Save gp
|
||||
};;
|
||||
{ .mmi
|
||||
stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
|
||||
add GR_Parameter_X = 16,sp // Parameter 1 address
|
||||
.save b0, GR_SAVE_B0
|
||||
mov GR_SAVE_B0=b0 // Save b0
|
||||
};;
|
||||
.body
|
||||
{ .mib
|
||||
stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
|
||||
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
|
||||
nop.b 0
|
||||
}
|
||||
{ .mib
|
||||
stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
|
||||
add GR_Parameter_Y = -16,GR_Parameter_Y
|
||||
br.call.sptk b0=__libm_error_support# // Call error handling function
|
||||
};;
|
||||
{ .mmi
|
||||
add GR_Parameter_RESULT = 48,sp
|
||||
nop.m 0
|
||||
nop.i 0
|
||||
};;
|
||||
{ .mmi
|
||||
ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
|
||||
.restore sp
|
||||
add sp = 64,sp // Restore stack pointer
|
||||
mov b0 = GR_SAVE_B0 // Restore return address
|
||||
};;
|
||||
{ .mib
|
||||
mov gp = GR_SAVE_GP // Restore gp
|
||||
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
|
||||
br.ret.sptk b0 // Return
|
||||
};;
|
||||
|
||||
LOCAL_LIBM_END(__libm_error_region)
|
||||
.type __libm_error_support#,@function
|
||||
.global __libm_error_support#
|
@ -1,675 +0,0 @@
|
||||
.file "asinf.s"
|
||||
|
||||
|
||||
// Copyright (c) 2000 - 2003, Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
|
||||
// History
|
||||
//==============================================================
|
||||
// 02/02/00 Initial version
|
||||
// 06/28/00 Improved speed
|
||||
// 06/31/00 Changed register allocation because of some duplicate macros
|
||||
// moved nan exit bundle up to gain a cycle.
|
||||
// 08/08/00 Improved speed by avoiding SIR flush.
|
||||
// 08/15/00 Bundle added after call to __libm_error_support to properly
|
||||
// set [the previously overwritten] GR_Parameter_RESULT.
|
||||
// 08/17/00 Changed predicate register macro-usage to direct predicate
|
||||
// names due to an assembler bug.
|
||||
// 10/17/00 Improved speed of x=0 and x=1 paths, set D flag if x denormal.
|
||||
// 03/13/01 Corrected sign of imm1 value in dep instruction.
|
||||
// 05/20/02 Cleaned up namespace and sf0 syntax
|
||||
// 02/06/03 Reordered header: .section, .global, .proc, .align
|
||||
|
||||
|
||||
// Description
|
||||
//=========================================
|
||||
// The asinf function computes the arc sine of x in the range [-pi,+pi].
|
||||
// A domain error occurs for arguments not in the range [-1,+1].
|
||||
// asinf(+-0) returns +-0
|
||||
// asinf(x) returns a Nan and raises the invalid exception for |x| >1
|
||||
|
||||
// The acosf function returns the arc cosine in the range [0, +pi] radians.
|
||||
// A domain error occurs for arguments not in the range [-1,+1].
|
||||
// acosf(1) returns +0
|
||||
// acosf(x) returns a Nan and raises the invalid exception for |x| >1
|
||||
|
||||
|
||||
// |x| <= sqrt(2)/2. get Ax and Bx
|
||||
|
||||
// poly_p1 = x p1
|
||||
// poly_p3 = x2 p4 + p3
|
||||
// poly_p1 = x2 (poly_p1) + x = x2(x p1) + x
|
||||
// poly_p2 = x2( poly_p3) + p2 = x2(x2 p4 + p3) + p2
|
||||
|
||||
// poly_Ax = x5(x2( poly_p3) + p2) + x2(x p1) + x
|
||||
// = x5(x2(x2 p4 + p3) + p2) + x2(x p1) + x
|
||||
|
||||
// poly_p7 = x2 p8 + p7
|
||||
// poly_p5 = x2 p6 + p5
|
||||
|
||||
// poly_p7 = x4 p9 + (poly_p7)
|
||||
// poly_p7 = x4 p9 + (x2 p8 + p7)
|
||||
// poly_Bx = x4 (x4 p9 + (x2 p8 + p7)) + x2 p6 + p5
|
||||
|
||||
// answer1 = x11(x4 (x4 p9 + (x2 p8 + p7)) + x2 p6 + p5) + x5(x2(x2 p4 + p3) + p2) + x2(x p1) + x
|
||||
// = x19 p9 + x17 p8 + x15 p7 x13 p6 + x11 p5 + x9 p4 + x7 p3 + x5 p2 + x3 p1 + x
|
||||
|
||||
|
||||
|
||||
// |x| > sqrt(2)/2
|
||||
|
||||
// Get z = sqrt(1-x2)
|
||||
|
||||
// Get polynomial in t = 1-x2
|
||||
|
||||
// t2 = t t
|
||||
// t4 = t2 t2
|
||||
|
||||
// poly_p4 = t p5 + p4
|
||||
// poly_p1 = t p1 + 1
|
||||
|
||||
// poly_p6 = t p7 + p6
|
||||
// poly_p2 = t p3 + p2
|
||||
|
||||
// poly_p8 = t p9 + p8
|
||||
|
||||
// poly_p4 = t2 poly_p6 + poly_p4
|
||||
// = t2 (t p7 + p6) + (t p5 + p4)
|
||||
|
||||
// poly_p2 = t2 poly_p2 + poly_p1
|
||||
// = t2 (t p3 + p2) + (t p1 + 1)
|
||||
|
||||
// poly_p4 = t4 poly_p8 + poly_p4
|
||||
// = t4 (t p9 + p8) + (t2 (t p7 + p6) + (t p5 + p4))
|
||||
|
||||
// P(t) = poly_p2 + t4 poly_p8
|
||||
// = t2 (t p3 + p2) + (t p1 + 1) + t4 (t4 (t p9 + p8) + (t2 (t p7 + p6) + (t p5 + p4)))
|
||||
// = t3 p3 + t2 p2 + t p1 + 1 + t9 p9 + t8 p8 + t7 p7 + t6 p6 + t5 p5 + t4 p4
|
||||
|
||||
|
||||
// answer2 = - sign(x) z P(t) + (sign(x) pi/2)
|
||||
//
|
||||
|
||||
|
||||
// Assembly macros
|
||||
//=========================================
|
||||
|
||||
// predicate registers
|
||||
//asinf_pred_LEsqrt2by2 = p7
|
||||
//asinf_pred_GTsqrt2by2 = p8
|
||||
|
||||
// integer registers
|
||||
ASINF_Addr1 = r33
|
||||
ASINF_Addr2 = r34
|
||||
ASINF_GR_1by2 = r35
|
||||
|
||||
ASINF_GR_3by2 = r36
|
||||
ASINF_GR_5by2 = r37
|
||||
|
||||
GR_SAVE_B0 = r38
|
||||
GR_SAVE_PFS = r39
|
||||
GR_SAVE_GP = r40
|
||||
|
||||
GR_Parameter_X = r41
|
||||
GR_Parameter_Y = r42
|
||||
GR_Parameter_RESULT = r43
|
||||
GR_Parameter_TAG = r44
|
||||
|
||||
// floating point registers
|
||||
|
||||
asinf_y = f32
|
||||
asinf_abs_x = f33
|
||||
asinf_x2 = f34
|
||||
asinf_sgn_x = f35
|
||||
|
||||
asinf_1by2 = f36
|
||||
asinf_3by2 = f37
|
||||
asinf_5by2 = f38
|
||||
asinf_coeff_P3 = f39
|
||||
asinf_coeff_P8 = f40
|
||||
|
||||
asinf_coeff_P1 = f41
|
||||
asinf_coeff_P4 = f42
|
||||
asinf_coeff_P5 = f43
|
||||
asinf_coeff_P2 = f44
|
||||
asinf_coeff_P7 = f45
|
||||
|
||||
asinf_coeff_P6 = f46
|
||||
asinf_coeff_P9 = f47
|
||||
asinf_x2 = f48
|
||||
asinf_x3 = f49
|
||||
asinf_x4 = f50
|
||||
|
||||
asinf_x8 = f51
|
||||
asinf_x5 = f52
|
||||
asinf_const_piby2 = f53
|
||||
asinf_const_sqrt2by2 = f54
|
||||
asinf_x11 = f55
|
||||
|
||||
asinf_poly_p1 = f56
|
||||
asinf_poly_p3 = f57
|
||||
asinf_sinf1 = f58
|
||||
asinf_poly_p2 = f59
|
||||
asinf_poly_Ax = f60
|
||||
|
||||
asinf_poly_p7 = f61
|
||||
asinf_poly_p5 = f62
|
||||
asinf_sgnx_t4 = f63
|
||||
asinf_poly_Bx = f64
|
||||
asinf_t = f65
|
||||
|
||||
asinf_yby2 = f66
|
||||
asinf_B = f67
|
||||
asinf_B2 = f68
|
||||
asinf_Az = f69
|
||||
asinf_dz = f70
|
||||
|
||||
asinf_Sz = f71
|
||||
asinf_d2z = f72
|
||||
asinf_Fz = f73
|
||||
asinf_z = f74
|
||||
asinf_sgnx_z = f75
|
||||
|
||||
asinf_t2 = f76
|
||||
asinf_2poly_p4 = f77
|
||||
asinf_2poly_p6 = f78
|
||||
asinf_2poly_p1 = f79
|
||||
asinf_2poly_p2 = f80
|
||||
|
||||
asinf_2poly_p8 = f81
|
||||
asinf_t4 = f82
|
||||
asinf_Pt = f83
|
||||
asinf_sgnx_2poly_p2 = f84
|
||||
asinf_sgn_x_piby2 = f85
|
||||
|
||||
asinf_poly_p7a = f86
|
||||
asinf_2poly_p4a = f87
|
||||
asinf_2poly_p4b = f88
|
||||
asinf_2poly_p2a = f89
|
||||
asinf_poly_p1a = f90
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// Data tables
|
||||
//==============================================================
|
||||
|
||||
RODATA
|
||||
|
||||
.align 16
|
||||
|
||||
LOCAL_OBJECT_START(asinf_coeff_1_table)
|
||||
data8 0x3FC5555607DCF816 // P1
|
||||
data8 0x3F9CF81AD9BAB2C6 // P4
|
||||
data8 0x3FC59E0975074DF3 // P7
|
||||
data8 0xBFA6F4CC2780AA1D // P6
|
||||
data8 0x3FC2DD45292E93CB // P9
|
||||
data8 0x3fe6a09e667f3bcd // sqrt(2)/2
|
||||
LOCAL_OBJECT_END(asinf_coeff_1_table)
|
||||
|
||||
LOCAL_OBJECT_START(asinf_coeff_2_table)
|
||||
data8 0x3FA6F108E31EFBA6 // P3
|
||||
data8 0xBFCA31BF175D82A0 // P8
|
||||
data8 0x3FA30C0337F6418B // P5
|
||||
data8 0x3FB332C9266CB1F9 // P2
|
||||
data8 0x3ff921fb54442d18 // pi_by_2
|
||||
LOCAL_OBJECT_END(asinf_coeff_2_table)
|
||||
|
||||
|
||||
.section .text
|
||||
GLOBAL_LIBM_ENTRY(asinf)
|
||||
|
||||
// Load the addresses of the two tables.
|
||||
// Then, load the coefficients and other constants.
|
||||
|
||||
{ .mfi
|
||||
alloc r32 = ar.pfs,1,8,4,0
|
||||
fnma.s1 asinf_t = f8,f8,f1
|
||||
dep.z ASINF_GR_1by2 = 0x3f,24,8 // 0x3f000000
|
||||
}
|
||||
{ .mfi
|
||||
addl ASINF_Addr1 = @ltoff(asinf_coeff_1_table),gp
|
||||
fma.s1 asinf_x2 = f8,f8,f0
|
||||
addl ASINF_Addr2 = @ltoff(asinf_coeff_2_table),gp ;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
ld8 ASINF_Addr1 = [ASINF_Addr1]
|
||||
fmerge.s asinf_abs_x = f1,f8
|
||||
dep ASINF_GR_3by2 = -1,r0,22,8 // 0x3fc00000
|
||||
}
|
||||
{ .mlx
|
||||
nop.m 999
|
||||
movl ASINF_GR_5by2 = 0x40200000;;
|
||||
}
|
||||
|
||||
|
||||
|
||||
{ .mfi
|
||||
setf.s asinf_1by2 = ASINF_GR_1by2
|
||||
fmerge.s asinf_sgn_x = f8,f1
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
ld8 ASINF_Addr2 = [ASINF_Addr2]
|
||||
nop.f 0
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
setf.s asinf_5by2 = ASINF_GR_5by2
|
||||
fcmp.lt.s1 p11,p12 = f8,f0
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
{ .mmf
|
||||
ldfpd asinf_coeff_P1,asinf_coeff_P4 = [ASINF_Addr1],16
|
||||
setf.s asinf_3by2 = ASINF_GR_3by2
|
||||
fclass.m.unc p8,p0 = f8, 0xc3 ;; //@qnan | @snan
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
ldfpd asinf_coeff_P7,asinf_coeff_P6 = [ASINF_Addr1],16
|
||||
fma.s1 asinf_t2 = asinf_t,asinf_t,f0
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
ldfpd asinf_coeff_P3,asinf_coeff_P8 = [ASINF_Addr2],16
|
||||
fma.s1 asinf_x4 = asinf_x2,asinf_x2,f0
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
ldfpd asinf_coeff_P9,asinf_const_sqrt2by2 = [ASINF_Addr1]
|
||||
fclass.m.unc p10,p0 = f8, 0x07 //@zero
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
ldfpd asinf_coeff_P5,asinf_coeff_P2 = [ASINF_Addr2],16
|
||||
fma.s1 asinf_x3 = f8,asinf_x2,f0
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
ldfd asinf_const_piby2 = [ASINF_Addr2]
|
||||
frsqrta.s1 asinf_B,p0 = asinf_t
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
(p8) fma.s.s0 f8 = f8,f1,f0
|
||||
(p8) br.ret.spnt b0 ;; // Exit if x=nan
|
||||
}
|
||||
|
||||
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
fcmp.eq.s1 p6,p0 = asinf_abs_x,f1
|
||||
(p10) br.ret.spnt b0 ;; // Exit if x=0
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fcmp.gt.s1 p9,p0 = asinf_abs_x,f1
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 asinf_x8 = asinf_x4,asinf_x4,f0
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
fma.s1 asinf_t4 = asinf_t2,asinf_t2,f0
|
||||
(p6) br.cond.spnt ASINF_ABS_ONE ;; // Branch if |x|=1
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 asinf_x5 = asinf_x2,asinf_x3,f0
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfb
|
||||
(p9) mov GR_Parameter_TAG = 62
|
||||
fma.s1 asinf_yby2 = asinf_t,asinf_1by2,f0
|
||||
(p9) br.cond.spnt __libm_error_region ;; // Branch if |x|>1
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 asinf_Az = asinf_t,asinf_B,f0
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 asinf_B2 = asinf_B,asinf_B,f0
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 asinf_poly_p1 = f8,asinf_coeff_P1,f0
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 asinf_2poly_p1 = asinf_coeff_P1,asinf_t,f1
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 asinf_poly_p3 = asinf_coeff_P4,asinf_x2,asinf_coeff_P3
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 asinf_2poly_p6 = asinf_coeff_P7,asinf_t,asinf_coeff_P6
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 asinf_poly_p7 = asinf_x2,asinf_coeff_P8,asinf_coeff_P7
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 asinf_2poly_p2 = asinf_coeff_P3,asinf_t,asinf_coeff_P2
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 asinf_poly_p5 = asinf_x2,asinf_coeff_P6,asinf_coeff_P5
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 asinf_2poly_p4 = asinf_coeff_P5,asinf_t,asinf_coeff_P4
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.d.s1 asinf_x11 = asinf_x8,asinf_x3,f0
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fnma.s1 asinf_dz = asinf_B2,asinf_yby2,asinf_1by2
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 asinf_poly_p1a = asinf_x2,asinf_poly_p1,f8
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 asinf_2poly_p8 = asinf_coeff_P9,asinf_t,asinf_coeff_P8
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
// Get the absolute value of x and determine the region in which x lies
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fcmp.le.s1 p7,p8 = asinf_abs_x,asinf_const_sqrt2by2
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 asinf_poly_p2 = asinf_x2,asinf_poly_p3,asinf_coeff_P2
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 asinf_poly_p7a = asinf_x4,asinf_coeff_P9,asinf_poly_p7
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 asinf_2poly_p2a = asinf_2poly_p2,asinf_t2,asinf_2poly_p1
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p8) fma.s1 asinf_sgnx_t4 = asinf_sgn_x,asinf_t4,f0
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p8) fma.s1 asinf_2poly_p4a = asinf_2poly_p6,asinf_t2,asinf_2poly_p4
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p8) fma.s1 asinf_Sz = asinf_5by2,asinf_dz,asinf_3by2
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p8) fma.s1 asinf_d2z = asinf_dz,asinf_dz,f0
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p8) fma.s1 asinf_sgn_x_piby2 = asinf_sgn_x,asinf_const_piby2,f0
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p7) fma.d.s1 asinf_poly_Ax = asinf_x5,asinf_poly_p2,asinf_poly_p1a
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p7) fma.d.s1 asinf_poly_Bx = asinf_x4,asinf_poly_p7a,asinf_poly_p5
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p8) fma.s1 asinf_sgnx_2poly_p2 = asinf_sgn_x,asinf_2poly_p2a,f0
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fcmp.eq.s0 p6,p0 = f8,f0 // Only purpose is to set D if x denormal
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p8) fma.s1 asinf_2poly_p4b = asinf_2poly_p8,asinf_t4,asinf_2poly_p4a
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p8) fma.s1 asinf_Fz = asinf_d2z,asinf_Sz,asinf_dz
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p8) fma.d.s1 asinf_Pt = asinf_2poly_p4b,asinf_sgnx_t4,asinf_sgnx_2poly_p2
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p8) fma.d.s1 asinf_z = asinf_Az,asinf_Fz,asinf_Az
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
.pred.rel "mutex",p8,p7 //asinf_pred_GTsqrt2by2,asinf_pred_LEsqrt2by2
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p8) fnma.s.s0 f8 = asinf_z,asinf_Pt,asinf_sgn_x_piby2
|
||||
nop.i 999
|
||||
}
|
||||
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
(p7) fma.s.s0 f8 = asinf_x11,asinf_poly_Bx,asinf_poly_Ax
|
||||
br.ret.sptk b0 ;;
|
||||
}
|
||||
|
||||
ASINF_ABS_ONE:
|
||||
// Here for short exit if |x|=1
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
fma.s.s0 f8 = asinf_sgn_x,asinf_const_piby2,f0
|
||||
br.ret.sptk b0
|
||||
}
|
||||
;;
|
||||
|
||||
GLOBAL_LIBM_END(asinf)
|
||||
libm_alias_float_other (asin, asin)
|
||||
|
||||
// Stack operations when calling error support.
|
||||
// (1) (2)
|
||||
// sp -> + psp -> +
|
||||
// | |
|
||||
// | | <- GR_Y
|
||||
// | |
|
||||
// | <-GR_Y Y2->|
|
||||
// | |
|
||||
// | | <- GR_X
|
||||
// | |
|
||||
// sp-64 -> + sp -> +
|
||||
// save ar.pfs save b0
|
||||
// save gp
|
||||
|
||||
|
||||
// Stack operations when calling error support.
|
||||
// (3) (call) (4)
|
||||
// psp -> + sp -> +
|
||||
// | |
|
||||
// R3 ->| <- GR_RESULT | -> f8
|
||||
// | |
|
||||
// Y2 ->| <- GR_Y |
|
||||
// | |
|
||||
// X1 ->| |
|
||||
// | |
|
||||
// sp -> + +
|
||||
// restore gp
|
||||
// restore ar.pfs
|
||||
|
||||
LOCAL_LIBM_ENTRY(__libm_error_region)
|
||||
.prologue
|
||||
{ .mfi
|
||||
add GR_Parameter_Y=-32,sp // Parameter 2 value
|
||||
nop.f 999
|
||||
.save ar.pfs,GR_SAVE_PFS
|
||||
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
|
||||
}
|
||||
{ .mfi
|
||||
.fframe 64
|
||||
add sp=-64,sp // Create new stack
|
||||
nop.f 0
|
||||
mov GR_SAVE_GP=gp // Save gp
|
||||
};;
|
||||
{ .mmi
|
||||
stfs [GR_Parameter_Y] = f1,16 // Store Parameter 2 on stack
|
||||
add GR_Parameter_X = 16,sp // Parameter 1 address
|
||||
.save b0, GR_SAVE_B0
|
||||
mov GR_SAVE_B0=b0 // Save b0
|
||||
};;
|
||||
|
||||
.body
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
frcpa.s0 f9,p0 = f0,f0
|
||||
nop.i 0
|
||||
};;
|
||||
|
||||
{ .mib
|
||||
stfs [GR_Parameter_X] = f8 // Store Parameter 1 on stack
|
||||
add GR_Parameter_RESULT = 0,GR_Parameter_Y
|
||||
nop.b 0 // Parameter 3 address
|
||||
}
|
||||
{ .mib
|
||||
stfs [GR_Parameter_Y] = f9 // Store Parameter 3 on stack
|
||||
add GR_Parameter_Y = -16,GR_Parameter_Y
|
||||
br.call.sptk b0=__libm_error_support# // Call error handling function
|
||||
};;
|
||||
{ .mmi
|
||||
nop.m 0
|
||||
nop.m 0
|
||||
add GR_Parameter_RESULT = 48,sp
|
||||
};;
|
||||
|
||||
{ .mmi
|
||||
ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
|
||||
.restore sp
|
||||
add sp = 64,sp // Restore stack pointer
|
||||
mov b0 = GR_SAVE_B0 // Restore return address
|
||||
};;
|
||||
{ .mib
|
||||
mov gp = GR_SAVE_GP // Restore gp
|
||||
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
|
||||
br.ret.sptk b0 // Return
|
||||
};;
|
||||
|
||||
LOCAL_LIBM_END(__libm_error_region)
|
||||
|
||||
.type __libm_error_support#,@function
|
||||
.global __libm_error_support#
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,900 +0,0 @@
|
||||
.file "atan2f.s"
|
||||
|
||||
|
||||
// Copyright (c) 2000 - 2003, Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
|
||||
// History
|
||||
//==============================================================
|
||||
// 06/01/00 Initial version
|
||||
// 08/15/00 Bundle added after call to __libm_error_support to properly
|
||||
// set [the previously overwritten] GR_Parameter_RESULT.
|
||||
// 08/17/00 Changed predicate register macro-usage to direct predicate
|
||||
// names due to an assembler bug.
|
||||
// 01/05/01 Fixed flag settings for denormal input.
|
||||
// 01/19/01 Added documentation
|
||||
// 01/30/01 Improved speed
|
||||
// 02/06/02 Corrected .section statement
|
||||
// 05/20/02 Cleaned up namespace and sf0 syntax
|
||||
// 02/06/03 Reordered header: .section, .global, .proc, .align
|
||||
|
||||
// Description
|
||||
//=========================================
|
||||
// The atan2 function computes the principle value of the arc tangent of y/x using
|
||||
// the signs of both arguments to determine the quadrant of the return value.
|
||||
// A domain error may occur if both arguments are zero.
|
||||
|
||||
// The atan2 function returns the arc tangent of y/x in the range [-pi,+pi] radians.
|
||||
|
||||
//..
|
||||
//..Let (v,u) = (y,x) if |y| <= |x|, and (v,u) = (x,y) otherwise. Note that
|
||||
//..v and u can be negative. We state the relationship between atan2(y,x) and
|
||||
//..atan(v/u).
|
||||
//..
|
||||
//..Let swap = false if v = y, and swap = true if v = x.
|
||||
//..Define C according to the matrix
|
||||
//..
|
||||
//.. TABLE FOR C
|
||||
//.. x +ve x -ve
|
||||
//.. no swap (swap = false) sgn(y)*0 sgn(y)*pi
|
||||
//.. swap (swap = true ) sgn(y)*pi/2 sgn(y)*pi/2
|
||||
//..
|
||||
//.. atan2(y,x) = C + atan(v/u) if no swap
|
||||
//.. atan2(y,x) = C - atan(v/u) if swap
|
||||
//..
|
||||
//..These relationship is more efficient to compute as we accommodate signs in v and u
|
||||
//..saving the need to obtain the absolute value before computation can proceed.
|
||||
//..
|
||||
//..Suppose (v,u) = (y,x), we calculate atan(v/u) as follows:
|
||||
//..A = y * frcpa(x) (so A = (y/x)(1 - beta))
|
||||
//..atan(y/x) = atan(A) + atan( ((y/x)-A))/(1 + (y/x)A) ), the second term is
|
||||
//..a correction.
|
||||
//..atan(A) is approximated by a polynomial
|
||||
//..A + p1 A^3 + p2 A^5 + ... + p10 A^21,
|
||||
//..atan(G) is approximated as follows:
|
||||
//..Let G = (y - Ax)/(x + Ay), atan(G) can be approximated by G + g * p1
|
||||
//..where g is a limited precision approximation to G via g = (y - Ax)*frcpa(x + Ay).
|
||||
//..
|
||||
//..Suppose (v,u) = (x,y), we calculate atan(v/u) as follows:
|
||||
//..Z = x * frcpa(y) (so Z = (x/y)(1 - beta))
|
||||
//..atan(x/y) = atan(Z) + atan( ((x/y)-Z))/(1 + (x/y)Z) ), the second term is
|
||||
//..a correction.
|
||||
//..atan(Z) is approximated by a polynomial
|
||||
//..Z + p1 Z^3 + p2 Z^5 + ... + p10 Z^21,
|
||||
//..atan(T) is approximated as follows:
|
||||
//..Let T = (x - Ay)/(y + Ax), atan(T) can be approximated by T + t * p1
|
||||
//..where t is a limited precision approximation to T via t = (x - Ay)*frcpa(y + Ax).
|
||||
//..
|
||||
//..
|
||||
//..A = y * frcpa(x)
|
||||
//..atan(A) ~=~ A + p1 A^3 + ... + P10 A^21
|
||||
//..
|
||||
//..This polynomial is computed as follows:
|
||||
//..Asq = A*A; Acub = A*Asq, A4 = Asq*Asq
|
||||
//..A5 = Asq*Acub, A6 = Asq*A4; A11 = A5 * A6
|
||||
//..
|
||||
//..poly_A1 = p9 + Asq*p10, poly_A2 = p7 + Asq*p8, poly_A3 = p5 + Asq*p6
|
||||
//..poly_A1 = poly_A2 + A4 * poly_A1
|
||||
//..poly_A1 = poly_A3 + A4 * poly_A1
|
||||
//..
|
||||
//..poly_A4 = p1 * A
|
||||
//,,poly_A5 = p3 + Asq * p4, poly_A4 = A + Asq*poly_A4
|
||||
//..poly_A5 = p2 + Asq * poly_A5
|
||||
//..poly_A4 = poly_A4 + A5 * poly_A5
|
||||
//..
|
||||
//..atan_A = poly_A4 + A11 * poly_A1
|
||||
//..
|
||||
//..atan(G) is approximated as follows:
|
||||
//..G_numer = y - A*x, G_denom = x + A*y
|
||||
//..H1 = frcpa(G_denom)
|
||||
//..H_beta = 1 - H1 * G_denom
|
||||
//..H2 = H1 + H1 * H_beta
|
||||
//..H_beta2 = H_beta*H_beta
|
||||
//..H3 = H2 + H2*H_beta2
|
||||
//..g = H1 * G_numer; gsq = g*g; atan_G = g*p1, atan_G = atan_G*gsq
|
||||
//..atan_G = G_numer*H3 + atan_G
|
||||
//..
|
||||
//..
|
||||
//..A = y * frcpa(x)
|
||||
//..atan(A) ~=~ A + p1 A^3 + ... + P10 A^21
|
||||
//..
|
||||
//..This polynomial is computed as follows:
|
||||
//..Asq = A*A; Acub = A*Asq, A4 = Asq*Asq
|
||||
//..A5 = Asq*Acub, A6 = Asq*A4; A11 = A5 * A6
|
||||
//..
|
||||
//..poly_A1 = p9 + Asq*p10, poly_A2 = p7 + Asq*p8, poly_A3 = p5 + Asq*p6
|
||||
//..poly_A1 = poly_A2 + A4 * poly_A1
|
||||
//..poly_A1 = poly_A3 + A4 * poly_A1
|
||||
//..
|
||||
//..poly_A4 = p1 * A
|
||||
//,,poly_A5 = p3 + Asq * p4, poly_A4 = A + Asq*poly_A4
|
||||
//..poly_A5 = p2 + Asq * poly_A5
|
||||
//..poly_A4 = poly_A4 + A5 * poly_A5
|
||||
//..
|
||||
//..atan_A = poly_A4 + A11 * poly_A1
|
||||
//..
|
||||
//..
|
||||
//..====================================================================
|
||||
//.. COEFFICIENTS USED IN THE COMPUTATION
|
||||
//..====================================================================
|
||||
|
||||
//coef_pj, j = 1,2,...,10; atan(A) ~=~ A + p1 A^3 + p2 A^5 + ... + p10 A^21
|
||||
//
|
||||
// coef_p1 = -.3333332707155439167401311806315789E+00
|
||||
// coef_p1 in dbl = BFD5 5555 1219 1621
|
||||
//
|
||||
// coef_p2 = .1999967670926658391827857030875748E+00
|
||||
// coef_p2 in dbl = 3FC9 997E 7AFB FF4E
|
||||
//
|
||||
// coef_p3 = -.1427989384500152360161563301087296E+00
|
||||
// coef_p3 in dbl = BFC2 473C 5145 EE38
|
||||
//
|
||||
// coef_p4 = .1105852823460720770079031213661163E+00
|
||||
// coef_p4 in dbl = 3FBC 4F51 2B18 65F5
|
||||
//
|
||||
// coef_p5 = -.8811839915595312348625710228448363E-01
|
||||
// coef_p5 in dbl = BFB6 8EED 6A8C FA32
|
||||
//
|
||||
// coef_p6 = .6742329836955067042153645159059714E-01
|
||||
// coef_p6 in dbl = 3FB1 42A7 3D7C 54E3
|
||||
//
|
||||
// coef_p7 = -.4468571068774672908561591262231909E-01
|
||||
// coef_p7 in dbl = BFA6 E10B A401 393F
|
||||
//
|
||||
// coef_p8 = .2252333246746511135532726960586493E-01
|
||||
// coef_p8 in dbl = 3F97 105B 4160 F86B
|
||||
//
|
||||
// coef_p9 = -.7303884867007574742501716845542314E-02
|
||||
// coef_p9 in dbl = BF7D EAAD AA33 6451
|
||||
//
|
||||
// coef_p10 = .1109686868355312093949039454619058E-02
|
||||
// coef_p10 in dbl = 3F52 2E5D 33BC 9BAA
|
||||
//
|
||||
|
||||
// Special values
|
||||
//==============================================================
|
||||
// Y x Result
|
||||
// +number +inf +0
|
||||
// -number +inf -0
|
||||
// +number -inf +pi
|
||||
// -number -inf -pi
|
||||
//
|
||||
// +inf +number +pi/2
|
||||
// -inf +number -pi/2
|
||||
// +inf -number +pi/2
|
||||
// -inf -number -pi/2
|
||||
//
|
||||
// +inf +inf +pi/4
|
||||
// -inf +inf -pi/4
|
||||
// +inf -inf +3pi/4
|
||||
// -inf -inf -3pi/4
|
||||
//
|
||||
// +1 +1 +pi/4
|
||||
// -1 +1 -pi/4
|
||||
// +1 -1 +3pi/4
|
||||
// -1 -1 -3pi/4
|
||||
//
|
||||
// +number +0 +pi/2 // does not raise DBZ
|
||||
// -number +0 -pi/2 // does not raise DBZ
|
||||
// +number -0 +pi/2 // does not raise DBZ
|
||||
// -number -0 -pi/2 // does not raise DBZ
|
||||
//
|
||||
// +0 +number +0
|
||||
// -0 +number -0
|
||||
// +0 -number +pi
|
||||
// -0 -number -pi
|
||||
//
|
||||
// +0 +0 +0 // does not raise invalid
|
||||
// -0 +0 -0 // does not raise invalid
|
||||
// +0 -0 +pi // does not raise invalid
|
||||
// -0 -0 -pi // does not raise invalid
|
||||
//
|
||||
// Nan anything quiet Y
|
||||
// anything NaN quiet X
|
||||
|
||||
// atan2(+-0/+-0) sets double error tag to 37
|
||||
// atan2f(+-0/+-0) sets single error tag to 38
|
||||
// These are domain errors.
|
||||
|
||||
|
||||
//
|
||||
// Assembly macros
|
||||
//=========================================
|
||||
|
||||
|
||||
// integer registers
|
||||
atan2f_GR_Addr_1 = r33
|
||||
atan2f_GR_Addr_2 = r34
|
||||
GR_SAVE_B0 = r35
|
||||
|
||||
GR_SAVE_PFS = r36
|
||||
GR_SAVE_GP = r37
|
||||
|
||||
GR_Parameter_X = r38
|
||||
GR_Parameter_Y = r39
|
||||
GR_Parameter_RESULT = r40
|
||||
GR_Parameter_TAG = r41
|
||||
|
||||
// floating point registers
|
||||
atan2f_coef_p1 = f32
|
||||
atan2f_coef_p10 = f33
|
||||
atan2f_coef_p7 = f34
|
||||
atan2f_coef_p6 = f35
|
||||
|
||||
atan2f_coef_p3 = f36
|
||||
atan2f_coef_p2 = f37
|
||||
atan2f_coef_p9 = f38
|
||||
atan2f_coef_p8 = f39
|
||||
atan2f_coef_p5 = f40
|
||||
|
||||
atan2f_coef_p4 = f41
|
||||
atan2f_const_piby2 = f42
|
||||
atan2f_const_pi = f43
|
||||
atan2f_const_piby4 = f44
|
||||
atan2f_const_3piby4 = f45
|
||||
|
||||
atan2f_xsq = f46
|
||||
atan2f_ysq = f47
|
||||
atan2f_xy = f48
|
||||
atan2f_const_1 = f49
|
||||
atan2f_sgn_Y = f50
|
||||
|
||||
atan2f_Z0 = f51
|
||||
atan2f_A0 = f52
|
||||
atan2f_Z = f53
|
||||
atan2f_A = f54
|
||||
atan2f_C = f55
|
||||
|
||||
atan2f_U = f56
|
||||
atan2f_Usq = f57
|
||||
atan2f_U4 = f58
|
||||
atan2f_U6 = f59
|
||||
atan2f_U8 = f60
|
||||
|
||||
atan2f_poly_u109 = f61
|
||||
atan2f_poly_u87 = f62
|
||||
atan2f_poly_u65 = f63
|
||||
atan2f_poly_u43 = f64
|
||||
atan2f_poly_u21 = f65
|
||||
|
||||
atan2f_poly_u10to7 = f66
|
||||
atan2f_poly_u6to3 = f67
|
||||
atan2f_poly_u10to3 = f68
|
||||
atan2f_poly_u10to0 = f69
|
||||
atan2f_poly_u210 = f70
|
||||
|
||||
atan2f_T_numer = f71
|
||||
atan2f_T_denom = f72
|
||||
atan2f_G_numer = f73
|
||||
atan2f_G_denom = f74
|
||||
atan2f_p1rnum = f75
|
||||
|
||||
atan2f_R_denom = f76
|
||||
atan2f_R_numer = f77
|
||||
atan2f_pR = f78
|
||||
atan2f_pRC = f79
|
||||
atan2f_pQRC = f80
|
||||
|
||||
atan2f_Q1 = f81
|
||||
atan2f_Q_beta = f82
|
||||
atan2f_Q2 = f83
|
||||
atan2f_Q_beta2 = f84
|
||||
atan2f_Q3 = f85
|
||||
|
||||
atan2f_r = f86
|
||||
atan2f_rsq = f87
|
||||
atan2f_poly_atan_U = f88
|
||||
|
||||
|
||||
// predicate registers
|
||||
//atan2f_Pred_Swap = p6 // |y| > |x|
|
||||
//atan2f_Pred_noSwap = p7 // |y| <= |x|
|
||||
//atan2f_Pred_Xpos = p8 // x >= 0
|
||||
//atan2f_Pred_Xneg = p9 // x < 0
|
||||
|
||||
|
||||
RODATA
|
||||
|
||||
.align 16
|
||||
|
||||
LOCAL_OBJECT_START(atan2f_coef_table1)
|
||||
data8 0xBFD5555512191621 // p1
|
||||
data8 0x3F522E5D33BC9BAA // p10
|
||||
data8 0xBFA6E10BA401393F // p7
|
||||
data8 0x3FB142A73D7C54E3 // p6
|
||||
data8 0xBFC2473C5145EE38 // p3
|
||||
data8 0x3FC9997E7AFBFF4E // p2
|
||||
LOCAL_OBJECT_END(atan2f_coef_table1)
|
||||
|
||||
LOCAL_OBJECT_START(atan2f_coef_table2)
|
||||
data8 0xBF7DEAADAA336451 // p9
|
||||
data8 0x3F97105B4160F86B // p8
|
||||
data8 0xBFB68EED6A8CFA32 // p5
|
||||
data8 0x3FBC4F512B1865F5 // p4
|
||||
data8 0x3ff921fb54442d18 // pi/2
|
||||
data8 0x400921fb54442d18 // pi
|
||||
data8 0x3fe921fb54442d18 // pi/4
|
||||
data8 0x4002d97c7f3321d2 // 3pi/4
|
||||
LOCAL_OBJECT_END(atan2f_coef_table2)
|
||||
|
||||
|
||||
|
||||
.section .text
|
||||
GLOBAL_IEEE754_ENTRY(atan2f)
|
||||
|
||||
{ .mfi
|
||||
alloc r32 = ar.pfs,1,5,4,0
|
||||
frcpa.s1 atan2f_Z0,p0 = f1,f8 // Approx to 1/y
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
addl atan2f_GR_Addr_1 = @ltoff(atan2f_coef_table1),gp
|
||||
fma.s1 atan2f_xsq = f9,f9,f0
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
ld8 atan2f_GR_Addr_1 = [atan2f_GR_Addr_1]
|
||||
frcpa.s1 atan2f_A0,p0 = f1,f9 // Approx to 1/x
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 atan2f_ysq = f8,f8,f0
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fcmp.ge.s1 p8,p9 = f9,f0 // Set p8 if x>=0, p9 if x<0
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 atan2f_xy = f9,f8,f0
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
add atan2f_GR_Addr_2 = 0x30, atan2f_GR_Addr_1
|
||||
fmerge.s atan2f_sgn_Y = f8,f1
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{ .mmf
|
||||
ldfpd atan2f_coef_p1,atan2f_coef_p10 = [atan2f_GR_Addr_1],16
|
||||
ldfpd atan2f_coef_p9,atan2f_coef_p8 = [atan2f_GR_Addr_2],16
|
||||
fclass.m p10,p0 = f9,0xe7 // Test x @inf|@snan|@qnan|@zero
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
ldfpd atan2f_coef_p7,atan2f_coef_p6 = [atan2f_GR_Addr_1],16
|
||||
fma.s1 atan2f_T_denom = atan2f_Z0,atan2f_xsq,f8
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
ldfpd atan2f_coef_p5,atan2f_coef_p4 = [atan2f_GR_Addr_2],16
|
||||
fma.s1 atan2f_Z = atan2f_Z0,f9,f0
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
ldfpd atan2f_coef_p3,atan2f_coef_p2 = [atan2f_GR_Addr_1],16
|
||||
fma.s1 atan2f_G_denom = atan2f_A0,atan2f_ysq,f9
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
ldfpd atan2f_const_piby2,atan2f_const_pi = [atan2f_GR_Addr_2],16
|
||||
fma.s1 atan2f_A = atan2f_A0,f8,f0
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
ldfpd atan2f_const_piby4,atan2f_const_3piby4 = [atan2f_GR_Addr_2]
|
||||
fclass.m p11,p0 = f8,0xe7 // Test y @inf|@snan|@qnan|@zero
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
fnma.s1 atan2f_T_numer = atan2f_Z0,atan2f_xy,f9
|
||||
(p10) br.cond.spnt ATAN2F_XY_INF_NAN_ZERO ;; // Branch on x nan,inf,zero
|
||||
}
|
||||
|
||||
|
||||
// p6 if |y|>|x|, p7 if |x|>=|y| , use xsq and ysq for test
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fcmp.gt.s1 p6,p7 = atan2f_ysq,atan2f_xsq
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
fnma.s1 atan2f_G_numer = atan2f_A0,atan2f_xy,f8
|
||||
(p11) br.cond.spnt ATAN2F_XY_INF_NAN_ZERO ;; // Branch on y nan,inf,zero
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p8) fma.s1 atan2f_const_1 = atan2f_sgn_Y,f0,f0
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p9) fma.s1 atan2f_const_1 = atan2f_sgn_Y,f1,f0
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p6) fnma.s1 atan2f_U = atan2f_Z,f1,f0
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p6) fma.s1 atan2f_Usq = atan2f_Z,atan2f_Z,f0
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p7) fma.s1 atan2f_U = atan2f_A,f1,f0
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p7) fma.s1 atan2f_Usq = atan2f_A,atan2f_A,f0
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p6) frcpa.s1 atan2f_Q1,p0 = f1,atan2f_T_denom
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p6) fma.s1 atan2f_R_denom = atan2f_T_denom,f1,f0
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p7) frcpa.s1 atan2f_Q1,p0 = f1,atan2f_G_denom
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p7) fma.s1 atan2f_R_denom = atan2f_G_denom,f1,f0
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p6) fnma.s1 atan2f_R_numer = atan2f_T_numer,f1,f0
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p7) fma.s1 atan2f_R_numer = atan2f_G_numer,f1,f0
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p6) fnma.s1 atan2f_p1rnum = atan2f_T_numer,atan2f_coef_p1,f0
|
||||
nop.i 999 ;;
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p7) fma.s1 atan2f_p1rnum = atan2f_G_numer,atan2f_coef_p1,f0
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 atan2f_U4 = atan2f_Usq,atan2f_Usq,f0
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 atan2f_poly_u109 = atan2f_Usq,atan2f_coef_p10,atan2f_coef_p9
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 atan2f_poly_u87 = atan2f_Usq,atan2f_coef_p8,atan2f_coef_p7
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 atan2f_poly_u65 = atan2f_Usq,atan2f_coef_p6,atan2f_coef_p5
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 atan2f_poly_u43 = atan2f_Usq,atan2f_coef_p4,atan2f_coef_p3
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fnma.s1 atan2f_Q_beta = atan2f_Q1,atan2f_R_denom,f1
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 atan2f_poly_u21 = atan2f_Usq,atan2f_coef_p2,atan2f_coef_p1
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 atan2f_r = atan2f_Q1,atan2f_R_numer,f0
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p6) fma.s1 atan2f_C = atan2f_sgn_Y,atan2f_const_piby2,f0
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p7) fma.s1 atan2f_C = atan2f_const_1,atan2f_const_pi,f0
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 atan2f_U6 = atan2f_U4,atan2f_Usq,f0
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 atan2f_U8 = atan2f_U4,atan2f_U4,f0
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 atan2f_poly_u10to7 = atan2f_U4,atan2f_poly_u109,atan2f_poly_u87
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 atan2f_pR = atan2f_p1rnum,atan2f_Q1,f0
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 atan2f_poly_u6to3 = atan2f_U4,atan2f_poly_u65,atan2f_poly_u43
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 atan2f_Q2 = atan2f_Q1,atan2f_Q_beta,atan2f_Q1
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 atan2f_Q_beta2 = atan2f_Q_beta,atan2f_Q_beta,f0
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 atan2f_rsq = atan2f_r,atan2f_r,f0
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 atan2f_poly_u210 = atan2f_Usq,atan2f_poly_u21,f1
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fcmp.eq.s0 p8,p0 = f8,f9 // Dummy op to set flag on denormal inputs
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 atan2f_poly_u10to3 = atan2f_U8,atan2f_poly_u10to7,atan2f_poly_u6to3
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 atan2f_Q3 = atan2f_Q2,atan2f_Q_beta2,atan2f_Q2
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 atan2f_pRC = atan2f_rsq,atan2f_pR,atan2f_C
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 atan2f_poly_u10to0 = atan2f_U6,atan2f_poly_u10to3,atan2f_poly_u210
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s1 atan2f_pQRC = atan2f_R_numer,atan2f_Q3,atan2f_pRC
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
fma.s.s0 f8 = atan2f_U,atan2f_poly_u10to0,atan2f_pQRC
|
||||
br.ret.sptk b0 ;;
|
||||
}
|
||||
|
||||
|
||||
|
||||
ATAN2F_XY_INF_NAN_ZERO:
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fclass.m p10,p0 = f8,0xc3 // Is y nan
|
||||
nop.i 999
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fclass.m p12,p0 = f9,0xc3 // Is x nan
|
||||
nop.i 999
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fclass.m p6,p0 = f9,0x21 // Is x +inf
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
(p10) fma.s.s0 f8 = f9,f8,f0 // Result quietized y if y is nan
|
||||
(p10) br.ret.spnt b0 // Exit if y is nan
|
||||
}
|
||||
;;
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p6) fclass.m.unc p7,p8 = f8,0x23 // x +inf, is y inf
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
(p12) fnorm.s.s0 f8 = f9 // Result quietized x if x is nan, y not nan
|
||||
(p12) br.ret.spnt b0 // Exit if x is nan, y not nan
|
||||
}
|
||||
;;
|
||||
|
||||
// Here if x or y inf, or x or y zero
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fcmp.eq.s0 p15,p0 = f8,f9 // Dummy op to set flag on denormal inputs
|
||||
nop.i 999
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fclass.m p11,p12 = f9,0x22 // Is x -inf
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
(p7) fma.s.s0 f8 = atan2f_sgn_Y, atan2f_const_piby4,f0 // Result +-pi/4
|
||||
(p7) br.ret.spnt b0 // Exit if x +inf and y inf
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
(p8) fmerge.s f8 = f8,f0 // If x +inf and y not inf, result +-0
|
||||
(p8) br.ret.spnt b0 // Exit if x +inf and y not inf
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p12) fclass.m.unc p13,p0 = f8,0x23 // x not -inf, is y inf
|
||||
nop.i 999
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p11) fclass.m.unc p14,p15 = f8,0x23 // x -inf, is y inf
|
||||
nop.i 999
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fclass.m p6,p7 = f9,0x7 // Is x zero
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
(p13) fma.s.s0 f8 = atan2f_sgn_Y, atan2f_const_piby2,f0 // Result +-pi/2
|
||||
(p13) br.ret.spnt b0 // Exit if x not -inf and y inf
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p14) fma.s.s0 f8 = atan2f_sgn_Y, atan2f_const_3piby4,f0 // Result +-3pi/4
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
(p15) fma.s.s0 f8 = atan2f_sgn_Y, atan2f_const_pi,f0 // Result +-pi
|
||||
(p11) br.ret.spnt b0 // Exit if x -inf
|
||||
}
|
||||
;;
|
||||
|
||||
// Here if x or y zero
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p7) fclass.m.unc p8,p9 = f9,0x19 // x not zero, y zero, is x > zero
|
||||
nop.i 999
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p6) fclass.m.unc p10,p11 = f8,0x7 // x zero, is y zero
|
||||
nop.i 999
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p8) fmerge.s f8 = f8, f0 // x > zero and y zero, result is +-zero
|
||||
nop.i 999
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
(p9) fma.s.s0 f8 = atan2f_sgn_Y, atan2f_const_pi,f0 // x < 0, y 0, result +-pi
|
||||
(p10) br.cond.spnt __libm_error_region // Branch if x zero and y zero
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
(p11) fma.s.s0 f8 = atan2f_sgn_Y, atan2f_const_piby2,f0 // x zero, y not zero
|
||||
br.ret.sptk b0 // Final special case exit
|
||||
}
|
||||
;;
|
||||
|
||||
|
||||
GLOBAL_IEEE754_END(atan2f)
|
||||
libm_alias_float_other (__atan2, atan2)
|
||||
|
||||
|
||||
LOCAL_LIBM_ENTRY(__libm_error_region)
|
||||
.prologue
|
||||
mov GR_Parameter_TAG = 38
|
||||
fclass.m p10,p11 = f9,0x5 // @zero | @pos
|
||||
;;
|
||||
(p10) fmerge.s f10 = f8, f0
|
||||
(p11) fma.s.s0 f10 = atan2f_sgn_Y, atan2f_const_pi,f0
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
add GR_Parameter_Y=-32,sp // Parameter 2 value
|
||||
nop.f 999
|
||||
.save ar.pfs,GR_SAVE_PFS
|
||||
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
.fframe 64
|
||||
add sp=-64,sp // Create new stack
|
||||
nop.f 0
|
||||
mov GR_SAVE_GP=gp // Save gp
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mmi
|
||||
stfs [GR_Parameter_Y] = f9,16 // Store Parameter 2 on stack
|
||||
add GR_Parameter_X = 16,sp // Parameter 1 address
|
||||
.save b0, GR_SAVE_B0
|
||||
mov GR_SAVE_B0=b0 // Save b0
|
||||
}
|
||||
;;
|
||||
|
||||
|
||||
.body
|
||||
{ .mib
|
||||
stfs [GR_Parameter_X] = f8 // Store Parameter 1 on stack
|
||||
add GR_Parameter_RESULT = 0,GR_Parameter_Y
|
||||
nop.b 0 // Parameter 3 address
|
||||
}
|
||||
{ .mib
|
||||
stfs [GR_Parameter_Y] = f10 // Store Parameter 3 on stack
|
||||
add GR_Parameter_Y = -16,GR_Parameter_Y
|
||||
br.call.sptk b0=__libm_error_support# // Call error handling function
|
||||
}
|
||||
;;
|
||||
{ .mmi
|
||||
nop.m 0
|
||||
nop.m 0
|
||||
add GR_Parameter_RESULT = 48,sp
|
||||
};;
|
||||
|
||||
{ .mmi
|
||||
ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
|
||||
.restore sp
|
||||
add sp = 64,sp // Restore stack pointer
|
||||
mov b0 = GR_SAVE_B0 // Restore return address
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mib
|
||||
mov gp = GR_SAVE_GP // Restore gp
|
||||
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
|
||||
br.ret.sptk b0 // Return
|
||||
}
|
||||
;;
|
||||
|
||||
LOCAL_LIBM_END(__libm_error_region)
|
||||
|
||||
.type __libm_error_support#,@function
|
||||
.global __libm_error_support#
|
@ -1 +0,0 @@
|
||||
/* Not needed. */
|
File diff suppressed because it is too large
Load Diff
@ -1,845 +0,0 @@
|
||||
.file "atanhf.s"
|
||||
|
||||
|
||||
// Copyright (c) 2000 - 2003, Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
//
|
||||
// History
|
||||
//==============================================================
|
||||
// 05/22/01 Initial version
|
||||
// 05/20/02 Cleaned up namespace and sf0 syntax
|
||||
// 08/06/02 Improved Itanium 2 performance
|
||||
// 02/06/03 Reordered header: .section, .global, .proc, .align
|
||||
// 05/26/03 Improved performance, fixed to handle unorms
|
||||
//
|
||||
// API
|
||||
//==============================================================
|
||||
// float atanhf(float)
|
||||
//
|
||||
// Overview of operation
|
||||
//==============================================================
|
||||
// Background
|
||||
//
|
||||
//
|
||||
// There are 7 paths:
|
||||
// 1. x = +/-0.0
|
||||
// Return atanhf(x) = +/-0.0
|
||||
//
|
||||
// 2. 0.0 < |x| <= MAX_DENORMAL_ABS
|
||||
// Return atanhf(x) = x + sign(x)*x^2
|
||||
//
|
||||
// 3. MAX_DENORMAL_ABS < |x| < 2^(-20)
|
||||
// Return atanhf(x) = Pol3(x), where Pol3(x) = x + x^3
|
||||
//
|
||||
// 4. 2^(-20) <= |x| < 1
|
||||
// Return atanhf(x) = 0.5 * (log(1 + x) - log(1 - x))
|
||||
// Algorithm description for log function see below.
|
||||
//
|
||||
// 5. |x| = 1
|
||||
// Return atanhf(x) = sign(x) * +INF
|
||||
//
|
||||
// 6. 1 < |x| <= +INF
|
||||
// Return atanhf(x) = QNaN
|
||||
//
|
||||
// 7. x = [S,Q]NaN
|
||||
// Return atanhf(x) = QNaN
|
||||
//
|
||||
//==============================================================
|
||||
// Algorithm Description for log(x) function
|
||||
//
|
||||
// Consider x = 2^N * 1.f1 f2 f3 f4...f63
|
||||
// log(x) = log(x * frcpa(x) / frcpa(x))
|
||||
// = log(x * frcpa(x)) + log(1/frcpa(x))
|
||||
// = log(x * frcpa(x)) - log(frcpa(x))
|
||||
//
|
||||
// frcpa(x) = 2^(-N) * frcpa(1.f1 f2 ... f63)
|
||||
//
|
||||
// -log(frcpa(x)) = -log(C)
|
||||
// = -log(2^(-N)) - log(frcpa(1.f1 f2 ... f63))
|
||||
//
|
||||
// -log(frcpa(x)) = -log(C)
|
||||
// = N*log2 - log(frcpa(1.f1 f2 ... f63))
|
||||
//
|
||||
//
|
||||
// log(x) = log(1/frcpa(x)) + log(frcpa(x) x)
|
||||
//
|
||||
// log(x) = N*log2 + log(1./frcpa(1.f1 f2 ... f63)) + log(x * frcpa(x))
|
||||
// log(x) = N*log2 + T + log(frcpa(x) x)
|
||||
//
|
||||
// Log(x) = N*log2 + T + log(C * x)
|
||||
//
|
||||
// C * x = 1 + r
|
||||
//
|
||||
// log(x) = N*log2 + T + log(1 + r)
|
||||
// log(x) = N*log2 + T + Series(r)
|
||||
//
|
||||
// 1.f1 f2 ... f8 has 256 entries.
|
||||
// They are 1 + k/2^8, k = 0 ... 255
|
||||
// These 256 values are the table entries.
|
||||
//
|
||||
// Implementation
|
||||
//==============================================================
|
||||
// C = frcpa(x)
|
||||
// r = C * x - 1
|
||||
//
|
||||
// Form rseries = r + P1*r^2 + P2*r^3 + P3*r^4
|
||||
//
|
||||
// x = f * 2*N where f is 1.f_1f_2f_3...f_63
|
||||
// Nfloat = float(n) where n is the true unbiased exponent
|
||||
// pre-index = f_1f_2....f_8
|
||||
// index = pre_index * 16
|
||||
// get the dxt table entry at index + offset = T
|
||||
//
|
||||
// result = (T + Nfloat * log(2)) + rseries
|
||||
//
|
||||
// The T table is calculated as follows
|
||||
// Form x_k = 1 + k/2^8 where k goes from 0... 255
|
||||
// y_k = frcpa(x_k)
|
||||
// log(1/y_k) in quad and round to double-extended
|
||||
|
||||
// Registers used
|
||||
//==============================================================
|
||||
// Floating Point registers used:
|
||||
// f8, input
|
||||
// f32 -> f59
|
||||
|
||||
// General registers used:
|
||||
// r14 -> r29, r32 -> r39
|
||||
|
||||
// Predicate registers used:
|
||||
// p6 -> p9
|
||||
|
||||
// p6 to filter out case when |x| >= 1
|
||||
// p7 to filter out case when x = [Q,S]NaN or +/-0
|
||||
// p8 to filter out case when |x| < 2^(-20)
|
||||
// p9 to filter out case when x = denormal
|
||||
|
||||
|
||||
// Assembly macros
|
||||
//==============================================================
|
||||
DataPtr = r14
|
||||
RcpTablePtrM = r15
|
||||
RcpTablePtrP = r16
|
||||
rExpbMask = r17
|
||||
rBias = r18
|
||||
rNearZeroBound = r19
|
||||
rArgSExpb = r20
|
||||
rArgExpb = r21
|
||||
rExpbm = r22
|
||||
rExpbp = r23
|
||||
rSigm = r24
|
||||
rSigp = r25
|
||||
rNm = r26
|
||||
rNp = r27
|
||||
rIndm = r28
|
||||
rIndp = r29
|
||||
|
||||
GR_SAVE_B0 = r33
|
||||
GR_SAVE_GP = r34
|
||||
GR_SAVE_PFS = r35
|
||||
|
||||
GR_Parameter_X = r36
|
||||
GR_Parameter_Y = r37
|
||||
GR_Parameter_RESULT = r38
|
||||
atanh_GR_tag = r39
|
||||
|
||||
//==============================================================
|
||||
fOneMx = f33
|
||||
fOnePx = f34
|
||||
fRm2 = f35
|
||||
fRm3 = f36
|
||||
fRp2 = f37
|
||||
fRp3 = f38
|
||||
fRcpM = f39
|
||||
fRcpP = f40
|
||||
fRp = f41
|
||||
fRm = f42
|
||||
fN4CvtM = f43
|
||||
fN4CvtP = f44
|
||||
fNm = f45
|
||||
fNp = f46
|
||||
fLogTm = f47
|
||||
fLogTp = f48
|
||||
fLog2 = f49
|
||||
fArgAbs = f50
|
||||
fNormX = f50
|
||||
fP32m = f51
|
||||
fP32p = f52
|
||||
fP10m = f53
|
||||
fP10p = f54
|
||||
fX2 = f55
|
||||
fP3 = f56
|
||||
fP2 = f57
|
||||
fP1 = f58
|
||||
fHalf = f59
|
||||
|
||||
|
||||
|
||||
// Data tables
|
||||
//==============================================================
|
||||
|
||||
RODATA
|
||||
|
||||
.align 16
|
||||
|
||||
LOCAL_OBJECT_START(atanhf_data)
|
||||
data8 0xbfc0001008f39d59 // P3*0.5
|
||||
data8 0x3fc5556073e0c45a // P2*0.5
|
||||
data8 0xbfcffffffffaea15 // P1*0.5
|
||||
data8 0x3fe0000000000000 // 0.5
|
||||
data8 0x3fd62e42fefa39ef // 0.5*ln(2)
|
||||
data8 0x0000000000000000 // pad
|
||||
LOCAL_OBJECT_END(atanhf_data)
|
||||
|
||||
LOCAL_OBJECT_START(atanhf_data2)
|
||||
data8 0x3f50040155d5889e //log(1/frcpa(1+0/256))/2
|
||||
data8 0x3f68121214586b54 //log(1/frcpa(1+1/256))/2
|
||||
data8 0x3f741929f96832f0 //log(1/frcpa(1+2/256))/2
|
||||
data8 0x3f7c317384c75f06 //log(1/frcpa(1+3/256))/2
|
||||
data8 0x3f81a6b91ac73386 //log(1/frcpa(1+4/256))/2
|
||||
data8 0x3f85ba9a5d9ac039 //log(1/frcpa(1+5/256))/2
|
||||
data8 0x3f89d2a8074325f4 //log(1/frcpa(1+6/256))/2
|
||||
data8 0x3f8d6b2725979802 //log(1/frcpa(1+7/256))/2
|
||||
data8 0x3f90c58fa19dfaaa //log(1/frcpa(1+8/256))/2
|
||||
data8 0x3f92954c78cbce1b //log(1/frcpa(1+9/256))/2
|
||||
data8 0x3f94a94d2da96c56 //log(1/frcpa(1+10/256))/2
|
||||
data8 0x3f967c94f2d4bb58 //log(1/frcpa(1+11/256))/2
|
||||
data8 0x3f985188b630f068 //log(1/frcpa(1+12/256))/2
|
||||
data8 0x3f9a6b8abe73af4c //log(1/frcpa(1+13/256))/2
|
||||
data8 0x3f9c441e06f72a9e //log(1/frcpa(1+14/256))/2
|
||||
data8 0x3f9e1e6713606d07 //log(1/frcpa(1+15/256))/2
|
||||
data8 0x3f9ffa6911ab9301 //log(1/frcpa(1+16/256))/2
|
||||
data8 0x3fa0ec139c5da601 //log(1/frcpa(1+17/256))/2
|
||||
data8 0x3fa1dbd2643d190b //log(1/frcpa(1+18/256))/2
|
||||
data8 0x3fa2cc7284fe5f1c //log(1/frcpa(1+19/256))/2
|
||||
data8 0x3fa3bdf5a7d1ee64 //log(1/frcpa(1+20/256))/2
|
||||
data8 0x3fa4b05d7aa012e0 //log(1/frcpa(1+21/256))/2
|
||||
data8 0x3fa580db7ceb5702 //log(1/frcpa(1+22/256))/2
|
||||
data8 0x3fa674f089365a7a //log(1/frcpa(1+23/256))/2
|
||||
data8 0x3fa769ef2c6b568d //log(1/frcpa(1+24/256))/2
|
||||
data8 0x3fa85fd927506a48 //log(1/frcpa(1+25/256))/2
|
||||
data8 0x3fa9335e5d594989 //log(1/frcpa(1+26/256))/2
|
||||
data8 0x3faa2b0220c8e5f5 //log(1/frcpa(1+27/256))/2
|
||||
data8 0x3fab0004ac1a86ac //log(1/frcpa(1+28/256))/2
|
||||
data8 0x3fabf968769fca11 //log(1/frcpa(1+29/256))/2
|
||||
data8 0x3faccfedbfee13a8 //log(1/frcpa(1+30/256))/2
|
||||
data8 0x3fada727638446a2 //log(1/frcpa(1+31/256))/2
|
||||
data8 0x3faea3257fe10f7a //log(1/frcpa(1+32/256))/2
|
||||
data8 0x3faf7be9fedbfde6 //log(1/frcpa(1+33/256))/2
|
||||
data8 0x3fb02ab352ff25f4 //log(1/frcpa(1+34/256))/2
|
||||
data8 0x3fb097ce579d204d //log(1/frcpa(1+35/256))/2
|
||||
data8 0x3fb1178e8227e47c //log(1/frcpa(1+36/256))/2
|
||||
data8 0x3fb185747dbecf34 //log(1/frcpa(1+37/256))/2
|
||||
data8 0x3fb1f3b925f25d41 //log(1/frcpa(1+38/256))/2
|
||||
data8 0x3fb2625d1e6ddf57 //log(1/frcpa(1+39/256))/2
|
||||
data8 0x3fb2d1610c86813a //log(1/frcpa(1+40/256))/2
|
||||
data8 0x3fb340c59741142e //log(1/frcpa(1+41/256))/2
|
||||
data8 0x3fb3b08b6757f2a9 //log(1/frcpa(1+42/256))/2
|
||||
data8 0x3fb40dfb08378003 //log(1/frcpa(1+43/256))/2
|
||||
data8 0x3fb47e74e8ca5f7c //log(1/frcpa(1+44/256))/2
|
||||
data8 0x3fb4ef51f6466de4 //log(1/frcpa(1+45/256))/2
|
||||
data8 0x3fb56092e02ba516 //log(1/frcpa(1+46/256))/2
|
||||
data8 0x3fb5d23857cd74d5 //log(1/frcpa(1+47/256))/2
|
||||
data8 0x3fb6313a37335d76 //log(1/frcpa(1+48/256))/2
|
||||
data8 0x3fb6a399dabbd383 //log(1/frcpa(1+49/256))/2
|
||||
data8 0x3fb70337dd3ce41b //log(1/frcpa(1+50/256))/2
|
||||
data8 0x3fb77654128f6127 //log(1/frcpa(1+51/256))/2
|
||||
data8 0x3fb7e9d82a0b022d //log(1/frcpa(1+52/256))/2
|
||||
data8 0x3fb84a6b759f512f //log(1/frcpa(1+53/256))/2
|
||||
data8 0x3fb8ab47d5f5a310 //log(1/frcpa(1+54/256))/2
|
||||
data8 0x3fb91fe49096581b //log(1/frcpa(1+55/256))/2
|
||||
data8 0x3fb981634011aa75 //log(1/frcpa(1+56/256))/2
|
||||
data8 0x3fb9f6c407089664 //log(1/frcpa(1+57/256))/2
|
||||
data8 0x3fba58e729348f43 //log(1/frcpa(1+58/256))/2
|
||||
data8 0x3fbabb55c31693ad //log(1/frcpa(1+59/256))/2
|
||||
data8 0x3fbb1e104919efd0 //log(1/frcpa(1+60/256))/2
|
||||
data8 0x3fbb94ee93e367cb //log(1/frcpa(1+61/256))/2
|
||||
data8 0x3fbbf851c067555f //log(1/frcpa(1+62/256))/2
|
||||
data8 0x3fbc5c0254bf23a6 //log(1/frcpa(1+63/256))/2
|
||||
data8 0x3fbcc000c9db3c52 //log(1/frcpa(1+64/256))/2
|
||||
data8 0x3fbd244d99c85674 //log(1/frcpa(1+65/256))/2
|
||||
data8 0x3fbd88e93fb2f450 //log(1/frcpa(1+66/256))/2
|
||||
data8 0x3fbdedd437eaef01 //log(1/frcpa(1+67/256))/2
|
||||
data8 0x3fbe530effe71012 //log(1/frcpa(1+68/256))/2
|
||||
data8 0x3fbeb89a1648b971 //log(1/frcpa(1+69/256))/2
|
||||
data8 0x3fbf1e75fadf9bde //log(1/frcpa(1+70/256))/2
|
||||
data8 0x3fbf84a32ead7c35 //log(1/frcpa(1+71/256))/2
|
||||
data8 0x3fbfeb2233ea07cd //log(1/frcpa(1+72/256))/2
|
||||
data8 0x3fc028f9c7035c1c //log(1/frcpa(1+73/256))/2
|
||||
data8 0x3fc05c8be0d9635a //log(1/frcpa(1+74/256))/2
|
||||
data8 0x3fc085eb8f8ae797 //log(1/frcpa(1+75/256))/2
|
||||
data8 0x3fc0b9c8e32d1911 //log(1/frcpa(1+76/256))/2
|
||||
data8 0x3fc0edd060b78081 //log(1/frcpa(1+77/256))/2
|
||||
data8 0x3fc122024cf0063f //log(1/frcpa(1+78/256))/2
|
||||
data8 0x3fc14be2927aecd4 //log(1/frcpa(1+79/256))/2
|
||||
data8 0x3fc180618ef18adf //log(1/frcpa(1+80/256))/2
|
||||
data8 0x3fc1b50bbe2fc63b //log(1/frcpa(1+81/256))/2
|
||||
data8 0x3fc1df4cc7cf242d //log(1/frcpa(1+82/256))/2
|
||||
data8 0x3fc214456d0eb8d4 //log(1/frcpa(1+83/256))/2
|
||||
data8 0x3fc23ec5991eba49 //log(1/frcpa(1+84/256))/2
|
||||
data8 0x3fc2740d9f870afb //log(1/frcpa(1+85/256))/2
|
||||
data8 0x3fc29ecdabcdfa04 //log(1/frcpa(1+86/256))/2
|
||||
data8 0x3fc2d46602adccee //log(1/frcpa(1+87/256))/2
|
||||
data8 0x3fc2ff66b04ea9d4 //log(1/frcpa(1+88/256))/2
|
||||
data8 0x3fc335504b355a37 //log(1/frcpa(1+89/256))/2
|
||||
data8 0x3fc360925ec44f5d //log(1/frcpa(1+90/256))/2
|
||||
data8 0x3fc38bf1c3337e75 //log(1/frcpa(1+91/256))/2
|
||||
data8 0x3fc3c25277333184 //log(1/frcpa(1+92/256))/2
|
||||
data8 0x3fc3edf463c1683e //log(1/frcpa(1+93/256))/2
|
||||
data8 0x3fc419b423d5e8c7 //log(1/frcpa(1+94/256))/2
|
||||
data8 0x3fc44591e0539f49 //log(1/frcpa(1+95/256))/2
|
||||
data8 0x3fc47c9175b6f0ad //log(1/frcpa(1+96/256))/2
|
||||
data8 0x3fc4a8b341552b09 //log(1/frcpa(1+97/256))/2
|
||||
data8 0x3fc4d4f3908901a0 //log(1/frcpa(1+98/256))/2
|
||||
data8 0x3fc501528da1f968 //log(1/frcpa(1+99/256))/2
|
||||
data8 0x3fc52dd06347d4f6 //log(1/frcpa(1+100/256))/2
|
||||
data8 0x3fc55a6d3c7b8a8a //log(1/frcpa(1+101/256))/2
|
||||
data8 0x3fc5925d2b112a59 //log(1/frcpa(1+102/256))/2
|
||||
data8 0x3fc5bf406b543db2 //log(1/frcpa(1+103/256))/2
|
||||
data8 0x3fc5ec433d5c35ae //log(1/frcpa(1+104/256))/2
|
||||
data8 0x3fc61965cdb02c1f //log(1/frcpa(1+105/256))/2
|
||||
data8 0x3fc646a84935b2a2 //log(1/frcpa(1+106/256))/2
|
||||
data8 0x3fc6740add31de94 //log(1/frcpa(1+107/256))/2
|
||||
data8 0x3fc6a18db74a58c5 //log(1/frcpa(1+108/256))/2
|
||||
data8 0x3fc6cf31058670ec //log(1/frcpa(1+109/256))/2
|
||||
data8 0x3fc6f180e852f0ba //log(1/frcpa(1+110/256))/2
|
||||
data8 0x3fc71f5d71b894f0 //log(1/frcpa(1+111/256))/2
|
||||
data8 0x3fc74d5aefd66d5c //log(1/frcpa(1+112/256))/2
|
||||
data8 0x3fc77b79922bd37e //log(1/frcpa(1+113/256))/2
|
||||
data8 0x3fc7a9b9889f19e2 //log(1/frcpa(1+114/256))/2
|
||||
data8 0x3fc7d81b037eb6a6 //log(1/frcpa(1+115/256))/2
|
||||
data8 0x3fc8069e33827231 //log(1/frcpa(1+116/256))/2
|
||||
data8 0x3fc82996d3ef8bcb //log(1/frcpa(1+117/256))/2
|
||||
data8 0x3fc85855776dcbfb //log(1/frcpa(1+118/256))/2
|
||||
data8 0x3fc8873658327ccf //log(1/frcpa(1+119/256))/2
|
||||
data8 0x3fc8aa75973ab8cf //log(1/frcpa(1+120/256))/2
|
||||
data8 0x3fc8d992dc8824e5 //log(1/frcpa(1+121/256))/2
|
||||
data8 0x3fc908d2ea7d9512 //log(1/frcpa(1+122/256))/2
|
||||
data8 0x3fc92c59e79c0e56 //log(1/frcpa(1+123/256))/2
|
||||
data8 0x3fc95bd750ee3ed3 //log(1/frcpa(1+124/256))/2
|
||||
data8 0x3fc98b7811a3ee5b //log(1/frcpa(1+125/256))/2
|
||||
data8 0x3fc9af47f33d406c //log(1/frcpa(1+126/256))/2
|
||||
data8 0x3fc9df270c1914a8 //log(1/frcpa(1+127/256))/2
|
||||
data8 0x3fca0325ed14fda4 //log(1/frcpa(1+128/256))/2
|
||||
data8 0x3fca33440224fa79 //log(1/frcpa(1+129/256))/2
|
||||
data8 0x3fca57725e80c383 //log(1/frcpa(1+130/256))/2
|
||||
data8 0x3fca87d0165dd199 //log(1/frcpa(1+131/256))/2
|
||||
data8 0x3fcaac2e6c03f896 //log(1/frcpa(1+132/256))/2
|
||||
data8 0x3fcadccc6fdf6a81 //log(1/frcpa(1+133/256))/2
|
||||
data8 0x3fcb015b3eb1e790 //log(1/frcpa(1+134/256))/2
|
||||
data8 0x3fcb323a3a635948 //log(1/frcpa(1+135/256))/2
|
||||
data8 0x3fcb56fa04462909 //log(1/frcpa(1+136/256))/2
|
||||
data8 0x3fcb881aa659bc93 //log(1/frcpa(1+137/256))/2
|
||||
data8 0x3fcbad0bef3db165 //log(1/frcpa(1+138/256))/2
|
||||
data8 0x3fcbd21297781c2f //log(1/frcpa(1+139/256))/2
|
||||
data8 0x3fcc039236f08819 //log(1/frcpa(1+140/256))/2
|
||||
data8 0x3fcc28cb1e4d32fd //log(1/frcpa(1+141/256))/2
|
||||
data8 0x3fcc4e19b84723c2 //log(1/frcpa(1+142/256))/2
|
||||
data8 0x3fcc7ff9c74554c9 //log(1/frcpa(1+143/256))/2
|
||||
data8 0x3fcca57b64e9db05 //log(1/frcpa(1+144/256))/2
|
||||
data8 0x3fcccb130a5cebb0 //log(1/frcpa(1+145/256))/2
|
||||
data8 0x3fccf0c0d18f326f //log(1/frcpa(1+146/256))/2
|
||||
data8 0x3fcd232075b5a201 //log(1/frcpa(1+147/256))/2
|
||||
data8 0x3fcd490246defa6b //log(1/frcpa(1+148/256))/2
|
||||
data8 0x3fcd6efa918d25cd //log(1/frcpa(1+149/256))/2
|
||||
data8 0x3fcd9509707ae52f //log(1/frcpa(1+150/256))/2
|
||||
data8 0x3fcdbb2efe92c554 //log(1/frcpa(1+151/256))/2
|
||||
data8 0x3fcdee2f3445e4af //log(1/frcpa(1+152/256))/2
|
||||
data8 0x3fce148a1a2726ce //log(1/frcpa(1+153/256))/2
|
||||
data8 0x3fce3afc0a49ff40 //log(1/frcpa(1+154/256))/2
|
||||
data8 0x3fce6185206d516e //log(1/frcpa(1+155/256))/2
|
||||
data8 0x3fce882578823d52 //log(1/frcpa(1+156/256))/2
|
||||
data8 0x3fceaedd2eac990c //log(1/frcpa(1+157/256))/2
|
||||
data8 0x3fced5ac5f436be3 //log(1/frcpa(1+158/256))/2
|
||||
data8 0x3fcefc9326d16ab9 //log(1/frcpa(1+159/256))/2
|
||||
data8 0x3fcf2391a2157600 //log(1/frcpa(1+160/256))/2
|
||||
data8 0x3fcf4aa7ee03192d //log(1/frcpa(1+161/256))/2
|
||||
data8 0x3fcf71d627c30bb0 //log(1/frcpa(1+162/256))/2
|
||||
data8 0x3fcf991c6cb3b379 //log(1/frcpa(1+163/256))/2
|
||||
data8 0x3fcfc07ada69a910 //log(1/frcpa(1+164/256))/2
|
||||
data8 0x3fcfe7f18eb03d3e //log(1/frcpa(1+165/256))/2
|
||||
data8 0x3fd007c053c5002e //log(1/frcpa(1+166/256))/2
|
||||
data8 0x3fd01b942198a5a1 //log(1/frcpa(1+167/256))/2
|
||||
data8 0x3fd02f74400c64eb //log(1/frcpa(1+168/256))/2
|
||||
data8 0x3fd04360be7603ad //log(1/frcpa(1+169/256))/2
|
||||
data8 0x3fd05759ac47fe34 //log(1/frcpa(1+170/256))/2
|
||||
data8 0x3fd06b5f1911cf52 //log(1/frcpa(1+171/256))/2
|
||||
data8 0x3fd078bf0533c568 //log(1/frcpa(1+172/256))/2
|
||||
data8 0x3fd08cd9687e7b0e //log(1/frcpa(1+173/256))/2
|
||||
data8 0x3fd0a10074cf9019 //log(1/frcpa(1+174/256))/2
|
||||
data8 0x3fd0b5343a234477 //log(1/frcpa(1+175/256))/2
|
||||
data8 0x3fd0c974c89431ce //log(1/frcpa(1+176/256))/2
|
||||
data8 0x3fd0ddc2305b9886 //log(1/frcpa(1+177/256))/2
|
||||
data8 0x3fd0eb524bafc918 //log(1/frcpa(1+178/256))/2
|
||||
data8 0x3fd0ffb54213a476 //log(1/frcpa(1+179/256))/2
|
||||
data8 0x3fd114253da97d9f //log(1/frcpa(1+180/256))/2
|
||||
data8 0x3fd128a24f1d9aff //log(1/frcpa(1+181/256))/2
|
||||
data8 0x3fd1365252bf0865 //log(1/frcpa(1+182/256))/2
|
||||
data8 0x3fd14ae558b4a92d //log(1/frcpa(1+183/256))/2
|
||||
data8 0x3fd15f85a19c765b //log(1/frcpa(1+184/256))/2
|
||||
data8 0x3fd16d4d38c119fa //log(1/frcpa(1+185/256))/2
|
||||
data8 0x3fd18203c20dd133 //log(1/frcpa(1+186/256))/2
|
||||
data8 0x3fd196c7bc4b1f3b //log(1/frcpa(1+187/256))/2
|
||||
data8 0x3fd1a4a738b7a33c //log(1/frcpa(1+188/256))/2
|
||||
data8 0x3fd1b981c0c9653d //log(1/frcpa(1+189/256))/2
|
||||
data8 0x3fd1ce69e8bb106b //log(1/frcpa(1+190/256))/2
|
||||
data8 0x3fd1dc619de06944 //log(1/frcpa(1+191/256))/2
|
||||
data8 0x3fd1f160a2ad0da4 //log(1/frcpa(1+192/256))/2
|
||||
data8 0x3fd2066d7740737e //log(1/frcpa(1+193/256))/2
|
||||
data8 0x3fd2147dba47a394 //log(1/frcpa(1+194/256))/2
|
||||
data8 0x3fd229a1bc5ebac3 //log(1/frcpa(1+195/256))/2
|
||||
data8 0x3fd237c1841a502e //log(1/frcpa(1+196/256))/2
|
||||
data8 0x3fd24cfce6f80d9a //log(1/frcpa(1+197/256))/2
|
||||
data8 0x3fd25b2c55cd5762 //log(1/frcpa(1+198/256))/2
|
||||
data8 0x3fd2707f4d5f7c41 //log(1/frcpa(1+199/256))/2
|
||||
data8 0x3fd285e0842ca384 //log(1/frcpa(1+200/256))/2
|
||||
data8 0x3fd294294708b773 //log(1/frcpa(1+201/256))/2
|
||||
data8 0x3fd2a9a2670aff0c //log(1/frcpa(1+202/256))/2
|
||||
data8 0x3fd2b7fb2c8d1cc1 //log(1/frcpa(1+203/256))/2
|
||||
data8 0x3fd2c65a6395f5f5 //log(1/frcpa(1+204/256))/2
|
||||
data8 0x3fd2dbf557b0df43 //log(1/frcpa(1+205/256))/2
|
||||
data8 0x3fd2ea64c3f97655 //log(1/frcpa(1+206/256))/2
|
||||
data8 0x3fd3001823684d73 //log(1/frcpa(1+207/256))/2
|
||||
data8 0x3fd30e97e9a8b5cd //log(1/frcpa(1+208/256))/2
|
||||
data8 0x3fd32463ebdd34ea //log(1/frcpa(1+209/256))/2
|
||||
data8 0x3fd332f4314ad796 //log(1/frcpa(1+210/256))/2
|
||||
data8 0x3fd348d90e7464d0 //log(1/frcpa(1+211/256))/2
|
||||
data8 0x3fd35779f8c43d6e //log(1/frcpa(1+212/256))/2
|
||||
data8 0x3fd36621961a6a99 //log(1/frcpa(1+213/256))/2
|
||||
data8 0x3fd37c299f3c366a //log(1/frcpa(1+214/256))/2
|
||||
data8 0x3fd38ae2171976e7 //log(1/frcpa(1+215/256))/2
|
||||
data8 0x3fd399a157a603e7 //log(1/frcpa(1+216/256))/2
|
||||
data8 0x3fd3afccfe77b9d1 //log(1/frcpa(1+217/256))/2
|
||||
data8 0x3fd3be9d503533b5 //log(1/frcpa(1+218/256))/2
|
||||
data8 0x3fd3cd7480b4a8a3 //log(1/frcpa(1+219/256))/2
|
||||
data8 0x3fd3e3c43918f76c //log(1/frcpa(1+220/256))/2
|
||||
data8 0x3fd3f2acb27ed6c7 //log(1/frcpa(1+221/256))/2
|
||||
data8 0x3fd4019c2125ca93 //log(1/frcpa(1+222/256))/2
|
||||
data8 0x3fd4181061389722 //log(1/frcpa(1+223/256))/2
|
||||
data8 0x3fd42711518df545 //log(1/frcpa(1+224/256))/2
|
||||
data8 0x3fd436194e12b6bf //log(1/frcpa(1+225/256))/2
|
||||
data8 0x3fd445285d68ea69 //log(1/frcpa(1+226/256))/2
|
||||
data8 0x3fd45bcc464c893a //log(1/frcpa(1+227/256))/2
|
||||
data8 0x3fd46aed21f117fc //log(1/frcpa(1+228/256))/2
|
||||
data8 0x3fd47a1527e8a2d3 //log(1/frcpa(1+229/256))/2
|
||||
data8 0x3fd489445efffccc //log(1/frcpa(1+230/256))/2
|
||||
data8 0x3fd4a018bcb69835 //log(1/frcpa(1+231/256))/2
|
||||
data8 0x3fd4af5a0c9d65d7 //log(1/frcpa(1+232/256))/2
|
||||
data8 0x3fd4bea2a5bdbe87 //log(1/frcpa(1+233/256))/2
|
||||
data8 0x3fd4cdf28f10ac46 //log(1/frcpa(1+234/256))/2
|
||||
data8 0x3fd4dd49cf994058 //log(1/frcpa(1+235/256))/2
|
||||
data8 0x3fd4eca86e64a684 //log(1/frcpa(1+236/256))/2
|
||||
data8 0x3fd503c43cd8eb68 //log(1/frcpa(1+237/256))/2
|
||||
data8 0x3fd513356667fc57 //log(1/frcpa(1+238/256))/2
|
||||
data8 0x3fd522ae0738a3d8 //log(1/frcpa(1+239/256))/2
|
||||
data8 0x3fd5322e26867857 //log(1/frcpa(1+240/256))/2
|
||||
data8 0x3fd541b5cb979809 //log(1/frcpa(1+241/256))/2
|
||||
data8 0x3fd55144fdbcbd62 //log(1/frcpa(1+242/256))/2
|
||||
data8 0x3fd560dbc45153c7 //log(1/frcpa(1+243/256))/2
|
||||
data8 0x3fd5707a26bb8c66 //log(1/frcpa(1+244/256))/2
|
||||
data8 0x3fd587f60ed5b900 //log(1/frcpa(1+245/256))/2
|
||||
data8 0x3fd597a7977c8f31 //log(1/frcpa(1+246/256))/2
|
||||
data8 0x3fd5a760d634bb8b //log(1/frcpa(1+247/256))/2
|
||||
data8 0x3fd5b721d295f10f //log(1/frcpa(1+248/256))/2
|
||||
data8 0x3fd5c6ea94431ef9 //log(1/frcpa(1+249/256))/2
|
||||
data8 0x3fd5d6bb22ea86f6 //log(1/frcpa(1+250/256))/2
|
||||
data8 0x3fd5e6938645d390 //log(1/frcpa(1+251/256))/2
|
||||
data8 0x3fd5f673c61a2ed2 //log(1/frcpa(1+252/256))/2
|
||||
data8 0x3fd6065bea385926 //log(1/frcpa(1+253/256))/2
|
||||
data8 0x3fd6164bfa7cc06b //log(1/frcpa(1+254/256))/2
|
||||
data8 0x3fd62643fecf9743 //log(1/frcpa(1+255/256))/2
|
||||
LOCAL_OBJECT_END(atanhf_data2)
|
||||
|
||||
|
||||
.section .text
|
||||
GLOBAL_LIBM_ENTRY(atanhf)
|
||||
|
||||
{ .mfi
|
||||
getf.exp rArgSExpb = f8
|
||||
fclass.m p9,p0 = f8, 0x0b // is arg denormal ?
|
||||
mov rExpbMask = 0x1ffff
|
||||
}
|
||||
{ .mfi
|
||||
addl DataPtr = @ltoff(atanhf_data), gp
|
||||
fnma.s1 fOneMx = f8, f1, f1 // 1 - x
|
||||
mov rBias = 0xffff
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fclass.m p7,p0 = f8, 0xc7 // is arg NaN or +/-0 ?
|
||||
mov rNearZeroBound = 0xffeb // 2^(-20)
|
||||
}
|
||||
{ .mfi
|
||||
ld8 DataPtr = [DataPtr]
|
||||
fma.s1 fOnePx = f8, f1, f1 // 1 + x
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
fnorm.s1 fNormX = f8 // Normalize x
|
||||
(p9) br.cond.spnt ATANH_UNORM // Branch if x=unorm
|
||||
}
|
||||
;;
|
||||
|
||||
ATANH_COMMON:
|
||||
// Return here if x=unorm and not denorm
|
||||
{ .mfi
|
||||
ldfpd fP3, fP2 = [DataPtr], 16
|
||||
fma.s1 fX2 = f8, f8, f0 // x^2
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
(p7) fma.s.s0 f8 = f8,f1,f8 // NaN or +/-0
|
||||
(p7) br.ret.spnt b0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
ldfpd fP1, fHalf = [DataPtr], 16
|
||||
frcpa.s1 fRcpM, p9 = f1, fOneMx // rcpm = frcpa(1 - x)
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
getf.exp rExpbm = fOneMx
|
||||
frcpa.s1 fRcpP, p0 = f1, fOnePx // rcpp = frcpa(1 + x)
|
||||
// biased exponent
|
||||
and rArgExpb = rArgSExpb, rExpbMask
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mmi
|
||||
getf.exp rExpbp = fOnePx
|
||||
// is |x| < 2^(-20) ?
|
||||
cmp.gt p8,p0 = rNearZeroBound, rArgExpb
|
||||
cmp.ge p6,p0 = rArgExpb, rBias // is |x| >= 1 ?
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mmb
|
||||
getf.sig rSigm = fOneMx
|
||||
nop.m 0
|
||||
(p6) br.cond.spnt atanhf_ge_one
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
getf.sig rSigp = fOnePx
|
||||
(p8) fma.s.s0 f8 = fX2, f8, f8 // x + x^3
|
||||
(p8) br.ret.spnt b0 // Exit for MAX_DENORM_ABS < |x| < 2^-20
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
ldfd fLog2 = [DataPtr], 16
|
||||
fms.s1 fRm = fRcpM, fOneMx, f1 // rm = rcpm * (1 - x) - 1
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mmf
|
||||
// (1 - x) is always positive here and we need not mask sign bit
|
||||
sub rNm = rExpbm, rBias
|
||||
// (1 + x) is always positive here and we need not mask sign bit
|
||||
sub rNp = rExpbp, rBias
|
||||
fms.s1 fRp = fRcpP, fOnePx, f1 // rp = rcpp * (1 + x) - 1
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mmi
|
||||
setf.sig fN4CvtM = rNm
|
||||
setf.sig fN4CvtP = rNp
|
||||
extr.u rIndm = rSigm,55,8 // Extract 8 bits
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mmi
|
||||
shladd RcpTablePtrM = rIndm, 3, DataPtr
|
||||
nop.m 0
|
||||
extr.u rIndp = rSigp,55,8 // Extract 8 bits
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mmi
|
||||
ldfd fLogTm = [RcpTablePtrM]
|
||||
shladd RcpTablePtrP = rIndp, 3, DataPtr
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
ldfd fLogTp = [RcpTablePtrP]
|
||||
fma.s1 fRm2 = fRm, fRm, f0 // rm^2
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fP32m = fP3, fRm, fP2 // P3*rm + P2
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fRp2 = fRp, fRp, f0 // rp^2
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fP10m = fP1, fRm, fHalf // P1*rm + 1
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fP32p = fP3, fRp, fP2 // P3*rp + P2
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fP10p = fP1, fRp, fHalf // P1*rp + 1
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fcvt.xf fNm = fN4CvtM
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fcvt.xf fNp = fN4CvtP
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// (P3*rm + P2)*rm^2 + (P1*rm + 1)
|
||||
fma.s1 fP32m = fP32m, fRm2, fP10m
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// (P3*rp + P2)*rp^2 + (P1*rp + 1)
|
||||
fma.s1 fP32p = fP32p, fRp2, fP10p
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// Nm*ln(2)/2 + Tm/2
|
||||
fma.s1 fLogTm = fNm, fLog2, fLogTm
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// Np*ln(2)/2 + Tp/2
|
||||
fma.s1 fLogTp = fNp, fLog2, fLogTp
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// ((P3*rm + P2)*rm^2 + (P3*rm + 1))*0.5*rm + (Nm*ln(2)/2 + Tm/2)
|
||||
fma.d.s1 fP32m = fP32m, fRm, fLogTm
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// ((P3*rp + P2)*rp^2 + (P3*rp + 1))*0.5*rp + (Np*ln(2)/2 + Tp/2)
|
||||
fma.d.s1 fP32p = fP32p, fRp, fLogTp
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
// atanhf(x) = 0.5 * (log(1 + x) - log(1 - x))
|
||||
fnma.s.s0 f8 = fP32m, f1, fP32p
|
||||
br.ret.sptk b0 // Exit for 2^(-20) <= |x| < 1.0
|
||||
}
|
||||
;;
|
||||
|
||||
|
||||
ATANH_UNORM:
|
||||
// Here if x=unorm
|
||||
{ .mfi
|
||||
getf.exp rArgSExpb = fNormX // Recompute if x unorm
|
||||
fclass.m p0,p9 = fNormX, 0x0b // Test x denorm
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
fcmp.lt.s0 p10,p11 = f8, f0 // Set denormal flag
|
||||
(p9) br.cond.sptk ATANH_COMMON // Continue if x unorm and not denorm
|
||||
}
|
||||
;;
|
||||
|
||||
.pred.rel "mutex",p6,p7
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p6) fnma.s.s0 f8 = f8,f8,f8 // Result x-x^2 if x=-denorm
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
(p7) fma.s.s0 f8 = f8,f8,f8 // Result x+x^2 if x=+denorm
|
||||
br.ret.spnt b0 // Exit if denorm
|
||||
}
|
||||
;;
|
||||
|
||||
// Here if |x| >= 1.0
|
||||
atanhf_ge_one:
|
||||
{ .mfi
|
||||
alloc r32 = ar.pfs,1,3,4,0
|
||||
fmerge.s fArgAbs = f0, f8 // Form |x|
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fmerge.s f10 = f8, f8 // Save input for error call
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fcmp.eq.s1 p6,p7 = fArgAbs, f1 // Test for |x| = 1.0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
// Set error tag and result, and raise invalid flag if |x| > 1.0
|
||||
{ .mfi
|
||||
(p7) mov atanh_GR_tag = 133
|
||||
(p7) frcpa.s0 f8, p0 = f0, f0 // Get QNaN, and raise invalid
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
// Set error tag and result, and raise Z flag if |x| = 1.0
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p6) frcpa.s0 fRm, p0 = f1, f0 // Get inf, and raise Z flag
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
(p6) mov atanh_GR_tag = 134
|
||||
(p6) fmerge.s f8 = f8, fRm // result is +-inf
|
||||
br.cond.sptk __libm_error_region // Exit if |x| >= 1.0
|
||||
}
|
||||
;;
|
||||
|
||||
GLOBAL_LIBM_END(atanhf)
|
||||
libm_alias_float_other (atanh, atanh)
|
||||
|
||||
|
||||
LOCAL_LIBM_ENTRY(__libm_error_region)
|
||||
.prologue
|
||||
|
||||
{ .mfi
|
||||
add GR_Parameter_Y=-32,sp // Parameter 2 value
|
||||
nop.f 0
|
||||
.save ar.pfs,GR_SAVE_PFS
|
||||
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
|
||||
}
|
||||
{ .mfi
|
||||
.fframe 64
|
||||
add sp=-64,sp // Create new stack
|
||||
nop.f 0
|
||||
mov GR_SAVE_GP=gp // Save gp
|
||||
};;
|
||||
|
||||
{ .mmi
|
||||
stfs [GR_Parameter_Y] = f1,16 // STORE Parameter 2 on stack
|
||||
add GR_Parameter_X = 16,sp // Parameter 1 address
|
||||
.save b0, GR_SAVE_B0
|
||||
mov GR_SAVE_B0=b0 // Save b0
|
||||
};;
|
||||
|
||||
.body
|
||||
{ .mib
|
||||
stfs [GR_Parameter_X] = f10 // STORE Parameter 1 on stack
|
||||
// Parameter 3 address
|
||||
add GR_Parameter_RESULT = 0,GR_Parameter_Y
|
||||
nop.b 0
|
||||
}
|
||||
{ .mib
|
||||
stfs [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
|
||||
add GR_Parameter_Y = -16,GR_Parameter_Y
|
||||
br.call.sptk b0=__libm_error_support# // Call error handling function
|
||||
};;
|
||||
|
||||
{ .mmi
|
||||
add GR_Parameter_RESULT = 48,sp
|
||||
nop.m 0
|
||||
nop.i 0
|
||||
};;
|
||||
|
||||
{ .mmi
|
||||
ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
|
||||
.restore sp
|
||||
add sp = 64,sp // Restore stack pointer
|
||||
mov b0 = GR_SAVE_B0 // Restore return address
|
||||
};;
|
||||
|
||||
{ .mib
|
||||
mov gp = GR_SAVE_GP // Restore gp
|
||||
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
|
||||
br.ret.sptk b0 // Return
|
||||
};;
|
||||
|
||||
LOCAL_LIBM_END(__libm_error_region)
|
||||
|
||||
|
||||
.type __libm_error_support#,@function
|
||||
.global __libm_error_support#
|
File diff suppressed because it is too large
Load Diff
@ -1,866 +0,0 @@
|
||||
.file "cosh.s"
|
||||
|
||||
|
||||
// Copyright (c) 2000 - 2005, Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
//
|
||||
// History
|
||||
//==============================================================
|
||||
// 02/02/00 Initial version
|
||||
// 04/04/00 Unwind support added
|
||||
// 08/15/00 Bundle added after call to __libm_error_support to properly
|
||||
// set [the previously overwritten] GR_Parameter_RESULT.
|
||||
// 05/07/01 Reworked to improve speed of all paths
|
||||
// 05/20/02 Cleaned up namespace and sf0 syntax
|
||||
// 11/15/02 Improved speed with new algorithm
|
||||
// 03/31/05 Reformatted delimiters between data tables
|
||||
|
||||
// API
|
||||
//==============================================================
|
||||
// double cosh(double)
|
||||
|
||||
// Overview of operation
|
||||
//==============================================================
|
||||
// Case 1: 0 < |x| < 0.25
|
||||
// Evaluate cosh(x) by a 12th order polynomial
|
||||
// Care is take for the order of multiplication; and A2 is not exactly 1/4!,
|
||||
// A3 is not exactly 1/6!, etc.
|
||||
// cosh(x) = 1 + (A1*x^2 + A2*x^4 + A3*x^6 + A4*x^8 + A5*x^10 + A6*x^12)
|
||||
//
|
||||
// Case 2: 0.25 < |x| < 710.47586
|
||||
// Algorithm is based on the identity cosh(x) = ( exp(x) + exp(-x) ) / 2.
|
||||
// The algorithm for exp is described as below. There are a number of
|
||||
// economies from evaluating both exp(x) and exp(-x). Although we
|
||||
// are evaluating both quantities, only where the quantities diverge do we
|
||||
// duplicate the computations. The basic algorithm for exp(x) is described
|
||||
// below.
|
||||
//
|
||||
// Take the input x. w is "how many log2/128 in x?"
|
||||
// w = x * 128/log2
|
||||
// n = int(w)
|
||||
// x = n log2/128 + r + delta
|
||||
|
||||
// n = 128M + index_1 + 2^4 index_2
|
||||
// x = M log2 + (log2/128) index_1 + (log2/8) index_2 + r + delta
|
||||
|
||||
// exp(x) = 2^M 2^(index_1/128) 2^(index_2/8) exp(r) exp(delta)
|
||||
// Construct 2^M
|
||||
// Get 2^(index_1/128) from table_1;
|
||||
// Get 2^(index_2/8) from table_2;
|
||||
// Calculate exp(r) by 5th order polynomial
|
||||
// r = x - n (log2/128)_high
|
||||
// delta = - n (log2/128)_low
|
||||
// Calculate exp(delta) as 1 + delta
|
||||
|
||||
|
||||
// Special values
|
||||
//==============================================================
|
||||
// cosh(+0) = 1.0
|
||||
// cosh(-0) = 1.0
|
||||
|
||||
// cosh(+qnan) = +qnan
|
||||
// cosh(-qnan) = -qnan
|
||||
// cosh(+snan) = +qnan
|
||||
// cosh(-snan) = -qnan
|
||||
|
||||
// cosh(-inf) = +inf
|
||||
// cosh(+inf) = +inf
|
||||
|
||||
// Overflow and Underflow
|
||||
//=======================
|
||||
// cosh(x) = largest double normal when
|
||||
// x = 710.47586 = 0x408633ce8fb9f87d
|
||||
//
|
||||
// There is no underflow.
|
||||
|
||||
// Registers used
|
||||
//==============================================================
|
||||
// Floating Point registers used:
|
||||
// f8, input, output
|
||||
// f6 -> f15, f32 -> f61
|
||||
|
||||
// General registers used:
|
||||
// r14 -> r40
|
||||
|
||||
// Predicate registers used:
|
||||
// p6 -> p15
|
||||
|
||||
// Assembly macros
|
||||
//==============================================================
|
||||
|
||||
rRshf = r14
|
||||
rN_neg = r14
|
||||
rAD_TB1 = r15
|
||||
rAD_TB2 = r16
|
||||
rAD_P = r17
|
||||
rN = r18
|
||||
rIndex_1 = r19
|
||||
rIndex_2_16 = r20
|
||||
rM = r21
|
||||
rBiased_M = r21
|
||||
rSig_inv_ln2 = r22
|
||||
rIndex_1_neg = r22
|
||||
rExp_bias = r23
|
||||
rExp_bias_minus_1 = r23
|
||||
rExp_mask = r24
|
||||
rTmp = r24
|
||||
rGt_ln = r24
|
||||
rIndex_2_16_neg = r24
|
||||
rM_neg = r25
|
||||
rBiased_M_neg = r25
|
||||
rRshf_2to56 = r26
|
||||
rAD_T1_neg = r26
|
||||
rExp_2tom56 = r28
|
||||
rAD_T2_neg = r28
|
||||
rAD_T1 = r29
|
||||
rAD_T2 = r30
|
||||
rSignexp_x = r31
|
||||
rExp_x = r31
|
||||
|
||||
GR_SAVE_B0 = r33
|
||||
GR_SAVE_PFS = r34
|
||||
GR_SAVE_GP = r35
|
||||
GR_SAVE_SP = r36
|
||||
|
||||
GR_Parameter_X = r37
|
||||
GR_Parameter_Y = r38
|
||||
GR_Parameter_RESULT = r39
|
||||
GR_Parameter_TAG = r40
|
||||
|
||||
|
||||
FR_X = f10
|
||||
FR_Y = f1
|
||||
FR_RESULT = f8
|
||||
|
||||
fRSHF_2TO56 = f6
|
||||
fINV_LN2_2TO63 = f7
|
||||
fW_2TO56_RSH = f9
|
||||
f2TOM56 = f11
|
||||
fP5 = f12
|
||||
fP4 = f13
|
||||
fP3 = f14
|
||||
fP2 = f15
|
||||
|
||||
fLn2_by_128_hi = f33
|
||||
fLn2_by_128_lo = f34
|
||||
|
||||
fRSHF = f35
|
||||
fNfloat = f36
|
||||
fNormX = f37
|
||||
fR = f38
|
||||
fF = f39
|
||||
|
||||
fRsq = f40
|
||||
f2M = f41
|
||||
fS1 = f42
|
||||
fT1 = f42
|
||||
fS2 = f43
|
||||
fT2 = f43
|
||||
fS = f43
|
||||
fWre_urm_f8 = f44
|
||||
fAbsX = f44
|
||||
|
||||
fMIN_DBL_OFLOW_ARG = f45
|
||||
fMAX_DBL_NORM_ARG = f46
|
||||
fXsq = f47
|
||||
fX4 = f48
|
||||
fGt_pln = f49
|
||||
fTmp = f49
|
||||
|
||||
fP54 = f50
|
||||
fP5432 = f50
|
||||
fP32 = f51
|
||||
fP = f52
|
||||
fP54_neg = f53
|
||||
fP5432_neg = f53
|
||||
fP32_neg = f54
|
||||
fP_neg = f55
|
||||
fF_neg = f56
|
||||
|
||||
f2M_neg = f57
|
||||
fS1_neg = f58
|
||||
fT1_neg = f58
|
||||
fS2_neg = f59
|
||||
fT2_neg = f59
|
||||
fS_neg = f59
|
||||
fExp = f60
|
||||
fExp_neg = f61
|
||||
|
||||
fA6 = f50
|
||||
fA65 = f50
|
||||
fA6543 = f50
|
||||
fA654321 = f50
|
||||
fA5 = f51
|
||||
fA4 = f52
|
||||
fA43 = f52
|
||||
fA3 = f53
|
||||
fA2 = f54
|
||||
fA21 = f54
|
||||
fA1 = f55
|
||||
|
||||
// Data tables
|
||||
//==============================================================
|
||||
|
||||
RODATA
|
||||
.align 16
|
||||
|
||||
// ************* DO NOT CHANGE ORDER OF THESE TABLES ********************
|
||||
|
||||
// double-extended 1/ln(2)
|
||||
// 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88
|
||||
// 3fff b8aa 3b29 5c17 f0bc
|
||||
// For speed the significand will be loaded directly with a movl and setf.sig
|
||||
// and the exponent will be bias+63 instead of bias+0. Thus subsequent
|
||||
// computations need to scale appropriately.
|
||||
// The constant 128/ln(2) is needed for the computation of w. This is also
|
||||
// obtained by scaling the computations.
|
||||
//
|
||||
// Two shifting constants are loaded directly with movl and setf.d.
|
||||
// 1. fRSHF_2TO56 = 1.1000..00 * 2^(63-7)
|
||||
// This constant is added to x*1/ln2 to shift the integer part of
|
||||
// x*128/ln2 into the rightmost bits of the significand.
|
||||
// The result of this fma is fW_2TO56_RSH.
|
||||
// 2. fRSHF = 1.1000..00 * 2^(63)
|
||||
// This constant is subtracted from fW_2TO56_RSH * 2^(-56) to give
|
||||
// the integer part of w, n, as a floating-point number.
|
||||
// The result of this fms is fNfloat.
|
||||
|
||||
|
||||
LOCAL_OBJECT_START(exp_table_1)
|
||||
data8 0x408633ce8fb9f87e // smallest dbl overflow arg
|
||||
data8 0x408633ce8fb9f87d // largest dbl arg to give normal dbl result
|
||||
data8 0xb17217f7d1cf79ab , 0x00003ff7 // ln2/128 hi
|
||||
data8 0xc9e3b39803f2f6af , 0x00003fb7 // ln2/128 lo
|
||||
//
|
||||
// Table 1 is 2^(index_1/128) where
|
||||
// index_1 goes from 0 to 15
|
||||
//
|
||||
data8 0x8000000000000000 , 0x00003FFF
|
||||
data8 0x80B1ED4FD999AB6C , 0x00003FFF
|
||||
data8 0x8164D1F3BC030773 , 0x00003FFF
|
||||
data8 0x8218AF4373FC25EC , 0x00003FFF
|
||||
data8 0x82CD8698AC2BA1D7 , 0x00003FFF
|
||||
data8 0x8383594EEFB6EE37 , 0x00003FFF
|
||||
data8 0x843A28C3ACDE4046 , 0x00003FFF
|
||||
data8 0x84F1F656379C1A29 , 0x00003FFF
|
||||
data8 0x85AAC367CC487B15 , 0x00003FFF
|
||||
data8 0x8664915B923FBA04 , 0x00003FFF
|
||||
data8 0x871F61969E8D1010 , 0x00003FFF
|
||||
data8 0x87DB357FF698D792 , 0x00003FFF
|
||||
data8 0x88980E8092DA8527 , 0x00003FFF
|
||||
data8 0x8955EE03618E5FDD , 0x00003FFF
|
||||
data8 0x8A14D575496EFD9A , 0x00003FFF
|
||||
data8 0x8AD4C6452C728924 , 0x00003FFF
|
||||
LOCAL_OBJECT_END(exp_table_1)
|
||||
|
||||
// Table 2 is 2^(index_1/8) where
|
||||
// index_2 goes from 0 to 7
|
||||
LOCAL_OBJECT_START(exp_table_2)
|
||||
data8 0x8000000000000000 , 0x00003FFF
|
||||
data8 0x8B95C1E3EA8BD6E7 , 0x00003FFF
|
||||
data8 0x9837F0518DB8A96F , 0x00003FFF
|
||||
data8 0xA5FED6A9B15138EA , 0x00003FFF
|
||||
data8 0xB504F333F9DE6484 , 0x00003FFF
|
||||
data8 0xC5672A115506DADD , 0x00003FFF
|
||||
data8 0xD744FCCAD69D6AF4 , 0x00003FFF
|
||||
data8 0xEAC0C6E7DD24392F , 0x00003FFF
|
||||
LOCAL_OBJECT_END(exp_table_2)
|
||||
|
||||
LOCAL_OBJECT_START(exp_p_table)
|
||||
data8 0x3f8111116da21757 //P5
|
||||
data8 0x3fa55555d787761c //P4
|
||||
data8 0x3fc5555555555414 //P3
|
||||
data8 0x3fdffffffffffd6a //P2
|
||||
LOCAL_OBJECT_END(exp_p_table)
|
||||
|
||||
LOCAL_OBJECT_START(cosh_p_table)
|
||||
data8 0x8FA02AC65BCBD5BC, 0x00003FE2 // A6
|
||||
data8 0xD00D00D1021D7370, 0x00003FEF // A4
|
||||
data8 0xAAAAAAAAAAAAAB80, 0x00003FFA // A2
|
||||
data8 0x93F27740C0C2F1CC, 0x00003FE9 // A5
|
||||
data8 0xB60B60B60B4FE884, 0x00003FF5 // A3
|
||||
data8 0x8000000000000000, 0x00003FFE // A1
|
||||
LOCAL_OBJECT_END(cosh_p_table)
|
||||
|
||||
|
||||
.section .text
|
||||
GLOBAL_IEEE754_ENTRY(cosh)
|
||||
|
||||
{ .mlx
|
||||
getf.exp rSignexp_x = f8 // Must recompute if x unorm
|
||||
movl rSig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2
|
||||
}
|
||||
{ .mlx
|
||||
addl rAD_TB1 = @ltoff(exp_table_1), gp
|
||||
movl rRshf_2to56 = 0x4768000000000000 // 1.10000 2^(63+56)
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
ld8 rAD_TB1 = [rAD_TB1]
|
||||
fclass.m p6,p0 = f8,0x0b // Test for x=unorm
|
||||
mov rExp_mask = 0x1ffff
|
||||
}
|
||||
{ .mfi
|
||||
mov rExp_bias = 0xffff
|
||||
fnorm.s1 fNormX = f8
|
||||
mov rExp_2tom56 = 0xffff-56
|
||||
}
|
||||
;;
|
||||
|
||||
// Form two constants we need
|
||||
// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128
|
||||
// 1.1000..000 * 2^(63+63-7) to right shift int(w) into the significand
|
||||
|
||||
{ .mfi
|
||||
setf.sig fINV_LN2_2TO63 = rSig_inv_ln2 // form 1/ln2 * 2^63
|
||||
fclass.m p8,p0 = f8,0x07 // Test for x=0
|
||||
nop.i 999
|
||||
}
|
||||
{ .mlx
|
||||
setf.d fRSHF_2TO56 = rRshf_2to56 // Form const 1.100 * 2^(63+56)
|
||||
movl rRshf = 0x43e8000000000000 // 1.10000 2^63 for right shift
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
ldfpd fMIN_DBL_OFLOW_ARG, fMAX_DBL_NORM_ARG = [rAD_TB1],16
|
||||
fclass.m p10,p0 = f8,0x1e3 // Test for x=inf, nan, NaT
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
setf.exp f2TOM56 = rExp_2tom56 // form 2^-56 for scaling Nfloat
|
||||
nop.f 0
|
||||
(p6) br.cond.spnt COSH_UNORM // Branch if x=unorm
|
||||
}
|
||||
;;
|
||||
|
||||
COSH_COMMON:
|
||||
{ .mfi
|
||||
ldfe fLn2_by_128_hi = [rAD_TB1],16
|
||||
nop.f 0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
setf.d fRSHF = rRshf // Form right shift const 1.100 * 2^63
|
||||
(p8) fma.d.s0 f8 = f1,f1,f0 // quick exit for x=0
|
||||
(p8) br.ret.spnt b0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
ldfe fLn2_by_128_lo = [rAD_TB1],16
|
||||
nop.f 0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
and rExp_x = rExp_mask, rSignexp_x // Biased exponent of x
|
||||
(p10) fma.d.s0 f8 = f8,f8,f0 // Result if x=inf, nan, NaT
|
||||
(p10) br.ret.spnt b0 // quick exit for x=inf, nan, NaT
|
||||
}
|
||||
;;
|
||||
|
||||
// After that last load rAD_TB1 points to the beginning of table 1
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fcmp.eq.s0 p6,p0 = f8, f0 // Dummy to set D
|
||||
sub rExp_x = rExp_x, rExp_bias // True exponent of x
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fmerge.s fAbsX = f0, fNormX // Form |x|
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
cmp.gt p7, p0 = -2, rExp_x // Test |x| < 2^(-2)
|
||||
fma.s1 fXsq = fNormX, fNormX, f0 // x*x for small path
|
||||
(p7) br.cond.spnt COSH_SMALL // Branch if 0 < |x| < 2^-2
|
||||
}
|
||||
;;
|
||||
|
||||
// W = X * Inv_log2_by_128
|
||||
// By adding 1.10...0*2^63 we shift and get round_int(W) in significand.
|
||||
// We actually add 1.10...0*2^56 to X * Inv_log2 to do the same thing.
|
||||
|
||||
{ .mfi
|
||||
add rAD_P = 0x180, rAD_TB1
|
||||
fma.s1 fW_2TO56_RSH = fNormX, fINV_LN2_2TO63, fRSHF_2TO56
|
||||
add rAD_TB2 = 0x100, rAD_TB1
|
||||
}
|
||||
;;
|
||||
|
||||
// Divide arguments into the following categories:
|
||||
// Certain Safe - 0.25 <= |x| <= MAX_DBL_NORM_ARG
|
||||
// Possible Overflow p14 - MAX_DBL_NORM_ARG < |x| < MIN_DBL_OFLOW_ARG
|
||||
// Certain Overflow p15 - MIN_DBL_OFLOW_ARG <= |x| < +inf
|
||||
//
|
||||
// If the input is really a double arg, then there will never be
|
||||
// "Possible Overflow" arguments.
|
||||
//
|
||||
|
||||
{ .mfi
|
||||
ldfpd fP5, fP4 = [rAD_P] ,16
|
||||
fcmp.ge.s1 p15,p14 = fAbsX,fMIN_DBL_OFLOW_ARG
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
// Nfloat = round_int(W)
|
||||
// The signficand of fW_2TO56_RSH contains the rounded integer part of W,
|
||||
// as a twos complement number in the lower bits (that is, it may be negative).
|
||||
// That twos complement number (called N) is put into rN.
|
||||
|
||||
// Since fW_2TO56_RSH is scaled by 2^56, it must be multiplied by 2^-56
|
||||
// before the shift constant 1.10000 * 2^63 is subtracted to yield fNfloat.
|
||||
// Thus, fNfloat contains the floating point version of N
|
||||
|
||||
{ .mfi
|
||||
ldfpd fP3, fP2 = [rAD_P]
|
||||
(p14) fcmp.gt.unc.s1 p14,p0 = fAbsX,fMAX_DBL_NORM_ARG
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
fms.s1 fNfloat = fW_2TO56_RSH, f2TOM56, fRSHF
|
||||
(p15) br.cond.spnt COSH_CERTAIN_OVERFLOW
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
getf.sig rN = fW_2TO56_RSH
|
||||
nop.f 0
|
||||
mov rExp_bias_minus_1 = 0xfffe
|
||||
}
|
||||
;;
|
||||
|
||||
// rIndex_1 has index_1
|
||||
// rIndex_2_16 has index_2 * 16
|
||||
// rBiased_M has M
|
||||
|
||||
// rM has true M
|
||||
// r = x - Nfloat * ln2_by_128_hi
|
||||
// f = 1 - Nfloat * ln2_by_128_lo
|
||||
{ .mfi
|
||||
and rIndex_1 = 0x0f, rN
|
||||
fnma.s1 fR = fNfloat, fLn2_by_128_hi, fNormX
|
||||
shr rM = rN, 0x7
|
||||
}
|
||||
{ .mfi
|
||||
and rIndex_2_16 = 0x70, rN
|
||||
fnma.s1 fF = fNfloat, fLn2_by_128_lo, f1
|
||||
sub rN_neg = r0, rN
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mmi
|
||||
and rIndex_1_neg = 0x0f, rN_neg
|
||||
add rBiased_M = rExp_bias_minus_1, rM
|
||||
shr rM_neg = rN_neg, 0x7
|
||||
}
|
||||
{ .mmi
|
||||
and rIndex_2_16_neg = 0x70, rN_neg
|
||||
add rAD_T2 = rAD_TB2, rIndex_2_16
|
||||
shladd rAD_T1 = rIndex_1, 4, rAD_TB1
|
||||
}
|
||||
;;
|
||||
|
||||
// rAD_T1 has address of T1
|
||||
// rAD_T2 has address if T2
|
||||
|
||||
{ .mmi
|
||||
setf.exp f2M = rBiased_M
|
||||
ldfe fT2 = [rAD_T2]
|
||||
nop.i 0
|
||||
}
|
||||
{ .mmi
|
||||
add rBiased_M_neg = rExp_bias_minus_1, rM_neg
|
||||
add rAD_T2_neg = rAD_TB2, rIndex_2_16_neg
|
||||
shladd rAD_T1_neg = rIndex_1_neg, 4, rAD_TB1
|
||||
}
|
||||
;;
|
||||
|
||||
// Create Scale = 2^M
|
||||
// Load T1 and T2
|
||||
{ .mmi
|
||||
ldfe fT1 = [rAD_T1]
|
||||
nop.m 0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mmf
|
||||
setf.exp f2M_neg = rBiased_M_neg
|
||||
ldfe fT2_neg = [rAD_T2_neg]
|
||||
fma.s1 fF_neg = fNfloat, fLn2_by_128_lo, f1
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fRsq = fR, fR, f0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
ldfe fT1_neg = [rAD_T1_neg]
|
||||
fma.s1 fP54 = fR, fP5, fP4
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fP32 = fR, fP3, fP2
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fnma.s1 fP54_neg = fR, fP5, fP4
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fnma.s1 fP32_neg = fR, fP3, fP2
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fP5432 = fRsq, fP54, fP32
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fS2 = fF,fT2,f0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fS1 = f2M,fT1,f0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fP5432_neg = fRsq, fP54_neg, fP32_neg
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fS1_neg = f2M_neg,fT1_neg,f0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fS2_neg = fF_neg,fT2_neg,f0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fP = fRsq, fP5432, fR
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fS = fS1,fS2,f0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fms.s1 fP_neg = fRsq, fP5432_neg, fR
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fS_neg = fS1_neg,fS2_neg,f0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
fmpy.s0 fTmp = fLn2_by_128_lo, fLn2_by_128_lo // Force inexact
|
||||
(p14) br.cond.spnt COSH_POSSIBLE_OVERFLOW
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fExp = fS, fP, fS
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fExp_neg = fS_neg, fP_neg, fS_neg
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
fma.d.s0 f8 = fExp, f1, fExp_neg
|
||||
br.ret.sptk b0 // Normal path exit
|
||||
}
|
||||
;;
|
||||
|
||||
// Here if 0 < |x| < 0.25
|
||||
COSH_SMALL:
|
||||
{ .mmf
|
||||
add rAD_T1 = 0x1a0, rAD_TB1
|
||||
add rAD_T2 = 0x1d0, rAD_TB1
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mmf
|
||||
ldfe fA6 = [rAD_T1],16
|
||||
ldfe fA5 = [rAD_T2],16
|
||||
nop.f 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mmi
|
||||
ldfe fA4 = [rAD_T1],16
|
||||
ldfe fA3 = [rAD_T2],16
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mmi
|
||||
ldfe fA2 = [rAD_T1],16
|
||||
ldfe fA1 = [rAD_T2],16
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fX4 = fXsq, fXsq, f0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA65 = fXsq, fA6, fA5
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA43 = fXsq, fA4, fA3
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA21 = fXsq, fA2, fA1
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA6543 = fX4, fA65, fA43
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA654321 = fX4, fA6543, fA21
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
// Dummy multiply to generate inexact
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fmpy.s0 fTmp = fA6, fA6
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
fma.d.s0 f8 = fA654321, fXsq, f1
|
||||
br.ret.sptk b0 // Exit if 0 < |x| < 0.25
|
||||
}
|
||||
;;
|
||||
|
||||
|
||||
COSH_POSSIBLE_OVERFLOW:
|
||||
|
||||
// Here if fMAX_DBL_NORM_ARG < |x| < fMIN_DBL_OFLOW_ARG
|
||||
// This cannot happen if input is a double, only if input higher precision.
|
||||
// Overflow is a possibility, not a certainty.
|
||||
|
||||
// Recompute result using status field 2 with user's rounding mode,
|
||||
// and wre set. If result is larger than largest double, then we have
|
||||
// overflow
|
||||
|
||||
{ .mfi
|
||||
mov rGt_ln = 0x103ff // Exponent for largest dbl + 1 ulp
|
||||
fsetc.s2 0x7F,0x42 // Get user's round mode, set wre
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
setf.exp fGt_pln = rGt_ln // Create largest double + 1 ulp
|
||||
fma.d.s2 fWre_urm_f8 = fS, fP, fS // Result with wre set
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fsetc.s2 0x7F,0x40 // Turn off wre in sf2
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
nop.f 0
|
||||
(p6) br.cond.spnt COSH_CERTAIN_OVERFLOW // Branch if overflow
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
fma.d.s0 f8 = fS, fP, fS
|
||||
br.ret.sptk b0 // Exit if really no overflow
|
||||
}
|
||||
;;
|
||||
|
||||
COSH_CERTAIN_OVERFLOW:
|
||||
{ .mmi
|
||||
sub rTmp = rExp_mask, r0, 1
|
||||
;;
|
||||
setf.exp fTmp = rTmp
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
alloc r32=ar.pfs,1,4,4,0
|
||||
fmerge.s FR_X = f8,f8
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
mov GR_Parameter_TAG = 64
|
||||
fma.d.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and +INF result
|
||||
br.cond.sptk __libm_error_region
|
||||
}
|
||||
;;
|
||||
|
||||
// Here if x unorm
|
||||
COSH_UNORM:
|
||||
{ .mfb
|
||||
getf.exp rSignexp_x = fNormX // Must recompute if x unorm
|
||||
fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag
|
||||
br.cond.sptk COSH_COMMON
|
||||
}
|
||||
;;
|
||||
|
||||
GLOBAL_IEEE754_END(cosh)
|
||||
libm_alias_double_other (__cosh, cosh)
|
||||
|
||||
|
||||
LOCAL_LIBM_ENTRY(__libm_error_region)
|
||||
.prologue
|
||||
{ .mfi
|
||||
add GR_Parameter_Y=-32,sp // Parameter 2 value
|
||||
nop.f 0
|
||||
.save ar.pfs,GR_SAVE_PFS
|
||||
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
|
||||
}
|
||||
{ .mfi
|
||||
.fframe 64
|
||||
add sp=-64,sp // Create new stack
|
||||
nop.f 0
|
||||
mov GR_SAVE_GP=gp // Save gp
|
||||
};;
|
||||
{ .mmi
|
||||
stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
|
||||
add GR_Parameter_X = 16,sp // Parameter 1 address
|
||||
.save b0, GR_SAVE_B0
|
||||
mov GR_SAVE_B0=b0 // Save b0
|
||||
};;
|
||||
.body
|
||||
{ .mib
|
||||
stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
|
||||
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
|
||||
nop.b 0
|
||||
}
|
||||
{ .mib
|
||||
stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
|
||||
add GR_Parameter_Y = -16,GR_Parameter_Y
|
||||
br.call.sptk b0=__libm_error_support# // Call error handling function
|
||||
};;
|
||||
{ .mmi
|
||||
add GR_Parameter_RESULT = 48,sp
|
||||
nop.m 0
|
||||
nop.i 0
|
||||
};;
|
||||
{ .mmi
|
||||
ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
|
||||
.restore sp
|
||||
add sp = 64,sp // Restore stack pointer
|
||||
mov b0 = GR_SAVE_B0 // Restore return address
|
||||
};;
|
||||
{ .mib
|
||||
mov gp = GR_SAVE_GP // Restore gp
|
||||
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
|
||||
br.ret.sptk b0 // Return
|
||||
};;
|
||||
|
||||
LOCAL_LIBM_END(__libm_error_region)
|
||||
.type __libm_error_support#,@function
|
||||
.global __libm_error_support#
|
@ -1,711 +0,0 @@
|
||||
.file "coshf.s"
|
||||
|
||||
|
||||
// Copyright (c) 2000 - 2005, Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
|
||||
// History
|
||||
//*********************************************************************
|
||||
// 02/02/00 Initial version
|
||||
// 02/16/00 The error tag for coshf overflow changed to 65 (from 64).
|
||||
// 04/04/00 Unwind support added
|
||||
// 08/15/00 Bundle added after call to __libm_error_support to properly
|
||||
// set [the previously overwritten] GR_Parameter_RESULT.
|
||||
// 05/07/01 Reworked to improve speed of all paths
|
||||
// 05/20/02 Cleaned up namespace and sf0 syntax
|
||||
// 11/15/02 Improved algorithm based on expf
|
||||
// 03/31/05 Reformatted delimiters between data tables
|
||||
//
|
||||
// API
|
||||
//*********************************************************************
|
||||
// float coshf(float)
|
||||
//
|
||||
// Overview of operation
|
||||
//*********************************************************************
|
||||
// Case 1: 0 < |x| < 0.25
|
||||
// Evaluate cosh(x) by a 8th order polynomial
|
||||
// Care is take for the order of multiplication; and A2 is not exactly 1/4!,
|
||||
// A3 is not exactly 1/6!, etc.
|
||||
// cosh(x) = 1 + (A1*x^2 + A2*x^4 + A3*x^6 + A4*x^8)
|
||||
//
|
||||
// Case 2: 0.25 < |x| < 89.41598
|
||||
// Algorithm is based on the identity cosh(x) = ( exp(x) + exp(-x) ) / 2.
|
||||
// The algorithm for exp is described as below. There are a number of
|
||||
// economies from evaluating both exp(x) and exp(-x). Although we
|
||||
// are evaluating both quantities, only where the quantities diverge do we
|
||||
// duplicate the computations. The basic algorithm for exp(x) is described
|
||||
// below.
|
||||
//
|
||||
// Take the input x. w is "how many log2/128 in x?"
|
||||
// w = x * 64/log2
|
||||
// NJ = int(w)
|
||||
// x = NJ*log2/64 + R
|
||||
|
||||
// NJ = 64*n + j
|
||||
// x = n*log2 + (log2/64)*j + R
|
||||
//
|
||||
// So, exp(x) = 2^n * 2^(j/64)* exp(R)
|
||||
//
|
||||
// T = 2^n * 2^(j/64)
|
||||
// Construct 2^n
|
||||
// Get 2^(j/64) table
|
||||
// actually all the entries of 2^(j/64) table are stored in DP and
|
||||
// with exponent bits set to 0 -> multiplication on 2^n can be
|
||||
// performed by doing logical "or" operation with bits presenting 2^n
|
||||
|
||||
// exp(R) = 1 + (exp(R) - 1)
|
||||
// P = exp(R) - 1 approximated by Taylor series of 3rd degree
|
||||
// P = A3*R^3 + A2*R^2 + R, A3 = 1/6, A2 = 1/2
|
||||
//
|
||||
|
||||
// The final result is reconstructed as follows
|
||||
// exp(x) = T + T*P
|
||||
|
||||
// Special values
|
||||
//*********************************************************************
|
||||
// coshf(+0) = 1.0
|
||||
// coshf(-0) = 1.0
|
||||
|
||||
// coshf(+qnan) = +qnan
|
||||
// coshf(-qnan) = -qnan
|
||||
// coshf(+snan) = +qnan
|
||||
// coshf(-snan) = -qnan
|
||||
|
||||
// coshf(-inf) = +inf
|
||||
// coshf(+inf) = +inf
|
||||
|
||||
// Overflow and Underflow
|
||||
//*********************************************************************
|
||||
// coshf(x) = largest single normal when
|
||||
// x = 89.41598 = 0x42b2d4fc
|
||||
//
|
||||
// There is no underflow.
|
||||
|
||||
// Registers used
|
||||
//*********************************************************************
|
||||
// Floating Point registers used:
|
||||
// f8 input, output
|
||||
// f6,f7, f9 -> f15, f32 -> f45
|
||||
|
||||
// General registers used:
|
||||
// r2, r3, r16 -> r38
|
||||
|
||||
// Predicate registers used:
|
||||
// p6 -> p15
|
||||
|
||||
// Assembly macros
|
||||
//*********************************************************************
|
||||
// integer registers used
|
||||
// scratch
|
||||
rNJ = r2
|
||||
rNJ_neg = r3
|
||||
|
||||
rJ_neg = r16
|
||||
rN_neg = r17
|
||||
rSignexp_x = r18
|
||||
rExp_x = r18
|
||||
rExp_mask = r19
|
||||
rExp_bias = r20
|
||||
rAd1 = r21
|
||||
rAd2 = r22
|
||||
rJ = r23
|
||||
rN = r24
|
||||
rTblAddr = r25
|
||||
rA3 = r26
|
||||
rExpHalf = r27
|
||||
rLn2Div64 = r28
|
||||
rGt_ln = r29
|
||||
r17ones_m1 = r29
|
||||
rRightShifter = r30
|
||||
rJ_mask = r30
|
||||
r64DivLn2 = r31
|
||||
rN_mask = r31
|
||||
// stacked
|
||||
GR_SAVE_PFS = r32
|
||||
GR_SAVE_B0 = r33
|
||||
GR_SAVE_GP = r34
|
||||
GR_Parameter_X = r35
|
||||
GR_Parameter_Y = r36
|
||||
GR_Parameter_RESULT = r37
|
||||
GR_Parameter_TAG = r38
|
||||
|
||||
// floating point registers used
|
||||
FR_X = f10
|
||||
FR_Y = f1
|
||||
FR_RESULT = f8
|
||||
// scratch
|
||||
fRightShifter = f6
|
||||
f64DivLn2 = f7
|
||||
fNormX = f9
|
||||
fNint = f10
|
||||
fN = f11
|
||||
fR = f12
|
||||
fLn2Div64 = f13
|
||||
fA2 = f14
|
||||
fA3 = f15
|
||||
// stacked
|
||||
fP = f32
|
||||
fT = f33
|
||||
fMIN_SGL_OFLOW_ARG = f34
|
||||
fMAX_SGL_NORM_ARG = f35
|
||||
fRSqr = f36
|
||||
fA1 = f37
|
||||
fA21 = f37
|
||||
fA4 = f38
|
||||
fA43 = f38
|
||||
fA4321 = f38
|
||||
fX4 = f39
|
||||
fTmp = f39
|
||||
fGt_pln = f39
|
||||
fWre_urm_f8 = f40
|
||||
fXsq = f40
|
||||
fP_neg = f41
|
||||
fT_neg = f42
|
||||
fExp = f43
|
||||
fExp_neg = f44
|
||||
fAbsX = f45
|
||||
|
||||
|
||||
RODATA
|
||||
.align 16
|
||||
|
||||
LOCAL_OBJECT_START(_coshf_table)
|
||||
data4 0x42b2d4fd // Smallest single arg to overflow single result
|
||||
data4 0x42b2d4fc // Largest single arg to give normal single result
|
||||
data4 0x00000000 // pad
|
||||
data4 0x00000000 // pad
|
||||
//
|
||||
// 2^(j/64) table, j goes from 0 to 63
|
||||
data8 0x0000000000000000 // 2^(0/64)
|
||||
data8 0x00002C9A3E778061 // 2^(1/64)
|
||||
data8 0x000059B0D3158574 // 2^(2/64)
|
||||
data8 0x0000874518759BC8 // 2^(3/64)
|
||||
data8 0x0000B5586CF9890F // 2^(4/64)
|
||||
data8 0x0000E3EC32D3D1A2 // 2^(5/64)
|
||||
data8 0x00011301D0125B51 // 2^(6/64)
|
||||
data8 0x0001429AAEA92DE0 // 2^(7/64)
|
||||
data8 0x000172B83C7D517B // 2^(8/64)
|
||||
data8 0x0001A35BEB6FCB75 // 2^(9/64)
|
||||
data8 0x0001D4873168B9AA // 2^(10/64)
|
||||
data8 0x0002063B88628CD6 // 2^(11/64)
|
||||
data8 0x0002387A6E756238 // 2^(12/64)
|
||||
data8 0x00026B4565E27CDD // 2^(13/64)
|
||||
data8 0x00029E9DF51FDEE1 // 2^(14/64)
|
||||
data8 0x0002D285A6E4030B // 2^(15/64)
|
||||
data8 0x000306FE0A31B715 // 2^(16/64)
|
||||
data8 0x00033C08B26416FF // 2^(17/64)
|
||||
data8 0x000371A7373AA9CB // 2^(18/64)
|
||||
data8 0x0003A7DB34E59FF7 // 2^(19/64)
|
||||
data8 0x0003DEA64C123422 // 2^(20/64)
|
||||
data8 0x0004160A21F72E2A // 2^(21/64)
|
||||
data8 0x00044E086061892D // 2^(22/64)
|
||||
data8 0x000486A2B5C13CD0 // 2^(23/64)
|
||||
data8 0x0004BFDAD5362A27 // 2^(24/64)
|
||||
data8 0x0004F9B2769D2CA7 // 2^(25/64)
|
||||
data8 0x0005342B569D4F82 // 2^(26/64)
|
||||
data8 0x00056F4736B527DA // 2^(27/64)
|
||||
data8 0x0005AB07DD485429 // 2^(28/64)
|
||||
data8 0x0005E76F15AD2148 // 2^(29/64)
|
||||
data8 0x0006247EB03A5585 // 2^(30/64)
|
||||
data8 0x0006623882552225 // 2^(31/64)
|
||||
data8 0x0006A09E667F3BCD // 2^(32/64)
|
||||
data8 0x0006DFB23C651A2F // 2^(33/64)
|
||||
data8 0x00071F75E8EC5F74 // 2^(34/64)
|
||||
data8 0x00075FEB564267C9 // 2^(35/64)
|
||||
data8 0x0007A11473EB0187 // 2^(36/64)
|
||||
data8 0x0007E2F336CF4E62 // 2^(37/64)
|
||||
data8 0x00082589994CCE13 // 2^(38/64)
|
||||
data8 0x000868D99B4492ED // 2^(39/64)
|
||||
data8 0x0008ACE5422AA0DB // 2^(40/64)
|
||||
data8 0x0008F1AE99157736 // 2^(41/64)
|
||||
data8 0x00093737B0CDC5E5 // 2^(42/64)
|
||||
data8 0x00097D829FDE4E50 // 2^(43/64)
|
||||
data8 0x0009C49182A3F090 // 2^(44/64)
|
||||
data8 0x000A0C667B5DE565 // 2^(45/64)
|
||||
data8 0x000A5503B23E255D // 2^(46/64)
|
||||
data8 0x000A9E6B5579FDBF // 2^(47/64)
|
||||
data8 0x000AE89F995AD3AD // 2^(48/64)
|
||||
data8 0x000B33A2B84F15FB // 2^(49/64)
|
||||
data8 0x000B7F76F2FB5E47 // 2^(50/64)
|
||||
data8 0x000BCC1E904BC1D2 // 2^(51/64)
|
||||
data8 0x000C199BDD85529C // 2^(52/64)
|
||||
data8 0x000C67F12E57D14B // 2^(53/64)
|
||||
data8 0x000CB720DCEF9069 // 2^(54/64)
|
||||
data8 0x000D072D4A07897C // 2^(55/64)
|
||||
data8 0x000D5818DCFBA487 // 2^(56/64)
|
||||
data8 0x000DA9E603DB3285 // 2^(57/64)
|
||||
data8 0x000DFC97337B9B5F // 2^(58/64)
|
||||
data8 0x000E502EE78B3FF6 // 2^(59/64)
|
||||
data8 0x000EA4AFA2A490DA // 2^(60/64)
|
||||
data8 0x000EFA1BEE615A27 // 2^(61/64)
|
||||
data8 0x000F50765B6E4540 // 2^(62/64)
|
||||
data8 0x000FA7C1819E90D8 // 2^(63/64)
|
||||
LOCAL_OBJECT_END(_coshf_table)
|
||||
|
||||
LOCAL_OBJECT_START(cosh_p_table)
|
||||
data8 0x3efa3001dcf5905b // A4
|
||||
data8 0x3f56c1437543543e // A3
|
||||
data8 0x3fa5555572601504 // A2
|
||||
data8 0x3fdfffffffe2f097 // A1
|
||||
LOCAL_OBJECT_END(cosh_p_table)
|
||||
|
||||
|
||||
.section .text
|
||||
GLOBAL_IEEE754_ENTRY(coshf)
|
||||
|
||||
{ .mlx
|
||||
getf.exp rSignexp_x = f8 // Must recompute if x unorm
|
||||
movl r64DivLn2 = 0x40571547652B82FE // 64/ln(2)
|
||||
}
|
||||
{ .mlx
|
||||
addl rTblAddr = @ltoff(_coshf_table),gp
|
||||
movl rRightShifter = 0x43E8000000000000 // DP Right Shifter
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
// point to the beginning of the table
|
||||
ld8 rTblAddr = [rTblAddr]
|
||||
fclass.m p6, p0 = f8, 0x0b // Test for x=unorm
|
||||
addl rA3 = 0x3E2AA, r0 // high bits of 1.0/6.0 rounded to SP
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fnorm.s1 fNormX = f8 // normalized x
|
||||
addl rExpHalf = 0xFFFE, r0 // exponent of 1/2
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
setf.d f64DivLn2 = r64DivLn2 // load 64/ln(2) to FP reg
|
||||
fclass.m p15, p0 = f8, 0x1e3 // test for NaT,NaN,Inf
|
||||
nop.i 0
|
||||
}
|
||||
{ .mlx
|
||||
// load Right Shifter to FP reg
|
||||
setf.d fRightShifter = rRightShifter
|
||||
movl rLn2Div64 = 0x3F862E42FEFA39EF // DP ln(2)/64 in GR
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
mov rExp_mask = 0x1ffff
|
||||
fcmp.eq.s1 p13, p0 = f0, f8 // test for x = 0.0
|
||||
shl rA3 = rA3, 12 // 0x3E2AA000, approx to 1.0/6.0 in SP
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
nop.f 0
|
||||
(p6) br.cond.spnt COSH_UNORM // Branch if x=unorm
|
||||
}
|
||||
;;
|
||||
|
||||
COSH_COMMON:
|
||||
{ .mfi
|
||||
setf.exp fA2 = rExpHalf // load A2 to FP reg
|
||||
nop.f 0
|
||||
mov rExp_bias = 0xffff
|
||||
}
|
||||
{ .mfb
|
||||
setf.d fLn2Div64 = rLn2Div64 // load ln(2)/64 to FP reg
|
||||
(p15) fma.s.s0 f8 = f8, f8, f0 // result if x = NaT,NaN,Inf
|
||||
(p15) br.ret.spnt b0 // exit here if x = NaT,NaN,Inf
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
// min overflow and max normal threshold
|
||||
ldfps fMIN_SGL_OFLOW_ARG, fMAX_SGL_NORM_ARG = [rTblAddr], 8
|
||||
nop.f 0
|
||||
and rExp_x = rExp_mask, rSignexp_x // Biased exponent of x
|
||||
}
|
||||
{ .mfb
|
||||
setf.s fA3 = rA3 // load A3 to FP reg
|
||||
(p13) fma.s.s0 f8 = f1, f1, f0 // result if x = 0.0
|
||||
(p13) br.ret.spnt b0 // exit here if x =0.0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
sub rExp_x = rExp_x, rExp_bias // True exponent of x
|
||||
fmerge.s fAbsX = f0, fNormX // Form |x|
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// x*(64/ln(2)) + Right Shifter
|
||||
fma.s1 fNint = fNormX, f64DivLn2, fRightShifter
|
||||
add rTblAddr = 8, rTblAddr
|
||||
}
|
||||
{ .mfb
|
||||
cmp.gt p7, p0 = -2, rExp_x // Test |x| < 2^(-2)
|
||||
fma.s1 fXsq = fNormX, fNormX, f0 // x*x for small path
|
||||
(p7) br.cond.spnt COSH_SMALL // Branch if 0 < |x| < 2^-2
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// check for overflow
|
||||
fcmp.ge.s1 p12, p13 = fAbsX, fMIN_SGL_OFLOW_ARG
|
||||
mov rJ_mask = 0x3f // 6-bit mask for J
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
fms.s1 fN = fNint, f1, fRightShifter // n in FP register
|
||||
// branch out if overflow
|
||||
(p12) br.cond.spnt COSH_CERTAIN_OVERFLOW
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
getf.sig rNJ = fNint // bits of n, j
|
||||
// check for possible overflow
|
||||
fcmp.gt.s1 p13, p0 = fAbsX, fMAX_SGL_NORM_ARG
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
addl rN = 0xFFBF - 63, rNJ // biased and shifted n-1,j
|
||||
fnma.s1 fR = fLn2Div64, fN, fNormX // R = x - N*ln(2)/64
|
||||
and rJ = rJ_mask, rNJ // bits of j
|
||||
}
|
||||
{ .mfi
|
||||
sub rNJ_neg = r0, rNJ // bits of n, j for -x
|
||||
nop.f 0
|
||||
andcm rN_mask = -1, rJ_mask // 0xff...fc0 to mask N
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
shladd rJ = rJ, 3, rTblAddr // address in the 2^(j/64) table
|
||||
nop.f 0
|
||||
and rN = rN_mask, rN // biased, shifted n-1
|
||||
}
|
||||
{ .mfi
|
||||
addl rN_neg = 0xFFBF - 63, rNJ_neg // -x biased, shifted n-1,j
|
||||
nop.f 0
|
||||
and rJ_neg = rJ_mask, rNJ_neg // bits of j for -x
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
ld8 rJ = [rJ] // Table value
|
||||
nop.f 0
|
||||
shl rN = rN, 46 // 2^(n-1) bits in DP format
|
||||
}
|
||||
{ .mfi
|
||||
shladd rJ_neg = rJ_neg, 3, rTblAddr // addr in 2^(j/64) table -x
|
||||
nop.f 0
|
||||
and rN_neg = rN_mask, rN_neg // biased, shifted n-1 for -x
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
ld8 rJ_neg = [rJ_neg] // Table value for -x
|
||||
nop.f 0
|
||||
shl rN_neg = rN_neg, 46 // 2^(n-1) bits in DP format for -x
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
or rN = rN, rJ // bits of 2^n * 2^(j/64) in DP format
|
||||
nop.f 0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mmf
|
||||
setf.d fT = rN // 2^(n-1) * 2^(j/64)
|
||||
or rN_neg = rN_neg, rJ_neg // -x bits of 2^n * 2^(j/64) in DP
|
||||
fma.s1 fRSqr = fR, fR, f0 // R^2
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
setf.d fT_neg = rN_neg // 2^(n-1) * 2^(j/64) for -x
|
||||
fma.s1 fP = fA3, fR, fA2 // A3*R + A2
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fnma.s1 fP_neg = fA3, fR, fA2 // A3*R + A2 for -x
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fP = fP, fRSqr, fR // P = (A3*R + A2)*R^2 + R
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fms.s1 fP_neg = fP_neg, fRSqr, fR // P = (A3*R + A2)*R^2 + R, -x
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fmpy.s0 fTmp = fLn2Div64, fLn2Div64 // Force inexact
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fExp = fP, fT, fT // exp(x)/2
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
fma.s1 fExp_neg = fP_neg, fT_neg, fT_neg // exp(-x)/2
|
||||
// branch out if possible overflow result
|
||||
(p13) br.cond.spnt COSH_POSSIBLE_OVERFLOW
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
// final result in the absence of overflow
|
||||
fma.s.s0 f8 = fExp, f1, fExp_neg // result = (exp(x)+exp(-x))/2
|
||||
// exit here in the absence of overflow
|
||||
br.ret.sptk b0 // Exit main path, 0.25 <= |x| < 89.41598
|
||||
}
|
||||
;;
|
||||
|
||||
// Here if 0 < |x| < 0.25. Evaluate 8th order polynomial.
|
||||
COSH_SMALL:
|
||||
{ .mmi
|
||||
add rAd1 = 0x200, rTblAddr
|
||||
add rAd2 = 0x210, rTblAddr
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mmi
|
||||
ldfpd fA4, fA3 = [rAd1]
|
||||
ldfpd fA2, fA1 = [rAd2]
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fX4 = fXsq, fXsq, f0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA43 = fXsq, fA4, fA3
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA21 = fXsq, fA2, fA1
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fA4321 = fX4, fA43, fA21
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
// Dummy multiply to generate inexact
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fmpy.s0 fTmp = fA4, fA4
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
fma.s.s0 f8 = fA4321, fXsq, f1
|
||||
br.ret.sptk b0 // Exit if 0 < |x| < 0.25
|
||||
}
|
||||
;;
|
||||
|
||||
COSH_POSSIBLE_OVERFLOW:
|
||||
|
||||
// Here if fMAX_SGL_NORM_ARG < x < fMIN_SGL_OFLOW_ARG
|
||||
// This cannot happen if input is a single, only if input higher precision.
|
||||
// Overflow is a possibility, not a certainty.
|
||||
|
||||
// Recompute result using status field 2 with user's rounding mode,
|
||||
// and wre set. If result is larger than largest single, then we have
|
||||
// overflow
|
||||
|
||||
{ .mfi
|
||||
mov rGt_ln = 0x1007f // Exponent for largest single + 1 ulp
|
||||
fsetc.s2 0x7F,0x42 // Get user's round mode, set wre
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
setf.exp fGt_pln = rGt_ln // Create largest single + 1 ulp
|
||||
fma.s.s2 fWre_urm_f8 = fP, fT, fT // Result with wre set
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fsetc.s2 0x7F,0x40 // Turn off wre in sf2
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
nop.f 0
|
||||
(p6) br.cond.spnt COSH_CERTAIN_OVERFLOW // Branch if overflow
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
fma.s.s0 f8 = fP, fT, fT
|
||||
br.ret.sptk b0 // Exit if really no overflow
|
||||
}
|
||||
;;
|
||||
|
||||
// here if overflow
|
||||
COSH_CERTAIN_OVERFLOW:
|
||||
{ .mmi
|
||||
addl r17ones_m1 = 0x1FFFE, r0
|
||||
;;
|
||||
setf.exp fTmp = r17ones_m1
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
alloc r32 = ar.pfs, 0, 3, 4, 0 // get some registers
|
||||
fmerge.s FR_X = f8,f8
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
mov GR_Parameter_TAG = 65
|
||||
fma.s.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and +INF result
|
||||
br.cond.sptk __libm_error_region
|
||||
}
|
||||
;;
|
||||
|
||||
// Here if x unorm
|
||||
COSH_UNORM:
|
||||
{ .mfb
|
||||
getf.exp rSignexp_x = fNormX // Must recompute if x unorm
|
||||
fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag
|
||||
br.cond.sptk COSH_COMMON // Return to main path
|
||||
}
|
||||
;;
|
||||
|
||||
GLOBAL_IEEE754_END(coshf)
|
||||
libm_alias_float_other (__cosh, cosh)
|
||||
|
||||
|
||||
LOCAL_LIBM_ENTRY(__libm_error_region)
|
||||
.prologue
|
||||
{ .mfi
|
||||
add GR_Parameter_Y=-32,sp // Parameter 2 value
|
||||
nop.f 0
|
||||
.save ar.pfs,GR_SAVE_PFS
|
||||
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
|
||||
}
|
||||
{ .mfi
|
||||
.fframe 64
|
||||
add sp=-64,sp // Create new stack
|
||||
nop.f 0
|
||||
mov GR_SAVE_GP=gp // Save gp
|
||||
};;
|
||||
{ .mmi
|
||||
stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack
|
||||
add GR_Parameter_X = 16,sp // Parameter 1 address
|
||||
.save b0, GR_SAVE_B0
|
||||
mov GR_SAVE_B0=b0 // Save b0
|
||||
};;
|
||||
.body
|
||||
{ .mfi
|
||||
stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
|
||||
nop.f 0
|
||||
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
|
||||
}
|
||||
{ .mib
|
||||
stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
|
||||
add GR_Parameter_Y = -16,GR_Parameter_Y
|
||||
br.call.sptk b0=__libm_error_support# // Call error handling function
|
||||
};;
|
||||
|
||||
{ .mmi
|
||||
add GR_Parameter_RESULT = 48,sp
|
||||
nop.m 0
|
||||
nop.i 0
|
||||
};;
|
||||
|
||||
{ .mmi
|
||||
ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
|
||||
.restore sp
|
||||
add sp = 64,sp // Restore stack pointer
|
||||
mov b0 = GR_SAVE_B0 // Restore return address
|
||||
};;
|
||||
{ .mib
|
||||
mov gp = GR_SAVE_GP // Restore gp
|
||||
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
|
||||
br.ret.sptk b0 // Return
|
||||
};;
|
||||
|
||||
LOCAL_LIBM_END(__libm_error_region)
|
||||
|
||||
|
||||
.type __libm_error_support#,@function
|
||||
.global __libm_error_support#
|
File diff suppressed because it is too large
Load Diff
@ -1,799 +0,0 @@
|
||||
.file "exp.s"
|
||||
|
||||
|
||||
// Copyright (c) 2000 - 2005, Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
//
|
||||
// History
|
||||
//==============================================================
|
||||
// 2/02/00 Initial version
|
||||
// 3/07/00 exp(inf) = inf but now does NOT call error support
|
||||
// exp(-inf) = 0 but now does NOT call error support
|
||||
// 4/04/00 Unwind support added
|
||||
// 8/15/00 Bundle added after call to __libm_error_support to properly
|
||||
// set [the previously overwritten] GR_Parameter_RESULT.
|
||||
// 11/30/00 Reworked to shorten main path, widen main path to include all
|
||||
// args in normal range, and add quick exit for 0, nan, inf.
|
||||
// 12/05/00 Loaded constants earlier with setf to save 2 cycles.
|
||||
// 02/05/02 Corrected uninitialize predicate in POSSIBLE_UNDERFLOW path
|
||||
// 05/20/02 Cleaned up namespace and sf0 syntax
|
||||
// 09/07/02 Force inexact flag
|
||||
// 11/15/02 Split underflow path into zero/nonzero; eliminated fma in main path
|
||||
// 05/30/03 Set inexact flag on unmasked overflow/underflow
|
||||
// 03/31/05 Reformatted delimiters between data tables
|
||||
|
||||
// API
|
||||
//==============================================================
|
||||
// double exp(double)
|
||||
|
||||
// Overview of operation
|
||||
//==============================================================
|
||||
// Take the input x. w is "how many log2/128 in x?"
|
||||
// w = x * 128/log2
|
||||
// n = int(w)
|
||||
// x = n log2/128 + r + delta
|
||||
|
||||
// n = 128M + index_1 + 2^4 index_2
|
||||
// x = M log2 + (log2/128) index_1 + (log2/8) index_2 + r + delta
|
||||
|
||||
// exp(x) = 2^M 2^(index_1/128) 2^(index_2/8) exp(r) exp(delta)
|
||||
// Construct 2^M
|
||||
// Get 2^(index_1/128) from table_1;
|
||||
// Get 2^(index_2/8) from table_2;
|
||||
// Calculate exp(r) by 5th order polynomial
|
||||
// r = x - n (log2/128)_high
|
||||
// delta = - n (log2/128)_low
|
||||
// Calculate exp(delta) as 1 + delta
|
||||
|
||||
|
||||
// Special values
|
||||
//==============================================================
|
||||
// exp(+0) = 1.0
|
||||
// exp(-0) = 1.0
|
||||
|
||||
// exp(+qnan) = +qnan
|
||||
// exp(-qnan) = -qnan
|
||||
// exp(+snan) = +qnan
|
||||
// exp(-snan) = -qnan
|
||||
|
||||
// exp(-inf) = +0
|
||||
// exp(+inf) = +inf
|
||||
|
||||
// Overflow and Underflow
|
||||
//=======================
|
||||
// exp(x) = largest double normal when
|
||||
// x = 709.7827 = 0x40862e42fefa39ef
|
||||
|
||||
// exp(x) = smallest double normal when
|
||||
// x = -708.396 = 0xc086232bdd7abcd2
|
||||
|
||||
// exp(x) = largest round-to-nearest single zero when
|
||||
// x = -745.1332 = 0xc0874910d52d3052
|
||||
|
||||
|
||||
// Registers used
|
||||
//==============================================================
|
||||
// Floating Point registers used:
|
||||
// f8, input, output
|
||||
// f6 -> f15, f32 -> f49
|
||||
|
||||
// General registers used:
|
||||
// r14 -> r40
|
||||
|
||||
// Predicate registers used:
|
||||
// p6 -> p15
|
||||
|
||||
// Assembly macros
|
||||
//==============================================================
|
||||
|
||||
rRshf = r14
|
||||
rAD_TB1 = r15
|
||||
rAD_T1 = r15
|
||||
rAD_TB2 = r16
|
||||
rAD_T2 = r16
|
||||
rAD_P = r17
|
||||
rN = r18
|
||||
rIndex_1 = r19
|
||||
rIndex_2_16 = r20
|
||||
rM = r21
|
||||
rBiased_M = r21
|
||||
rIndex_1_16 = r21
|
||||
rSig_inv_ln2 = r22
|
||||
rExp_bias = r23
|
||||
rExp_mask = r24
|
||||
rTmp = r25
|
||||
rRshf_2to56 = r26
|
||||
rGt_ln = r27
|
||||
rExp_2tom56 = r28
|
||||
|
||||
|
||||
GR_SAVE_B0 = r33
|
||||
GR_SAVE_PFS = r34
|
||||
GR_SAVE_GP = r35
|
||||
GR_SAVE_SP = r36
|
||||
|
||||
GR_Parameter_X = r37
|
||||
GR_Parameter_Y = r38
|
||||
GR_Parameter_RESULT = r39
|
||||
GR_Parameter_TAG = r40
|
||||
|
||||
|
||||
FR_X = f10
|
||||
FR_Y = f1
|
||||
FR_RESULT = f8
|
||||
|
||||
fRSHF_2TO56 = f6
|
||||
fINV_LN2_2TO63 = f7
|
||||
fW_2TO56_RSH = f9
|
||||
f2TOM56 = f11
|
||||
fP5 = f12
|
||||
fP54 = f12
|
||||
fP5432 = f12
|
||||
fP4 = f13
|
||||
fP3 = f14
|
||||
fP32 = f14
|
||||
fP2 = f15
|
||||
fP = f15
|
||||
|
||||
fLn2_by_128_hi = f33
|
||||
fLn2_by_128_lo = f34
|
||||
|
||||
fRSHF = f35
|
||||
fNfloat = f36
|
||||
fNormX = f37
|
||||
fR = f38
|
||||
fF = f39
|
||||
|
||||
fRsq = f40
|
||||
f2M = f41
|
||||
fS1 = f42
|
||||
fT1 = f42
|
||||
fS2 = f43
|
||||
fT2 = f43
|
||||
fS = f43
|
||||
fWre_urm_f8 = f44
|
||||
fFtz_urm_f8 = f44
|
||||
|
||||
fMIN_DBL_OFLOW_ARG = f45
|
||||
fMAX_DBL_ZERO_ARG = f46
|
||||
fMAX_DBL_NORM_ARG = f47
|
||||
fMIN_DBL_NORM_ARG = f48
|
||||
fGt_pln = f49
|
||||
fTmp = f49
|
||||
|
||||
|
||||
// Data tables
|
||||
//==============================================================
|
||||
|
||||
RODATA
|
||||
.align 16
|
||||
|
||||
// ************* DO NOT CHANGE ORDER OF THESE TABLES ********************
|
||||
|
||||
// double-extended 1/ln(2)
|
||||
// 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88
|
||||
// 3fff b8aa 3b29 5c17 f0bc
|
||||
// For speed the significand will be loaded directly with a movl and setf.sig
|
||||
// and the exponent will be bias+63 instead of bias+0. Thus subsequent
|
||||
// computations need to scale appropriately.
|
||||
// The constant 128/ln(2) is needed for the computation of w. This is also
|
||||
// obtained by scaling the computations.
|
||||
//
|
||||
// Two shifting constants are loaded directly with movl and setf.d.
|
||||
// 1. fRSHF_2TO56 = 1.1000..00 * 2^(63-7)
|
||||
// This constant is added to x*1/ln2 to shift the integer part of
|
||||
// x*128/ln2 into the rightmost bits of the significand.
|
||||
// The result of this fma is fW_2TO56_RSH.
|
||||
// 2. fRSHF = 1.1000..00 * 2^(63)
|
||||
// This constant is subtracted from fW_2TO56_RSH * 2^(-56) to give
|
||||
// the integer part of w, n, as a floating-point number.
|
||||
// The result of this fms is fNfloat.
|
||||
|
||||
|
||||
LOCAL_OBJECT_START(exp_table_1)
|
||||
data8 0x40862e42fefa39f0 // smallest dbl overflow arg, +709.7827
|
||||
data8 0xc0874910d52d3052 // largest arg for rnd-to-nearest 0 result, -745.133
|
||||
data8 0x40862e42fefa39ef // largest dbl arg to give normal dbl result, +709.7827
|
||||
data8 0xc086232bdd7abcd2 // smallest dbl arg to give normal dbl result, -708.396
|
||||
data8 0xb17217f7d1cf79ab , 0x00003ff7 // ln2/128 hi
|
||||
data8 0xc9e3b39803f2f6af , 0x00003fb7 // ln2/128 lo
|
||||
//
|
||||
// Table 1 is 2^(index_1/128) where
|
||||
// index_1 goes from 0 to 15
|
||||
//
|
||||
data8 0x8000000000000000 , 0x00003FFF
|
||||
data8 0x80B1ED4FD999AB6C , 0x00003FFF
|
||||
data8 0x8164D1F3BC030773 , 0x00003FFF
|
||||
data8 0x8218AF4373FC25EC , 0x00003FFF
|
||||
data8 0x82CD8698AC2BA1D7 , 0x00003FFF
|
||||
data8 0x8383594EEFB6EE37 , 0x00003FFF
|
||||
data8 0x843A28C3ACDE4046 , 0x00003FFF
|
||||
data8 0x84F1F656379C1A29 , 0x00003FFF
|
||||
data8 0x85AAC367CC487B15 , 0x00003FFF
|
||||
data8 0x8664915B923FBA04 , 0x00003FFF
|
||||
data8 0x871F61969E8D1010 , 0x00003FFF
|
||||
data8 0x87DB357FF698D792 , 0x00003FFF
|
||||
data8 0x88980E8092DA8527 , 0x00003FFF
|
||||
data8 0x8955EE03618E5FDD , 0x00003FFF
|
||||
data8 0x8A14D575496EFD9A , 0x00003FFF
|
||||
data8 0x8AD4C6452C728924 , 0x00003FFF
|
||||
LOCAL_OBJECT_END(exp_table_1)
|
||||
|
||||
// Table 2 is 2^(index_1/8) where
|
||||
// index_2 goes from 0 to 7
|
||||
LOCAL_OBJECT_START(exp_table_2)
|
||||
data8 0x8000000000000000 , 0x00003FFF
|
||||
data8 0x8B95C1E3EA8BD6E7 , 0x00003FFF
|
||||
data8 0x9837F0518DB8A96F , 0x00003FFF
|
||||
data8 0xA5FED6A9B15138EA , 0x00003FFF
|
||||
data8 0xB504F333F9DE6484 , 0x00003FFF
|
||||
data8 0xC5672A115506DADD , 0x00003FFF
|
||||
data8 0xD744FCCAD69D6AF4 , 0x00003FFF
|
||||
data8 0xEAC0C6E7DD24392F , 0x00003FFF
|
||||
LOCAL_OBJECT_END(exp_table_2)
|
||||
|
||||
|
||||
LOCAL_OBJECT_START(exp_p_table)
|
||||
data8 0x3f8111116da21757 //P5
|
||||
data8 0x3fa55555d787761c //P4
|
||||
data8 0x3fc5555555555414 //P3
|
||||
data8 0x3fdffffffffffd6a //P2
|
||||
LOCAL_OBJECT_END(exp_p_table)
|
||||
|
||||
|
||||
.section .text
|
||||
GLOBAL_IEEE754_ENTRY(exp)
|
||||
|
||||
{ .mlx
|
||||
nop.m 0
|
||||
movl rSig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2
|
||||
}
|
||||
{ .mlx
|
||||
addl rAD_TB1 = @ltoff(exp_table_1), gp
|
||||
movl rRshf_2to56 = 0x4768000000000000 // 1.10000 2^(63+56)
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
ld8 rAD_TB1 = [rAD_TB1]
|
||||
fclass.m p8,p0 = f8,0x07 // Test for x=0
|
||||
mov rExp_mask = 0x1ffff
|
||||
}
|
||||
{ .mfi
|
||||
mov rExp_bias = 0xffff
|
||||
fnorm.s1 fNormX = f8
|
||||
mov rExp_2tom56 = 0xffff-56
|
||||
}
|
||||
;;
|
||||
|
||||
// Form two constants we need
|
||||
// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128
|
||||
// 1.1000..000 * 2^(63+63-7) to right shift int(w) into the significand
|
||||
|
||||
{ .mfi
|
||||
setf.sig fINV_LN2_2TO63 = rSig_inv_ln2 // form 1/ln2 * 2^63
|
||||
fclass.m p9,p0 = f8,0x22 // Test for x=-inf
|
||||
nop.i 0
|
||||
}
|
||||
{ .mlx
|
||||
setf.d fRSHF_2TO56 = rRshf_2to56 // Form const 1.100 * 2^(63+56)
|
||||
movl rRshf = 0x43e8000000000000 // 1.10000 2^63 for right shift
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
ldfpd fMIN_DBL_OFLOW_ARG, fMAX_DBL_ZERO_ARG = [rAD_TB1],16
|
||||
fclass.m p10,p0 = f8,0x1e1 // Test for x=+inf, nan, NaT
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
setf.exp f2TOM56 = rExp_2tom56 // form 2^-56 for scaling Nfloat
|
||||
(p9) fma.d.s0 f8 = f0,f0,f0 // quick exit for x=-inf
|
||||
(p9) br.ret.spnt b0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
ldfpd fMAX_DBL_NORM_ARG, fMIN_DBL_NORM_ARG = [rAD_TB1],16
|
||||
nop.f 0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
setf.d fRSHF = rRshf // Form right shift const 1.100 * 2^63
|
||||
(p8) fma.d.s0 f8 = f1,f1,f0 // quick exit for x=0
|
||||
(p8) br.ret.spnt b0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
ldfe fLn2_by_128_hi = [rAD_TB1],16
|
||||
(p10) fma.d.s0 f8 = f8,f8,f0 // Result if x=+inf, nan, NaT
|
||||
(p10) br.ret.spnt b0 // quick exit for x=+inf, nan, NaT
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
ldfe fLn2_by_128_lo = [rAD_TB1],16
|
||||
fcmp.eq.s0 p6,p0 = f8, f0 // Dummy to set D
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
// After that last load, rAD_TB1 points to the beginning of table 1
|
||||
|
||||
// W = X * Inv_log2_by_128
|
||||
// By adding 1.10...0*2^63 we shift and get round_int(W) in significand.
|
||||
// We actually add 1.10...0*2^56 to X * Inv_log2 to do the same thing.
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fW_2TO56_RSH = fNormX, fINV_LN2_2TO63, fRSHF_2TO56
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
// Divide arguments into the following categories:
|
||||
// Certain Underflow p11 - -inf < x <= MAX_DBL_ZERO_ARG
|
||||
// Possible Underflow p13 - MAX_DBL_ZERO_ARG < x < MIN_DBL_NORM_ARG
|
||||
// Certain Safe - MIN_DBL_NORM_ARG <= x <= MAX_DBL_NORM_ARG
|
||||
// Possible Overflow p14 - MAX_DBL_NORM_ARG < x < MIN_DBL_OFLOW_ARG
|
||||
// Certain Overflow p15 - MIN_DBL_OFLOW_ARG <= x < +inf
|
||||
//
|
||||
// If the input is really a double arg, then there will never be
|
||||
// "Possible Overflow" arguments.
|
||||
//
|
||||
|
||||
{ .mfi
|
||||
add rAD_TB2 = 0x100, rAD_TB1
|
||||
fcmp.ge.s1 p15,p0 = fNormX,fMIN_DBL_OFLOW_ARG
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
add rAD_P = 0x80, rAD_TB2
|
||||
fcmp.le.s1 p11,p0 = fNormX,fMAX_DBL_ZERO_ARG
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
ldfpd fP5, fP4 = [rAD_P] ,16
|
||||
fcmp.gt.s1 p14,p0 = fNormX,fMAX_DBL_NORM_ARG
|
||||
(p15) br.cond.spnt EXP_CERTAIN_OVERFLOW
|
||||
}
|
||||
;;
|
||||
|
||||
// Nfloat = round_int(W)
|
||||
// The signficand of fW_2TO56_RSH contains the rounded integer part of W,
|
||||
// as a twos complement number in the lower bits (that is, it may be negative).
|
||||
// That twos complement number (called N) is put into rN.
|
||||
|
||||
// Since fW_2TO56_RSH is scaled by 2^56, it must be multiplied by 2^-56
|
||||
// before the shift constant 1.10000 * 2^63 is subtracted to yield fNfloat.
|
||||
// Thus, fNfloat contains the floating point version of N
|
||||
|
||||
{ .mfb
|
||||
ldfpd fP3, fP2 = [rAD_P]
|
||||
fms.s1 fNfloat = fW_2TO56_RSH, f2TOM56, fRSHF
|
||||
(p11) br.cond.spnt EXP_CERTAIN_UNDERFLOW
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
getf.sig rN = fW_2TO56_RSH
|
||||
nop.f 0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
// rIndex_1 has index_1
|
||||
// rIndex_2_16 has index_2 * 16
|
||||
// rBiased_M has M
|
||||
// rIndex_1_16 has index_1 * 16
|
||||
|
||||
// rM has true M
|
||||
// r = x - Nfloat * ln2_by_128_hi
|
||||
// f = 1 - Nfloat * ln2_by_128_lo
|
||||
{ .mfi
|
||||
and rIndex_1 = 0x0f, rN
|
||||
fnma.s1 fR = fNfloat, fLn2_by_128_hi, fNormX
|
||||
shr rM = rN, 0x7
|
||||
}
|
||||
{ .mfi
|
||||
and rIndex_2_16 = 0x70, rN
|
||||
fnma.s1 fF = fNfloat, fLn2_by_128_lo, f1
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
// rAD_T1 has address of T1
|
||||
// rAD_T2 has address if T2
|
||||
|
||||
{ .mmi
|
||||
add rBiased_M = rExp_bias, rM
|
||||
add rAD_T2 = rAD_TB2, rIndex_2_16
|
||||
shladd rAD_T1 = rIndex_1, 4, rAD_TB1
|
||||
}
|
||||
;;
|
||||
|
||||
// Create Scale = 2^M
|
||||
{ .mmi
|
||||
setf.exp f2M = rBiased_M
|
||||
ldfe fT2 = [rAD_T2]
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
// Load T1 and T2
|
||||
{ .mfi
|
||||
ldfe fT1 = [rAD_T1]
|
||||
fmpy.s0 fTmp = fLn2_by_128_lo, fLn2_by_128_lo // Force inexact
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fRsq = fR, fR, f0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fP54 = fR, fP5, fP4
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fcmp.lt.s1 p13,p0 = fNormX,fMIN_DBL_NORM_ARG
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fP32 = fR, fP3, fP2
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fP5432 = fRsq, fP54, fP32
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fS1 = f2M,fT1,f0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fS2 = fF,fT2,f0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fP = fRsq, fP5432, fR
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fS = fS1,fS2,f0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mbb
|
||||
nop.m 0
|
||||
(p13) br.cond.spnt EXP_POSSIBLE_UNDERFLOW
|
||||
(p14) br.cond.spnt EXP_POSSIBLE_OVERFLOW
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
fma.d.s0 f8 = fS, fP, fS
|
||||
br.ret.sptk b0 // Normal path exit
|
||||
}
|
||||
;;
|
||||
|
||||
|
||||
EXP_POSSIBLE_OVERFLOW:
|
||||
|
||||
// Here if fMAX_DBL_NORM_ARG < x < fMIN_DBL_OFLOW_ARG
|
||||
// This cannot happen if input is a double, only if input higher precision.
|
||||
// Overflow is a possibility, not a certainty.
|
||||
|
||||
// Recompute result using status field 2 with user's rounding mode,
|
||||
// and wre set. If result is larger than largest double, then we have
|
||||
// overflow
|
||||
|
||||
{ .mfi
|
||||
mov rGt_ln = 0x103ff // Exponent for largest dbl + 1 ulp
|
||||
fsetc.s2 0x7F,0x42 // Get user's round mode, set wre
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
setf.exp fGt_pln = rGt_ln // Create largest double + 1 ulp
|
||||
fma.d.s2 fWre_urm_f8 = fS, fP, fS // Result with wre set
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fsetc.s2 0x7F,0x40 // Turn off wre in sf2
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
nop.f 0
|
||||
(p6) br.cond.spnt EXP_CERTAIN_OVERFLOW // Branch if overflow
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
fma.d.s0 f8 = fS, fP, fS
|
||||
br.ret.sptk b0 // Exit if really no overflow
|
||||
}
|
||||
;;
|
||||
|
||||
EXP_CERTAIN_OVERFLOW:
|
||||
{ .mmi
|
||||
sub rTmp = rExp_mask, r0, 1
|
||||
;;
|
||||
setf.exp fTmp = rTmp
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
alloc r32=ar.pfs,1,4,4,0
|
||||
fmerge.s FR_X = f8,f8
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
mov GR_Parameter_TAG = 14
|
||||
fma.d.s0 FR_RESULT = fTmp, fTmp, fTmp // Set I,O and +INF result
|
||||
br.cond.sptk __libm_error_region
|
||||
}
|
||||
;;
|
||||
|
||||
EXP_POSSIBLE_UNDERFLOW:
|
||||
|
||||
// Here if fMAX_DBL_ZERO_ARG < x < fMIN_DBL_NORM_ARG
|
||||
// Underflow is a possibility, not a certainty
|
||||
|
||||
// We define an underflow when the answer with
|
||||
// ftz set
|
||||
// is zero (tiny numbers become zero)
|
||||
|
||||
// Notice (from below) that if we have an unlimited exponent range,
|
||||
// then there is an extra machine number E between the largest denormal and
|
||||
// the smallest normal.
|
||||
|
||||
// So if with unbounded exponent we round to E or below, then we are
|
||||
// tiny and underflow has occurred.
|
||||
|
||||
// But notice that you can be in a situation where we are tiny, namely
|
||||
// rounded to E, but when the exponent is bounded we round to smallest
|
||||
// normal. So the answer can be the smallest normal with underflow.
|
||||
|
||||
// E
|
||||
// -----+--------------------+--------------------+-----
|
||||
// | | |
|
||||
// 1.1...10 2^-3fff 1.1...11 2^-3fff 1.0...00 2^-3ffe
|
||||
// 0.1...11 2^-3ffe (biased, 1)
|
||||
// largest dn smallest normal
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fsetc.s2 0x7F,0x41 // Get user's round mode, set ftz
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.d.s2 fFtz_urm_f8 = fS, fP, fS // Result with ftz set
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fsetc.s2 0x7F,0x40 // Turn off ftz in sf2
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fcmp.eq.s1 p6, p7 = fFtz_urm_f8, f0 // Test for underflow
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.d.s0 f8 = fS, fP, fS // Compute result, set I, maybe U
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mbb
|
||||
nop.m 0
|
||||
(p6) br.cond.spnt EXP_UNDERFLOW_COMMON // Branch if really underflow
|
||||
(p7) br.ret.sptk b0 // Exit if really no underflow
|
||||
}
|
||||
;;
|
||||
|
||||
EXP_CERTAIN_UNDERFLOW:
|
||||
// Here if x < fMAX_DBL_ZERO_ARG
|
||||
// Result will be zero (or smallest denorm if round to +inf) with I, U set
|
||||
{ .mmi
|
||||
mov rTmp = 1
|
||||
;;
|
||||
setf.exp fTmp = rTmp // Form small normal
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fmerge.se fTmp = fTmp, fLn2_by_128_lo // Small with signif lsb 1
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
fma.d.s0 f8 = fTmp, fTmp, f0 // Set I,U, tiny (+0.0) result
|
||||
br.cond.sptk EXP_UNDERFLOW_COMMON
|
||||
}
|
||||
;;
|
||||
|
||||
EXP_UNDERFLOW_COMMON:
|
||||
// Determine if underflow result is zero or nonzero
|
||||
{ .mfi
|
||||
alloc r32=ar.pfs,1,4,4,0
|
||||
fcmp.eq.s1 p6, p0 = f8, f0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
fmerge.s FR_X = fNormX,fNormX
|
||||
(p6) br.cond.spnt EXP_UNDERFLOW_ZERO
|
||||
}
|
||||
;;
|
||||
|
||||
EXP_UNDERFLOW_NONZERO:
|
||||
// Here if x < fMIN_DBL_NORM_ARG and result nonzero;
|
||||
// I, U are set
|
||||
{ .mfb
|
||||
mov GR_Parameter_TAG = 15
|
||||
nop.f 0 // FR_RESULT already set
|
||||
br.cond.sptk __libm_error_region
|
||||
}
|
||||
;;
|
||||
|
||||
EXP_UNDERFLOW_ZERO:
|
||||
// Here if x < fMIN_DBL_NORM_ARG and result zero;
|
||||
// I, U are set
|
||||
{ .mfb
|
||||
mov GR_Parameter_TAG = 15
|
||||
nop.f 0 // FR_RESULT already set
|
||||
br.cond.sptk __libm_error_region
|
||||
}
|
||||
;;
|
||||
|
||||
GLOBAL_IEEE754_END(exp)
|
||||
libm_alias_double_other (__exp, exp)
|
||||
#ifdef SHARED
|
||||
.symver exp,exp@@GLIBC_2.29
|
||||
.weak __exp_compat
|
||||
.set __exp_compat,__exp
|
||||
.symver __exp_compat,exp@GLIBC_2.2
|
||||
#endif
|
||||
|
||||
|
||||
LOCAL_LIBM_ENTRY(__libm_error_region)
|
||||
.prologue
|
||||
{ .mfi
|
||||
add GR_Parameter_Y=-32,sp // Parameter 2 value
|
||||
nop.f 0
|
||||
.save ar.pfs,GR_SAVE_PFS
|
||||
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
|
||||
}
|
||||
{ .mfi
|
||||
.fframe 64
|
||||
add sp=-64,sp // Create new stack
|
||||
nop.f 0
|
||||
mov GR_SAVE_GP=gp // Save gp
|
||||
};;
|
||||
{ .mmi
|
||||
stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
|
||||
add GR_Parameter_X = 16,sp // Parameter 1 address
|
||||
.save b0, GR_SAVE_B0
|
||||
mov GR_SAVE_B0=b0 // Save b0
|
||||
};;
|
||||
.body
|
||||
{ .mib
|
||||
stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
|
||||
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
|
||||
nop.b 0
|
||||
}
|
||||
{ .mib
|
||||
stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
|
||||
add GR_Parameter_Y = -16,GR_Parameter_Y
|
||||
br.call.sptk b0=__libm_error_support# // Call error handling function
|
||||
};;
|
||||
{ .mmi
|
||||
add GR_Parameter_RESULT = 48,sp
|
||||
nop.m 0
|
||||
nop.i 0
|
||||
};;
|
||||
{ .mmi
|
||||
ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
|
||||
.restore sp
|
||||
add sp = 64,sp // Restore stack pointer
|
||||
mov b0 = GR_SAVE_B0 // Restore return address
|
||||
};;
|
||||
{ .mib
|
||||
mov gp = GR_SAVE_GP // Restore gp
|
||||
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
|
||||
br.ret.sptk b0 // Return
|
||||
};;
|
||||
|
||||
LOCAL_LIBM_END(__libm_error_region)
|
||||
.type __libm_error_support#,@function
|
||||
.global __libm_error_support#
|
@ -1,609 +0,0 @@
|
||||
.file "exp10.s"
|
||||
|
||||
|
||||
// Copyright (c) 2000 - 2005, Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
//
|
||||
// History
|
||||
//==============================================================
|
||||
// 08/25/00 Initial version
|
||||
// 05/20/02 Cleaned up namespace and sf0 syntax
|
||||
// 09/06/02 Improved performance; no inexact flags on exact cases
|
||||
// 01/29/03 Added missing } to bundle templates
|
||||
// 12/16/04 Call error handling on underflow.
|
||||
// 03/31/05 Reformatted delimiters between data tables
|
||||
//
|
||||
// API
|
||||
//==============================================================
|
||||
// double exp10(double)
|
||||
//
|
||||
// Overview of operation
|
||||
//==============================================================
|
||||
// Background
|
||||
//
|
||||
// Implementation
|
||||
//
|
||||
// Let x= (K + fh + fl + r)/log2(10), where
|
||||
// K is an integer, fh= 0.b1 b2 b3 b4 b5,
|
||||
// fl= 2^{-5}* 0.b6 b7 b8 b8 b10 (fh, fl >= 0),
|
||||
// and |r|<2^{-11}
|
||||
// Th is a table that stores 2^fh (32 entries) rounded to
|
||||
// double extended precision (only mantissa is stored)
|
||||
// Tl is a table that stores 2^fl (32 entries) rounded to
|
||||
// double extended precision (only mantissa is stored)
|
||||
//
|
||||
// 10^x is approximated as
|
||||
// 2^K * Th [ f ] * Tl [ f ] * (1+c1*e+c1*r+c2*r^2+c3*r^3+c4*r^4),
|
||||
// where e= (x*log2(10)_hi-RN(x*log2(10)_hi))+log2(10)_lo*x
|
||||
|
||||
// Note there are only 22 non-zero values that produce an exact result:
|
||||
// 1.0, 2.0, ... 22.0.
|
||||
// We test for these cases and use s1 to avoid setting the inexact flag.
|
||||
|
||||
// Special values
|
||||
//==============================================================
|
||||
// exp10(0)= 1
|
||||
// exp10(+inf)= inf
|
||||
// exp10(-inf)= 0
|
||||
//
|
||||
|
||||
// Registers used
|
||||
//==============================================================
|
||||
// r2-r3, r14-r40
|
||||
// f6-f15, f32-f52
|
||||
// p6-p12
|
||||
//
|
||||
|
||||
#include <shlib-compat.h>
|
||||
|
||||
|
||||
GR_TBL_START = r2
|
||||
GR_LOG_TBL = r3
|
||||
|
||||
GR_OF_LIMIT = r14
|
||||
GR_UF_LIMIT = r15
|
||||
GR_EXP_CORR = r16
|
||||
GR_F_low = r17
|
||||
GR_F_high = r18
|
||||
GR_K = r19
|
||||
GR_Flow_ADDR = r20
|
||||
|
||||
GR_BIAS = r21
|
||||
GR_Fh = r22
|
||||
GR_Fh_ADDR = r23
|
||||
GR_EXPMAX = r24
|
||||
GR_BIAS53 = r25
|
||||
|
||||
GR_ROUNDVAL = r26
|
||||
GR_SNORM_LIMIT = r26
|
||||
GR_MASK = r27
|
||||
GR_KF0 = r28
|
||||
GR_MASK_low = r29
|
||||
GR_COEFF_START = r30
|
||||
GR_exact_limit = r31
|
||||
|
||||
GR_SAVE_B0 = r33
|
||||
GR_SAVE_PFS = r34
|
||||
GR_SAVE_GP = r35
|
||||
GR_SAVE_SP = r36
|
||||
|
||||
GR_Parameter_X = r37
|
||||
GR_Parameter_Y = r38
|
||||
GR_Parameter_RESULT = r39
|
||||
GR_Parameter_TAG = r40
|
||||
|
||||
|
||||
FR_X = f10
|
||||
FR_Y = f1
|
||||
FR_RESULT = f8
|
||||
|
||||
|
||||
FR_COEFF1 = f6
|
||||
FR_COEFF2 = f7
|
||||
FR_R = f9
|
||||
FR_LOG2_10 = f10
|
||||
|
||||
FR_2P53 = f11
|
||||
FR_KF0 = f12
|
||||
FR_COEFF3 = f13
|
||||
FR_COEFF4 = f14
|
||||
FR_UF_LIMIT = f15
|
||||
|
||||
FR_OF_LIMIT = f32
|
||||
FR_DX_L210 = f33
|
||||
FR_ROUNDVAL = f34
|
||||
FR_KF = f35
|
||||
|
||||
FR_2_TO_K = f36
|
||||
FR_T_low = f37
|
||||
FR_T_high = f38
|
||||
FR_P34 = f39
|
||||
FR_R2 = f40
|
||||
|
||||
FR_P12 = f41
|
||||
FR_T_low_K = f42
|
||||
FR_P14 = f43
|
||||
FR_T = f44
|
||||
FR_P = f45
|
||||
|
||||
FR_L2_10_low = f46
|
||||
FR_L2_10_high = f47
|
||||
FR_E0 = f48
|
||||
FR_E = f49
|
||||
FR_exact_limit = f50
|
||||
|
||||
FR_int_x = f51
|
||||
FR_SNORM_LIMIT = f52
|
||||
|
||||
|
||||
// Data tables
|
||||
//==============================================================
|
||||
|
||||
RODATA
|
||||
|
||||
.align 16
|
||||
|
||||
LOCAL_OBJECT_START(poly_coeffs)
|
||||
|
||||
data8 0xd49a784bcd1b8afe, 0x00003fcb // log2(10)*2^(10-63)
|
||||
data8 0x9257edfe9b5fb698, 0x3fbf // log2(10)_low (bits 64...127)
|
||||
data8 0x3fac6b08d704a0c0, 0x3f83b2ab6fba4e77 // C_3 and C_4
|
||||
data8 0xb17217f7d1cf79ab, 0x00003ffe // C_1
|
||||
data8 0xf5fdeffc162c7541, 0x00003ffc // C_2
|
||||
LOCAL_OBJECT_END(poly_coeffs)
|
||||
|
||||
|
||||
LOCAL_OBJECT_START(T_table)
|
||||
|
||||
// 2^{0.00000 b6 b7 b8 b9 b10}
|
||||
data8 0x8000000000000000, 0x8016302f17467628
|
||||
data8 0x802c6436d0e04f50, 0x80429c17d77c18ed
|
||||
data8 0x8058d7d2d5e5f6b0, 0x806f17687707a7af
|
||||
data8 0x80855ad965e88b83, 0x809ba2264dada76a
|
||||
data8 0x80b1ed4fd999ab6c, 0x80c83c56b50cf77f
|
||||
data8 0x80de8f3b8b85a0af, 0x80f4e5ff089f763e
|
||||
data8 0x810b40a1d81406d4, 0x81219f24a5baa59d
|
||||
data8 0x813801881d886f7b, 0x814e67cceb90502c
|
||||
data8 0x8164d1f3bc030773, 0x817b3ffd3b2f2e47
|
||||
data8 0x8191b1ea15813bfd, 0x81a827baf7838b78
|
||||
data8 0x81bea1708dde6055, 0x81d51f0b8557ec1c
|
||||
data8 0x81eba08c8ad4536f, 0x820225f44b55b33b
|
||||
data8 0x8218af4373fc25eb, 0x822f3c7ab205c89a
|
||||
data8 0x8245cd9ab2cec048, 0x825c62a423d13f0c
|
||||
data8 0x8272fb97b2a5894c, 0x828998760d01faf3
|
||||
data8 0x82a0393fe0bb0ca8, 0x82b6ddf5dbc35906
|
||||
//
|
||||
// 2^{0.b1 b2 b3 b4 b5}
|
||||
data8 0x8000000000000000, 0x82cd8698ac2ba1d7
|
||||
data8 0x85aac367cc487b14, 0x88980e8092da8527
|
||||
data8 0x8b95c1e3ea8bd6e6, 0x8ea4398b45cd53c0
|
||||
data8 0x91c3d373ab11c336, 0x94f4efa8fef70961
|
||||
data8 0x9837f0518db8a96f, 0x9b8d39b9d54e5538
|
||||
data8 0x9ef5326091a111ad, 0xa27043030c496818
|
||||
data8 0xa5fed6a9b15138ea, 0xa9a15ab4ea7c0ef8
|
||||
data8 0xad583eea42a14ac6, 0xb123f581d2ac258f
|
||||
data8 0xb504f333f9de6484, 0xb8fbaf4762fb9ee9
|
||||
data8 0xbd08a39f580c36be, 0xc12c4cca66709456
|
||||
data8 0xc5672a115506dadd, 0xc9b9bd866e2f27a2
|
||||
data8 0xce248c151f8480e3, 0xd2a81d91f12ae45a
|
||||
data8 0xd744fccad69d6af4, 0xdbfbb797daf23755
|
||||
data8 0xe0ccdeec2a94e111, 0xe5b906e77c8348a8
|
||||
data8 0xeac0c6e7dd24392e, 0xefe4b99bdcdaf5cb
|
||||
data8 0xf5257d152486cc2c, 0xfa83b2db722a033a
|
||||
LOCAL_OBJECT_END(T_table)
|
||||
|
||||
|
||||
|
||||
.section .text
|
||||
GLOBAL_IEEE754_ENTRY(exp10)
|
||||
|
||||
|
||||
{.mfi
|
||||
alloc r32= ar.pfs, 1, 4, 4, 0
|
||||
// will continue only for non-zero normal/denormal numbers
|
||||
fclass.nm.unc p12, p7= f8, 0x1b
|
||||
mov GR_BIAS53= 0xffff+63-10
|
||||
}
|
||||
{.mlx
|
||||
// GR_TBL_START= pointer to log2(10), C_1...C_4 followed by T_table
|
||||
addl GR_TBL_START= @ltoff(poly_coeffs), gp
|
||||
movl GR_ROUNDVAL= 0x3fc00000 // 1.5 (SP)
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
ld8 GR_COEFF_START= [ GR_TBL_START ] // Load pointer to coeff table
|
||||
fcmp.lt.s1 p6, p8= f8, f0 // X<0 ?
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mlx
|
||||
setf.exp FR_2P53= GR_BIAS53 // 2^{63-10}
|
||||
movl GR_UF_LIMIT= 0xc07439b746e36b52 // (-2^10-51) / log2(10)
|
||||
}
|
||||
{.mlx
|
||||
setf.s FR_ROUNDVAL= GR_ROUNDVAL
|
||||
movl GR_OF_LIMIT= 0x40734413509f79fe // Overflow threshold
|
||||
}
|
||||
;;
|
||||
|
||||
{.mlx
|
||||
ldfe FR_LOG2_10= [ GR_COEFF_START ], 16 // load log2(10)*2^(10-63)
|
||||
movl GR_SNORM_LIMIT= 0xc0733a7146f72a41 // Smallest normal threshold
|
||||
}
|
||||
{.mib
|
||||
nop.m 0
|
||||
nop.i 0
|
||||
(p12) br.cond.spnt SPECIAL_exp10 // Branch if nan, inf, zero
|
||||
}
|
||||
;;
|
||||
|
||||
{.mmf
|
||||
ldfe FR_L2_10_low= [ GR_COEFF_START ], 16 // load log2(10)_low
|
||||
setf.d FR_OF_LIMIT= GR_OF_LIMIT // Set overflow limit
|
||||
fma.s0 f8= f8, f1, f0 // normalize x
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
ldfpd FR_COEFF3, FR_COEFF4= [ GR_COEFF_START ], 16 // load C_3, C_4
|
||||
(p8) fcvt.fx.s1 FR_int_x = f8 // Convert x to integer
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
setf.d FR_UF_LIMIT= GR_UF_LIMIT // Set underflow limit
|
||||
fma.s1 FR_KF0= f8, FR_LOG2_10, FR_ROUNDVAL // y= (x*log2(10)*2^10 +
|
||||
// 1.5*2^63) * 2^(-63)
|
||||
mov GR_EXP_CORR= 0xffff-126
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
setf.d FR_SNORM_LIMIT= GR_SNORM_LIMIT // Set smallest normal limit
|
||||
fma.s1 FR_L2_10_high= FR_LOG2_10, FR_2P53, f0 // FR_LOG2_10= log2(10)_hi
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
ldfe FR_COEFF1= [ GR_COEFF_START ], 16 // load C_1
|
||||
fms.s1 FR_KF= FR_KF0, f1, FR_ROUNDVAL // (K+f)*2^(10-63)
|
||||
mov GR_MASK= 1023
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
ldfe FR_COEFF2= [ GR_COEFF_START ], 16 // load C_2
|
||||
fma.s1 FR_LOG2_10= f8, FR_L2_10_high, f0 // y0= x*log2(10)_hi
|
||||
mov GR_MASK_low= 31
|
||||
}
|
||||
;;
|
||||
|
||||
{.mlx
|
||||
getf.sig GR_KF0= FR_KF0 // (K+f)*2^10= round_to_int(y)
|
||||
(p8) movl GR_exact_limit= 0x41b00000 // Largest x for exact result,
|
||||
// +22.0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
add GR_LOG_TBL= 256, GR_COEFF_START // Pointer to high T_table
|
||||
fcmp.gt.s1 p12, p7= f8, FR_OF_LIMIT // x>overflow threshold ?
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
(p8) setf.s FR_exact_limit = GR_exact_limit // Largest x for exact result
|
||||
(p8) fcvt.xf FR_int_x = FR_int_x // Integral part of x
|
||||
shr GR_K= GR_KF0, 10 // K
|
||||
}
|
||||
{.mfi
|
||||
and GR_F_high= GR_MASK, GR_KF0 // f_high*32
|
||||
fnma.s1 FR_R= FR_KF, FR_2P53, FR_LOG2_10 // r= x*log2(10)-2^{63-10}*
|
||||
// [ (K+f)*2^{10-63} ]
|
||||
and GR_F_low= GR_KF0, GR_MASK_low // f_low
|
||||
}
|
||||
;;
|
||||
|
||||
{.mmi
|
||||
shladd GR_Flow_ADDR= GR_F_low, 3, GR_COEFF_START // address of 2^{f_low}
|
||||
add GR_BIAS= GR_K, GR_EXP_CORR // K= bias-2*63
|
||||
shr GR_Fh= GR_F_high, 5 // f_high
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
setf.exp FR_2_TO_K= GR_BIAS // 2^{K-126}
|
||||
(p7) fcmp.lt.s1 p12, p7= f8, FR_UF_LIMIT // x<underflow threshold ?
|
||||
shladd GR_Fh_ADDR= GR_Fh, 3, GR_LOG_TBL // address of 2^{f_high}
|
||||
}
|
||||
{.mfi
|
||||
ldf8 FR_T_low= [ GR_Flow_ADDR ] // load T_low= 2^{f_low}
|
||||
fms.s1 FR_DX_L210= f8, FR_L2_10_high, FR_LOG2_10 // x*log2(10)_hi-
|
||||
// RN(x*log2(10)_hi)
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
ldf8 FR_T_high= [ GR_Fh_ADDR ] // load T_high= 2^{f_high}
|
||||
fma.s1 FR_P34= FR_COEFF4, FR_R, FR_COEFF3 // P34= C_3+C_4*r
|
||||
nop.i 0
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
fma.s1 FR_R2= FR_R, FR_R, f0 // r*r
|
||||
(p12) br.cond.spnt OUT_RANGE_exp10
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// e= (x*log2(10)_hi-RN(x*log2(10)_hi))+log2(10)_lo*x
|
||||
fma.s1 FR_E0= f8, FR_L2_10_low, FR_DX_L210
|
||||
cmp.eq p7,p9= r0,r0 // Assume inexact result
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s1 FR_P12= FR_COEFF2, FR_R, FR_COEFF1 // P12= C_1+C_2*r
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
(p8) fcmp.eq.s1 p9,p7= FR_int_x, f8 // Test x positive integer
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s1 FR_T_low_K= FR_T_low, FR_2_TO_K, f0 // T= 2^{K-126}*T_low
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fcmp.ge.s1 p11,p0= f8, FR_SNORM_LIMIT // Test x for normal range
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s1 FR_E= FR_E0, FR_COEFF1, f0 // E= C_1*e
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s1 FR_P14= FR_R2, FR_P34, FR_P12 // P14= P12+r2*P34
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
// If x a positive integer, will it produce an exact result?
|
||||
// p7 result will be inexact
|
||||
// p9 result will be exact
|
||||
{.mfi
|
||||
nop.m 0
|
||||
(p9) fcmp.le.s1 p9,p7= f8, FR_exact_limit // Test x gives exact result
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s1 FR_T= FR_T_low_K, FR_T_high, f0 // T= T*T_high
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s1 FR_P= FR_P14, FR_R, FR_E // P= P14*r+E
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
.pred.rel "mutex",p7,p9
|
||||
{.mfi
|
||||
nop.m 0
|
||||
(p7) fma.d.s0 f8= FR_P, FR_T, FR_T // result= T+T*P, inexact set
|
||||
nop.i 0
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
(p9) fma.d.s1 f8= FR_P, FR_T, FR_T // result= T+T*P, exact use s1
|
||||
(p11) br.ret.sptk b0 // return, if result normal
|
||||
}
|
||||
;;
|
||||
|
||||
// Here if result in denormal range (and not zero)
|
||||
{.mib
|
||||
nop.m 0
|
||||
mov GR_Parameter_TAG= 265
|
||||
br.cond.sptk __libm_error_region // Branch to error handling
|
||||
}
|
||||
;;
|
||||
|
||||
SPECIAL_exp10:
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fclass.m p6, p0= f8, 0x22 // x= -Infinity ?
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fclass.m p7, p0= f8, 0x21 // x= +Infinity ?
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fclass.m p8, p0= f8, 0x7 // x= +/-Zero ?
|
||||
nop.i 0
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
(p6) mov f8= f0 // exp10(-Infinity)= 0
|
||||
(p6) br.ret.spnt b0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
nop.f 0
|
||||
(p7) br.ret.spnt b0 // exp10(+Infinity)= +Infinity
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
(p8) mov f8= f1 // exp10(+/-0)= 1
|
||||
(p8) br.ret.spnt b0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
fma.d.s0 f8= f8, f1, f0 // Remaining cases: NaNs
|
||||
br.ret.sptk b0
|
||||
}
|
||||
;;
|
||||
|
||||
|
||||
OUT_RANGE_exp10:
|
||||
|
||||
// underflow: p6= 1
|
||||
// overflow: p8= 1
|
||||
|
||||
.pred.rel "mutex",p6,p8
|
||||
{.mmi
|
||||
(p8) mov GR_EXPMAX= 0x1fffe
|
||||
(p6) mov GR_EXPMAX= 1
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mii
|
||||
setf.exp FR_R= GR_EXPMAX
|
||||
(p8) mov GR_Parameter_TAG= 166
|
||||
(p6) mov GR_Parameter_TAG= 265
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
fma.d.s0 f8= FR_R, FR_R, f0 // Create overflow/underflow
|
||||
br.cond.sptk __libm_error_region // Branch to error handling
|
||||
}
|
||||
;;
|
||||
|
||||
GLOBAL_IEEE754_END(exp10)
|
||||
libm_alias_double_other (__exp10, exp10)
|
||||
#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_27)
|
||||
compat_symbol (libm, exp10, pow10, GLIBC_2_2)
|
||||
#endif
|
||||
|
||||
|
||||
LOCAL_LIBM_ENTRY(__libm_error_region)
|
||||
|
||||
.prologue
|
||||
{.mfi
|
||||
add GR_Parameter_Y= -32, sp // Parameter 2 value
|
||||
nop.f 0
|
||||
.save ar.pfs, GR_SAVE_PFS
|
||||
mov GR_SAVE_PFS= ar.pfs // Save ar.pfs
|
||||
}
|
||||
|
||||
{.mfi
|
||||
.fframe 64
|
||||
add sp= -64, sp // Create new stack
|
||||
nop.f 0
|
||||
mov GR_SAVE_GP= gp // Save gp
|
||||
}
|
||||
;;
|
||||
|
||||
{.mmi
|
||||
stfd [ GR_Parameter_Y ]= FR_Y, 16 // STORE Parameter 2 on stack
|
||||
add GR_Parameter_X= 16, sp // Parameter 1 address
|
||||
.save b0, GR_SAVE_B0
|
||||
mov GR_SAVE_B0= b0 // Save b0
|
||||
}
|
||||
;;
|
||||
|
||||
.body
|
||||
{.mib
|
||||
stfd [ GR_Parameter_X ]= FR_X // STORE Parameter 1 on stack
|
||||
add GR_Parameter_RESULT= 0, GR_Parameter_Y // Parameter 3 address
|
||||
nop.b 0
|
||||
}
|
||||
{.mib
|
||||
stfd [ GR_Parameter_Y ]= FR_RESULT // STORE Parameter 3 on stack
|
||||
add GR_Parameter_Y= -16, GR_Parameter_Y
|
||||
br.call.sptk b0= __libm_error_support# // Call error handling function
|
||||
}
|
||||
;;
|
||||
|
||||
{.mmi
|
||||
add GR_Parameter_RESULT= 48, sp
|
||||
nop.m 0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mmi
|
||||
ldfd f8= [ GR_Parameter_RESULT ] // Get return result off stack
|
||||
.restore sp
|
||||
add sp= 64, sp // Restore stack pointer
|
||||
mov b0= GR_SAVE_B0 // Restore return address
|
||||
}
|
||||
;;
|
||||
|
||||
{.mib
|
||||
mov gp= GR_SAVE_GP // Restore gp
|
||||
mov ar.pfs= GR_SAVE_PFS // Restore ar.pfs
|
||||
br.ret.sptk b0 // Return
|
||||
}
|
||||
;;
|
||||
|
||||
|
||||
LOCAL_LIBM_END(__libm_error_region)
|
||||
|
||||
.type __libm_error_support#, @function
|
||||
.global __libm_error_support#
|
@ -1,5 +0,0 @@
|
||||
/* IA64 does not provided the finite symbol alias. */
|
||||
#include <libm-alias-finite.h>
|
||||
#undef libm_alias_finite
|
||||
#define libm_alias_finite(a, b)
|
||||
#include <sysdeps/ieee754/flt-32/e_exp10f.c>
|
@ -1,814 +0,0 @@
|
||||
.file "exp10l.s"
|
||||
|
||||
|
||||
// Copyright (c) 2000 - 2004, Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
//
|
||||
// History
|
||||
//==============================================================
|
||||
// 08/25/00 Initial version
|
||||
// 05/20/02 Cleaned up namespace and sf0 syntax
|
||||
// 02/06/03 Reordered header: .section, .global, .proc, .align
|
||||
// 05/08/03 Reformatted assembly source; corrected overflow result for round to
|
||||
// -inf and round to zero; exact results now don't set inexact flag
|
||||
// 12/16/04 Call error handling on underflow.
|
||||
//
|
||||
// API
|
||||
//==============================================================
|
||||
// long double exp10l(long double)
|
||||
//
|
||||
// Overview of operation
|
||||
//==============================================================
|
||||
// Background
|
||||
//
|
||||
// Implementation
|
||||
//
|
||||
// Let x= (K + f + r)/log2(10), where
|
||||
// K is an integer, f= 0.b1 b2... b8 (f>= 0),
|
||||
// and |r|<2^{-9}
|
||||
// T is a table that stores 2^f (256 entries) rounded to
|
||||
// double extended precision (only mantissa is stored)
|
||||
// D stores (2^f/T [ f ] - 1), rounded to single precision
|
||||
//
|
||||
// 10^x is approximated as
|
||||
// 2^K * T [ f ] * ((1+c1*r+c2*r^2+...+c6*r^6)*(1+c1*e)+D [ f ] ),
|
||||
// where e= log2(10)_lo*x+(log2(10)_hi*x-RN(log2(10)_hi*x))
|
||||
//
|
||||
|
||||
|
||||
|
||||
// Special values
|
||||
//==============================================================
|
||||
// exp10(0)= 1
|
||||
// exp10(+inf)= inf
|
||||
// exp10(-inf)= 0
|
||||
//
|
||||
|
||||
|
||||
// Registers used
|
||||
//==============================================================
|
||||
// f6-f15, f32-f63
|
||||
// r14-r30, r32-r40
|
||||
// p6-p8, p11-p14
|
||||
//
|
||||
|
||||
#include <shlib-compat.h>
|
||||
|
||||
|
||||
FR_X = f10
|
||||
FR_Y = f1
|
||||
FR_RESULT = f8
|
||||
|
||||
FR_COEFF1 = f6
|
||||
FR_COEFF2 = f7
|
||||
FR_KF0 = f9
|
||||
FR_LOG10 = f10
|
||||
FR_CONST1 = f11
|
||||
FR_XL10 = f12
|
||||
FR_COEFF3 = f13
|
||||
FR_COEFF4 = f14
|
||||
FR_UF_TEST = f15
|
||||
FR_OF_TEST = f32
|
||||
FR_L10_LOW = f33
|
||||
FR_COEFF5 = f34
|
||||
FR_COEFF6 = f35
|
||||
FR_L10 = f36
|
||||
FR_C_L10 = f37
|
||||
FR_XL10_H = f38
|
||||
FR_XL10_L = f39
|
||||
FR_KF = f40
|
||||
FR_E = f41
|
||||
FR_T = f42
|
||||
FR_D = f43
|
||||
FR_EXP_M_63 = f44
|
||||
FR_R = f45
|
||||
FR_E1 = f46
|
||||
FR_COEFF2 = f47
|
||||
FR_P34 = f48
|
||||
FR_P56 = f49
|
||||
FR_R2 = f50
|
||||
FR_RE = f51
|
||||
FR_D1 = f52
|
||||
FR_P36 = f53
|
||||
FR_R3E = f54
|
||||
FR_P1 = f55
|
||||
FR_P = f56
|
||||
FR_T1 = f57
|
||||
FR_XINT = f58
|
||||
FR_XINTF = f59
|
||||
FR_4 = f60
|
||||
FR_28 = f61
|
||||
FR_32 = f62
|
||||
FR_SNORM_LIMIT = f63
|
||||
|
||||
|
||||
GR_ADDR0 = r14
|
||||
GR_D_ADDR = r15
|
||||
GR_ADDR = r16
|
||||
GR_B63 = r17
|
||||
GR_KBITS = r18
|
||||
GR_F = r19
|
||||
GR_K = r20
|
||||
GR_D = r21
|
||||
GR_BM63 = r22
|
||||
GR_T = r23
|
||||
GR_CONST1 = r24
|
||||
GR_EMIN = r25
|
||||
GR_CONST2 = r26
|
||||
GR_BM8 = r27
|
||||
GR_SREG = r28
|
||||
GR_4_BIAS = r29
|
||||
GR_32_BIAS = r30
|
||||
|
||||
GR_SAVE_B0 = r33
|
||||
GR_SAVE_PFS = r34
|
||||
GR_SAVE_GP = r35
|
||||
GR_SAVE_SP = r36
|
||||
|
||||
GR_Parameter_X = r37
|
||||
GR_Parameter_Y = r38
|
||||
GR_Parameter_RESULT= r39
|
||||
GR_Parameter_TAG = r40
|
||||
|
||||
|
||||
// Data tables
|
||||
//==============================================================
|
||||
|
||||
RODATA
|
||||
|
||||
.align 16
|
||||
|
||||
LOCAL_OBJECT_START(poly_coeffs)
|
||||
|
||||
data8 0xd49a784bcd1b8afe, 0x00004008 // log2(10)*2^8
|
||||
data8 0x9a209a84fbcff798, 0x0000400b // overflow threshold
|
||||
data8 0xb17217f7d1cf79ab, 0x00003ffe // C_1
|
||||
data8 0xf5fdeffc162c7541, 0x00003ffc // C_2
|
||||
data8 0x3fac6b08d704a0c0 // C_3
|
||||
data8 0x3f83b2ab6fba4e77 // C_4
|
||||
data8 0x3f55d87fe78a6731 // C_5
|
||||
data8 0x3f2430912f86c787 // C_6
|
||||
data8 0x9257edfe9b5fb698, 0x00003fbf // log2(10)_low (bits 64...127)
|
||||
data8 0x9a1bc98027a81918, 0x0000c00b // Smallest normal threshold
|
||||
LOCAL_OBJECT_END(poly_coeffs)
|
||||
|
||||
|
||||
LOCAL_OBJECT_START(T_table)
|
||||
|
||||
// 2^{0.b1 b2 b3 b4 b5 b6 b7 b8}
|
||||
data8 0x8000000000000000, 0x8058d7d2d5e5f6b1
|
||||
data8 0x80b1ed4fd999ab6c, 0x810b40a1d81406d4
|
||||
data8 0x8164d1f3bc030773, 0x81bea1708dde6056
|
||||
data8 0x8218af4373fc25ec, 0x8272fb97b2a5894c
|
||||
data8 0x82cd8698ac2ba1d7, 0x83285071e0fc4547
|
||||
data8 0x8383594eefb6ee37, 0x83dea15b9541b132
|
||||
data8 0x843a28c3acde4046, 0x8495efb3303efd30
|
||||
data8 0x84f1f656379c1a29, 0x854e3cd8f9c8c95d
|
||||
data8 0x85aac367cc487b15, 0x86078a2f23642a9f
|
||||
data8 0x8664915b923fba04, 0x86c1d919caef5c88
|
||||
data8 0x871f61969e8d1010, 0x877d2afefd4e256c
|
||||
data8 0x87db357ff698d792, 0x88398146b919f1d4
|
||||
data8 0x88980e8092da8527, 0x88f6dd5af155ac6b
|
||||
data8 0x8955ee03618e5fdd, 0x89b540a7902557a4
|
||||
data8 0x8a14d575496efd9a, 0x8a74ac9a79896e47
|
||||
data8 0x8ad4c6452c728924, 0x8b3522a38e1e1032
|
||||
data8 0x8b95c1e3ea8bd6e7, 0x8bf6a434adde0085
|
||||
data8 0x8c57c9c4646f4dde, 0x8cb932c1bae97a95
|
||||
data8 0x8d1adf5b7e5ba9e6, 0x8d7ccfc09c50e2f8
|
||||
data8 0x8ddf042022e69cd6, 0x8e417ca940e35a01
|
||||
data8 0x8ea4398b45cd53c0, 0x8f073af5a2013520
|
||||
data8 0x8f6a8117e6c8e5c4, 0x8fce0c21c6726481
|
||||
data8 0x9031dc431466b1dc, 0x9095f1abc540ca6b
|
||||
data8 0x90fa4c8beee4b12b, 0x915eed13c89689d3
|
||||
data8 0x91c3d373ab11c336, 0x9228ffdc10a051ad
|
||||
data8 0x928e727d9531f9ac, 0x92f42b88f673aa7c
|
||||
data8 0x935a2b2f13e6e92c, 0x93c071a0eef94bc1
|
||||
data8 0x9426ff0fab1c04b6, 0x948dd3ac8ddb7ed3
|
||||
data8 0x94f4efa8fef70961, 0x955c5336887894d5
|
||||
data8 0x95c3fe86d6cc7fef, 0x962bf1cbb8d97560
|
||||
data8 0x96942d3720185a00, 0x96fcb0fb20ac4ba3
|
||||
data8 0x97657d49f17ab08e, 0x97ce9255ec4357ab
|
||||
data8 0x9837f0518db8a96f, 0x98a1976f7597e996
|
||||
data8 0x990b87e266c189aa, 0x9975c1dd47518c77
|
||||
data8 0x99e0459320b7fa65, 0x9a4b13371fd166ca
|
||||
data8 0x9ab62afc94ff864a, 0x9b218d16f441d63d
|
||||
data8 0x9b8d39b9d54e5539, 0x9bf93118f3aa4cc1
|
||||
data8 0x9c6573682ec32c2d, 0x9cd200db8a0774cb
|
||||
data8 0x9d3ed9a72cffb751, 0x9dabfdff6367a2aa
|
||||
data8 0x9e196e189d472420, 0x9e872a276f0b98ff
|
||||
data8 0x9ef5326091a111ae, 0x9f6386f8e28ba651
|
||||
data8 0x9fd228256400dd06, 0xa041161b3d0121be
|
||||
data8 0xa0b0510fb9714fc2, 0xa11fd9384a344cf7
|
||||
data8 0xa18faeca8544b6e4, 0xa1ffd1fc25cea188
|
||||
data8 0xa27043030c496819, 0xa2e102153e918f9e
|
||||
data8 0xa3520f68e802bb93, 0xa3c36b345991b47c
|
||||
data8 0xa43515ae09e6809e, 0xa4a70f0c95768ec5
|
||||
data8 0xa5195786be9ef339, 0xa58bef536dbeb6ee
|
||||
data8 0xa5fed6a9b15138ea, 0xa6720dc0be08a20c
|
||||
data8 0xa6e594cfeee86b1e, 0xa7596c0ec55ff55b
|
||||
data8 0xa7cd93b4e965356a, 0xa8420bfa298f70d1
|
||||
data8 0xa8b6d5167b320e09, 0xa92bef41fa77771b
|
||||
data8 0xa9a15ab4ea7c0ef8, 0xaa1717a7b5693979
|
||||
data8 0xaa8d2652ec907629, 0xab0386ef48868de1
|
||||
data8 0xab7a39b5a93ed337, 0xabf13edf162675e9
|
||||
data8 0xac6896a4be3fe929, 0xace0413ff83e5d04
|
||||
data8 0xad583eea42a14ac6, 0xadd08fdd43d01491
|
||||
data8 0xae493452ca35b80e, 0xaec22c84cc5c9465
|
||||
data8 0xaf3b78ad690a4375, 0xafb51906e75b8661
|
||||
data8 0xb02f0dcbb6e04584, 0xb0a957366fb7a3c9
|
||||
data8 0xb123f581d2ac2590, 0xb19ee8e8c94feb09
|
||||
data8 0xb21a31a66618fe3b, 0xb295cff5e47db4a4
|
||||
data8 0xb311c412a9112489, 0xb38e0e38419fae18
|
||||
data8 0xb40aaea2654b9841, 0xb487a58cf4a9c180
|
||||
data8 0xb504f333f9de6484, 0xb58297d3a8b9f0d2
|
||||
data8 0xb60093a85ed5f76c, 0xb67ee6eea3b22b8f
|
||||
data8 0xb6fd91e328d17791, 0xb77c94c2c9d725e9
|
||||
data8 0xb7fbefca8ca41e7c, 0xb87ba337a1743834
|
||||
data8 0xb8fbaf4762fb9ee9, 0xb97c143756844dbf
|
||||
data8 0xb9fcd2452c0b9deb, 0xba7de9aebe5fea09
|
||||
data8 0xbaff5ab2133e45fb, 0xbb81258d5b704b6f
|
||||
data8 0xbc034a7ef2e9fb0d, 0xbc85c9c560e7b269
|
||||
data8 0xbd08a39f580c36bf, 0xbd8bd84bb67ed483
|
||||
data8 0xbe0f6809860993e2, 0xbe935317fc378238
|
||||
data8 0xbf1799b67a731083, 0xbf9c3c248e2486f8
|
||||
data8 0xc0213aa1f0d08db0, 0xc0a6956e8836ca8d
|
||||
data8 0xc12c4cca66709456, 0xc1b260f5ca0fbb33
|
||||
data8 0xc238d2311e3d6673, 0xc2bfa0bcfad907c9
|
||||
data8 0xc346ccda24976407, 0xc3ce56c98d21b15d
|
||||
data8 0xc4563ecc5334cb33, 0xc4de8523c2c07baa
|
||||
data8 0xc5672a115506dadd, 0xc5f02dd6b0bbc3d9
|
||||
data8 0xc67990b5aa245f79, 0xc70352f04336c51e
|
||||
data8 0xc78d74c8abb9b15d, 0xc817f681416452b2
|
||||
data8 0xc8a2d85c8ffe2c45, 0xc92e1a9d517f0ecc
|
||||
data8 0xc9b9bd866e2f27a3, 0xca45c15afcc72624
|
||||
data8 0xcad2265e4290774e, 0xcb5eecd3b38597c9
|
||||
data8 0xcbec14fef2727c5d, 0xcc799f23d11510e5
|
||||
data8 0xcd078b86503dcdd2, 0xcd95da6a9ff06445
|
||||
data8 0xce248c151f8480e4, 0xceb3a0ca5dc6a55d
|
||||
data8 0xcf4318cf191918c1, 0xcfd2f4683f94eeb5
|
||||
data8 0xd06333daef2b2595, 0xd0f3d76c75c5db8d
|
||||
data8 0xd184df6251699ac6, 0xd2164c023056bcab
|
||||
data8 0xd2a81d91f12ae45a, 0xd33a5457a3029054
|
||||
data8 0xd3ccf099859ac379, 0xd45ff29e0972c561
|
||||
data8 0xd4f35aabcfedfa1f, 0xd5872909ab75d18a
|
||||
data8 0xd61b5dfe9f9bce07, 0xd6aff9d1e13ba2fe
|
||||
data8 0xd744fccad69d6af4, 0xd7da67311797f56a
|
||||
data8 0xd870394c6db32c84, 0xd9067364d44a929c
|
||||
data8 0xd99d15c278afd7b6, 0xda3420adba4d8704
|
||||
data8 0xdacb946f2ac9cc72, 0xdb63714f8e295255
|
||||
data8 0xdbfbb797daf23755, 0xdc9467913a4f1c92
|
||||
data8 0xdd2d818508324c20, 0xddc705bcd378f7f0
|
||||
data8 0xde60f4825e0e9124, 0xdefb4e1f9d1037f2
|
||||
data8 0xdf9612deb8f04420, 0xe031430a0d99e627
|
||||
data8 0xe0ccdeec2a94e111, 0xe168e6cfd3295d23
|
||||
data8 0xe2055afffe83d369, 0xe2a23bc7d7d91226
|
||||
data8 0xe33f8972be8a5a51, 0xe3dd444c46499619
|
||||
data8 0xe47b6ca0373da88d, 0xe51a02ba8e26d681
|
||||
data8 0xe5b906e77c8348a8, 0xe658797368b3a717
|
||||
data8 0xe6f85aaaee1fce22, 0xe798aadadd5b9cbf
|
||||
data8 0xe8396a503c4bdc68, 0xe8da9958464b42ab
|
||||
data8 0xe97c38406c4f8c57, 0xea1e4756550eb27b
|
||||
data8 0xeac0c6e7dd24392f, 0xeb63b74317369840
|
||||
data8 0xec0718b64c1cbddc, 0xecaaeb8ffb03ab41
|
||||
data8 0xed4f301ed9942b84, 0xedf3e6b1d418a491
|
||||
data8 0xee990f980da3025b, 0xef3eab20e032bc6b
|
||||
data8 0xefe4b99bdcdaf5cb, 0xf08b3b58cbe8b76a
|
||||
data8 0xf13230a7ad094509, 0xf1d999d8b7708cc1
|
||||
data8 0xf281773c59ffb13a, 0xf329c9233b6bae9c
|
||||
data8 0xf3d28fde3a641a5b, 0xf47bcbbe6db9fddf
|
||||
data8 0xf5257d152486cc2c, 0xf5cfa433e6537290
|
||||
data8 0xf67a416c733f846e, 0xf7255510c4288239
|
||||
data8 0xf7d0df730ad13bb9, 0xf87ce0e5b2094d9c
|
||||
data8 0xf92959bb5dd4ba74, 0xf9d64a46eb939f35
|
||||
data8 0xfa83b2db722a033a, 0xfb3193cc4227c3f4
|
||||
data8 0xfbdfed6ce5f09c49, 0xfc8ec01121e447bb
|
||||
data8 0xfd3e0c0cf486c175, 0xfdedd1b496a89f35
|
||||
data8 0xfe9e115c7b8f884c, 0xff4ecb59511ec8a5
|
||||
LOCAL_OBJECT_END(T_table)
|
||||
|
||||
|
||||
LOCAL_OBJECT_START(D_table)
|
||||
data4 0x00000000, 0x9f55c08f, 0x1e93ffa3, 0x1dcd43a8
|
||||
data4 0x1f751f79, 0x9f3cdd88, 0x9f43d155, 0x1eda222c
|
||||
data4 0x1ef35513, 0x9f597895, 0x9e698881, 0x1ec71073
|
||||
data4 0x1e50e371, 0x9dc01e19, 0x1de74133, 0x1e2f028c
|
||||
data4 0x9edefb47, 0x1ebbac48, 0x9e8b0330, 0x9e9e9314
|
||||
data4 0x1edc1d11, 0x1f098529, 0x9f52827c, 0x1f50050d
|
||||
data4 0x1f301e8e, 0x1f5b64d1, 0x9f45e3ee, 0x9ef64d6d
|
||||
data4 0x1d6ec5e8, 0x9e61ad9a, 0x1d44ccbb, 0x9e4a8bbb
|
||||
data4 0x9cf11576, 0x9dcce7e7, 0x9d02ac90, 0x1f26ccf0
|
||||
data4 0x9f0877c6, 0x9ddd62ae, 0x9f4b7fc3, 0x1ea8ef6b
|
||||
data4 0x1ea4378d, 0x1ef6fc38, 0x1db99fd9, 0x1f22bf6f
|
||||
data4 0x1f53e172, 0x1e85504a, 0x9f37cc75, 0x1f0c5e17
|
||||
data4 0x1dde8aac, 0x9cb42bb2, 0x1e153cd7, 0x1eb62bba
|
||||
data4 0x9e9b941b, 0x9ea80e3c, 0x1f508823, 0x1ec3fd36
|
||||
data4 0x1e9ffaa1, 0x1e21e2eb, 0x9d948b1d, 0x9e8ac93a
|
||||
data4 0x1ef7ee6f, 0x9e80dda3, 0x1f0814be, 0x1dc5ddfe
|
||||
data4 0x1eedb9d1, 0x9f2aaa26, 0x9ea5b0fc, 0x1edf702e
|
||||
data4 0x9e391201, 0x1f1316bb, 0x1ea27fb7, 0x9e05ed18
|
||||
data4 0x9f199ed2, 0x1ee7fd7c, 0x1f003db6, 0x9eac3793
|
||||
data4 0x9e5b8c10, 0x9f3af17c, 0x1bc9a8be, 0x1ee3c004
|
||||
data4 0x9f19b1b2, 0x9f242ce9, 0x9ce67dd1, 0x9e4f6275
|
||||
data4 0x1e20742c, 0x1eb9328a, 0x9f477153, 0x1d969718
|
||||
data4 0x9f1e6c43, 0x1f2f67f4, 0x9f39c7e4, 0x9e3c4feb
|
||||
data4 0x1da3956b, 0x9e7c685d, 0x1f280911, 0x9f0d8afb
|
||||
data4 0x1e314b40, 0x9eb4f250, 0x9f1a34ad, 0x1ef5d5e7
|
||||
data4 0x9f145496, 0x1e604827, 0x9f1e5195, 0x1e9c1fc0
|
||||
data4 0x1efde521, 0x1e69b385, 0x1f316830, 0x9f244eae
|
||||
data4 0x1f1787ec, 0x9e939971, 0x1f0bb393, 0x9f0511d6
|
||||
data4 0x1ed919de, 0x1d8b7b28, 0x1e5ca4a9, 0x1e7c357b
|
||||
data4 0x9e3ff8e8, 0x1eef53b5, 0x9ed22ed7, 0x1f16659b
|
||||
data4 0x9f2db102, 0x9e2c6a78, 0x1f328d7d, 0x9f2fec3c
|
||||
data4 0x1eb395bd, 0x9f242b84, 0x9e2683e6, 0x1ed71e68
|
||||
data4 0x1efd1df5, 0x9e9eeafd, 0x9ed2249c, 0x1eef129a
|
||||
data4 0x1d1ea44c, 0x9e81f7ff, 0x1eaf77c9, 0x9ee7a285
|
||||
data4 0x1e1864ed, 0x9ee7edbb, 0x9e15a27d, 0x9ae61655
|
||||
data4 0x1f1ff1a2, 0x1da29755, 0x9e5f46fb, 0x1e901236
|
||||
data4 0x9eecfb9b, 0x9f204d2f, 0x1ec64685, 0x9eb809bd
|
||||
data4 0x9e0026c5, 0x1d9f1da1, 0x1f142b49, 0x9f20f22e
|
||||
data4 0x1f24b067, 0x1f185a4c, 0x9f09765c, 0x9ece902f
|
||||
data4 0x1e2ca5db, 0x1e6de464, 0x9f071f67, 0x1f1518c3
|
||||
data4 0x1ea13ded, 0x1f0b8414, 0x1edb6ad4, 0x9e548740
|
||||
data4 0x9ea10efb, 0x1ee48a60, 0x1e7954c5, 0x9edad013
|
||||
data4 0x9f21517d, 0x9e9b6e0c, 0x9ee7f9a6, 0x9ebd4298
|
||||
data4 0x9d65b24e, 0x1eed751f, 0x9f1573ea, 0x9d430377
|
||||
data4 0x9e13fc0c, 0x1e47008a, 0x1e3d5c1d, 0x1ef41a91
|
||||
data4 0x9e4a4ef7, 0x9e952f18, 0x1d620566, 0x1d9b8d33
|
||||
data4 0x1db06247, 0x1e94b31e, 0x1f0730ad, 0x9d79ffb4
|
||||
data4 0x1ed64d51, 0x9e91fd11, 0x9e28d35a, 0x9dea0ed9
|
||||
data4 0x1e891def, 0x9ee28ac0, 0x1e1db99b, 0x9ee1ce38
|
||||
data4 0x9bdd9bca, 0x1eb72cb9, 0x9e8c53c6, 0x1e0df6ca
|
||||
data4 0x1e8f2ccd, 0x9e9b0886, 0x1eeb3bc7, 0x1ec7e772
|
||||
data4 0x9e210776, 0x9daf246c, 0x1ea1f151, 0x1ece4dc6
|
||||
data4 0x1ce741c8, 0x1ed3c88f, 0x9ec9a4fd, 0x9e0c8d30
|
||||
data4 0x1d2fbb26, 0x9ef212a7, 0x1ee44f1c, 0x9e445550
|
||||
data4 0x1e075f77, 0x9d9291a3, 0x1f09c2ee, 0x9e012c88
|
||||
data4 0x1f057d62, 0x9e7bb0dc, 0x9d8758ee, 0x1ee8d6c1
|
||||
data4 0x9e509a57, 0x9e4ca7b7, 0x1e2cb341, 0x9ec35106
|
||||
data4 0x1ecf3baf, 0x1e11781c, 0x1ea0cc78, 0x1eb75ca6
|
||||
data4 0x1e961e1a, 0x1eb88853, 0x1e7abf50, 0x1ee38704
|
||||
data4 0x9dc5ab0f, 0x1afe197b, 0x9ec07523, 0x9d9b7f78
|
||||
data4 0x1f011618, 0x1ed43b0b, 0x9f035945, 0x9e3fd014
|
||||
data4 0x9bbda5cd, 0x9e83f8ab, 0x1e58a928, 0x1e392d61
|
||||
data4 0x1efdbb52, 0x1ee310a8, 0x9ec7ecc1, 0x1e8c9ed6
|
||||
data4 0x9ef82dee, 0x9e70545b, 0x9ea53fc4, 0x1e40f419
|
||||
LOCAL_OBJECT_END(D_table)
|
||||
|
||||
|
||||
|
||||
.section .text
|
||||
GLOBAL_IEEE754_ENTRY(exp10l)
|
||||
|
||||
{.mfi
|
||||
alloc GR_SREG = ar.pfs, 1, 4, 4, 0
|
||||
// will continue only for normal/denormal numbers
|
||||
fclass.nm.unc p12, p7 = f8, 0x1b
|
||||
// GR_ADDR0 = pointer to log2(10), C_1...C_6 followed by T_table
|
||||
addl GR_ADDR0 = @ltoff(poly_coeffs), gp ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
// load start address for C_1...C_6 followed by T_table
|
||||
ld8 GR_ADDR0 = [ GR_ADDR0 ]
|
||||
// X<0 ?
|
||||
fcmp.lt.s1 p6, p8 = f8, f0
|
||||
// GR_BM8 = bias-8
|
||||
mov GR_BM8 = 0xffff-8
|
||||
}
|
||||
{.mlx
|
||||
nop.m 0
|
||||
// GR_EMIN = (-2^14-62)*2^{8}
|
||||
movl GR_EMIN = 0xca807c00 ;;
|
||||
}
|
||||
|
||||
{.mmb
|
||||
// FR_CONST1 = 2^{-8}
|
||||
setf.exp FR_CONST1 = GR_BM8
|
||||
// load log2(10)*2^8
|
||||
ldfe FR_LOG10 = [ GR_ADDR0 ], 16
|
||||
(p12) br.cond.spnt SPECIAL_EXP10 ;;
|
||||
}
|
||||
|
||||
{.mmf
|
||||
setf.s FR_UF_TEST = GR_EMIN
|
||||
// load overflow threshold
|
||||
ldfe FR_OF_TEST = [ GR_ADDR0 ], 16
|
||||
// normalize x
|
||||
fma.s0 f8 = f8, f1, f0 ;;
|
||||
}
|
||||
|
||||
{.mmi
|
||||
// load C_1
|
||||
ldfe FR_COEFF1 = [ GR_ADDR0 ], 16 ;;
|
||||
// load C_2
|
||||
ldfe FR_COEFF2 = [ GR_ADDR0 ], 16
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mmf
|
||||
// GR_D_ADDR = pointer to D table
|
||||
add GR_D_ADDR = 2048-64+96+32, GR_ADDR0
|
||||
// load C_3, C_4
|
||||
ldfpd FR_COEFF3, FR_COEFF4 = [ GR_ADDR0 ], 16
|
||||
// y = x*log2(10)*2^8
|
||||
fma.s1 FR_XL10 = f8, FR_LOG10, f0 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
// load C_5, C_6
|
||||
ldfpd FR_COEFF5, FR_COEFF6 = [ GR_ADDR0 ], 16
|
||||
// get int(x)
|
||||
fcvt.fx.trunc.s1 FR_XINT = f8
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// FR_LOG10 = log2(10)
|
||||
fma.s1 FR_L10 = FR_LOG10, FR_CONST1, f0
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
// load log2(10)_low
|
||||
ldfe FR_L10_LOW = [ GR_ADDR0 ], 16
|
||||
// y0 = x*log2(10) = x*log2(10)_hi
|
||||
fma.s1 FR_LOG10 = f8, FR_L10, f0
|
||||
mov GR_EMIN = 0xffff-63
|
||||
}
|
||||
{.mfi
|
||||
mov GR_32_BIAS = 0xffff + 5
|
||||
// (K+f)*2^8 = round_to_int(y)
|
||||
fcvt.fx.s1 FR_KF0 = FR_XL10
|
||||
mov GR_4_BIAS = 0xffff + 2;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
// load smallest normal limit
|
||||
ldfe FR_SNORM_LIMIT = [ GR_ADDR0 ], 16
|
||||
// x>overflow threshold ?
|
||||
fcmp.gt.s1 p12, p7 = f8, FR_OF_TEST
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
setf.exp FR_32 = GR_32_BIAS
|
||||
// x<underflow threshold ?
|
||||
(p7) fcmp.lt.s1 p12, p7 = FR_XL10, FR_UF_TEST
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
setf.exp FR_4 = GR_4_BIAS
|
||||
fcvt.xf FR_XINTF = FR_XINT
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// FR_L10 = log2(10)_h*x-RN(log2(10)_h*x)
|
||||
fms.s1 FR_L10 = f8, FR_L10, FR_LOG10
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
getf.sig GR_BM8 = FR_KF0
|
||||
fcvt.xf FR_KF0 = FR_KF0
|
||||
mov GR_CONST2 = 255 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
// GR_CONST2 = f
|
||||
and GR_CONST2 = GR_CONST2, GR_BM8
|
||||
// FR_L10_LOW = e = log2(10)_l*x+(log2(10)_h*x-RN(log2(10)_h*x))
|
||||
fma.s1 FR_L10_LOW = FR_L10_LOW, f8, FR_L10
|
||||
// GR_BM8 = K
|
||||
shr GR_BM8 = GR_BM8, 8 ;;
|
||||
}
|
||||
|
||||
{.mmi
|
||||
// address of D
|
||||
shladd GR_D_ADDR = GR_CONST2, 2, GR_D_ADDR
|
||||
// K+ = bias-63
|
||||
add GR_BM8 = GR_BM8, GR_EMIN
|
||||
// address of T
|
||||
shladd GR_ADDR0 = GR_CONST2, 3, GR_ADDR0 ;;
|
||||
}
|
||||
|
||||
{.mfb
|
||||
// load D
|
||||
ldfs FR_OF_TEST = [ GR_D_ADDR ]
|
||||
// is input an integer ?
|
||||
fcmp.eq.s1 p13, p14 = f8, FR_XINTF
|
||||
(p12) br.cond.spnt OUT_RANGE_EXP10 ;;
|
||||
}
|
||||
|
||||
{.mmf
|
||||
// load T
|
||||
ldf8 FR_UF_TEST = [ GR_ADDR0 ]
|
||||
// FR_XL10 = 2^{K-63}
|
||||
setf.exp FR_XL10 = GR_BM8
|
||||
// r = x*log2(10)_hi-2^{-10}* [ (K+f)*2^{10} ]
|
||||
fnma.s1 FR_KF0 = FR_KF0, FR_CONST1, FR_LOG10 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// get 28.0
|
||||
fms.s1 FR_28 = FR_32, f1, FR_4
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// E = 1+C_1*e
|
||||
fma.s1 FR_L10 = FR_L10_LOW, FR_COEFF1, f1
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P12 = C_1+C_2*r
|
||||
fma.s1 FR_COEFF2 = FR_COEFF2, FR_KF0, FR_COEFF1
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P34 = C_3+C_4*r
|
||||
fma.s1 FR_COEFF4 = FR_COEFF4, FR_KF0, FR_COEFF3
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P56 = C_5+C_6*r
|
||||
fma.s1 FR_COEFF5 = FR_COEFF6, FR_KF0, FR_COEFF5
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// GR_ADDR0 = r*r
|
||||
fma.s1 FR_COEFF3 = FR_KF0, FR_KF0, f0
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// if input is integer, is it positive ?
|
||||
(p13) fcmp.ge.s1 p13, p14 = f8, f0
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// r' = r*E
|
||||
fma.s1 FR_KF0 = FR_KF0, FR_L10, f0
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// D' = D+C_1*e
|
||||
fma.s1 FR_OF_TEST = FR_L10_LOW, FR_COEFF1, FR_OF_TEST
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// test if x >= smallest normal limit
|
||||
fcmp.ge.s1 p11, p0 = f8, FR_SNORM_LIMIT
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P36 = P34+r2*P56
|
||||
fma.s1 FR_COEFF4 = FR_COEFF5, FR_COEFF3, FR_COEFF4
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// GR_D_ADDR = r'*r2
|
||||
fma.s1 FR_COEFF3 = FR_COEFF3, FR_KF0, f0
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// is input below 28.0 ?
|
||||
(p13) fcmp.lt.s1 p13, p14 = f8, FR_28
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P' = P12*r'+D'
|
||||
fma.s1 FR_COEFF2 = FR_COEFF2, FR_KF0, FR_OF_TEST
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P = P'+r3*P36
|
||||
fma.s1 FR_COEFF3 = FR_COEFF3, FR_COEFF4, FR_COEFF2
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// T = 2^{K-63}*T
|
||||
fma.s1 FR_UF_TEST = FR_UF_TEST, FR_XL10, f0
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
.pred.rel "mutex",p13,p14
|
||||
{.mfi
|
||||
nop.m 0
|
||||
(p13) fma.s1 f8 = FR_COEFF3, FR_UF_TEST, FR_UF_TEST
|
||||
nop.i 0
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// result = T+T*P
|
||||
(p14) fma.s0 f8 = FR_COEFF3, FR_UF_TEST, FR_UF_TEST
|
||||
// return
|
||||
(p11) br.ret.sptk b0 ;; // return, if result normal
|
||||
}
|
||||
|
||||
// Here if result in denormal range (and not zero)
|
||||
{.mib
|
||||
nop.m 0
|
||||
mov GR_Parameter_TAG= 264
|
||||
br.cond.sptk __libm_error_region // Branch to error handling
|
||||
}
|
||||
;;
|
||||
|
||||
SPECIAL_EXP10:
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// x = -Infinity ?
|
||||
fclass.m p6, p0 = f8, 0x22
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// x = +Infinity ?
|
||||
fclass.m p7, p0 = f8, 0x21
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// x = +/-Zero ?
|
||||
fclass.m p8, p0 = f8, 0x7
|
||||
nop.i 0
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// exp10(-Infinity) = 0
|
||||
(p6) mov f8 = f0
|
||||
(p6) br.ret.spnt b0 ;;
|
||||
}
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// exp10(+Infinity) = +Infinity
|
||||
nop.f 0
|
||||
(p7) br.ret.spnt b0 ;;
|
||||
}
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// exp10(+/-0) = 1
|
||||
(p8) mov f8 = f1
|
||||
(p8) br.ret.spnt b0 ;;
|
||||
}
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// Remaining cases: NaNs
|
||||
fma.s0 f8 = f8, f1, f0
|
||||
br.ret.sptk b0 ;;
|
||||
}
|
||||
|
||||
|
||||
OUT_RANGE_EXP10:
|
||||
|
||||
// underflow: p6 = 1
|
||||
// overflow: p8 = 1
|
||||
|
||||
.pred.rel "mutex",p6,p8
|
||||
{.mmi
|
||||
(p8) mov GR_CONST1 = 0x1fffe
|
||||
(p6) mov GR_CONST1 = 1
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mii
|
||||
setf.exp FR_KF0 = GR_CONST1
|
||||
(p8) mov GR_Parameter_TAG = 165
|
||||
(p6) mov GR_Parameter_TAG = 264
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfb
|
||||
nop.m 999
|
||||
fma.s0 f8 = FR_KF0, FR_KF0, f0 // Create overflow/underflow
|
||||
br.cond.sptk __libm_error_region // Branch to error handling
|
||||
}
|
||||
;;
|
||||
|
||||
GLOBAL_IEEE754_END(exp10l)
|
||||
libm_alias_ldouble_other (__exp10, exp10)
|
||||
#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_27)
|
||||
compat_symbol (libm, exp10l, pow10l, GLIBC_2_2)
|
||||
#endif
|
||||
|
||||
|
||||
LOCAL_LIBM_ENTRY(__libm_error_region)
|
||||
.prologue
|
||||
{.mfi
|
||||
add GR_Parameter_Y = -32, sp // Parameter 2 value
|
||||
nop.f 0
|
||||
.save ar.pfs, GR_SAVE_PFS
|
||||
mov GR_SAVE_PFS = ar.pfs // Save ar.pfs
|
||||
}
|
||||
|
||||
{.mfi
|
||||
.fframe 64
|
||||
add sp = -64, sp // Create new stack
|
||||
nop.f 0
|
||||
mov GR_SAVE_GP = gp ;; // Save gp
|
||||
}
|
||||
|
||||
{.mmi
|
||||
stfe [ GR_Parameter_Y ] = FR_Y, 16 // STORE Parameter 2 on stack
|
||||
add GR_Parameter_X = 16, sp // Parameter 1 address
|
||||
.save b0, GR_SAVE_B0
|
||||
mov GR_SAVE_B0 = b0 ;; // Save b0
|
||||
}
|
||||
|
||||
.body
|
||||
{.mib
|
||||
stfe [ GR_Parameter_X ] = FR_X // STORE Parameter 1 on stack
|
||||
add GR_Parameter_RESULT = 0, GR_Parameter_Y // Parameter 3 address
|
||||
nop.b 0
|
||||
}
|
||||
{.mib
|
||||
stfe [ GR_Parameter_Y ] = FR_RESULT // STORE Parameter 3 on stack
|
||||
add GR_Parameter_Y = -16, GR_Parameter_Y
|
||||
br.call.sptk b0 = __libm_error_support# ;; // Call error handling function
|
||||
}
|
||||
|
||||
{.mmi
|
||||
add GR_Parameter_RESULT = 48, sp
|
||||
nop.m 0
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mmi
|
||||
ldfe f8 = [ GR_Parameter_RESULT ] // Get return result off stack
|
||||
.restore sp
|
||||
add sp = 64, sp // Restore stack pointer
|
||||
mov b0 = GR_SAVE_B0 ;; // Restore return address
|
||||
}
|
||||
|
||||
{.mib
|
||||
mov gp = GR_SAVE_GP // Restore gp
|
||||
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
|
||||
br.ret.sptk b0 ;; // Return
|
||||
}
|
||||
|
||||
|
||||
LOCAL_LIBM_END(__libm_error_region)
|
||||
.type __libm_error_support#, @function
|
||||
.global __libm_error_support#
|
@ -1,570 +0,0 @@
|
||||
.file "exp2.s"
|
||||
|
||||
|
||||
// Copyright (c) 2000 - 2005, Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
//
|
||||
// History
|
||||
//==============================================================
|
||||
// 08/25/00 Initial version
|
||||
// 05/20/02 Cleaned up namespace and sf0 syntax
|
||||
// 09/05/02 Improved performance
|
||||
// 01/17/03 Fixed to call error support when x=1024.0
|
||||
// 03/31/05 Reformatted delimiters between data tables
|
||||
//
|
||||
// API
|
||||
//==============================================================
|
||||
// double exp2(double)
|
||||
//
|
||||
// Overview of operation
|
||||
//==============================================================
|
||||
// Background
|
||||
//
|
||||
// Implementation
|
||||
//
|
||||
// Let x= (K + fh + fl + r), where
|
||||
// K is an integer, fh= 0.b1 b2 b3 b4 b5,
|
||||
// fl= 2^{-5}* 0.b6 b7 b8 b8 b10 (fh, fl >= 0),
|
||||
// and |r|<2^{-11}
|
||||
// Th is a table that stores 2^fh (32 entries) rounded to
|
||||
// double extended precision (only mantissa is stored)
|
||||
// Tl is a table that stores 2^fl (32 entries) rounded to
|
||||
// double extended precision (only mantissa is stored)
|
||||
//
|
||||
// 2^x is approximated as
|
||||
// 2^K * Th [ f ] * Tl [ f ] * (1+c1*r+c2*r^2+c3*r^3+c4*r^4)
|
||||
|
||||
// Note: We use the following trick to speed up conversion from FP to integer:
|
||||
//
|
||||
// Let x = K + r, where K is an integer, and |r| <= 0.5
|
||||
// Let N be the number of significand bits for the FP format used
|
||||
// ( N=64 for double-extended, N=53 for double)
|
||||
//
|
||||
// Then let y = 1.5 * 2^(N-1) + x for RN mode
|
||||
// K = y - 1.5 * 2^(N-1)
|
||||
// r = x - K
|
||||
//
|
||||
// If we want to obtain the integer part and the first m fractional bits of x,
|
||||
// we can use the same trick, but with a constant of 1.5 * 2^(N-1-m):
|
||||
//
|
||||
// Let x = K + f + r
|
||||
// f = 0.b_1 b_2 ... b_m
|
||||
// |r| <= 2^(-m-1)
|
||||
//
|
||||
// Then let y = 1.5 * 2^(N-1-m) + x for RN mode
|
||||
// (K+f) = y - 1.5 * 2^(N-1-m)
|
||||
// r = x - K
|
||||
|
||||
|
||||
// Special values
|
||||
//==============================================================
|
||||
// exp2(0)= 1
|
||||
// exp2(+inf)= inf
|
||||
// exp2(-inf)= 0
|
||||
//
|
||||
|
||||
// Registers used
|
||||
//==============================================================
|
||||
// r2-r3, r14-r40
|
||||
// f6-f15, f32-f45
|
||||
// p6-p8, p12
|
||||
//
|
||||
|
||||
|
||||
GR_TBL_START = r2
|
||||
GR_LOG_TBL = r3
|
||||
|
||||
GR_OF_LIMIT = r14
|
||||
GR_UF_LIMIT = r15
|
||||
GR_EXP_CORR = r16
|
||||
GR_F_low = r17
|
||||
GR_F_high = r18
|
||||
GR_K = r19
|
||||
GR_Flow_ADDR = r20
|
||||
|
||||
GR_BIAS = r21
|
||||
GR_Fh = r22
|
||||
GR_Fh_ADDR = r23
|
||||
GR_EXPMAX = r24
|
||||
GR_EMIN = r25
|
||||
|
||||
GR_ROUNDVAL = r26
|
||||
GR_MASK = r27
|
||||
GR_KF0 = r28
|
||||
GR_MASK_low = r29
|
||||
GR_COEFF_START = r30
|
||||
|
||||
GR_SAVE_B0 = r33
|
||||
GR_SAVE_PFS = r34
|
||||
GR_SAVE_GP = r35
|
||||
GR_SAVE_SP = r36
|
||||
|
||||
GR_Parameter_X = r37
|
||||
GR_Parameter_Y = r38
|
||||
GR_Parameter_RESULT = r39
|
||||
GR_Parameter_TAG = r40
|
||||
|
||||
|
||||
FR_X = f10
|
||||
FR_Y = f1
|
||||
FR_RESULT = f8
|
||||
|
||||
|
||||
FR_COEFF1 = f6
|
||||
FR_COEFF2 = f7
|
||||
FR_R = f9
|
||||
|
||||
FR_KF0 = f12
|
||||
FR_COEFF3 = f13
|
||||
FR_COEFF4 = f14
|
||||
FR_UF_LIMIT = f15
|
||||
|
||||
FR_OF_LIMIT = f32
|
||||
FR_EXPMIN = f33
|
||||
FR_ROUNDVAL = f34
|
||||
FR_KF = f35
|
||||
|
||||
FR_2_TO_K = f36
|
||||
FR_T_low = f37
|
||||
FR_T_high = f38
|
||||
FR_P34 = f39
|
||||
FR_R2 = f40
|
||||
|
||||
FR_P12 = f41
|
||||
FR_T_low_K = f42
|
||||
FR_P14 = f43
|
||||
FR_T = f44
|
||||
FR_P = f45
|
||||
|
||||
|
||||
// Data tables
|
||||
//==============================================================
|
||||
|
||||
RODATA
|
||||
|
||||
.align 16
|
||||
|
||||
LOCAL_OBJECT_START(poly_coeffs)
|
||||
|
||||
data8 0x3fac6b08d704a0c0, 0x3f83b2ab6fba4e77 // C_3 and C_4
|
||||
data8 0xb17217f7d1cf79ab, 0x00003ffe // C_1
|
||||
data8 0xf5fdeffc162c7541, 0x00003ffc // C_2
|
||||
LOCAL_OBJECT_END(poly_coeffs)
|
||||
|
||||
|
||||
LOCAL_OBJECT_START(T_table)
|
||||
|
||||
// 2^{0.00000 b6 b7 b8 b9 b10}
|
||||
data8 0x8000000000000000, 0x8016302f17467628
|
||||
data8 0x802c6436d0e04f50, 0x80429c17d77c18ed
|
||||
data8 0x8058d7d2d5e5f6b0, 0x806f17687707a7af
|
||||
data8 0x80855ad965e88b83, 0x809ba2264dada76a
|
||||
data8 0x80b1ed4fd999ab6c, 0x80c83c56b50cf77f
|
||||
data8 0x80de8f3b8b85a0af, 0x80f4e5ff089f763e
|
||||
data8 0x810b40a1d81406d4, 0x81219f24a5baa59d
|
||||
data8 0x813801881d886f7b, 0x814e67cceb90502c
|
||||
data8 0x8164d1f3bc030773, 0x817b3ffd3b2f2e47
|
||||
data8 0x8191b1ea15813bfd, 0x81a827baf7838b78
|
||||
data8 0x81bea1708dde6055, 0x81d51f0b8557ec1c
|
||||
data8 0x81eba08c8ad4536f, 0x820225f44b55b33b
|
||||
data8 0x8218af4373fc25eb, 0x822f3c7ab205c89a
|
||||
data8 0x8245cd9ab2cec048, 0x825c62a423d13f0c
|
||||
data8 0x8272fb97b2a5894c, 0x828998760d01faf3
|
||||
data8 0x82a0393fe0bb0ca8, 0x82b6ddf5dbc35906
|
||||
//
|
||||
// 2^{0.b1 b2 b3 b4 b5}
|
||||
data8 0x8000000000000000, 0x82cd8698ac2ba1d7
|
||||
data8 0x85aac367cc487b14, 0x88980e8092da8527
|
||||
data8 0x8b95c1e3ea8bd6e6, 0x8ea4398b45cd53c0
|
||||
data8 0x91c3d373ab11c336, 0x94f4efa8fef70961
|
||||
data8 0x9837f0518db8a96f, 0x9b8d39b9d54e5538
|
||||
data8 0x9ef5326091a111ad, 0xa27043030c496818
|
||||
data8 0xa5fed6a9b15138ea, 0xa9a15ab4ea7c0ef8
|
||||
data8 0xad583eea42a14ac6, 0xb123f581d2ac258f
|
||||
data8 0xb504f333f9de6484, 0xb8fbaf4762fb9ee9
|
||||
data8 0xbd08a39f580c36be, 0xc12c4cca66709456
|
||||
data8 0xc5672a115506dadd, 0xc9b9bd866e2f27a2
|
||||
data8 0xce248c151f8480e3, 0xd2a81d91f12ae45a
|
||||
data8 0xd744fccad69d6af4, 0xdbfbb797daf23755
|
||||
data8 0xe0ccdeec2a94e111, 0xe5b906e77c8348a8
|
||||
data8 0xeac0c6e7dd24392e, 0xefe4b99bdcdaf5cb
|
||||
data8 0xf5257d152486cc2c, 0xfa83b2db722a033a
|
||||
LOCAL_OBJECT_END(T_table)
|
||||
|
||||
|
||||
|
||||
.section .text
|
||||
WEAK_LIBM_ENTRY(exp2)
|
||||
|
||||
|
||||
{.mfi
|
||||
alloc r32= ar.pfs, 1, 4, 4, 0
|
||||
// will continue only for non-zero normal/denormal numbers
|
||||
fclass.nm p12, p0= f8, 0x1b
|
||||
// GR_TBL_START= pointer to C_1...C_4 followed by T_table
|
||||
addl GR_TBL_START= @ltoff(poly_coeffs), gp
|
||||
}
|
||||
{.mlx
|
||||
mov GR_OF_LIMIT= 0xffff + 10 // Exponent of overflow limit
|
||||
movl GR_ROUNDVAL= 0x5a400000 // 1.5*2^(63-10) (SP)
|
||||
}
|
||||
;;
|
||||
|
||||
// Form special constant 1.5*2^(63-10) to give integer part and first 10
|
||||
// fractional bits of x
|
||||
{.mfi
|
||||
setf.s FR_ROUNDVAL= GR_ROUNDVAL // Form special constant
|
||||
fcmp.lt.s1 p6, p8= f8, f0 // X<0 ?
|
||||
nop.i 0
|
||||
}
|
||||
{.mfb
|
||||
ld8 GR_COEFF_START= [ GR_TBL_START ] // Load pointer to coeff table
|
||||
nop.f 0
|
||||
(p12) br.cond.spnt SPECIAL_exp2 // Branch if nan, inf, zero
|
||||
}
|
||||
;;
|
||||
|
||||
{.mlx
|
||||
setf.exp FR_OF_LIMIT= GR_OF_LIMIT // Set overflow limit
|
||||
movl GR_UF_LIMIT= 0xc4866000 // (-2^10-51) = -1075
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
ldfpd FR_COEFF3, FR_COEFF4= [ GR_COEFF_START ], 16 // load C_3, C_4
|
||||
fma.s0 f8= f8, f1, f0 // normalize x
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mmi
|
||||
setf.s FR_UF_LIMIT= GR_UF_LIMIT // Set underflow limit
|
||||
ldfe FR_COEFF1= [ GR_COEFF_START ], 16 // load C_1
|
||||
mov GR_EXP_CORR= 0xffff-126
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
ldfe FR_COEFF2= [ GR_COEFF_START ], 16 // load C_2
|
||||
fma.s1 FR_KF0= f8, f1, FR_ROUNDVAL // y= x + 1.5*2^(63-10)
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
mov GR_MASK= 1023
|
||||
fms.s1 FR_KF= FR_KF0, f1, FR_ROUNDVAL // (K+f)
|
||||
mov GR_MASK_low= 31
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
getf.sig GR_KF0= FR_KF0 // (K+f)*2^10= round_to_int(y)
|
||||
fcmp.ge.s1 p12, p7= f8, FR_OF_LIMIT // x >= overflow threshold ?
|
||||
add GR_LOG_TBL= 256, GR_COEFF_START // Pointer to high T_table
|
||||
}
|
||||
;;
|
||||
|
||||
{.mmi
|
||||
and GR_F_low= GR_KF0, GR_MASK_low // f_low
|
||||
and GR_F_high= GR_MASK, GR_KF0 // f_high*32
|
||||
shr GR_K= GR_KF0, 10 // K
|
||||
}
|
||||
;;
|
||||
|
||||
{.mmi
|
||||
shladd GR_Flow_ADDR= GR_F_low, 3, GR_COEFF_START // address of 2^{f_low}
|
||||
add GR_BIAS= GR_K, GR_EXP_CORR // K= bias-2*63
|
||||
shr GR_Fh= GR_F_high, 5 // f_high
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
setf.exp FR_2_TO_K= GR_BIAS // 2^{K-126}
|
||||
fnma.s1 FR_R= FR_KF, f1, f8 // r= x - (K+f)
|
||||
shladd GR_Fh_ADDR= GR_Fh, 3, GR_LOG_TBL // address of 2^{f_high}
|
||||
}
|
||||
{.mlx
|
||||
ldf8 FR_T_low= [ GR_Flow_ADDR ] // load T_low= 2^{f_low}
|
||||
movl GR_EMIN= 0xc47f8000 // EMIN= -1022
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
ldf8 FR_T_high= [ GR_Fh_ADDR ] // load T_high= 2^{f_high}
|
||||
(p7) fcmp.lt.s1 p12, p7= f8, FR_UF_LIMIT // x<underflow threshold ?
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
setf.s FR_EXPMIN= GR_EMIN // FR_EXPMIN= EMIN
|
||||
fma.s1 FR_P34= FR_COEFF4, FR_R, FR_COEFF3 // P34= C_3+C_4*r
|
||||
nop.i 0
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
fma.s1 FR_R2= FR_R, FR_R, f0 // r*r
|
||||
(p12) br.cond.spnt OUT_RANGE_exp2
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s1 FR_P12= FR_COEFF2, FR_R, FR_COEFF1 // P12= C_1+C_2*r
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s1 FR_T_low_K= FR_T_low, FR_2_TO_K, f0 // T= 2^{K-126}*T_low
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s1 FR_P14= FR_R2, FR_P34, FR_P12 // P14= P12+r2*P34
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s1 FR_T= FR_T_low_K, FR_T_high, f0 // T= T*T_high
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fcmp.lt.s0 p6, p8= f8, FR_EXPMIN // underflow (x<EMIN) ?
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s1 FR_P= FR_P14, FR_R, f0 // P= P14*r
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
fma.d.s0 f8= FR_P, FR_T, FR_T // result= T+T*P
|
||||
(p8) br.ret.sptk b0 // return
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfb
|
||||
(p6) mov GR_Parameter_TAG= 162
|
||||
nop.f 0
|
||||
(p6) br.cond.sptk __libm_error_region
|
||||
}
|
||||
;;
|
||||
|
||||
|
||||
SPECIAL_exp2:
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fclass.m p6, p0= f8, 0x22 // x= -Infinity ?
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fclass.m p7, p0= f8, 0x21 // x= +Infinity ?
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fclass.m p8, p0= f8, 0x7 // x= +/-Zero ?
|
||||
nop.i 0
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
(p6) mov f8= f0 // exp2(-Infinity)= 0
|
||||
(p6) br.ret.spnt b0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
nop.f 0
|
||||
(p7) br.ret.spnt b0 // exp2(+Infinity)= +Infinity
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
(p8) mov f8= f1 // exp2(+/-0)= 1
|
||||
(p8) br.ret.spnt b0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
fma.d.s0 f8= f8, f1, f0 // Remaining cases: NaNs
|
||||
br.ret.sptk b0
|
||||
}
|
||||
;;
|
||||
|
||||
|
||||
OUT_RANGE_exp2:
|
||||
|
||||
// overflow: p8= 1
|
||||
|
||||
{.mii
|
||||
(p8) mov GR_EXPMAX= 0x1fffe
|
||||
nop.i 0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mmb
|
||||
(p8) mov GR_Parameter_TAG= 161
|
||||
(p8) setf.exp FR_R= GR_EXPMAX
|
||||
nop.b 999
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
nop.m 999
|
||||
(p8) fma.d.s0 f8= FR_R, FR_R, f0 // Create overflow
|
||||
nop.i 999
|
||||
}
|
||||
// underflow: p6= 1
|
||||
{.mii
|
||||
(p6) mov GR_Parameter_TAG= 162
|
||||
(p6) mov GR_EXPMAX= 1
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mmb
|
||||
nop.m 0
|
||||
(p6) setf.exp FR_R= GR_EXPMAX
|
||||
nop.b 999
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfb
|
||||
nop.m 999
|
||||
(p6) fma.d.s0 f8= FR_R, FR_R, f0 // Create underflow
|
||||
nop.b 0
|
||||
}
|
||||
;;
|
||||
|
||||
WEAK_LIBM_END(exp2)
|
||||
libm_alias_double_other (__exp2, exp2)
|
||||
#ifdef SHARED
|
||||
.symver exp2,exp2@@GLIBC_2.29
|
||||
.weak __exp2_compat
|
||||
.set __exp2_compat,__exp2
|
||||
.symver __exp2_compat,exp2@GLIBC_2.2
|
||||
#endif
|
||||
|
||||
|
||||
LOCAL_LIBM_ENTRY(__libm_error_region)
|
||||
|
||||
.prologue
|
||||
{.mfi
|
||||
add GR_Parameter_Y= -32, sp // Parameter 2 value
|
||||
nop.f 0
|
||||
.save ar.pfs, GR_SAVE_PFS
|
||||
mov GR_SAVE_PFS= ar.pfs // Save ar.pfs
|
||||
}
|
||||
|
||||
{.mfi
|
||||
.fframe 64
|
||||
add sp= -64, sp // Create new stack
|
||||
nop.f 0
|
||||
mov GR_SAVE_GP= gp // Save gp
|
||||
}
|
||||
;;
|
||||
|
||||
{.mmi
|
||||
stfd [ GR_Parameter_Y ]= FR_Y, 16 // STORE Parameter 2 on stack
|
||||
add GR_Parameter_X= 16, sp // Parameter 1 address
|
||||
.save b0, GR_SAVE_B0
|
||||
mov GR_SAVE_B0= b0 // Save b0
|
||||
}
|
||||
;;
|
||||
|
||||
.body
|
||||
{.mib
|
||||
stfd [ GR_Parameter_X ]= FR_X // STORE Parameter 1 on stack
|
||||
add GR_Parameter_RESULT= 0, GR_Parameter_Y // Parameter 3 address
|
||||
nop.b 0
|
||||
}
|
||||
{.mib
|
||||
stfd [ GR_Parameter_Y ]= FR_RESULT // STORE Parameter 3 on stack
|
||||
add GR_Parameter_Y= -16, GR_Parameter_Y
|
||||
br.call.sptk b0= __libm_error_support# // Call error handling function
|
||||
}
|
||||
;;
|
||||
|
||||
{.mmi
|
||||
add GR_Parameter_RESULT= 48, sp
|
||||
nop.m 0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mmi
|
||||
ldfd f8= [ GR_Parameter_RESULT ] // Get return result off stack
|
||||
.restore sp
|
||||
add sp= 64, sp // Restore stack pointer
|
||||
mov b0= GR_SAVE_B0 // Restore return address
|
||||
}
|
||||
;;
|
||||
|
||||
{.mib
|
||||
mov gp= GR_SAVE_GP // Restore gp
|
||||
mov ar.pfs= GR_SAVE_PFS // Restore ar.pfs
|
||||
br.ret.sptk b0 // Return
|
||||
}
|
||||
;;
|
||||
|
||||
|
||||
LOCAL_LIBM_END(__libm_error_region)
|
||||
|
||||
.type __libm_error_support#, @function
|
||||
.global __libm_error_support#
|
@ -1,545 +0,0 @@
|
||||
.file "exp2f.s"
|
||||
|
||||
|
||||
// Copyright (c) 2000 - 2005, Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
//
|
||||
// History
|
||||
//==============================================================
|
||||
// 08/25/00 Initial version
|
||||
// 05/20/02 Cleaned up namespace and sf0 syntax
|
||||
// 09/05/02 Improved performance and accuracy
|
||||
// 01/17/03 Fixed to call error support when x=128.0
|
||||
// 03/31/05 Reformatted delimiters between data tables
|
||||
//
|
||||
// API
|
||||
//==============================================================
|
||||
// float exp2f(float)
|
||||
//
|
||||
// Overview of operation
|
||||
//==============================================================
|
||||
// Background
|
||||
//
|
||||
// Implementation
|
||||
//
|
||||
// Let x= (K + fh + fl + r), where
|
||||
// K is an integer, fh= 0.b1 b2 b3 b4 b5,
|
||||
// fl= 2^{-5}* 0.b6 b7 b8 b8 b10 (fh, fl >= 0),
|
||||
// and |r|<2^{-11}
|
||||
// Th is a table that stores 2^fh (32 entries) rounded to
|
||||
// double extended precision (only mantissa is stored)
|
||||
// Tl is a table that stores 2^fl (32 entries) rounded to
|
||||
// double extended precision (only mantissa is stored)
|
||||
//
|
||||
// 2^x is approximated as
|
||||
// 2^K * Th [ f ] * Tl [ f ] * (1+c1*r+c2*r^2)
|
||||
|
||||
// Note: We use the following trick to speed up conversion from FP to integer:
|
||||
//
|
||||
// Let x = K + r, where K is an integer, and |r| <= 0.5
|
||||
// Let N be the number of significand bits for the FP format used
|
||||
// ( N=64 for double-extended, N=53 for double)
|
||||
//
|
||||
// Then let y = 1.5 * 2^(N-1) + x for RN mode
|
||||
// K = y - 1.5 * 2^(N-1)
|
||||
// r = x - K
|
||||
//
|
||||
// If we want to obtain the integer part and the first m fractional bits of x,
|
||||
// we can use the same trick, but with a constant of 1.5 * 2^(N-1-m):
|
||||
//
|
||||
// Let x = K + f + r
|
||||
// f = 0.b_1 b_2 ... b_m
|
||||
// |r| <= 2^(-m-1)
|
||||
//
|
||||
// Then let y = 1.5 * 2^(N-1-m) + x for RN mode
|
||||
// (K+f) = y - 1.5 * 2^(N-1-m)
|
||||
// r = x - K
|
||||
|
||||
|
||||
// Special values
|
||||
//==============================================================
|
||||
// exp2(0)= 1
|
||||
// exp2(+inf)= inf
|
||||
// exp2(-inf)= 0
|
||||
//
|
||||
|
||||
// Registers used
|
||||
//==============================================================
|
||||
// r2-r3, r14-r40
|
||||
// f6-f15, f32-f45
|
||||
// p6-p8, p12
|
||||
//
|
||||
|
||||
|
||||
GR_TBL_START = r2
|
||||
GR_LOG_TBL = r3
|
||||
|
||||
GR_OF_LIMIT = r14
|
||||
GR_UF_LIMIT = r15
|
||||
GR_EXP_CORR = r16
|
||||
GR_F_low = r17
|
||||
GR_F_high = r18
|
||||
GR_K = r19
|
||||
GR_Flow_ADDR = r20
|
||||
|
||||
GR_BIAS = r21
|
||||
GR_Fh = r22
|
||||
GR_Fh_ADDR = r23
|
||||
GR_EXPMAX = r24
|
||||
GR_EMIN = r25
|
||||
|
||||
GR_ROUNDVAL = r26
|
||||
GR_MASK = r27
|
||||
GR_KF0 = r28
|
||||
GR_MASK_low = r29
|
||||
GR_COEFF_START = r30
|
||||
|
||||
GR_SAVE_B0 = r33
|
||||
GR_SAVE_PFS = r34
|
||||
GR_SAVE_GP = r35
|
||||
GR_SAVE_SP = r36
|
||||
|
||||
GR_Parameter_X = r37
|
||||
GR_Parameter_Y = r38
|
||||
GR_Parameter_RESULT = r39
|
||||
GR_Parameter_TAG = r40
|
||||
|
||||
|
||||
FR_X = f10
|
||||
FR_Y = f1
|
||||
FR_RESULT = f8
|
||||
|
||||
|
||||
FR_COEFF1 = f6
|
||||
FR_COEFF2 = f7
|
||||
FR_R = f9
|
||||
|
||||
FR_KF0 = f12
|
||||
FR_UF_LIMIT = f15
|
||||
|
||||
FR_OF_LIMIT = f32
|
||||
FR_EXPMIN = f33
|
||||
FR_ROUNDVAL = f34
|
||||
FR_KF = f35
|
||||
|
||||
FR_2_TO_K = f36
|
||||
FR_T_low = f37
|
||||
FR_T_high = f38
|
||||
|
||||
FR_P12 = f41
|
||||
FR_T_low_K = f42
|
||||
FR_T = f44
|
||||
FR_P = f45
|
||||
|
||||
|
||||
// Data tables
|
||||
//==============================================================
|
||||
|
||||
RODATA
|
||||
|
||||
.align 16
|
||||
|
||||
LOCAL_OBJECT_START(poly_coeffs)
|
||||
|
||||
data8 0xb17217f7d1cf79ab, 0x00003ffe // C_1
|
||||
data8 0xf5fdeffc162c7541, 0x00003ffc // C_2
|
||||
LOCAL_OBJECT_END(poly_coeffs)
|
||||
|
||||
|
||||
LOCAL_OBJECT_START(T_table)
|
||||
|
||||
// 2^{0.00000 b6 b7 b8 b9 b10}
|
||||
data8 0x8000000000000000, 0x8016302f17467628
|
||||
data8 0x802c6436d0e04f50, 0x80429c17d77c18ed
|
||||
data8 0x8058d7d2d5e5f6b0, 0x806f17687707a7af
|
||||
data8 0x80855ad965e88b83, 0x809ba2264dada76a
|
||||
data8 0x80b1ed4fd999ab6c, 0x80c83c56b50cf77f
|
||||
data8 0x80de8f3b8b85a0af, 0x80f4e5ff089f763e
|
||||
data8 0x810b40a1d81406d4, 0x81219f24a5baa59d
|
||||
data8 0x813801881d886f7b, 0x814e67cceb90502c
|
||||
data8 0x8164d1f3bc030773, 0x817b3ffd3b2f2e47
|
||||
data8 0x8191b1ea15813bfd, 0x81a827baf7838b78
|
||||
data8 0x81bea1708dde6055, 0x81d51f0b8557ec1c
|
||||
data8 0x81eba08c8ad4536f, 0x820225f44b55b33b
|
||||
data8 0x8218af4373fc25eb, 0x822f3c7ab205c89a
|
||||
data8 0x8245cd9ab2cec048, 0x825c62a423d13f0c
|
||||
data8 0x8272fb97b2a5894c, 0x828998760d01faf3
|
||||
data8 0x82a0393fe0bb0ca8, 0x82b6ddf5dbc35906
|
||||
//
|
||||
// 2^{0.b1 b2 b3 b4 b5}
|
||||
data8 0x8000000000000000, 0x82cd8698ac2ba1d7
|
||||
data8 0x85aac367cc487b14, 0x88980e8092da8527
|
||||
data8 0x8b95c1e3ea8bd6e6, 0x8ea4398b45cd53c0
|
||||
data8 0x91c3d373ab11c336, 0x94f4efa8fef70961
|
||||
data8 0x9837f0518db8a96f, 0x9b8d39b9d54e5538
|
||||
data8 0x9ef5326091a111ad, 0xa27043030c496818
|
||||
data8 0xa5fed6a9b15138ea, 0xa9a15ab4ea7c0ef8
|
||||
data8 0xad583eea42a14ac6, 0xb123f581d2ac258f
|
||||
data8 0xb504f333f9de6484, 0xb8fbaf4762fb9ee9
|
||||
data8 0xbd08a39f580c36be, 0xc12c4cca66709456
|
||||
data8 0xc5672a115506dadd, 0xc9b9bd866e2f27a2
|
||||
data8 0xce248c151f8480e3, 0xd2a81d91f12ae45a
|
||||
data8 0xd744fccad69d6af4, 0xdbfbb797daf23755
|
||||
data8 0xe0ccdeec2a94e111, 0xe5b906e77c8348a8
|
||||
data8 0xeac0c6e7dd24392e, 0xefe4b99bdcdaf5cb
|
||||
data8 0xf5257d152486cc2c, 0xfa83b2db722a033a
|
||||
LOCAL_OBJECT_END(T_table)
|
||||
|
||||
|
||||
|
||||
.section .text
|
||||
WEAK_LIBM_ENTRY(exp2f)
|
||||
|
||||
|
||||
{.mfi
|
||||
alloc r32= ar.pfs, 1, 4, 4, 0
|
||||
// will continue only for non-zero normal/denormal numbers
|
||||
fclass.nm p12, p0= f8, 0x1b
|
||||
// GR_TBL_START= pointer to C_1...C_2 followed by T_table
|
||||
addl GR_TBL_START= @ltoff(poly_coeffs), gp
|
||||
}
|
||||
{.mlx
|
||||
mov GR_OF_LIMIT= 0xffff + 7 // Exponent of overflow limit
|
||||
movl GR_ROUNDVAL= 0x5a400000 // 1.5*2^(63-10) (SP)
|
||||
}
|
||||
;;
|
||||
|
||||
// Form special constant 1.5*2^(63-10) to give integer part and first 10
|
||||
// fractional bits of x
|
||||
{.mfi
|
||||
setf.s FR_ROUNDVAL= GR_ROUNDVAL // Form special constant
|
||||
fcmp.lt.s1 p6, p8= f8, f0 // X<0 ?
|
||||
nop.i 0
|
||||
}
|
||||
{.mfb
|
||||
ld8 GR_COEFF_START= [ GR_TBL_START ] // Load pointer to coeff table
|
||||
nop.f 0
|
||||
(p12) br.cond.spnt SPECIAL_exp2 // Branch if nan, inf, zero
|
||||
}
|
||||
;;
|
||||
|
||||
{.mlx
|
||||
setf.exp FR_OF_LIMIT= GR_OF_LIMIT // Set overflow limit
|
||||
movl GR_UF_LIMIT= 0xc3160000 // (-2^7-22) = -150
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
ldfe FR_COEFF1= [ GR_COEFF_START ], 16 // load C_1
|
||||
fma.s0 f8= f8, f1, f0 // normalize x
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mmi
|
||||
ldfe FR_COEFF2= [ GR_COEFF_START ], 16 // load C_2
|
||||
setf.s FR_UF_LIMIT= GR_UF_LIMIT // Set underflow limit
|
||||
mov GR_EXP_CORR= 0xffff-126
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s1 FR_KF0= f8, f1, FR_ROUNDVAL // y= x + 1.5*2^(63-10)
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
mov GR_MASK= 1023
|
||||
fms.s1 FR_KF= FR_KF0, f1, FR_ROUNDVAL // (K+f)
|
||||
mov GR_MASK_low= 31
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
getf.sig GR_KF0= FR_KF0 // (K+f)*2^10= round_to_int(y)
|
||||
fcmp.ge.s1 p12, p7= f8, FR_OF_LIMIT // x >= overflow threshold ?
|
||||
add GR_LOG_TBL= 256, GR_COEFF_START // Pointer to high T_table
|
||||
}
|
||||
;;
|
||||
|
||||
{.mmi
|
||||
and GR_F_low= GR_KF0, GR_MASK_low // f_low
|
||||
and GR_F_high= GR_MASK, GR_KF0 // f_high*32
|
||||
shr GR_K= GR_KF0, 10 // K
|
||||
}
|
||||
;;
|
||||
|
||||
{.mmi
|
||||
shladd GR_Flow_ADDR= GR_F_low, 3, GR_COEFF_START // address of 2^{f_low}
|
||||
add GR_BIAS= GR_K, GR_EXP_CORR // K= bias-2*63
|
||||
shr GR_Fh= GR_F_high, 5 // f_high
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
setf.exp FR_2_TO_K= GR_BIAS // 2^{K-126}
|
||||
fnma.s1 FR_R= FR_KF, f1, f8 // r= x - (K+f)
|
||||
shladd GR_Fh_ADDR= GR_Fh, 3, GR_LOG_TBL // address of 2^{f_high}
|
||||
}
|
||||
{.mlx
|
||||
ldf8 FR_T_low= [ GR_Flow_ADDR ] // load T_low= 2^{f_low}
|
||||
movl GR_EMIN= 0xc2fc0000 // EMIN= -126
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
ldf8 FR_T_high= [ GR_Fh_ADDR ] // load T_high= 2^{f_high}
|
||||
(p7) fcmp.lt.s1 p12, p7= f8, FR_UF_LIMIT // x<underflow threshold ?
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfb
|
||||
setf.s FR_EXPMIN= GR_EMIN // FR_EXPMIN= EMIN
|
||||
fma.s1 FR_P12= FR_COEFF2, FR_R, FR_COEFF1 // P12= C_1+C_2*r
|
||||
(p12) br.cond.spnt OUT_RANGE_exp2
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s1 FR_T_low_K= FR_T_low, FR_2_TO_K, f0 // T= 2^{K-126}*T_low
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s1 FR_P= FR_R, FR_P12, f0 // P= P12+r
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s1 FR_T= FR_T_low_K, FR_T_high, f0 // T= T*T_high
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fcmp.lt.s0 p6, p8= f8, FR_EXPMIN // underflow (x<EMIN) ?
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
fma.s.s0 f8= FR_P, FR_T, FR_T // result= T+T*P
|
||||
(p8) br.ret.sptk b0 // return
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfb
|
||||
(p6) mov GR_Parameter_TAG= 164
|
||||
nop.f 0
|
||||
(p6) br.cond.sptk __libm_error_region
|
||||
}
|
||||
;;
|
||||
|
||||
|
||||
SPECIAL_exp2:
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fclass.m p6, p0= f8, 0x22 // x= -Infinity ?
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fclass.m p7, p0= f8, 0x21 // x= +Infinity ?
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fclass.m p8, p0= f8, 0x7 // x= +/-Zero ?
|
||||
nop.i 0
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
(p6) mov f8= f0 // exp2(-Infinity)= 0
|
||||
(p6) br.ret.spnt b0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
nop.f 0
|
||||
(p7) br.ret.spnt b0 // exp2(+Infinity)= +Infinity
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
(p8) mov f8= f1 // exp2(+/-0)= 1
|
||||
(p8) br.ret.spnt b0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
fma.s.s0 f8= f8, f1, f0 // Remaining cases: NaNs
|
||||
br.ret.sptk b0
|
||||
}
|
||||
;;
|
||||
|
||||
|
||||
OUT_RANGE_exp2:
|
||||
|
||||
// overflow: p8= 1
|
||||
|
||||
{.mii
|
||||
(p8) mov GR_EXPMAX= 0x1fffe
|
||||
nop.i 0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mmb
|
||||
(p8) mov GR_Parameter_TAG= 163
|
||||
(p8) setf.exp FR_R= GR_EXPMAX
|
||||
nop.b 999
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfi
|
||||
nop.m 999
|
||||
(p8) fma.s.s0 f8= FR_R, FR_R, f0 // Create overflow
|
||||
nop.i 999
|
||||
}
|
||||
// underflow: p6= 1
|
||||
{.mii
|
||||
(p6) mov GR_Parameter_TAG= 164
|
||||
(p6) mov GR_EXPMAX= 1
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mmb
|
||||
nop.m 0
|
||||
(p6) setf.exp FR_R= GR_EXPMAX
|
||||
nop.b 999
|
||||
}
|
||||
;;
|
||||
|
||||
{.mfb
|
||||
nop.m 999
|
||||
(p6) fma.s.s0 f8= FR_R, FR_R, f0 // Create underflow
|
||||
nop.b 0
|
||||
}
|
||||
;;
|
||||
|
||||
WEAK_LIBM_END(exp2f)
|
||||
libm_alias_float_other (__exp2, exp2)
|
||||
#ifdef SHARED
|
||||
.symver exp2f,exp2f@@GLIBC_2.27
|
||||
.weak __exp2f_compat
|
||||
.set __exp2f_compat,__exp2f
|
||||
.symver __exp2f_compat,exp2f@GLIBC_2.2
|
||||
#endif
|
||||
|
||||
|
||||
LOCAL_LIBM_ENTRY(__libm_error_region)
|
||||
|
||||
.prologue
|
||||
{.mfi
|
||||
add GR_Parameter_Y= -32, sp // Parameter 2 value
|
||||
nop.f 0
|
||||
.save ar.pfs, GR_SAVE_PFS
|
||||
mov GR_SAVE_PFS= ar.pfs // Save ar.pfs
|
||||
}
|
||||
|
||||
{.mfi
|
||||
.fframe 64
|
||||
add sp= -64, sp // Create new stack
|
||||
nop.f 0
|
||||
mov GR_SAVE_GP= gp // Save gp
|
||||
}
|
||||
;;
|
||||
|
||||
{.mmi
|
||||
stfs [ GR_Parameter_Y ]= FR_Y, 16 // STORE Parameter 2 on stack
|
||||
add GR_Parameter_X= 16, sp // Parameter 1 address
|
||||
.save b0, GR_SAVE_B0
|
||||
mov GR_SAVE_B0= b0 // Save b0
|
||||
}
|
||||
;;
|
||||
|
||||
.body
|
||||
{.mib
|
||||
stfs [ GR_Parameter_X ]= FR_X // STORE Parameter 1 on stack
|
||||
add GR_Parameter_RESULT= 0, GR_Parameter_Y // Parameter 3 address
|
||||
nop.b 0
|
||||
}
|
||||
{.mib
|
||||
stfs [ GR_Parameter_Y ]= FR_RESULT // STORE Parameter 3 on stack
|
||||
add GR_Parameter_Y= -16, GR_Parameter_Y
|
||||
br.call.sptk b0= __libm_error_support# // Call error handling function
|
||||
}
|
||||
;;
|
||||
|
||||
{.mmi
|
||||
add GR_Parameter_RESULT= 48, sp
|
||||
nop.m 0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{.mmi
|
||||
ldfs f8= [ GR_Parameter_RESULT ] // Get return result off stack
|
||||
.restore sp
|
||||
add sp= 64, sp // Restore stack pointer
|
||||
mov b0= GR_SAVE_B0 // Restore return address
|
||||
}
|
||||
;;
|
||||
|
||||
{.mib
|
||||
mov gp= GR_SAVE_GP // Restore gp
|
||||
mov ar.pfs= GR_SAVE_PFS // Restore ar.pfs
|
||||
br.ret.sptk b0 // Return
|
||||
}
|
||||
;;
|
||||
|
||||
|
||||
LOCAL_LIBM_END(__libm_error_region)
|
||||
|
||||
.type __libm_error_support#, @function
|
||||
.global __libm_error_support#
|
@ -1,807 +0,0 @@
|
||||
.file "exp2l.s"
|
||||
|
||||
|
||||
// Copyright (c) 2000 - 2003, Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
//
|
||||
// History
|
||||
//==============================================================
|
||||
// 07/27/00 Initial version
|
||||
// 08/15/00 Bundle added after call to __libm_error_support to properly
|
||||
// set [ the previously overwritten ] GR_Parameter_RESULT.
|
||||
// 02/02/01 Added libm_error_support calls for underflow
|
||||
// 05/20/02 Cleaned up namespace and sf0 syntax
|
||||
// 02/06/03 Reordered header: .section, .global, .proc, .align
|
||||
// 05/07/03 Reformatted assembly source
|
||||
//
|
||||
// API
|
||||
//==============================================================
|
||||
// long double exp2l(long double)
|
||||
//
|
||||
// Overview of operation
|
||||
//==============================================================
|
||||
// Background
|
||||
//
|
||||
// Implementation
|
||||
//
|
||||
// Let x= K + f + r, where
|
||||
// K is an integer, f= 0.b1 b2... b8 (f>= 0),
|
||||
// and |r|<2^{-8}
|
||||
// T is a table that stores 2^f (256 entries) rounded to
|
||||
// double extended precision (only mantissa is stored)
|
||||
// D stores (2^f/T [ f ] - 1), rounded to single precision
|
||||
//
|
||||
// 2^x is approximated as
|
||||
// 2^K * T [ f ] * (1+D [ f ] +c1*r+c2*r^2+...+c6*r^6)
|
||||
//
|
||||
|
||||
|
||||
|
||||
// Special values
|
||||
//==============================================================
|
||||
// exp2(0)= 1
|
||||
// exp2(+inf)= inf
|
||||
// exp2(-inf)= 0
|
||||
//
|
||||
|
||||
|
||||
// Registers used
|
||||
//==============================================================
|
||||
// f6-f15, f32-f46
|
||||
// r2-r3, r8-r11, r14-r40
|
||||
// p6, p7, p8, p12
|
||||
|
||||
FR_X = f10
|
||||
FR_Y = f1
|
||||
FR_RESULT = f8
|
||||
|
||||
FR_KF0 = f6
|
||||
FR_EXP63 = f7
|
||||
FR_T = f9
|
||||
FR_COEFF3 = f10
|
||||
FR_COEFF4 = f11
|
||||
FR_COEFF5 = f12
|
||||
FR_COEFF6 = f13
|
||||
FR_COEFF1 = f14
|
||||
FR_COEFF2 = f15
|
||||
FR_2P14 = f32
|
||||
FR_UF_TEST = f33
|
||||
FR_D = f34
|
||||
FR_R = f35
|
||||
FR_2EXP = f36
|
||||
FR_EMIN = f37
|
||||
FR_P34 = f38
|
||||
FR_P56 = f39
|
||||
FR_R2 = f40
|
||||
FR_P12 = f41
|
||||
FR_TS = f42
|
||||
FR_P36 = f43
|
||||
FR_P02 = f44
|
||||
FR_R3 = f45
|
||||
FR_P06 = f46
|
||||
|
||||
|
||||
GR_ADDR0 = r2
|
||||
GR_ADDR = r2
|
||||
GR_D_ADDR0 = r3
|
||||
GR_D_ADDR = r3
|
||||
GR_LEADBITS = r8
|
||||
GR_256 = r9
|
||||
GR_EM63 = r10
|
||||
GR_255 = r11
|
||||
GR_EXPON = r14
|
||||
GR_BM63 = r15
|
||||
GR_UF_TEST = r16
|
||||
GR_INDEX = r17
|
||||
GR_K = r18
|
||||
GR_KF = r19
|
||||
GR_2P14 = r19
|
||||
GR_EMIN = r20
|
||||
GR_IT = r21
|
||||
GR_ID = r22
|
||||
GR_63 = r23
|
||||
GR_CONST1 = r24
|
||||
GR_EBIAS = r25
|
||||
GR_CONST2 = r26
|
||||
GR_CONST3 = r27
|
||||
GR_SIGNIF = r28
|
||||
GR_ARGEXP = r29
|
||||
GR_SGN = r30
|
||||
GR_EMIN1 = r31
|
||||
GR_SREG = r32
|
||||
|
||||
GR_SAVE_B0 = r33
|
||||
GR_SAVE_PFS = r34
|
||||
GR_SAVE_GP = r35
|
||||
GR_SAVE_SP = r36
|
||||
|
||||
GR_Parameter_X = r37
|
||||
GR_Parameter_Y = r38
|
||||
GR_Parameter_RESULT= r39
|
||||
GR_Parameter_TAG = r40
|
||||
|
||||
|
||||
// Data tables
|
||||
//==============================================================
|
||||
|
||||
RODATA
|
||||
|
||||
.align 16
|
||||
|
||||
LOCAL_OBJECT_START(poly_coeffs)
|
||||
|
||||
data8 0x3fac6b08d704a0c0 // C_3
|
||||
data8 0x3f83b2ab6fba4e77 // C_4
|
||||
data8 0x3f55d87fe78a6731 // C_5
|
||||
data8 0x3f2430912f86c787 // C_6
|
||||
data8 0xb17217f7d1cf79ab, 0x00003ffe // C_1
|
||||
data8 0xf5fdeffc162c7541, 0x00003ffc // C_2
|
||||
LOCAL_OBJECT_END(poly_coeffs)
|
||||
|
||||
|
||||
LOCAL_OBJECT_START(T_table)
|
||||
|
||||
data8 0x8000000000000000, 0x8058d7d2d5e5f6b1
|
||||
data8 0x80b1ed4fd999ab6c, 0x810b40a1d81406d4
|
||||
data8 0x8164d1f3bc030773, 0x81bea1708dde6056
|
||||
data8 0x8218af4373fc25ec, 0x8272fb97b2a5894c
|
||||
data8 0x82cd8698ac2ba1d7, 0x83285071e0fc4547
|
||||
data8 0x8383594eefb6ee37, 0x83dea15b9541b132
|
||||
data8 0x843a28c3acde4046, 0x8495efb3303efd30
|
||||
data8 0x84f1f656379c1a29, 0x854e3cd8f9c8c95d
|
||||
data8 0x85aac367cc487b15, 0x86078a2f23642a9f
|
||||
data8 0x8664915b923fba04, 0x86c1d919caef5c88
|
||||
data8 0x871f61969e8d1010, 0x877d2afefd4e256c
|
||||
data8 0x87db357ff698d792, 0x88398146b919f1d4
|
||||
data8 0x88980e8092da8527, 0x88f6dd5af155ac6b
|
||||
data8 0x8955ee03618e5fdd, 0x89b540a7902557a4
|
||||
data8 0x8a14d575496efd9a, 0x8a74ac9a79896e47
|
||||
data8 0x8ad4c6452c728924, 0x8b3522a38e1e1032
|
||||
data8 0x8b95c1e3ea8bd6e7, 0x8bf6a434adde0085
|
||||
data8 0x8c57c9c4646f4dde, 0x8cb932c1bae97a95
|
||||
data8 0x8d1adf5b7e5ba9e6, 0x8d7ccfc09c50e2f8
|
||||
data8 0x8ddf042022e69cd6, 0x8e417ca940e35a01
|
||||
data8 0x8ea4398b45cd53c0, 0x8f073af5a2013520
|
||||
data8 0x8f6a8117e6c8e5c4, 0x8fce0c21c6726481
|
||||
data8 0x9031dc431466b1dc, 0x9095f1abc540ca6b
|
||||
data8 0x90fa4c8beee4b12b, 0x915eed13c89689d3
|
||||
data8 0x91c3d373ab11c336, 0x9228ffdc10a051ad
|
||||
data8 0x928e727d9531f9ac, 0x92f42b88f673aa7c
|
||||
data8 0x935a2b2f13e6e92c, 0x93c071a0eef94bc1
|
||||
data8 0x9426ff0fab1c04b6, 0x948dd3ac8ddb7ed3
|
||||
data8 0x94f4efa8fef70961, 0x955c5336887894d5
|
||||
data8 0x95c3fe86d6cc7fef, 0x962bf1cbb8d97560
|
||||
data8 0x96942d3720185a00, 0x96fcb0fb20ac4ba3
|
||||
data8 0x97657d49f17ab08e, 0x97ce9255ec4357ab
|
||||
data8 0x9837f0518db8a96f, 0x98a1976f7597e996
|
||||
data8 0x990b87e266c189aa, 0x9975c1dd47518c77
|
||||
data8 0x99e0459320b7fa65, 0x9a4b13371fd166ca
|
||||
data8 0x9ab62afc94ff864a, 0x9b218d16f441d63d
|
||||
data8 0x9b8d39b9d54e5539, 0x9bf93118f3aa4cc1
|
||||
data8 0x9c6573682ec32c2d, 0x9cd200db8a0774cb
|
||||
data8 0x9d3ed9a72cffb751, 0x9dabfdff6367a2aa
|
||||
data8 0x9e196e189d472420, 0x9e872a276f0b98ff
|
||||
data8 0x9ef5326091a111ae, 0x9f6386f8e28ba651
|
||||
data8 0x9fd228256400dd06, 0xa041161b3d0121be
|
||||
data8 0xa0b0510fb9714fc2, 0xa11fd9384a344cf7
|
||||
data8 0xa18faeca8544b6e4, 0xa1ffd1fc25cea188
|
||||
data8 0xa27043030c496819, 0xa2e102153e918f9e
|
||||
data8 0xa3520f68e802bb93, 0xa3c36b345991b47c
|
||||
data8 0xa43515ae09e6809e, 0xa4a70f0c95768ec5
|
||||
data8 0xa5195786be9ef339, 0xa58bef536dbeb6ee
|
||||
data8 0xa5fed6a9b15138ea, 0xa6720dc0be08a20c
|
||||
data8 0xa6e594cfeee86b1e, 0xa7596c0ec55ff55b
|
||||
data8 0xa7cd93b4e965356a, 0xa8420bfa298f70d1
|
||||
data8 0xa8b6d5167b320e09, 0xa92bef41fa77771b
|
||||
data8 0xa9a15ab4ea7c0ef8, 0xaa1717a7b5693979
|
||||
data8 0xaa8d2652ec907629, 0xab0386ef48868de1
|
||||
data8 0xab7a39b5a93ed337, 0xabf13edf162675e9
|
||||
data8 0xac6896a4be3fe929, 0xace0413ff83e5d04
|
||||
data8 0xad583eea42a14ac6, 0xadd08fdd43d01491
|
||||
data8 0xae493452ca35b80e, 0xaec22c84cc5c9465
|
||||
data8 0xaf3b78ad690a4375, 0xafb51906e75b8661
|
||||
data8 0xb02f0dcbb6e04584, 0xb0a957366fb7a3c9
|
||||
data8 0xb123f581d2ac2590, 0xb19ee8e8c94feb09
|
||||
data8 0xb21a31a66618fe3b, 0xb295cff5e47db4a4
|
||||
data8 0xb311c412a9112489, 0xb38e0e38419fae18
|
||||
data8 0xb40aaea2654b9841, 0xb487a58cf4a9c180
|
||||
data8 0xb504f333f9de6484, 0xb58297d3a8b9f0d2
|
||||
data8 0xb60093a85ed5f76c, 0xb67ee6eea3b22b8f
|
||||
data8 0xb6fd91e328d17791, 0xb77c94c2c9d725e9
|
||||
data8 0xb7fbefca8ca41e7c, 0xb87ba337a1743834
|
||||
data8 0xb8fbaf4762fb9ee9, 0xb97c143756844dbf
|
||||
data8 0xb9fcd2452c0b9deb, 0xba7de9aebe5fea09
|
||||
data8 0xbaff5ab2133e45fb, 0xbb81258d5b704b6f
|
||||
data8 0xbc034a7ef2e9fb0d, 0xbc85c9c560e7b269
|
||||
data8 0xbd08a39f580c36bf, 0xbd8bd84bb67ed483
|
||||
data8 0xbe0f6809860993e2, 0xbe935317fc378238
|
||||
data8 0xbf1799b67a731083, 0xbf9c3c248e2486f8
|
||||
data8 0xc0213aa1f0d08db0, 0xc0a6956e8836ca8d
|
||||
data8 0xc12c4cca66709456, 0xc1b260f5ca0fbb33
|
||||
data8 0xc238d2311e3d6673, 0xc2bfa0bcfad907c9
|
||||
data8 0xc346ccda24976407, 0xc3ce56c98d21b15d
|
||||
data8 0xc4563ecc5334cb33, 0xc4de8523c2c07baa
|
||||
data8 0xc5672a115506dadd, 0xc5f02dd6b0bbc3d9
|
||||
data8 0xc67990b5aa245f79, 0xc70352f04336c51e
|
||||
data8 0xc78d74c8abb9b15d, 0xc817f681416452b2
|
||||
data8 0xc8a2d85c8ffe2c45, 0xc92e1a9d517f0ecc
|
||||
data8 0xc9b9bd866e2f27a3, 0xca45c15afcc72624
|
||||
data8 0xcad2265e4290774e, 0xcb5eecd3b38597c9
|
||||
data8 0xcbec14fef2727c5d, 0xcc799f23d11510e5
|
||||
data8 0xcd078b86503dcdd2, 0xcd95da6a9ff06445
|
||||
data8 0xce248c151f8480e4, 0xceb3a0ca5dc6a55d
|
||||
data8 0xcf4318cf191918c1, 0xcfd2f4683f94eeb5
|
||||
data8 0xd06333daef2b2595, 0xd0f3d76c75c5db8d
|
||||
data8 0xd184df6251699ac6, 0xd2164c023056bcab
|
||||
data8 0xd2a81d91f12ae45a, 0xd33a5457a3029054
|
||||
data8 0xd3ccf099859ac379, 0xd45ff29e0972c561
|
||||
data8 0xd4f35aabcfedfa1f, 0xd5872909ab75d18a
|
||||
data8 0xd61b5dfe9f9bce07, 0xd6aff9d1e13ba2fe
|
||||
data8 0xd744fccad69d6af4, 0xd7da67311797f56a
|
||||
data8 0xd870394c6db32c84, 0xd9067364d44a929c
|
||||
data8 0xd99d15c278afd7b6, 0xda3420adba4d8704
|
||||
data8 0xdacb946f2ac9cc72, 0xdb63714f8e295255
|
||||
data8 0xdbfbb797daf23755, 0xdc9467913a4f1c92
|
||||
data8 0xdd2d818508324c20, 0xddc705bcd378f7f0
|
||||
data8 0xde60f4825e0e9124, 0xdefb4e1f9d1037f2
|
||||
data8 0xdf9612deb8f04420, 0xe031430a0d99e627
|
||||
data8 0xe0ccdeec2a94e111, 0xe168e6cfd3295d23
|
||||
data8 0xe2055afffe83d369, 0xe2a23bc7d7d91226
|
||||
data8 0xe33f8972be8a5a51, 0xe3dd444c46499619
|
||||
data8 0xe47b6ca0373da88d, 0xe51a02ba8e26d681
|
||||
data8 0xe5b906e77c8348a8, 0xe658797368b3a717
|
||||
data8 0xe6f85aaaee1fce22, 0xe798aadadd5b9cbf
|
||||
data8 0xe8396a503c4bdc68, 0xe8da9958464b42ab
|
||||
data8 0xe97c38406c4f8c57, 0xea1e4756550eb27b
|
||||
data8 0xeac0c6e7dd24392f, 0xeb63b74317369840
|
||||
data8 0xec0718b64c1cbddc, 0xecaaeb8ffb03ab41
|
||||
data8 0xed4f301ed9942b84, 0xedf3e6b1d418a491
|
||||
data8 0xee990f980da3025b, 0xef3eab20e032bc6b
|
||||
data8 0xefe4b99bdcdaf5cb, 0xf08b3b58cbe8b76a
|
||||
data8 0xf13230a7ad094509, 0xf1d999d8b7708cc1
|
||||
data8 0xf281773c59ffb13a, 0xf329c9233b6bae9c
|
||||
data8 0xf3d28fde3a641a5b, 0xf47bcbbe6db9fddf
|
||||
data8 0xf5257d152486cc2c, 0xf5cfa433e6537290
|
||||
data8 0xf67a416c733f846e, 0xf7255510c4288239
|
||||
data8 0xf7d0df730ad13bb9, 0xf87ce0e5b2094d9c
|
||||
data8 0xf92959bb5dd4ba74, 0xf9d64a46eb939f35
|
||||
data8 0xfa83b2db722a033a, 0xfb3193cc4227c3f4
|
||||
data8 0xfbdfed6ce5f09c49, 0xfc8ec01121e447bb
|
||||
data8 0xfd3e0c0cf486c175, 0xfdedd1b496a89f35
|
||||
data8 0xfe9e115c7b8f884c, 0xff4ecb59511ec8a5
|
||||
LOCAL_OBJECT_END(T_table)
|
||||
|
||||
|
||||
LOCAL_OBJECT_START(D_table)
|
||||
|
||||
data4 0x00000000, 0x9f55c08f, 0x1e93ffa3, 0x1dcd43a8
|
||||
data4 0x1f751f79, 0x9f3cdd88, 0x9f43d155, 0x1eda222c
|
||||
data4 0x1ef35513, 0x9f597895, 0x9e698881, 0x1ec71073
|
||||
data4 0x1e50e371, 0x9dc01e19, 0x1de74133, 0x1e2f028c
|
||||
data4 0x9edefb47, 0x1ebbac48, 0x9e8b0330, 0x9e9e9314
|
||||
data4 0x1edc1d11, 0x1f098529, 0x9f52827c, 0x1f50050d
|
||||
data4 0x1f301e8e, 0x1f5b64d1, 0x9f45e3ee, 0x9ef64d6d
|
||||
data4 0x1d6ec5e8, 0x9e61ad9a, 0x1d44ccbb, 0x9e4a8bbb
|
||||
data4 0x9cf11576, 0x9dcce7e7, 0x9d02ac90, 0x1f26ccf0
|
||||
data4 0x9f0877c6, 0x9ddd62ae, 0x9f4b7fc3, 0x1ea8ef6b
|
||||
data4 0x1ea4378d, 0x1ef6fc38, 0x1db99fd9, 0x1f22bf6f
|
||||
data4 0x1f53e172, 0x1e85504a, 0x9f37cc75, 0x1f0c5e17
|
||||
data4 0x1dde8aac, 0x9cb42bb2, 0x1e153cd7, 0x1eb62bba
|
||||
data4 0x9e9b941b, 0x9ea80e3c, 0x1f508823, 0x1ec3fd36
|
||||
data4 0x1e9ffaa1, 0x1e21e2eb, 0x9d948b1d, 0x9e8ac93a
|
||||
data4 0x1ef7ee6f, 0x9e80dda3, 0x1f0814be, 0x1dc5ddfe
|
||||
data4 0x1eedb9d1, 0x9f2aaa26, 0x9ea5b0fc, 0x1edf702e
|
||||
data4 0x9e391201, 0x1f1316bb, 0x1ea27fb7, 0x9e05ed18
|
||||
data4 0x9f199ed2, 0x1ee7fd7c, 0x1f003db6, 0x9eac3793
|
||||
data4 0x9e5b8c10, 0x9f3af17c, 0x1bc9a8be, 0x1ee3c004
|
||||
data4 0x9f19b1b2, 0x9f242ce9, 0x9ce67dd1, 0x9e4f6275
|
||||
data4 0x1e20742c, 0x1eb9328a, 0x9f477153, 0x1d969718
|
||||
data4 0x9f1e6c43, 0x1f2f67f4, 0x9f39c7e4, 0x9e3c4feb
|
||||
data4 0x1da3956b, 0x9e7c685d, 0x1f280911, 0x9f0d8afb
|
||||
data4 0x1e314b40, 0x9eb4f250, 0x9f1a34ad, 0x1ef5d5e7
|
||||
data4 0x9f145496, 0x1e604827, 0x9f1e5195, 0x1e9c1fc0
|
||||
data4 0x1efde521, 0x1e69b385, 0x1f316830, 0x9f244eae
|
||||
data4 0x1f1787ec, 0x9e939971, 0x1f0bb393, 0x9f0511d6
|
||||
data4 0x1ed919de, 0x1d8b7b28, 0x1e5ca4a9, 0x1e7c357b
|
||||
data4 0x9e3ff8e8, 0x1eef53b5, 0x9ed22ed7, 0x1f16659b
|
||||
data4 0x9f2db102, 0x9e2c6a78, 0x1f328d7d, 0x9f2fec3c
|
||||
data4 0x1eb395bd, 0x9f242b84, 0x9e2683e6, 0x1ed71e68
|
||||
data4 0x1efd1df5, 0x9e9eeafd, 0x9ed2249c, 0x1eef129a
|
||||
data4 0x1d1ea44c, 0x9e81f7ff, 0x1eaf77c9, 0x9ee7a285
|
||||
data4 0x1e1864ed, 0x9ee7edbb, 0x9e15a27d, 0x9ae61655
|
||||
data4 0x1f1ff1a2, 0x1da29755, 0x9e5f46fb, 0x1e901236
|
||||
data4 0x9eecfb9b, 0x9f204d2f, 0x1ec64685, 0x9eb809bd
|
||||
data4 0x9e0026c5, 0x1d9f1da1, 0x1f142b49, 0x9f20f22e
|
||||
data4 0x1f24b067, 0x1f185a4c, 0x9f09765c, 0x9ece902f
|
||||
data4 0x1e2ca5db, 0x1e6de464, 0x9f071f67, 0x1f1518c3
|
||||
data4 0x1ea13ded, 0x1f0b8414, 0x1edb6ad4, 0x9e548740
|
||||
data4 0x9ea10efb, 0x1ee48a60, 0x1e7954c5, 0x9edad013
|
||||
data4 0x9f21517d, 0x9e9b6e0c, 0x9ee7f9a6, 0x9ebd4298
|
||||
data4 0x9d65b24e, 0x1eed751f, 0x9f1573ea, 0x9d430377
|
||||
data4 0x9e13fc0c, 0x1e47008a, 0x1e3d5c1d, 0x1ef41a91
|
||||
data4 0x9e4a4ef7, 0x9e952f18, 0x1d620566, 0x1d9b8d33
|
||||
data4 0x1db06247, 0x1e94b31e, 0x1f0730ad, 0x9d79ffb4
|
||||
data4 0x1ed64d51, 0x9e91fd11, 0x9e28d35a, 0x9dea0ed9
|
||||
data4 0x1e891def, 0x9ee28ac0, 0x1e1db99b, 0x9ee1ce38
|
||||
data4 0x9bdd9bca, 0x1eb72cb9, 0x9e8c53c6, 0x1e0df6ca
|
||||
data4 0x1e8f2ccd, 0x9e9b0886, 0x1eeb3bc7, 0x1ec7e772
|
||||
data4 0x9e210776, 0x9daf246c, 0x1ea1f151, 0x1ece4dc6
|
||||
data4 0x1ce741c8, 0x1ed3c88f, 0x9ec9a4fd, 0x9e0c8d30
|
||||
data4 0x1d2fbb26, 0x9ef212a7, 0x1ee44f1c, 0x9e445550
|
||||
data4 0x1e075f77, 0x9d9291a3, 0x1f09c2ee, 0x9e012c88
|
||||
data4 0x1f057d62, 0x9e7bb0dc, 0x9d8758ee, 0x1ee8d6c1
|
||||
data4 0x9e509a57, 0x9e4ca7b7, 0x1e2cb341, 0x9ec35106
|
||||
data4 0x1ecf3baf, 0x1e11781c, 0x1ea0cc78, 0x1eb75ca6
|
||||
data4 0x1e961e1a, 0x1eb88853, 0x1e7abf50, 0x1ee38704
|
||||
data4 0x9dc5ab0f, 0x1afe197b, 0x9ec07523, 0x9d9b7f78
|
||||
data4 0x1f011618, 0x1ed43b0b, 0x9f035945, 0x9e3fd014
|
||||
data4 0x9bbda5cd, 0x9e83f8ab, 0x1e58a928, 0x1e392d61
|
||||
data4 0x1efdbb52, 0x1ee310a8, 0x9ec7ecc1, 0x1e8c9ed6
|
||||
data4 0x9ef82dee, 0x9e70545b, 0x9ea53fc4, 0x1e40f419
|
||||
LOCAL_OBJECT_END(D_table)
|
||||
|
||||
|
||||
|
||||
.section .text
|
||||
GLOBAL_LIBM_ENTRY(exp2l)
|
||||
|
||||
{.mii
|
||||
// get exponent
|
||||
getf.exp GR_EBIAS = f8
|
||||
// GR_D_ADDR0 = pointer to D_table
|
||||
addl GR_D_ADDR0 = @ltoff(D_table), gp
|
||||
// GR_ADDR0 = pointer to C_1...C_6 followed by T_table
|
||||
addl GR_ADDR0 = @ltoff(poly_coeffs), gp ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
// get significand
|
||||
getf.sig GR_SIGNIF = f8
|
||||
// will continue only for normal/denormal numbers
|
||||
fclass.nm.unc p12, p7 = f8, 0x1b
|
||||
mov GR_63 = 63 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
nop.f 0
|
||||
// GR_CONST2 = bias+63-8
|
||||
mov GR_CONST2 = 0xffff+55
|
||||
}
|
||||
{.mfi
|
||||
// GR_CONST1 = bias+15
|
||||
mov GR_CONST1 = 0xffff+15
|
||||
nop.f 0
|
||||
mov GR_CONST3 = 0x1ffff ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
// load start address for C_1...C_6 followed by T_table
|
||||
ld8 GR_ADDR = [ GR_ADDR0 ]
|
||||
nop.f 0
|
||||
// get sign of argument
|
||||
andcm GR_SGN = GR_EBIAS, GR_CONST3
|
||||
}
|
||||
{.mfi
|
||||
// GR_D_ADDR = pointer to D_table
|
||||
ld8 GR_D_ADDR = [ GR_D_ADDR0 ]
|
||||
nop.f 0
|
||||
// get argument exponent
|
||||
and GR_ARGEXP = GR_CONST3, GR_EBIAS ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
alloc GR_SREG = ar.pfs, 1, 4, 4, 0
|
||||
nop.f 0
|
||||
// p6 = 1 if sign = 1
|
||||
cmp.ne p6, p8 = GR_SGN, r0
|
||||
}
|
||||
{.mfi
|
||||
// p7 = 1 if exponent> = 15 (argument out of range)
|
||||
cmp.ge p7, p0 = GR_ARGEXP, GR_CONST1
|
||||
nop.f 0
|
||||
sub GR_EXPON = GR_CONST2, GR_ARGEXP ;;
|
||||
}
|
||||
|
||||
{.mib
|
||||
// load C_3, C_4
|
||||
ldfpd FR_COEFF3, FR_COEFF4 = [ GR_ADDR ], 16
|
||||
// get first exponent+8 bits
|
||||
shr.u GR_LEADBITS = GR_SIGNIF, GR_EXPON
|
||||
(p12) br.cond.spnt SPECIAL_exp2l
|
||||
}
|
||||
{.mib
|
||||
mov GR_256 = 256
|
||||
// exponent- = 63
|
||||
sub GR_EM63 = GR_EBIAS, GR_63
|
||||
(p7) br.cond.spnt OUT_RANGE_exp2l ;;
|
||||
}
|
||||
|
||||
{.mlx
|
||||
// load C_5, C_6
|
||||
ldfpd FR_COEFF5, FR_COEFF6 = [ GR_ADDR ], 16
|
||||
// GR_2P14 = 2^14
|
||||
movl GR_2P14 = 0x46800000 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
// load C_1
|
||||
ldfe FR_COEFF1 = [ GR_ADDR ], 16
|
||||
fma.s0 f8 = f8, f1, f0
|
||||
// GR_BM63 = bias-63
|
||||
mov GR_BM63 = 0xffff-63 ;;
|
||||
}
|
||||
|
||||
{.mlx
|
||||
setf.s FR_2P14 = GR_2P14
|
||||
// GR_UF_TEST = -2^14-62
|
||||
movl GR_UF_TEST = 0xc6807c00
|
||||
}
|
||||
{.mfi
|
||||
// load C_2
|
||||
ldfe FR_COEFF2 = [ GR_ADDR ], 16
|
||||
nop.f 0
|
||||
mov GR_255 = 255 ;;
|
||||
}
|
||||
|
||||
{.mib
|
||||
// get 8-bit index
|
||||
and GR_INDEX = GR_255, GR_LEADBITS
|
||||
// get K = integer part
|
||||
shr.u GR_K = GR_LEADBITS, 8
|
||||
nop.b 0 ;;
|
||||
}
|
||||
|
||||
{.mmi
|
||||
// if sign = 1 && f>0, set p7 = 1
|
||||
(p6) cmp.gt.unc p7, p0 = GR_INDEX, r0
|
||||
setf.s FR_UF_TEST = GR_UF_TEST
|
||||
shl GR_KF = GR_LEADBITS, GR_EXPON ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
// if sign = 1 && f>0, set f = 1-f
|
||||
(p7) sub GR_INDEX = GR_256, GR_INDEX
|
||||
nop.f 0
|
||||
// if sign = 1 && f>0, set K = K+1
|
||||
(p7) add GR_K = GR_K, r0, 1 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
// FR_EXP63 = 2^{expon-63}
|
||||
setf.exp FR_EXP63 = GR_EM63
|
||||
nop.f 0
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
.pred.rel "mutex", p6, p8
|
||||
{.mfi
|
||||
// if sign = 0, set scale factor exponent S = K+bias-63
|
||||
(p8) add GR_K = GR_K, GR_BM63
|
||||
nop.f 0
|
||||
// if sign = 1, set scale factor exponent S = -K+bias-63
|
||||
(p6) sub GR_K = GR_BM63, GR_K ;;
|
||||
}
|
||||
|
||||
{.mmi
|
||||
// FR_KF0 = 2^{63-expon}*(K+f)
|
||||
setf.sig FR_KF0 = GR_KF
|
||||
nop.m 0
|
||||
// GR_EMIN = EMIN = 2-2^14
|
||||
mov GR_EMIN = 0x18cfff ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
// get T_table index
|
||||
shladd GR_IT = GR_INDEX, 3, GR_ADDR
|
||||
// p7 = 1 if x> = 2^10
|
||||
fcmp.ge.s1 p7, p12 = f8, FR_2P14
|
||||
// get D_table index
|
||||
shladd GR_ID = GR_INDEX, 2, GR_D_ADDR ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
// load T_table value
|
||||
ldf8 FR_T = [ GR_IT ]
|
||||
// p7 = 1 if x<-2^10-50
|
||||
(p12) fcmp.lt.s1 p7, p0 = f8, FR_UF_TEST
|
||||
// GR_EMIN1 = EMIN = 2-2^14
|
||||
shl GR_EMIN1 = GR_EMIN, 11 ;;
|
||||
}
|
||||
|
||||
{.mmb
|
||||
// f50 = scale factor = 2^{K-63}
|
||||
setf.exp FR_2EXP = GR_K
|
||||
// load D_table value
|
||||
ldfs FR_D = [ GR_ID ]
|
||||
(p7) br.cond.spnt OUT_RANGE_exp2l ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// get r = x-(K+f)
|
||||
fnma.s1 FR_R = FR_KF0, FR_EXP63, f8
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
// FR_EMIN = EMIN
|
||||
setf.s FR_EMIN = GR_EMIN1
|
||||
// P34 = C_4*r+C_3
|
||||
fma.s1 FR_P34 = FR_COEFF4, FR_R, FR_COEFF3
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P56 = C_6*r+C_5
|
||||
fma.s1 FR_P56 = FR_COEFF6, FR_R, FR_COEFF5
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// r*r
|
||||
fma.s1 FR_R2 = FR_R, FR_R, f0
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P12 = C_2*r+C_1
|
||||
fma.s1 FR_P12 = FR_COEFF2, FR_R, FR_COEFF1
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// T* = scaling factor
|
||||
fma.s1 FR_TS = FR_T, FR_2EXP, f0
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P36 = P34+r2*P56
|
||||
fma.s1 FR_P36 = FR_P56, FR_R2, FR_P34
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P02 = D+r*P12
|
||||
fma.s1 FR_P02 = FR_P12, FR_R, FR_D
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// GR_ID = r*r2
|
||||
fma.s1 FR_R3 = FR_R2, FR_R, f0
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P06 = P02+r3*P36
|
||||
fma.s1 FR_P06 = FR_P36, FR_R3, FR_P02
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// underflow (x<EMIN) ?
|
||||
fcmp.lt.s0 p6, p8 = f8, FR_EMIN
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// result = T+T*P06
|
||||
fma.s0 f8 = FR_TS, FR_P06, FR_TS
|
||||
// return
|
||||
(p8) br.ret.sptk b0
|
||||
}
|
||||
{.mfb
|
||||
(p6) mov GR_Parameter_TAG = 160
|
||||
nop.f 0
|
||||
(p6) br.cond.sptk __libm_error_region ;;
|
||||
}
|
||||
|
||||
|
||||
SPECIAL_exp2l:
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// x = -Infinity ?
|
||||
fclass.m p6, p0 = f8, 0x22
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// x = +Infinity ?
|
||||
fclass.m p7, p0 = f8, 0x21
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// x = +/-Zero ?
|
||||
fclass.m p8, p0 = f8, 0x7
|
||||
nop.i 0
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// exp2l(-Infinity) = 0
|
||||
(p6) mov f8 = f0
|
||||
(p6) br.ret.spnt b0 ;;
|
||||
}
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// exp2l(+Infinity) = +Infinity
|
||||
nop.f 0
|
||||
(p7) br.ret.spnt b0 ;;
|
||||
}
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// exp2l(+/-0) = 1
|
||||
(p8) mov f8 = f1
|
||||
(p8) br.ret.spnt b0 ;;
|
||||
}
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// Remaining cases: NaNs
|
||||
fma.s0 f8 = f8, f1, f0
|
||||
br.ret.sptk b0 ;;
|
||||
}
|
||||
|
||||
|
||||
OUT_RANGE_exp2l:
|
||||
|
||||
|
||||
{.mfi
|
||||
// overflow: p8 = 1
|
||||
(p8) mov GR_EM63 = 0x1fffe
|
||||
// normalize input, to detect pseudo-zeroes
|
||||
fma.s0 f8 = f8, f1, f0
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// f8 = 0?
|
||||
fcmp.eq.s1 p7, p0 = f8, f0
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mmb
|
||||
(p8) mov GR_Parameter_TAG = 159
|
||||
(p8) setf.exp FR_TS = GR_EM63
|
||||
nop.b 999 ;;
|
||||
}
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// pseudo-zero
|
||||
(p7) mov f8 = f1
|
||||
(p7) br.ret.sptk b0 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 999
|
||||
(p8) fma.s0 f8 = FR_TS, FR_TS, f0
|
||||
nop.i 999
|
||||
}
|
||||
{.mii
|
||||
nop.m 0
|
||||
// underflow: p6 = 1
|
||||
(p6) mov GR_EM63 = 1
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mmb
|
||||
(p6) mov GR_Parameter_TAG = 160
|
||||
(p6) setf.exp FR_TS = GR_EM63
|
||||
nop.b 999 ;;
|
||||
}
|
||||
|
||||
{.mfb
|
||||
nop.m 999
|
||||
(p6) fma.s0 f8 = FR_TS, FR_TS, f0
|
||||
nop.b 0 ;;
|
||||
}
|
||||
|
||||
|
||||
GLOBAL_LIBM_END(exp2l)
|
||||
libm_alias_ldouble_other (exp2, exp2)
|
||||
|
||||
|
||||
LOCAL_LIBM_ENTRY(__libm_error_region)
|
||||
.prologue
|
||||
{.mfi
|
||||
add GR_Parameter_Y = -32, sp // Parameter 2 value
|
||||
nop.f 0
|
||||
.save ar.pfs, GR_SAVE_PFS
|
||||
mov GR_SAVE_PFS = ar.pfs // Save ar.pfs
|
||||
}
|
||||
{.mfi
|
||||
.fframe 64
|
||||
add sp = -64, sp // Create new stack
|
||||
nop.f 0
|
||||
mov GR_SAVE_GP = gp ;; // Save gp
|
||||
}
|
||||
|
||||
{.mmi
|
||||
stfe [ GR_Parameter_Y ] = FR_Y, 16 // STORE Parameter 2 on stack
|
||||
add GR_Parameter_X = 16, sp // Parameter 1 address
|
||||
.save b0, GR_SAVE_B0
|
||||
mov GR_SAVE_B0 = b0 ;; // Save b0
|
||||
}
|
||||
|
||||
.body
|
||||
{.mib
|
||||
stfe [ GR_Parameter_X ] = FR_X // STORE Parameter 1 on stack
|
||||
add GR_Parameter_RESULT = 0, GR_Parameter_Y // Parameter 3 address
|
||||
nop.b 0
|
||||
}
|
||||
{.mib
|
||||
stfe [ GR_Parameter_Y ] = FR_RESULT // STORE Parameter 3 on stack
|
||||
add GR_Parameter_Y = -16, GR_Parameter_Y
|
||||
br.call.sptk b0 = __libm_error_support# ;; // Call error handling function
|
||||
}
|
||||
|
||||
{.mmi
|
||||
add GR_Parameter_RESULT = 48, sp
|
||||
nop.m 0
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
{.mmi
|
||||
ldfe f8 = [ GR_Parameter_RESULT ] // Get return result off stack
|
||||
.restore sp
|
||||
add sp = 64, sp // Restore stack pointer
|
||||
mov b0 = GR_SAVE_B0 ;; // Restore return address
|
||||
}
|
||||
|
||||
{.mib
|
||||
mov gp = GR_SAVE_GP // Restore gp
|
||||
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
|
||||
br.ret.sptk b0 ;; // Return
|
||||
}
|
||||
|
||||
|
||||
LOCAL_LIBM_END(__libm_error_region)
|
||||
.type __libm_error_support#, @function
|
||||
.global __libm_error_support#
|
@ -1 +0,0 @@
|
||||
/* Not needed. */
|
@ -1,722 +0,0 @@
|
||||
.file "expf.s"
|
||||
|
||||
|
||||
// Copyright (c) 2000 - 2005, Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
|
||||
// History
|
||||
//*********************************************************************
|
||||
// 02/02/00 Original version
|
||||
// 04/04/00 Unwind support added
|
||||
// 08/15/00 Bundle added after call to __libm_error_support to properly
|
||||
// set [the previously overwritten] GR_Parameter_RESULT.
|
||||
// 08/21/00 Improvements to save 2 cycles on main path, and shorten x=0 case
|
||||
// 12/07/00 Widen main path, shorten x=inf, nan paths
|
||||
// 03/15/01 Fix monotonicity problem around x=0 for round to +inf
|
||||
// 02/05/02 Corrected uninitialize predicate in POSSIBLE_UNDERFLOW path
|
||||
// 05/20/02 Cleaned up namespace and sf0 syntax
|
||||
// 07/26/02 Algorithm changed, accuracy improved
|
||||
// 09/26/02 support of higher precision inputs added, underflow threshold
|
||||
// corrected
|
||||
// 11/15/02 Improved performance on Itanium 2, added possible over/under paths
|
||||
// 05/30/03 Set inexact flag on unmasked overflow/underflow
|
||||
// 03/31/05 Reformatted delimiters between data tables
|
||||
//
|
||||
//
|
||||
// API
|
||||
//*********************************************************************
|
||||
// float expf(float)
|
||||
//
|
||||
// Overview of operation
|
||||
//*********************************************************************
|
||||
// Take the input x. w is "how many log2/128 in x?"
|
||||
// w = x * 64/log2
|
||||
// NJ = int(w)
|
||||
// x = NJ*log2/64 + R
|
||||
|
||||
// NJ = 64*n + j
|
||||
// x = n*log2 + (log2/64)*j + R
|
||||
//
|
||||
// So, exp(x) = 2^n * 2^(j/64)* exp(R)
|
||||
//
|
||||
// T = 2^n * 2^(j/64)
|
||||
// Construct 2^n
|
||||
// Get 2^(j/64) table
|
||||
// actually all the entries of 2^(j/64) table are stored in DP and
|
||||
// with exponent bits set to 0 -> multiplication on 2^n can be
|
||||
// performed by doing logical "or" operation with bits presenting 2^n
|
||||
|
||||
// exp(R) = 1 + (exp(R) - 1)
|
||||
// P = exp(R) - 1 approximated by Taylor series of 3rd degree
|
||||
// P = A3*R^3 + A2*R^2 + R, A3 = 1/6, A2 = 1/2
|
||||
//
|
||||
|
||||
// The final result is reconstructed as follows
|
||||
// exp(x) = T + T*P
|
||||
|
||||
// Special values
|
||||
//*********************************************************************
|
||||
// expf(+0) = 1.0
|
||||
// expf(-0) = 1.0
|
||||
|
||||
// expf(+qnan) = +qnan
|
||||
// expf(-qnan) = -qnan
|
||||
// expf(+snan) = +qnan
|
||||
// expf(-snan) = -qnan
|
||||
|
||||
// expf(-inf) = +0
|
||||
// expf(+inf) = +inf
|
||||
|
||||
// Overflow and Underflow
|
||||
//*********************************************************************
|
||||
// expf(x) = largest single normal when
|
||||
// x = 88.72283 = 0x42b17217
|
||||
|
||||
// expf(x) = smallest single normal when
|
||||
// x = -87.33654 = 0xc2aeac4f
|
||||
|
||||
// expf(x) = largest round-to-nearest single zero when
|
||||
// x = -103.97208 = 0xc2cff1b5
|
||||
|
||||
|
||||
// Registers used
|
||||
//*********************************************************************
|
||||
// Floating Point registers used:
|
||||
// f8, input
|
||||
// f6,f7, f9 -> f15, f32 -> f40
|
||||
|
||||
// General registers used:
|
||||
// r3, r23 -> r38
|
||||
|
||||
// Predicate registers used:
|
||||
// p10 -> p15
|
||||
|
||||
// Assembly macros
|
||||
//*********************************************************************
|
||||
// integer registers used
|
||||
// scratch
|
||||
rNJ = r3
|
||||
|
||||
rTmp = r23
|
||||
rJ = r23
|
||||
rN = r24
|
||||
rTblAddr = r25
|
||||
rA3 = r26
|
||||
rExpHalf = r27
|
||||
rLn2Div64 = r28
|
||||
r17ones_m1 = r29
|
||||
rGt_ln = r29
|
||||
rRightShifter = r30
|
||||
r64DivLn2 = r31
|
||||
// stacked
|
||||
GR_SAVE_PFS = r32
|
||||
GR_SAVE_B0 = r33
|
||||
GR_SAVE_GP = r34
|
||||
GR_Parameter_X = r35
|
||||
GR_Parameter_Y = r36
|
||||
GR_Parameter_RESULT = r37
|
||||
GR_Parameter_TAG = r38
|
||||
|
||||
// floating point registers used
|
||||
FR_X = f10
|
||||
FR_Y = f1
|
||||
FR_RESULT = f8
|
||||
// scratch
|
||||
fRightShifter = f6
|
||||
f64DivLn2 = f7
|
||||
fNormX = f9
|
||||
fNint = f10
|
||||
fN = f11
|
||||
fR = f12
|
||||
fLn2Div64 = f13
|
||||
fA2 = f14
|
||||
fA3 = f15
|
||||
// stacked
|
||||
fP = f32
|
||||
fT = f33
|
||||
fMIN_SGL_OFLOW_ARG = f34
|
||||
fMAX_SGL_ZERO_ARG = f35
|
||||
fMAX_SGL_NORM_ARG = f36
|
||||
fMIN_SGL_NORM_ARG = f37
|
||||
fRSqr = f38
|
||||
fTmp = f39
|
||||
fGt_pln = f39
|
||||
fWre_urm_f8 = f40
|
||||
fFtz_urm_f8 = f40
|
||||
|
||||
|
||||
RODATA
|
||||
.align 16
|
||||
|
||||
LOCAL_OBJECT_START(_expf_table)
|
||||
data4 0x42b17218 // Smallest sgl arg to overflow sgl result, +88.7228
|
||||
data4 0xc2cff1b5 // Largest sgl for rnd-to-nearest 0 result, -103.9720
|
||||
data4 0x42b17217 // Largest sgl arg to give normal sgl result, +88.7228
|
||||
data4 0xc2aeac4f // Smallest sgl arg to give normal sgl result, -87.3365
|
||||
//
|
||||
// 2^(j/64) table, j goes from 0 to 63
|
||||
data8 0x0000000000000000 // 2^(0/64)
|
||||
data8 0x00002C9A3E778061 // 2^(1/64)
|
||||
data8 0x000059B0D3158574 // 2^(2/64)
|
||||
data8 0x0000874518759BC8 // 2^(3/64)
|
||||
data8 0x0000B5586CF9890F // 2^(4/64)
|
||||
data8 0x0000E3EC32D3D1A2 // 2^(5/64)
|
||||
data8 0x00011301D0125B51 // 2^(6/64)
|
||||
data8 0x0001429AAEA92DE0 // 2^(7/64)
|
||||
data8 0x000172B83C7D517B // 2^(8/64)
|
||||
data8 0x0001A35BEB6FCB75 // 2^(9/64)
|
||||
data8 0x0001D4873168B9AA // 2^(10/64)
|
||||
data8 0x0002063B88628CD6 // 2^(11/64)
|
||||
data8 0x0002387A6E756238 // 2^(12/64)
|
||||
data8 0x00026B4565E27CDD // 2^(13/64)
|
||||
data8 0x00029E9DF51FDEE1 // 2^(14/64)
|
||||
data8 0x0002D285A6E4030B // 2^(15/64)
|
||||
data8 0x000306FE0A31B715 // 2^(16/64)
|
||||
data8 0x00033C08B26416FF // 2^(17/64)
|
||||
data8 0x000371A7373AA9CB // 2^(18/64)
|
||||
data8 0x0003A7DB34E59FF7 // 2^(19/64)
|
||||
data8 0x0003DEA64C123422 // 2^(20/64)
|
||||
data8 0x0004160A21F72E2A // 2^(21/64)
|
||||
data8 0x00044E086061892D // 2^(22/64)
|
||||
data8 0x000486A2B5C13CD0 // 2^(23/64)
|
||||
data8 0x0004BFDAD5362A27 // 2^(24/64)
|
||||
data8 0x0004F9B2769D2CA7 // 2^(25/64)
|
||||
data8 0x0005342B569D4F82 // 2^(26/64)
|
||||
data8 0x00056F4736B527DA // 2^(27/64)
|
||||
data8 0x0005AB07DD485429 // 2^(28/64)
|
||||
data8 0x0005E76F15AD2148 // 2^(29/64)
|
||||
data8 0x0006247EB03A5585 // 2^(30/64)
|
||||
data8 0x0006623882552225 // 2^(31/64)
|
||||
data8 0x0006A09E667F3BCD // 2^(32/64)
|
||||
data8 0x0006DFB23C651A2F // 2^(33/64)
|
||||
data8 0x00071F75E8EC5F74 // 2^(34/64)
|
||||
data8 0x00075FEB564267C9 // 2^(35/64)
|
||||
data8 0x0007A11473EB0187 // 2^(36/64)
|
||||
data8 0x0007E2F336CF4E62 // 2^(37/64)
|
||||
data8 0x00082589994CCE13 // 2^(38/64)
|
||||
data8 0x000868D99B4492ED // 2^(39/64)
|
||||
data8 0x0008ACE5422AA0DB // 2^(40/64)
|
||||
data8 0x0008F1AE99157736 // 2^(41/64)
|
||||
data8 0x00093737B0CDC5E5 // 2^(42/64)
|
||||
data8 0x00097D829FDE4E50 // 2^(43/64)
|
||||
data8 0x0009C49182A3F090 // 2^(44/64)
|
||||
data8 0x000A0C667B5DE565 // 2^(45/64)
|
||||
data8 0x000A5503B23E255D // 2^(46/64)
|
||||
data8 0x000A9E6B5579FDBF // 2^(47/64)
|
||||
data8 0x000AE89F995AD3AD // 2^(48/64)
|
||||
data8 0x000B33A2B84F15FB // 2^(49/64)
|
||||
data8 0x000B7F76F2FB5E47 // 2^(50/64)
|
||||
data8 0x000BCC1E904BC1D2 // 2^(51/64)
|
||||
data8 0x000C199BDD85529C // 2^(52/64)
|
||||
data8 0x000C67F12E57D14B // 2^(53/64)
|
||||
data8 0x000CB720DCEF9069 // 2^(54/64)
|
||||
data8 0x000D072D4A07897C // 2^(55/64)
|
||||
data8 0x000D5818DCFBA487 // 2^(56/64)
|
||||
data8 0x000DA9E603DB3285 // 2^(57/64)
|
||||
data8 0x000DFC97337B9B5F // 2^(58/64)
|
||||
data8 0x000E502EE78B3FF6 // 2^(59/64)
|
||||
data8 0x000EA4AFA2A490DA // 2^(60/64)
|
||||
data8 0x000EFA1BEE615A27 // 2^(61/64)
|
||||
data8 0x000F50765B6E4540 // 2^(62/64)
|
||||
data8 0x000FA7C1819E90D8 // 2^(63/64)
|
||||
LOCAL_OBJECT_END(_expf_table)
|
||||
|
||||
|
||||
.section .text
|
||||
GLOBAL_IEEE754_ENTRY(expf)
|
||||
|
||||
{ .mlx
|
||||
addl rTblAddr = @ltoff(_expf_table),gp
|
||||
movl r64DivLn2 = 0x40571547652B82FE // 64/ln(2)
|
||||
}
|
||||
{ .mlx
|
||||
addl rA3 = 0x3E2AA, r0 // high bits of 1.0/6.0 rounded to SP
|
||||
movl rRightShifter = 0x43E8000000000000 // DP Right Shifter
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
// point to the beginning of the table
|
||||
ld8 rTblAddr = [rTblAddr]
|
||||
fclass.m p14, p0 = f8, 0x22 // test for -INF
|
||||
shl rA3 = rA3, 12 // 0x3E2AA000, approx to 1.0/6.0 in SP
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fnorm.s1 fNormX = f8 // normalized x
|
||||
addl rExpHalf = 0xFFFE, r0 // exponent of 1/2
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
setf.d f64DivLn2 = r64DivLn2 // load 64/ln(2) to FP reg
|
||||
fclass.m p15, p0 = f8, 0x1e1 // test for NaT,NaN,+Inf
|
||||
nop.i 0
|
||||
}
|
||||
{ .mlx
|
||||
// load Right Shifter to FP reg
|
||||
setf.d fRightShifter = rRightShifter
|
||||
movl rLn2Div64 = 0x3F862E42FEFA39EF // DP ln(2)/64 in GR
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fcmp.eq.s1 p13, p0 = f0, f8 // test for x = 0.0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
setf.s fA3 = rA3 // load A3 to FP reg
|
||||
(p14) fma.s.s0 f8 = f0, f1, f0 // result if x = -inf
|
||||
(p14) br.ret.spnt b0 // exit here if x = -inf
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
setf.exp fA2 = rExpHalf // load A2 to FP reg
|
||||
fcmp.eq.s0 p6, p0 = f8, f0 // Dummy to flag denorm
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
setf.d fLn2Div64 = rLn2Div64 // load ln(2)/64 to FP reg
|
||||
(p15) fma.s.s0 f8 = f8, f1, f0 // result if x = NaT,NaN,+Inf
|
||||
(p15) br.ret.spnt b0 // exit here if x = NaT,NaN,+Inf
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
// overflow and underflow_zero threshold
|
||||
ldfps fMIN_SGL_OFLOW_ARG, fMAX_SGL_ZERO_ARG = [rTblAddr], 8
|
||||
(p13) fma.s.s0 f8 = f1, f1, f0 // result if x = 0.0
|
||||
(p13) br.ret.spnt b0 // exit here if x =0.0
|
||||
}
|
||||
;;
|
||||
|
||||
// max normal and underflow_denorm threshold
|
||||
{ .mfi
|
||||
ldfps fMAX_SGL_NORM_ARG, fMIN_SGL_NORM_ARG = [rTblAddr], 8
|
||||
nop.f 0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// x*(64/ln(2)) + Right Shifter
|
||||
fma.s1 fNint = fNormX, f64DivLn2, fRightShifter
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
// Divide arguments into the following categories:
|
||||
// Certain Underflow p11 - -inf < x <= MAX_SGL_ZERO_ARG
|
||||
// Possible Underflow p13 - MAX_SGL_ZERO_ARG < x < MIN_SGL_NORM_ARG
|
||||
// Certain Safe - MIN_SGL_NORM_ARG <= x <= MAX_SGL_NORM_ARG
|
||||
// Possible Overflow p14 - MAX_SGL_NORM_ARG < x < MIN_SGL_OFLOW_ARG
|
||||
// Certain Overflow p15 - MIN_SGL_OFLOW_ARG <= x < +inf
|
||||
//
|
||||
// If the input is really a single arg, then there will never be
|
||||
// "Possible Overflow" arguments.
|
||||
//
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// check for overflow
|
||||
fcmp.ge.s1 p15, p0 = fNormX, fMIN_SGL_OFLOW_ARG
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// check for underflow and tiny (+0) result
|
||||
fcmp.le.s1 p11, p0 = fNormX, fMAX_SGL_ZERO_ARG
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
fms.s1 fN = fNint, f1, fRightShifter // n in FP register
|
||||
// branch out if overflow
|
||||
(p15) br.cond.spnt EXP_CERTAIN_OVERFLOW
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
getf.sig rNJ = fNint // bits of n, j
|
||||
// check for underflow and deno result
|
||||
fcmp.lt.s1 p13, p0 = fNormX, fMIN_SGL_NORM_ARG
|
||||
// branch out if underflow and tiny (+0) result
|
||||
(p11) br.cond.spnt EXP_CERTAIN_UNDERFLOW
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// check for possible overflow
|
||||
fcmp.gt.s1 p14, p0 = fNormX, fMAX_SGL_NORM_ARG
|
||||
extr.u rJ = rNJ, 0, 6 // bits of j
|
||||
}
|
||||
{ .mfi
|
||||
addl rN = 0xFFFF - 63, rNJ // biased and shifted n
|
||||
fnma.s1 fR = fLn2Div64, fN, fNormX // R = x - N*ln(2)/64
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
shladd rJ = rJ, 3, rTblAddr // address in the 2^(j/64) table
|
||||
nop.f 0
|
||||
shr rN = rN, 6 // biased n
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
ld8 rJ = [rJ]
|
||||
nop.f 0
|
||||
shl rN = rN, 52 // 2^n bits in DP format
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
or rN = rN, rJ // bits of 2^n * 2^(j/64) in DP format
|
||||
nop.f 0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
setf.d fT = rN // 2^n * 2^(j/64)
|
||||
fma.s1 fP = fA3, fR, fA2 // A3*R + A2
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fRSqr = fR, fR, f0 // R^2
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 fP = fP, fRSqr, fR // P = (A3*R + A2)*R^2 + R
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mbb
|
||||
nop.m 0
|
||||
// branch out if possible underflow
|
||||
(p13) br.cond.spnt EXP_POSSIBLE_UNDERFLOW
|
||||
// branch out if possible overflow result
|
||||
(p14) br.cond.spnt EXP_POSSIBLE_OVERFLOW
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
// final result in the absence of over- and underflow
|
||||
fma.s.s0 f8 = fP, fT, fT
|
||||
// exit here in the absence of over- and underflow
|
||||
br.ret.sptk b0
|
||||
}
|
||||
;;
|
||||
|
||||
EXP_POSSIBLE_OVERFLOW:
|
||||
|
||||
// Here if fMAX_SGL_NORM_ARG < x < fMIN_SGL_OFLOW_ARG
|
||||
// This cannot happen if input is a single, only if input higher precision.
|
||||
// Overflow is a possibility, not a certainty.
|
||||
|
||||
// Recompute result using status field 2 with user's rounding mode,
|
||||
// and wre set. If result is larger than largest single, then we have
|
||||
// overflow
|
||||
|
||||
{ .mfi
|
||||
mov rGt_ln = 0x1007f // Exponent for largest single + 1 ulp
|
||||
fsetc.s2 0x7F,0x42 // Get user's round mode, set wre
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
setf.exp fGt_pln = rGt_ln // Create largest single + 1 ulp
|
||||
fma.s.s2 fWre_urm_f8 = fP, fT, fT // Result with wre set
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fsetc.s2 0x7F,0x40 // Turn off wre in sf2
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
nop.f 0
|
||||
(p6) br.cond.spnt EXP_CERTAIN_OVERFLOW // Branch if overflow
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
fma.s.s0 f8 = fP, fT, fT
|
||||
br.ret.sptk b0 // Exit if really no overflow
|
||||
}
|
||||
;;
|
||||
|
||||
// here if overflow
|
||||
EXP_CERTAIN_OVERFLOW:
|
||||
{ .mmi
|
||||
addl r17ones_m1 = 0x1FFFE, r0
|
||||
;;
|
||||
setf.exp fTmp = r17ones_m1
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
alloc r32=ar.pfs,0,3,4,0
|
||||
fmerge.s FR_X = f8,f8
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
mov GR_Parameter_TAG = 16
|
||||
fma.s.s0 FR_RESULT = fTmp, fTmp, fTmp // Set I,O and +INF result
|
||||
br.cond.sptk __libm_error_region
|
||||
}
|
||||
;;
|
||||
|
||||
EXP_POSSIBLE_UNDERFLOW:
|
||||
|
||||
// Here if fMAX_SGL_ZERO_ARG < x < fMIN_SGL_NORM_ARG
|
||||
// Underflow is a possibility, not a certainty
|
||||
|
||||
// We define an underflow when the answer with
|
||||
// ftz set
|
||||
// is zero (tiny numbers become zero)
|
||||
|
||||
// Notice (from below) that if we have an unlimited exponent range,
|
||||
// then there is an extra machine number E between the largest denormal and
|
||||
// the smallest normal.
|
||||
|
||||
// So if with unbounded exponent we round to E or below, then we are
|
||||
// tiny and underflow has occurred.
|
||||
|
||||
// But notice that you can be in a situation where we are tiny, namely
|
||||
// rounded to E, but when the exponent is bounded we round to smallest
|
||||
// normal. So the answer can be the smallest normal with underflow.
|
||||
|
||||
// E
|
||||
// -----+--------------------+--------------------+-----
|
||||
// | | |
|
||||
// 1.1...10 2^-3fff 1.1...11 2^-3fff 1.0...00 2^-3ffe
|
||||
// 0.1...11 2^-3ffe (biased, 1)
|
||||
// largest dn smallest normal
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fsetc.s2 0x7F,0x41 // Get user's round mode, set ftz
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s.s2 fFtz_urm_f8 = fP, fT, fT // Result with ftz set
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fsetc.s2 0x7F,0x40 // Turn off ftz in sf2
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fcmp.eq.s1 p6, p7 = fFtz_urm_f8, f0 // Test for underflow
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s.s0 f8 = fP, fT, fT // Compute result, set I, maybe U
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mbb
|
||||
nop.m 0
|
||||
(p6) br.cond.spnt EXP_UNDERFLOW_COMMON // Branch if really underflow
|
||||
(p7) br.ret.sptk b0 // Exit if really no underflow
|
||||
}
|
||||
;;
|
||||
|
||||
EXP_CERTAIN_UNDERFLOW:
|
||||
// Here if x < fMAX_SGL_ZERO_ARG
|
||||
// Result will be zero (or smallest denorm if round to +inf) with I, U set
|
||||
{ .mmi
|
||||
mov rTmp = 1
|
||||
;;
|
||||
setf.exp fTmp = rTmp // Form small normal
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fmerge.se fTmp = fTmp, f64DivLn2 // Small with non-trial signif
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
fma.s.s0 f8 = fTmp, fTmp, f0 // Set I,U, tiny (+0.0) result
|
||||
br.cond.sptk EXP_UNDERFLOW_COMMON
|
||||
}
|
||||
;;
|
||||
|
||||
EXP_UNDERFLOW_COMMON:
|
||||
// Determine if underflow result is zero or nonzero
|
||||
{ .mfi
|
||||
alloc r32=ar.pfs,0,3,4,0
|
||||
fcmp.eq.s1 p6, p0 = f8, f0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
fmerge.s FR_X = fNormX,fNormX
|
||||
(p6) br.cond.spnt EXP_UNDERFLOW_ZERO
|
||||
}
|
||||
;;
|
||||
|
||||
EXP_UNDERFLOW_NONZERO:
|
||||
// Here if x < fMIN_SGL_NORM_ARG and result nonzero;
|
||||
// I, U are set
|
||||
{ .mfb
|
||||
mov GR_Parameter_TAG = 17
|
||||
nop.f 0 // FR_RESULT already set
|
||||
br.cond.sptk __libm_error_region
|
||||
}
|
||||
;;
|
||||
|
||||
EXP_UNDERFLOW_ZERO:
|
||||
// Here if x < fMIN_SGL_NORM_ARG and result zero;
|
||||
// I, U are set
|
||||
{ .mfb
|
||||
mov GR_Parameter_TAG = 17
|
||||
nop.f 0 // FR_RESULT already set
|
||||
br.cond.sptk __libm_error_region
|
||||
}
|
||||
;;
|
||||
|
||||
GLOBAL_IEEE754_END(expf)
|
||||
libm_alias_float_other (__exp, exp)
|
||||
#ifdef SHARED
|
||||
.symver expf,expf@@GLIBC_2.27
|
||||
.weak __expf_compat
|
||||
.set __expf_compat,__expf
|
||||
.symver __expf_compat,expf@GLIBC_2.2
|
||||
#endif
|
||||
|
||||
|
||||
LOCAL_LIBM_ENTRY(__libm_error_region)
|
||||
.prologue
|
||||
{ .mfi
|
||||
add GR_Parameter_Y=-32,sp // Parameter 2 value
|
||||
nop.f 0
|
||||
.save ar.pfs,GR_SAVE_PFS
|
||||
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
|
||||
}
|
||||
{ .mfi
|
||||
.fframe 64
|
||||
add sp=-64,sp // Create new stack
|
||||
nop.f 0
|
||||
mov GR_SAVE_GP=gp // Save gp
|
||||
};;
|
||||
{ .mmi
|
||||
stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack
|
||||
add GR_Parameter_X = 16,sp // Parameter 1 address
|
||||
.save b0, GR_SAVE_B0
|
||||
mov GR_SAVE_B0=b0 // Save b0
|
||||
};;
|
||||
.body
|
||||
{ .mfi
|
||||
stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
|
||||
nop.f 0
|
||||
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
|
||||
}
|
||||
{ .mib
|
||||
stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
|
||||
add GR_Parameter_Y = -16,GR_Parameter_Y
|
||||
br.call.sptk b0=__libm_error_support# // Call error handling function
|
||||
};;
|
||||
|
||||
{ .mmi
|
||||
add GR_Parameter_RESULT = 48,sp
|
||||
nop.m 0
|
||||
nop.i 0
|
||||
};;
|
||||
|
||||
{ .mmi
|
||||
ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
|
||||
.restore sp
|
||||
add sp = 64,sp // Restore stack pointer
|
||||
mov b0 = GR_SAVE_B0 // Restore return address
|
||||
};;
|
||||
{ .mib
|
||||
mov gp = GR_SAVE_GP // Restore gp
|
||||
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
|
||||
br.ret.sptk b0 // Return
|
||||
};;
|
||||
|
||||
LOCAL_LIBM_END(__libm_error_region)
|
||||
|
||||
|
||||
.type __libm_error_support#,@function
|
||||
.global __libm_error_support#
|
@ -1 +0,0 @@
|
||||
/* Not needed. */
|
@ -1,555 +0,0 @@
|
||||
.file "fmod.s"
|
||||
|
||||
|
||||
// Copyright (c) 2000 - 2003, Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
//
|
||||
// History
|
||||
//====================================================================
|
||||
// 02/02/00 Initial version
|
||||
// 03/02/00 New Algorithm
|
||||
// 04/04/00 Unwind support added
|
||||
// 08/15/00 Bundle added after call to __libm_error_support to properly
|
||||
// set [the previously overwritten] GR_Parameter_RESULT.
|
||||
// 11/28/00 Set FR_Y to f9
|
||||
// 03/11/02 Fixed flags for fmod(qnan,zero)
|
||||
// 05/20/02 Cleaned up namespace and sf0 syntax
|
||||
// 02/10/03 Reordered header: .section, .global, .proc, .align
|
||||
// 04/28/03 Fix: fmod(sNaN,0) no longer sets errno
|
||||
//
|
||||
// API
|
||||
//====================================================================
|
||||
// double fmod(double,double);
|
||||
//
|
||||
// Overview of operation
|
||||
//====================================================================
|
||||
// fmod(a,b)=a-i*b,
|
||||
// where i is an integer such that, if b!=0,
|
||||
// |i|<|a/b| and |a/b-i|<1
|
||||
//
|
||||
// Algorithm
|
||||
//====================================================================
|
||||
// a). if |a|<|b|, return a
|
||||
// b). get quotient and reciprocal overestimates accurate to
|
||||
// 33 bits (q2,y2)
|
||||
// c). if the exponent difference (exponent(a)-exponent(b))
|
||||
// is less than 32, truncate quotient to integer and
|
||||
// finish in one iteration
|
||||
// d). if exponent(a)-exponent(b)>=32 (q2>=2^32)
|
||||
// round quotient estimate to single precision (k=RN(q2)),
|
||||
// calculate partial remainder (a'=a-k*b),
|
||||
// get quotient estimate (a'*y2), and repeat from c).
|
||||
//
|
||||
// Special cases
|
||||
//====================================================================
|
||||
// b=+/-0: return NaN, call libm_error_support
|
||||
// a=+/-Inf, a=NaN or b=NaN: return NaN
|
||||
//
|
||||
// Registers used
|
||||
//====================================================================
|
||||
// Predicate registers: p6-p11
|
||||
// General registers: r2,r29,r32 (ar.pfs), r33-r39
|
||||
// Floating point registers: f6-f15
|
||||
|
||||
GR_SAVE_B0 = r33
|
||||
GR_SAVE_PFS = r34
|
||||
GR_SAVE_GP = r35
|
||||
GR_SAVE_SP = r36
|
||||
|
||||
GR_Parameter_X = r37
|
||||
GR_Parameter_Y = r38
|
||||
GR_Parameter_RESULT = r39
|
||||
GR_Parameter_TAG = r40
|
||||
|
||||
FR_X = f10
|
||||
FR_Y = f9
|
||||
FR_RESULT = f8
|
||||
|
||||
|
||||
.section .text
|
||||
GLOBAL_IEEE754_ENTRY(fmod)
|
||||
|
||||
// inputs in f8, f9
|
||||
// result in f8
|
||||
|
||||
{ .mfi
|
||||
alloc r32=ar.pfs,1,4,4,0
|
||||
// f6=|a|
|
||||
fmerge.s f6=f0,f8
|
||||
mov r2 = 0x0ffdd
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// f7=|b|
|
||||
fmerge.s f7=f0,f9
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
setf.exp f11 = r2
|
||||
// (1) y0
|
||||
frcpa.s1 f10,p6=f6,f7
|
||||
nop.i 0
|
||||
}
|
||||
|
||||
// Y +-NAN, +-inf, +-0? p7
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fclass.m.unc p7,p0 = f9, 0xe7
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
// qnan snan inf norm unorm 0 -+
|
||||
// 1 1 1 0 0 0 11
|
||||
// e 3
|
||||
// X +-NAN, +-inf, ? p9
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fclass.m.unc p9,p0 = f8, 0xe3
|
||||
nop.i 999
|
||||
}
|
||||
|
||||
// |x| < |y|? Return x p8
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fcmp.lt.unc.s1 p8,p0 = f6,f7
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// normalize y (if |x|<|y|)
|
||||
(p8) fma.s0 f9=f9,f1,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
mov r2=0x1001f
|
||||
// (2) q0=a*y0
|
||||
(p6) fma.s1 f13=f6,f10,f0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// (3) e0 = 1 - b * y0
|
||||
(p6) fnma.s1 f12=f7,f10,f1
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// normalize x (if |x|<|y|)
|
||||
(p8) fma.d.s0 f8=f8,f1,f0
|
||||
nop.i 0
|
||||
}
|
||||
{.bbb
|
||||
(p9) br.cond.spnt FMOD_X_NAN_INF
|
||||
(p7) br.cond.spnt FMOD_Y_NAN_INF_ZERO
|
||||
// if |x|<|y|, return
|
||||
(p8) br.ret.spnt b0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// normalize x
|
||||
fma.s0 f6=f6,f1,f0
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// normalize y
|
||||
fma.s0 f7=f7,f1,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
// f15=2^32
|
||||
setf.exp f15=r2
|
||||
// (4) q1=q0+e0*q0
|
||||
(p6) fma.s1 f13=f12,f13,f13
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// (5) e1 = e0 * e0 + 2^-34
|
||||
(p6) fma.s1 f14=f12,f12,f11
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mlx
|
||||
nop.m 0
|
||||
movl r2=0x33a00000;;
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// (6) y1 = y0 + e0 * y0
|
||||
(p6) fma.s1 f10=f12,f10,f10
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
// set f12=1.25*2^{-24}
|
||||
setf.s f12=r2
|
||||
// (7) q2=q1+e1*q1
|
||||
(p6) fma.s1 f13=f13,f14,f13
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fmerge.s f9=f8,f9
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// (8) y2 = y1 + e1 * y1
|
||||
(p6) fma.s1 f10=f14,f10,f10
|
||||
// set p6=0, p10=0
|
||||
cmp.ne.and p6,p10=r0,r0;;
|
||||
}
|
||||
|
||||
.align 32
|
||||
loop53:
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// compare q2, 2^32
|
||||
fcmp.lt.unc.s1 p8,p7=f13,f15
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// will truncate quotient to integer, if exponent<32 (in advance)
|
||||
fcvt.fx.trunc.s1 f11=f13
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// if exponent>32, round quotient to single precision (perform in advance)
|
||||
fma.s.s1 f13=f13,f1,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// set f12=sgn(a)
|
||||
(p8) fmerge.s f12=f8,f1
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// normalize truncated quotient
|
||||
(p8) fcvt.xf f13=f11
|
||||
nop.i 0;;
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// calculate remainder (assuming f13=RZ(Q))
|
||||
(p7) fnma.s1 f14=f13,f7,f6
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// also if exponent>32, round quotient to single precision
|
||||
// and subtract 1 ulp: q=q-q*(1.25*2^{-24})
|
||||
(p7) fnma.s.s1 f11=f13,f12,f13
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// (p8) calculate remainder (82-bit format)
|
||||
(p8) fnma.s1 f11=f13,f7,f6
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// (p7) calculate remainder (assuming f11=RZ(Q))
|
||||
(p7) fnma.s1 f6=f11,f7,f6
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// Final iteration (p8): is f6 the correct remainder (quotient was not overestimated) ?
|
||||
(p8) fcmp.lt.unc.s1 p6,p10=f11,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// get new quotient estimation: a'*y2
|
||||
(p7) fma.s1 f13=f14,f10,f0
|
||||
nop.i 0
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// was f14=RZ(Q) ? (then new remainder f14>=0)
|
||||
(p7) fcmp.lt.unc.s1 p7,p9=f14,f0
|
||||
nop.b 0;;
|
||||
}
|
||||
|
||||
|
||||
.pred.rel "mutex",p6,p10
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// add b to estimated remainder (to cover the case when the quotient was overestimated)
|
||||
// also set correct sign by using f9=|b|*sgn(a), f12=sgn(a)
|
||||
(p6) fma.d.s0 f8=f11,f12,f9
|
||||
nop.b 0
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// calculate remainder (single precision)
|
||||
// set correct sign of result before returning
|
||||
(p10) fma.d.s0 f8=f11,f12,f0
|
||||
(p8) br.ret.sptk b0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// if f13!=RZ(Q), get alternative quotient estimation: a''*y2
|
||||
(p7) fma.s1 f13=f6,f10,f0
|
||||
nop.i 0
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// if f14 was RZ(Q), set remainder to f14
|
||||
(p9) mov f6=f14
|
||||
br.cond.sptk loop53;;
|
||||
}
|
||||
|
||||
|
||||
|
||||
FMOD_X_NAN_INF:
|
||||
|
||||
// Y zero ?
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fclass.m p10,p0=f8,0xc3 // Test x=nan
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s1 f10=f9,f1,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s0 f8=f8,f1,f0
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
(p10) fclass.m p10,p0=f9,0x07 // Test x=nan, and y=zero
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
fcmp.eq.unc.s1 p11,p0=f10,f0
|
||||
(p10) br.ret.spnt b0;; // Exit with result=x if x=nan and y=zero
|
||||
}
|
||||
{.mib
|
||||
nop.m 0
|
||||
nop.i 0
|
||||
// if Y zero
|
||||
(p11) br.cond.spnt FMOD_Y_ZERO;;
|
||||
}
|
||||
|
||||
// X infinity? Return QNAN indefinite
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fclass.m.unc p8,p9 = f8, 0x23
|
||||
nop.i 999;;
|
||||
}
|
||||
// Y NaN ?
|
||||
{.mfi
|
||||
nop.m 999
|
||||
(p8) fclass.m p9,p8=f9,0xc3
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 999
|
||||
(p8) frcpa.s0 f8,p0 = f8,f8
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
// also set Denormal flag if necessary
|
||||
(p8) fma.s0 f9=f9,f1,f0
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
(p8) fma.d.s0 f8=f8,f1,f0
|
||||
nop.b 999 ;;
|
||||
}
|
||||
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
(p9) frcpa.s0 f8,p7=f8,f9
|
||||
br.ret.sptk b0 ;;
|
||||
}
|
||||
|
||||
|
||||
FMOD_Y_NAN_INF_ZERO:
|
||||
|
||||
// Y INF
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fclass.m.unc p7,p0 = f9, 0x23
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
(p7) fma.d.s0 f8=f8,f1,f0
|
||||
(p7) br.ret.spnt b0 ;;
|
||||
}
|
||||
|
||||
// Y NAN?
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fclass.m.unc p9,p0 = f9, 0xc3
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
(p9) fma.d.s0 f8=f9,f1,f0
|
||||
(p9) br.ret.spnt b0 ;;
|
||||
}
|
||||
|
||||
FMOD_Y_ZERO:
|
||||
// Y zero? Must be zero at this point
|
||||
// because it is the only choice left.
|
||||
// Return QNAN indefinite
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// set Invalid
|
||||
frcpa.s0 f12,p0=f0,f0
|
||||
nop.i 0
|
||||
}
|
||||
// X NAN?
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fclass.m.unc p9,p10 = f8, 0xc3
|
||||
nop.i 999 ;;
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p10) fclass.nm p9,p10 = f8, 0xff
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 999
|
||||
(p9) frcpa.s0 f11,p7=f8,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p10) frcpa.s0 f11,p7 = f9,f9
|
||||
mov GR_Parameter_TAG = 121 ;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fmerge.s f10 = f8, f8
|
||||
nop.i 999
|
||||
}
|
||||
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
fma.d.s0 f8=f11,f1,f0
|
||||
br.sptk __libm_error_region;;
|
||||
}
|
||||
|
||||
GLOBAL_IEEE754_END(fmod)
|
||||
libm_alias_double_other (__fmod, fmod)
|
||||
|
||||
LOCAL_LIBM_ENTRY(__libm_error_region)
|
||||
.prologue
|
||||
{ .mfi
|
||||
add GR_Parameter_Y=-32,sp // Parameter 2 value
|
||||
nop.f 0
|
||||
.save ar.pfs,GR_SAVE_PFS
|
||||
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
|
||||
}
|
||||
{ .mfi
|
||||
.fframe 64
|
||||
add sp=-64,sp // Create new stack
|
||||
nop.f 0
|
||||
mov GR_SAVE_GP=gp // Save gp
|
||||
};;
|
||||
{ .mmi
|
||||
stfd [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
|
||||
add GR_Parameter_X = 16,sp // Parameter 1 address
|
||||
.save b0, GR_SAVE_B0
|
||||
mov GR_SAVE_B0=b0 // Save b0
|
||||
};;
|
||||
.body
|
||||
{ .mib
|
||||
stfd [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
|
||||
add GR_Parameter_RESULT = 0,GR_Parameter_Y
|
||||
nop.b 0 // Parameter 3 address
|
||||
}
|
||||
{ .mib
|
||||
stfd [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
|
||||
add GR_Parameter_Y = -16,GR_Parameter_Y
|
||||
br.call.sptk b0=__libm_error_support# // Call error handling function
|
||||
};;
|
||||
{ .mmi
|
||||
nop.m 0
|
||||
nop.m 0
|
||||
add GR_Parameter_RESULT = 48,sp
|
||||
};;
|
||||
{ .mmi
|
||||
ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
|
||||
.restore sp
|
||||
add sp = 64,sp // Restore stack pointer
|
||||
mov b0 = GR_SAVE_B0 // Restore return address
|
||||
};;
|
||||
{ .mib
|
||||
mov gp = GR_SAVE_GP // Restore gp
|
||||
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
|
||||
br.ret.sptk b0 // Return
|
||||
};;
|
||||
|
||||
LOCAL_LIBM_END(__libm_error_region)
|
||||
|
||||
|
||||
.type __libm_error_support#,@function
|
||||
.global __libm_error_support#
|
@ -1,569 +0,0 @@
|
||||
.file "fmodf.s"
|
||||
|
||||
|
||||
// Copyright (c) 2000 - 2003, Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
//
|
||||
// History
|
||||
//====================================================================
|
||||
// 02/02/00 Initial version
|
||||
// 03/02/00 New Algorithm
|
||||
// 04/04/00 Unwind support added
|
||||
// 08/15/00 Bundle added after call to __libm_error_support to properly
|
||||
// set [the previously overwritten] GR_Parameter_RESULT.
|
||||
// 11/28/00 Set FR_Y to f9
|
||||
// 03/11/02 Fixed flags for fmodf(qnan,zero)
|
||||
// 05/20/02 Cleaned up namespace and sf0 syntax
|
||||
// 02/10/03 Reordered header: .section, .global, .proc, .align
|
||||
// 04/28/03 Fix: fmod(sNaN,0) no longer sets errno
|
||||
//
|
||||
// API
|
||||
//====================================================================
|
||||
// float fmodf(float,float);
|
||||
//
|
||||
// Overview of operation
|
||||
//====================================================================
|
||||
// fmod(a,b)=a-i*b,
|
||||
// where i is an integer such that, if b!=0,
|
||||
// |i|<|a/b| and |a/b-i|<1
|
||||
|
||||
// Algorithm
|
||||
//====================================================================
|
||||
// a). if |a|<|b|, return a
|
||||
// b). get quotient and reciprocal overestimates accurate to
|
||||
// 33 bits (q2,y2)
|
||||
// c). if the exponent difference (exponent(a)-exponent(b))
|
||||
// is less than 32, truncate quotient to integer and
|
||||
// finish in one iteration
|
||||
// d). if exponent(a)-exponent(b)>=32 (q2>=2^32)
|
||||
// round quotient estimate to single precision (k=RN(q2)),
|
||||
// calculate partial remainder (a'=a-k*b),
|
||||
// get quotient estimate (a'*y2), and repeat from c).
|
||||
|
||||
// Special cases
|
||||
//====================================================================
|
||||
// b=+/-0: return NaN, call libm_error_support
|
||||
// a=+/-Inf, a=NaN or b=NaN: return NaN
|
||||
|
||||
// Registers used
|
||||
//====================================================================
|
||||
// Predicate registers: p6-p11
|
||||
// General registers: r2,r29,r32 (ar.pfs), r33-r39
|
||||
// Floating point registers: f6-f15
|
||||
|
||||
GR_SAVE_B0 = r33
|
||||
GR_SAVE_PFS = r34
|
||||
GR_SAVE_GP = r35
|
||||
GR_SAVE_SP = r36
|
||||
|
||||
GR_Parameter_X = r37
|
||||
GR_Parameter_Y = r38
|
||||
GR_Parameter_RESULT = r39
|
||||
GR_Parameter_TAG = r40
|
||||
|
||||
FR_X = f10
|
||||
FR_Y = f9
|
||||
FR_RESULT = f8
|
||||
|
||||
|
||||
.section .text
|
||||
GLOBAL_IEEE754_ENTRY(fmodf)
|
||||
|
||||
// inputs in f8, f9
|
||||
// result in f8
|
||||
|
||||
{ .mfi
|
||||
alloc r32=ar.pfs,1,4,4,0
|
||||
// f6=|a|
|
||||
fmerge.s f6=f0,f8
|
||||
mov r2 = 0x0ffdd
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// f7=|b|
|
||||
fmerge.s f7=f0,f9
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
setf.exp f11 = r2
|
||||
// (1) y0
|
||||
frcpa.s1 f10,p6=f6,f7
|
||||
nop.i 0
|
||||
}
|
||||
|
||||
// eliminate special cases
|
||||
// Y +-NAN, +-inf, +-0? p7
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fclass.m.unc p7,p0 = f9, 0xe7
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
// qnan snan inf norm unorm 0 -+
|
||||
// 1 1 1 0 0 0 11
|
||||
// e 3
|
||||
// X +-NAN, +-inf, ? p9
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fclass.m.unc p9,p0 = f8, 0xe3
|
||||
nop.i 999
|
||||
}
|
||||
|
||||
// |x| < |y|? Return x p8
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fcmp.lt.unc.s1 p8,p0 = f6,f7
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// normalize y (if |x|<|y|)
|
||||
(p8) fma.s0 f9=f9,f1,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
mov r2=0x1001f
|
||||
// (2) q0=a*y0
|
||||
(p6) fma.s1 f13=f6,f10,f0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// (3) e0 = 1 - b * y0
|
||||
(p6) fnma.s1 f12=f7,f10,f1
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// normalize x (if |x|<|y|)
|
||||
(p8) fma.s.s0 f8=f8,f1,f0
|
||||
nop.i 0
|
||||
}
|
||||
{.bbb
|
||||
(p9) br.cond.spnt FMOD_X_NAN_INF
|
||||
(p7) br.cond.spnt FMOD_Y_NAN_INF_ZERO
|
||||
// if |x|<|y|, return
|
||||
(p8) br.ret.spnt b0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// normalize x
|
||||
fma.s0 f6=f6,f1,f0
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// normalize y
|
||||
fma.s0 f7=f7,f1,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
|
||||
{.mfi
|
||||
// f15=2^32
|
||||
setf.exp f15=r2
|
||||
// (4) q1=q0+e0*q0
|
||||
(p6) fma.s1 f13=f12,f13,f13
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// (5) e1 = e0 * e0 + 2^-34
|
||||
(p6) fma.s1 f14=f12,f12,f11
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mlx
|
||||
nop.m 0
|
||||
movl r2=0x33a00000;;
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// (6) y1 = y0 + e0 * y0
|
||||
(p6) fma.s1 f10=f12,f10,f10
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
// set f12=1.25*2^{-24}
|
||||
setf.s f12=r2
|
||||
// (7) q2=q1+e1*q1
|
||||
(p6) fma.s1 f13=f13,f14,f13
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fmerge.s f9=f8,f9
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// (8) y2 = y1 + e1 * y1
|
||||
(p6) fma.s1 f10=f14,f10,f10
|
||||
// set p6=0, p10=0
|
||||
cmp.ne.and p6,p10=r0,r0;;
|
||||
}
|
||||
|
||||
.align 32
|
||||
loop24:
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// compare q2, 2^32
|
||||
fcmp.lt.unc.s1 p8,p7=f13,f15
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// will truncate quotient to integer, if exponent<32 (in advance)
|
||||
fcvt.fx.trunc.s1 f11=f13
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// if exponent>32, round quotient to single precision (perform in advance)
|
||||
fma.s.s1 f13=f13,f1,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// set f12=sgn(a)
|
||||
(p8) fmerge.s f12=f8,f1
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// normalize truncated quotient
|
||||
(p8) fcvt.xf f13=f11
|
||||
nop.i 0;;
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// calculate remainder (assuming f13=RZ(Q))
|
||||
(p7) fnma.s1 f14=f13,f7,f6
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// also if exponent>32, round quotient to single precision
|
||||
// and subtract 1 ulp: q=q-q*(1.25*2^{-24})
|
||||
(p7) fnma.s.s1 f11=f13,f12,f13
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// (p8) calculate remainder (82-bit format)
|
||||
(p8) fnma.s1 f11=f13,f7,f6
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// (p7) calculate remainder (assuming f11=RZ(Q))
|
||||
(p7) fnma.s1 f6=f11,f7,f6
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// Final iteration (p8): is f6 the correct remainder (quotient was not overestimated) ?
|
||||
(p8) fcmp.lt.unc.s1 p6,p10=f11,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// get new quotient estimation: a'*y2
|
||||
(p7) fma.s1 f13=f14,f10,f0
|
||||
nop.i 0
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// was f14=RZ(Q) ? (then new remainder f14>=0)
|
||||
(p7) fcmp.lt.unc.s1 p7,p9=f14,f0
|
||||
nop.b 0;;
|
||||
}
|
||||
|
||||
|
||||
.pred.rel "mutex",p6,p10
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// add b to estimated remainder (to cover the case when the quotient was overestimated)
|
||||
// also set correct sign by using f9=|b|*sgn(a), f12=sgn(a)
|
||||
(p6) fma.s.s0 f8=f11,f12,f9
|
||||
nop.b 0
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// calculate remainder (single precision)
|
||||
// set correct sign of result before returning
|
||||
(p10) fma.s.s0 f8=f11,f12,f0
|
||||
(p8) br.ret.sptk b0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// if f13!=RZ(Q), get alternative quotient estimation: a''*y2
|
||||
(p7) fma.s1 f13=f6,f10,f0
|
||||
nop.i 0
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// if f14 was RZ(Q), set remainder to f14
|
||||
(p9) mov f6=f14
|
||||
br.cond.sptk loop24;;
|
||||
}
|
||||
|
||||
{ .mmb
|
||||
nop.m 0
|
||||
nop.m 0
|
||||
br.ret.sptk b0;;
|
||||
}
|
||||
|
||||
FMOD_X_NAN_INF:
|
||||
|
||||
|
||||
// Y zero ?
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fclass.m p10,p0=f8,0xc3 // Test x=nan
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s1 f10=f9,f1,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
fma.s0 f8=f8,f1,f0
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
(p10) fclass.m p10,p0=f9,0x07 // Test x=nan, and y=zero
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
fcmp.eq.unc.s1 p11,p0=f10,f0
|
||||
(p10) br.ret.spnt b0;; // Exit with result=x if x=nan and y=zero
|
||||
}
|
||||
{.mib
|
||||
nop.m 0
|
||||
nop.i 0
|
||||
// if Y zero
|
||||
(p11) br.cond.spnt FMOD_Y_ZERO;;
|
||||
}
|
||||
|
||||
// X infinity? Return QNAN indefinite
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fclass.m.unc p8,p9 = f8, 0x23
|
||||
nop.i 999;;
|
||||
}
|
||||
// Y NaN ?
|
||||
{.mfi
|
||||
nop.m 999
|
||||
(p8) fclass.m p9,p8=f9,0xc3
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 999
|
||||
(p8) frcpa.s0 f8,p0 = f8,f8
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
// also set Denormal flag if necessary
|
||||
(p8) fma.s0 f9=f9,f1,f0
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
(p8) fma.s.s0 f8=f8,f1,f0
|
||||
nop.b 999 ;;
|
||||
}
|
||||
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
(p9) frcpa.s0 f8,p7=f8,f9
|
||||
br.ret.sptk b0 ;;
|
||||
}
|
||||
|
||||
|
||||
FMOD_Y_NAN_INF_ZERO:
|
||||
|
||||
// Y INF
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fclass.m.unc p7,p0 = f9, 0x23
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
(p7) fma.s.s0 f8=f8,f1,f0
|
||||
(p7) br.ret.spnt b0 ;;
|
||||
}
|
||||
|
||||
// Y NAN?
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fclass.m.unc p9,p0 = f9, 0xc3
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{ .mfb
|
||||
nop.m 999
|
||||
(p9) fma.s.s0 f8=f9,f1,f0
|
||||
(p9) br.ret.spnt b0 ;;
|
||||
}
|
||||
|
||||
FMOD_Y_ZERO:
|
||||
// Y zero? Must be zero at this point
|
||||
// because it is the only choice left.
|
||||
// Return QNAN indefinite
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// set Invalid
|
||||
frcpa.s0 f12,p0=f0,f0
|
||||
nop.i 999
|
||||
}
|
||||
// X NAN?
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fclass.m.unc p9,p10 = f8, 0xc3
|
||||
nop.i 999 ;;
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p10) fclass.nm p9,p10 = f8, 0xff
|
||||
nop.i 999 ;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 999
|
||||
(p9) frcpa.s0 f11,p7=f8,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
(p10) frcpa.s0 f11,p7 = f0,f0
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fmerge.s f10 = f8, f8
|
||||
nop.i 999
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
fma.s.s0 f8=f11,f1,f0
|
||||
nop.i 999;;
|
||||
}
|
||||
|
||||
EXP_ERROR_RETURN:
|
||||
|
||||
|
||||
{ .mib
|
||||
nop.m 0
|
||||
mov GR_Parameter_TAG=122
|
||||
br.sptk __libm_error_region;;
|
||||
}
|
||||
|
||||
GLOBAL_IEEE754_END(fmodf)
|
||||
libm_alias_float_other (__fmod, fmod)
|
||||
|
||||
LOCAL_LIBM_ENTRY(__libm_error_region)
|
||||
.prologue
|
||||
{ .mfi
|
||||
add GR_Parameter_Y=-32,sp // Parameter 2 value
|
||||
nop.f 0
|
||||
.save ar.pfs,GR_SAVE_PFS
|
||||
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
|
||||
}
|
||||
{ .mfi
|
||||
.fframe 64
|
||||
add sp=-64,sp // Create new stack
|
||||
nop.f 0
|
||||
mov GR_SAVE_GP=gp // Save gp
|
||||
};;
|
||||
{ .mmi
|
||||
stfs [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
|
||||
add GR_Parameter_X = 16,sp // Parameter 1 address
|
||||
.save b0, GR_SAVE_B0
|
||||
mov GR_SAVE_B0=b0 // Save b0
|
||||
};;
|
||||
.body
|
||||
{ .mib
|
||||
stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
|
||||
add GR_Parameter_RESULT = 0,GR_Parameter_Y
|
||||
nop.b 0 // Parameter 3 address
|
||||
}
|
||||
{ .mib
|
||||
stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
|
||||
add GR_Parameter_Y = -16,GR_Parameter_Y
|
||||
br.call.sptk b0=__libm_error_support#;; // Call error handling function
|
||||
}
|
||||
{ .mmi
|
||||
nop.m 0
|
||||
nop.m 0
|
||||
add GR_Parameter_RESULT = 48,sp
|
||||
};;
|
||||
{ .mmi
|
||||
ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
|
||||
.restore sp
|
||||
add sp = 64,sp // Restore stack pointer
|
||||
mov b0 = GR_SAVE_B0 // Restore return address
|
||||
};;
|
||||
{ .mib
|
||||
mov gp = GR_SAVE_GP // Restore gp
|
||||
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
|
||||
br.ret.sptk b0 // Return
|
||||
};;
|
||||
|
||||
LOCAL_LIBM_END(__libm_error_region)
|
||||
|
||||
.type __libm_error_support#,@function
|
||||
.global __libm_error_support#
|
@ -1,672 +0,0 @@
|
||||
.file "fmodl.s"
|
||||
|
||||
|
||||
// Copyright (c) 2000 - 2004, Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
//
|
||||
// History
|
||||
//====================================================================
|
||||
// 02/02/00 Initial version
|
||||
// 03/02/00 New Algorithm
|
||||
// 04/04/00 Unwind support added
|
||||
// 08/15/00 Bundle added after call to __libm_error_support to properly
|
||||
// set [ the previously overwritten ] GR_Parameter_RESULT.
|
||||
// 11/28/00 Set FR_Y to f9
|
||||
// 03/11/02 Fixed flags for fmodl(qnan, zero)
|
||||
// 05/20/02 Cleaned up namespace and sf0 syntax
|
||||
// 02/10/03 Reordered header:.section,.global,.proc,.align
|
||||
// 04/28/03 Fix: fmod(sNaN, 0) no longer sets errno
|
||||
// 11/23/04 Reformatted routine and improved speed
|
||||
//
|
||||
// API
|
||||
//====================================================================
|
||||
// long double fmodl(long double, long double);
|
||||
//
|
||||
// Overview of operation
|
||||
//====================================================================
|
||||
// fmod(a, b)= a-i*b,
|
||||
// where i is an integer such that, if b!= 0,
|
||||
// |i|<|a/b| and |a/b-i|<1
|
||||
//
|
||||
// Algorithm
|
||||
//====================================================================
|
||||
// a). if |a|<|b|, return a
|
||||
// b). get quotient and reciprocal overestimates accurate to
|
||||
// 33 bits (q2, y2)
|
||||
// c). if the exponent difference (exponent(a)-exponent(b))
|
||||
// is less than 32, truncate quotient to integer and
|
||||
// finish in one iteration
|
||||
// d). if exponent(a)-exponent(b)>= 32 (q2>= 2^32)
|
||||
// round quotient estimate to single precision (k= RN(q2)),
|
||||
// calculate partial remainder (a'= a-k*b),
|
||||
// get quotient estimate (a'*y2), and repeat from c).
|
||||
//
|
||||
// Registers used
|
||||
//====================================================================
|
||||
|
||||
GR_SMALLBIASEXP = r2
|
||||
GR_2P32 = r3
|
||||
GR_SMALLBIASEXP = r20
|
||||
GR_ROUNDCONST = r21
|
||||
GR_SIG_B = r22
|
||||
GR_ARPFS = r23
|
||||
GR_TMP1 = r24
|
||||
GR_TMP2 = r25
|
||||
GR_TMP3 = r26
|
||||
|
||||
GR_SAVE_B0 = r33
|
||||
GR_SAVE_PFS = r34
|
||||
GR_SAVE_GP = r35
|
||||
GR_SAVE_SP = r36
|
||||
|
||||
GR_Parameter_X = r37
|
||||
GR_Parameter_Y = r38
|
||||
GR_Parameter_RESULT = r39
|
||||
GR_Parameter_TAG = r40
|
||||
|
||||
FR_X = f10
|
||||
FR_Y = f9
|
||||
FR_RESULT = f8
|
||||
|
||||
FR_ABS_A = f6
|
||||
FR_ABS_B = f7
|
||||
FR_Y_INV = f10
|
||||
FR_SMALLBIAS = f11
|
||||
FR_E0 = f12
|
||||
FR_Q = f13
|
||||
FR_E1 = f14
|
||||
FR_2P32 = f15
|
||||
FR_TMPX = f32
|
||||
FR_TMPY = f33
|
||||
FR_ROUNDCONST = f34
|
||||
FR_QINT = f35
|
||||
FR_QRND24 = f36
|
||||
FR_NORM_B = f37
|
||||
FR_TMP = f38
|
||||
FR_TMP2 = f39
|
||||
FR_DFLAG = f40
|
||||
FR_Y_INV0 = f41
|
||||
FR_Y_INV1 = f42
|
||||
FR_Q0 = f43
|
||||
FR_Q1 = f44
|
||||
FR_QINT_Z = f45
|
||||
FR_QREM = f46
|
||||
FR_B_SGN_A = f47
|
||||
|
||||
.section .text
|
||||
GLOBAL_IEEE754_ENTRY(fmodl)
|
||||
|
||||
// inputs in f8, f9
|
||||
// result in f8
|
||||
|
||||
{ .mfi
|
||||
getf.sig GR_SIG_B = f9
|
||||
// FR_ABS_A = |a|
|
||||
fmerge.s FR_ABS_A = f0, f8
|
||||
mov GR_SMALLBIASEXP = 0x0ffdd
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// FR_ABS_B = |b|
|
||||
fmerge.s FR_ABS_B = f0, f9
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
setf.exp FR_SMALLBIAS = GR_SMALLBIASEXP
|
||||
// (1) y0
|
||||
frcpa.s1 FR_Y_INV0, p6 = FR_ABS_A, FR_ABS_B
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mlx
|
||||
nop.m 0
|
||||
movl GR_ROUNDCONST = 0x33a00000
|
||||
}
|
||||
;;
|
||||
|
||||
// eliminate special cases
|
||||
{ .mmi
|
||||
nop.m 0
|
||||
nop.m 0
|
||||
// y pseudo-zero ?
|
||||
cmp.eq p7, p10 = GR_SIG_B, r0
|
||||
}
|
||||
;;
|
||||
|
||||
// set p7 if b +/-NAN, +/-inf, +/-0
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p10) fclass.m p7, p10 = f9, 0xe7
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
mov GR_2P32 = 0x1001f
|
||||
// (2) q0 = a*y0
|
||||
(p6) fma.s1 FR_Q0 = FR_ABS_A, FR_Y_INV0, f0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// (3) e0 = 1 - b * y0
|
||||
(p6) fnma.s1 FR_E0 = FR_ABS_B, FR_Y_INV0, f1
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
// set p9 if a +/-NAN, +/-inf
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fclass.m.unc p9, p11 = f8, 0xe3
|
||||
nop.i 0
|
||||
}
|
||||
// |a| < |b|? Return a, p8=1
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p10) fcmp.lt.unc.s1 p8, p0 = FR_ABS_A, FR_ABS_B
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
// set p7 if b +/-NAN, +/-inf, +/-0
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// pseudo-NaN ?
|
||||
(p10) fclass.nm p7, p0 = f9, 0xff
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
// set p9 if a is +/-NaN, +/-Inf
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p11) fclass.nm p9, p0 = f8, 0xff
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// b denormal ? set D flag (if |a|<|b|)
|
||||
(p8) fnma.s0 FR_DFLAG = f9, f1, f9
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
// FR_2P32 = 2^32
|
||||
setf.exp FR_2P32 = GR_2P32
|
||||
// (4) q1 = q0+e0*q0
|
||||
(p6) fma.s1 FR_Q1 = FR_E0, FR_Q0, FR_Q0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// (5) e1 = e0 * e0 + 2^-34
|
||||
(p6) fma.s1 FR_E1 = FR_E0, FR_E0, FR_SMALLBIAS
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// normalize a (if |a|<|b|)
|
||||
(p8) fma.s0 f8 = f8, f1, f0
|
||||
nop.i 0
|
||||
}
|
||||
{ .bbb
|
||||
(p9) br.cond.spnt FMOD_A_NAN_INF
|
||||
(p7) br.cond.spnt FMOD_B_NAN_INF_ZERO
|
||||
// if |a|<|b|, return
|
||||
(p8) br.ret.spnt b0
|
||||
}
|
||||
;;
|
||||
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// (6) y1 = y0 + e0 * y0
|
||||
(p6) fma.s1 FR_Y_INV1 = FR_E0, FR_Y_INV0, FR_Y_INV0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// a denormal ? set D flag
|
||||
// b denormal ? set D flag
|
||||
fcmp.eq.s0 p12,p0 = FR_ABS_A, FR_ABS_B
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
// set FR_ROUNDCONST = 1.25*2^{-24}
|
||||
setf.s FR_ROUNDCONST = GR_ROUNDCONST
|
||||
// (7) q2 = q1+e1*q1
|
||||
(p6) fma.s1 FR_Q = FR_Q1, FR_E1, FR_Q1
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fmerge.s FR_B_SGN_A = f8, f9
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// (8) y2 = y1 + e1 * y1
|
||||
(p6) fma.s1 FR_Y_INV = FR_E1, FR_Y_INV1, FR_Y_INV1
|
||||
// set p6 = 0, p10 = 0
|
||||
cmp.ne.and p6, p10 = r0, r0
|
||||
}
|
||||
;;
|
||||
|
||||
// will compute integer quotient bits (24 bits per iteration)
|
||||
.align 32
|
||||
loop64:
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// compare q2, 2^32
|
||||
fcmp.lt.unc.s1 p8, p7 = FR_Q, FR_2P32
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// will truncate quotient to integer, if exponent<32 (in advance)
|
||||
fcvt.fx.trunc.s1 FR_QINT = FR_Q
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// if exponent>32 round quotient to single precision (perform in advance)
|
||||
fma.s.s1 FR_QRND24 = FR_Q, f1, f0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// set FR_ROUNDCONST = sgn(a)
|
||||
(p8) fmerge.s FR_ROUNDCONST = f8, f1
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// normalize truncated quotient
|
||||
(p8) fcvt.xf FR_QRND24 = FR_QINT
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// calculate remainder (assuming FR_QRND24 = RZ(Q))
|
||||
(p7) fnma.s1 FR_E1 = FR_QRND24, FR_ABS_B, FR_ABS_A
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// also if exponent>32, round quotient to single precision
|
||||
// and subtract 1 ulp: q = q-q*(1.25*2^{-24})
|
||||
(p7) fnma.s.s1 FR_QINT_Z = FR_QRND24, FR_ROUNDCONST, FR_QRND24
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// (p8) calculate remainder (82-bit format)
|
||||
(p8) fnma.s1 FR_QREM = FR_QRND24, FR_ABS_B, FR_ABS_A
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// (p7) calculate remainder (assuming FR_QINT_Z = RZ(Q))
|
||||
(p7) fnma.s1 FR_ABS_A = FR_QINT_Z, FR_ABS_B, FR_ABS_A
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// Final iteration (p8): is FR_ABS_A the correct remainder
|
||||
// (quotient was not overestimated) ?
|
||||
(p8) fcmp.lt.unc.s1 p6, p10 = FR_QREM, f0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// get new quotient estimation: a'*y2
|
||||
(p7) fma.s1 FR_Q = FR_E1, FR_Y_INV, f0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
// was FR_Q = RZ(Q) ? (then new remainder FR_E1> = 0)
|
||||
(p7) fcmp.lt.unc.s1 p7, p9 = FR_E1, f0
|
||||
nop.b 0
|
||||
}
|
||||
;;
|
||||
|
||||
.pred.rel "mutex", p6, p10
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
// add b to estimated remainder (to cover the case when the quotient was
|
||||
// overestimated)
|
||||
// also set correct sign by using
|
||||
// FR_B_SGN_A = |b|*sgn(a), FR_ROUNDCONST = sgn(a)
|
||||
(p6) fma.s0 f8 = FR_QREM, FR_ROUNDCONST, FR_B_SGN_A
|
||||
nop.b 0
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
// set correct sign of result before returning: FR_ROUNDCONST = sgn(a)
|
||||
(p10) fma.s0 f8 = FR_QREM, FR_ROUNDCONST, f0
|
||||
(p8) br.ret.sptk b0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// if f13! = RZ(Q), get alternative quotient estimation: a''*y2
|
||||
(p7) fma.s1 FR_Q = FR_ABS_A, FR_Y_INV, f0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
// if FR_E1 was RZ(Q), set remainder to FR_E1
|
||||
(p9) fma.s1 FR_ABS_A = FR_E1, f1, f0
|
||||
br.cond.sptk loop64
|
||||
}
|
||||
;;
|
||||
|
||||
FMOD_A_NAN_INF:
|
||||
|
||||
// b zero ?
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fclass.m p10, p0 = f8, 0xc3 // Test a = nan
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s1 FR_NORM_B = f9, f1, f0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fma.s0 f8 = f8, f1, f0
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p10) fclass.m p10, p0 = f9, 0x07 // Test x = nan, and y = zero
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
fcmp.eq.unc.s1 p11, p0 = FR_NORM_B, f0
|
||||
(p10) br.ret.spnt b0 // Exit with result = a if a = nan and b = zero
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mib
|
||||
nop.m 0
|
||||
nop.i 0
|
||||
// if Y zero
|
||||
(p11) br.cond.spnt FMOD_B_ZERO
|
||||
}
|
||||
;;
|
||||
|
||||
// a= infinity? Return QNAN indefinite
|
||||
{ .mfi
|
||||
// set p7 t0 0
|
||||
cmp.ne p7, p0 = r0, r0
|
||||
fclass.m.unc p8, p9 = f8, 0x23
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
// b NaN ?
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p8) fclass.m p9, p8 = f9, 0xc3
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
// b not pseudo-zero ? (GR_SIG_B holds significand)
|
||||
{ .mii
|
||||
nop.m 0
|
||||
(p8) cmp.ne p7, p0 = GR_SIG_B, r0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p8) frcpa.s0 f8, p0 = f8, f8
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// also set Denormal flag if necessary
|
||||
(p7) fnma.s0 f9 = f9, f1, f9
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
(p8) fma.s0 f8 = f8, f1, f0
|
||||
nop.b 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
(p9) frcpa.s0 f8, p7 = f8, f9
|
||||
br.ret.sptk b0
|
||||
}
|
||||
;;
|
||||
|
||||
FMOD_B_NAN_INF_ZERO:
|
||||
// b INF
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fclass.m.unc p7, p0 = f9, 0x23
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
(p7) fma.s0 f8 = f8, f1, f0
|
||||
(p7) br.ret.spnt b0
|
||||
}
|
||||
;;
|
||||
|
||||
// b NAN?
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fclass.m.unc p9, p10 = f9, 0xc3
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p10) fclass.nm p9, p0 = f9, 0xff
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
(p9) fma.s0 f8 = f9, f1, f0
|
||||
(p9) br.ret.spnt b0
|
||||
}
|
||||
;;
|
||||
|
||||
FMOD_B_ZERO:
|
||||
// Y zero? Must be zero at this point
|
||||
// because it is the only choice left.
|
||||
// Return QNAN indefinite
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// set Invalid
|
||||
frcpa.s0 FR_TMP, p0 = f0, f0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
// a NAN?
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fclass.m.unc p9, p10 = f8, 0xc3
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
alloc GR_ARPFS = ar.pfs, 1, 4, 4, 0
|
||||
(p10) fclass.nm p9, p10 = f8, 0xff
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p9) frcpa.s0 FR_TMP2, p7 = f8, f0
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p10) frcpa.s0 FR_TMP2, p7 = f9, f9
|
||||
mov GR_Parameter_TAG = 120
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fmerge.s FR_X = f8, f8
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfb
|
||||
nop.m 0
|
||||
fma.s0 f8 = FR_TMP2, f1, f0
|
||||
br.sptk __libm_error_region
|
||||
}
|
||||
;;
|
||||
|
||||
GLOBAL_IEEE754_END(fmodl)
|
||||
libm_alias_ldouble_other (__fmod, fmod)
|
||||
|
||||
LOCAL_LIBM_ENTRY(__libm_error_region)
|
||||
.prologue
|
||||
{ .mfi
|
||||
add GR_Parameter_Y = -32, sp // Parameter 2 value
|
||||
nop.f 0
|
||||
.save ar.pfs, GR_SAVE_PFS
|
||||
mov GR_SAVE_PFS = ar.pfs // Save ar.pfs
|
||||
}
|
||||
{ .mfi
|
||||
.fframe 64
|
||||
add sp = -64, sp // Create new stack
|
||||
nop.f 0
|
||||
mov GR_SAVE_GP = gp // Save gp
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mmi
|
||||
stfe [ GR_Parameter_Y ] = FR_Y, 16 // Save Parameter 2 on stack
|
||||
add GR_Parameter_X = 16, sp // Parameter 1 address
|
||||
.save b0, GR_SAVE_B0
|
||||
mov GR_SAVE_B0 = b0 // Save b0
|
||||
}
|
||||
;;
|
||||
|
||||
.body
|
||||
{ .mib
|
||||
stfe [ GR_Parameter_X ] = FR_X // Store Parameter 1 on stack
|
||||
add GR_Parameter_RESULT = 0, GR_Parameter_Y
|
||||
nop.b 0 // Parameter 3 address
|
||||
}
|
||||
{ .mib
|
||||
stfe [ GR_Parameter_Y ] = FR_RESULT // Store Parameter 3 on stack
|
||||
add GR_Parameter_Y = -16, GR_Parameter_Y
|
||||
br.call.sptk b0 = __libm_error_support# // Call error handling function
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mmi
|
||||
nop.m 0
|
||||
nop.m 0
|
||||
add GR_Parameter_RESULT = 48, sp
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mmi
|
||||
ldfe f8 = [ GR_Parameter_RESULT ] // Get return result off stack
|
||||
.restore sp
|
||||
add sp = 64, sp // Restore stack pointer
|
||||
mov b0 = GR_SAVE_B0 // Restore return address
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mib
|
||||
mov gp = GR_SAVE_GP // Restore gp
|
||||
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
|
||||
br.ret.sptk b0 // Return
|
||||
}
|
||||
;;
|
||||
|
||||
LOCAL_LIBM_END(__libm_error_region)
|
||||
|
||||
.type __libm_error_support#, @function
|
||||
.global __libm_error_support#
|
@ -1 +0,0 @@
|
||||
/* Not needed. */
|
@ -1 +0,0 @@
|
||||
/* Not needed. */
|
@ -1 +0,0 @@
|
||||
/* Not needed. */
|
@ -1,438 +0,0 @@
|
||||
.file "hypot.s"
|
||||
|
||||
|
||||
// Copyright (c) 2000 - 2003, Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
//
|
||||
//*********************************************************************
|
||||
//
|
||||
// History:
|
||||
// 02/02/00 hand-optimized
|
||||
// 04/04/00 Unwind support added
|
||||
// 06/20/00 new version
|
||||
// 08/15/00 Bundle added after call to __libm_error_support to properly
|
||||
// set [the previously overwritten] GR_Parameter_RESULT.
|
||||
// 05/20/02 Cleaned up namespace and sf0 syntax
|
||||
// 02/10/03 Reordered header: .section, .global, .proc, .align
|
||||
// 04/17/03 Added missing mutex directive
|
||||
//
|
||||
//*********************************************************************
|
||||
// ___________
|
||||
// Function: hypot(x,y) = |(x^2 + y^2) = for double precision values
|
||||
// x and y
|
||||
// Also provides cabs functionality.
|
||||
//
|
||||
//*********************************************************************
|
||||
//
|
||||
// Resources Used:
|
||||
//
|
||||
// Floating-Point Registers: f8 (Input and Return Value)
|
||||
// f9 (Input)
|
||||
// f6 -f15, f32-f34
|
||||
//
|
||||
// General Purpose Registers:
|
||||
// r2,r3,r29 (Scratch)
|
||||
// r32-r36 (Locals)
|
||||
// r37-r40 (Used to pass arguments to error handling routine)
|
||||
//
|
||||
// Predicate Registers: p6 - p10
|
||||
//
|
||||
//*********************************************************************
|
||||
//
|
||||
// IEEE Special Conditions:
|
||||
//
|
||||
// All faults and exceptions should be raised correctly.
|
||||
// Overflow can occur.
|
||||
// hypot(Infinity and anything) = +Infinity
|
||||
// hypot(QNaN and anything) = QNaN
|
||||
// hypot(SNaN and anything ) = QNaN
|
||||
//
|
||||
//*********************************************************************
|
||||
//
|
||||
// Implementation:
|
||||
// x2 = x * x in double-extended
|
||||
// y2 = y * y in double-extended
|
||||
// temp = x2 + y2 in double-extended
|
||||
// sqrt(temp) rounded to double
|
||||
//
|
||||
//*********************************************************************
|
||||
|
||||
GR_SAVE_PFS = r33
|
||||
GR_SAVE_B0 = r34
|
||||
GR_SAVE_GP = r35
|
||||
GR_Parameter_X = r36
|
||||
GR_Parameter_Y = r37
|
||||
GR_Parameter_RESULT = r38
|
||||
GR_Parameter_TAG = r39
|
||||
|
||||
FR_X = f32
|
||||
FR_Y = f33
|
||||
FR_RESULT = f8
|
||||
|
||||
.section .text
|
||||
|
||||
LOCAL_LIBM_ENTRY(cabs)
|
||||
LOCAL_LIBM_END(cabs)
|
||||
|
||||
GLOBAL_IEEE754_ENTRY(hypot)
|
||||
|
||||
{.mfi
|
||||
alloc r32= ar.pfs,0,4,4,0
|
||||
// Compute x*x
|
||||
fma.s1 f10=f8,f8,f0
|
||||
// r2=bias-1
|
||||
mov r2=0xfffe
|
||||
}
|
||||
{.mfi
|
||||
// 63/8
|
||||
mov r3=0x40fc //0000
|
||||
// y*y
|
||||
fma.s1 f11=f9,f9,f0
|
||||
// r29=429/16
|
||||
mov r29=0x41d68;; //000
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// Check if x is an Inf - if so return Inf even
|
||||
// if y is a NaN (C9X)
|
||||
fclass.m.unc p7, p6 = f8, 0x023
|
||||
shl r3=r3,16
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// if possible overflow, copy f8 to f32
|
||||
// set Denormal, if necessary
|
||||
// (p8)
|
||||
fma.d.s0 f32=f8,f1,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// Check if y is an Inf - if so return Inf even
|
||||
// if x is a NaN (C9X)
|
||||
fclass.m.unc p8, p9 = f9, 0x023
|
||||
shl r29=r29,12
|
||||
}
|
||||
{ .mfb
|
||||
// f7=0.5
|
||||
setf.exp f7=r2
|
||||
// For x=inf, multiply y by 1 to raise invalid on y an SNaN
|
||||
// (p7) fma.s0 f9=f9,f1,f0
|
||||
// copy f9 to f33; set Denormal, if necessary
|
||||
fma.d.s0 f33=f9,f1,f0
|
||||
nop.b 0;;
|
||||
}
|
||||
{.mfb
|
||||
// f13=63/8
|
||||
setf.s f13=r3
|
||||
// is y Zero ?
|
||||
(p6) fclass.m p6,p0=f9,0x7
|
||||
nop.b 0
|
||||
}
|
||||
{.mlx
|
||||
nop.m 0
|
||||
movl r2=0x408c0000;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
// f34=429/16
|
||||
setf.s f34=r29
|
||||
// is x Zero ?
|
||||
(p9) fclass.m p9,p0=f8,0x7
|
||||
// 231/16
|
||||
mov r3=0x4167;; //0000
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// a=x2+y2
|
||||
fma.s1 f12=f10,f1,f11
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// y not NaN ?
|
||||
(p9) fclass.m p8,p0=f9,0x3f
|
||||
shl r3=r3,16
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// f6=2
|
||||
fma.s1 f6=f1,f1,f1
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// x not NaN ?
|
||||
(p6) fclass.m p7,p0=f8,0x3f
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
// f9=35/8
|
||||
setf.s f9=r2
|
||||
nop.f 0
|
||||
// 2*emax-2
|
||||
mov r2=0x107fb;;
|
||||
}
|
||||
|
||||
.pred.rel "mutex",p7,p8
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// if f8=Infinity or f9=Zero, return |f8|
|
||||
(p7) fmerge.s f8=f0,f32
|
||||
(p7) br.ret.spnt b0
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// if f9=Infinity or f8=Zero, return |f9|
|
||||
(p8) fmerge.s f8=f0,f33
|
||||
(p8) br.ret.spnt b0;;
|
||||
}
|
||||
|
||||
|
||||
{.mfi
|
||||
// f10 =231/16
|
||||
setf.s f10=r3
|
||||
// z0=frsqrta(a)
|
||||
frsqrta.s1 f8,p6=f12
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// Identify Natvals, Infs, NaNs, and Zeros
|
||||
// and return result
|
||||
fclass.m.unc p7, p0 = f12, 0x1E7
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfb
|
||||
// get exponent of x^2+y^2
|
||||
getf.exp r3=f12
|
||||
// if special case, set f8
|
||||
(p7) mov f8=f12
|
||||
(p7) br.ret.spnt b0;;
|
||||
}
|
||||
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// S0=a*z0
|
||||
(p6) fma.s1 f14=f12,f8,f0
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// H0=0.5*z0
|
||||
(p6) fma.s1 f15=f8,f7,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// f6=5/2
|
||||
fma.s1 f6=f7,f1,f6
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// f11=3/2
|
||||
fma.s1 f11=f7,f1,f1
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// d=0.5-S0*H0
|
||||
(p6) fnma.s1 f7=f14,f15,f7
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P67=231/16+429/16*d
|
||||
(p6) fma.s1 f10=f34,f7,f10
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P45=63/8*d+35/8
|
||||
(p6) fma.s1 f9=f13,f7,f9
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P23=5/2*d+3/2
|
||||
(p6) fma.s1 f11=f6,f7,f11
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// d2=d*d
|
||||
(p6) fma.s1 f13=f7,f7,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P47=d2*P67+P45
|
||||
(p6) fma.s1 f10=f10,f13,f9
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P13=d*P23+1
|
||||
(p6) fma.s1 f11=f11,f7,f1
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// d3=d2*d
|
||||
(p6) fma.s1 f13=f13,f7,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// T0=d*S0
|
||||
(p6) fma.s1 f15=f7,f14,f0
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
// Is x^2 + y^2 well less than the overflow
|
||||
// threshold?
|
||||
(p6) cmp.lt.unc p7, p8 = r3,r2
|
||||
// P=P13+d3*P47
|
||||
(p6) fma.s1 f10=f13,f10,f11
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// S=P*T0+S0
|
||||
fma.d.s0 f8=f10,f15,f14
|
||||
// No overflow in this case
|
||||
(p7) br.ret.sptk b0;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p8) fsetc.s2 0x7F,0x42
|
||||
// Possible overflow path, must detect by
|
||||
// Setting widest range exponent with prevailing
|
||||
// rounding mode.
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
// bias+0x400 (bias+EMAX+1)
|
||||
(p8) mov r2=0x103ff
|
||||
// S=P*T0+S0
|
||||
(p8) fma.d.s2 f12=f10,f15,f14
|
||||
nop.i 0 ;;
|
||||
}
|
||||
{ .mfi
|
||||
(p8) setf.exp f11 = r2
|
||||
(p8) fsetc.s2 0x7F,0x40
|
||||
// Restore Original Mode in S2
|
||||
nop.i 0 ;;
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p8) fcmp.lt.unc.s1 p9, p10 = f12, f11
|
||||
nop.i 0 ;;
|
||||
}
|
||||
{ .mib
|
||||
nop.m 0
|
||||
mov GR_Parameter_TAG = 46
|
||||
// No overflow
|
||||
(p9) br.ret.sptk b0;;
|
||||
}
|
||||
GLOBAL_IEEE754_END(hypot)
|
||||
libm_alias_double_other (__hypot, hypot)
|
||||
|
||||
LOCAL_LIBM_ENTRY(__libm_error_region)
|
||||
.prologue
|
||||
{ .mfi
|
||||
add GR_Parameter_Y=-32,sp // Parameter 2 value
|
||||
nop.f 0
|
||||
.save ar.pfs,GR_SAVE_PFS
|
||||
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
|
||||
}
|
||||
{ .mfi
|
||||
.fframe 64
|
||||
add sp=-64,sp // Create new stack
|
||||
nop.f 0
|
||||
mov GR_SAVE_GP=gp // Save gp
|
||||
};;
|
||||
{ .mmi
|
||||
stfd [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
|
||||
add GR_Parameter_X = 16,sp // Parameter 1 address
|
||||
.save b0, GR_SAVE_B0
|
||||
mov GR_SAVE_B0=b0 // Save b0
|
||||
};;
|
||||
.body
|
||||
{ .mib
|
||||
stfd [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
|
||||
add GR_Parameter_RESULT = 0,GR_Parameter_Y
|
||||
nop.b 0 // Parameter 3 address
|
||||
}
|
||||
{ .mib
|
||||
stfd [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
|
||||
add GR_Parameter_Y = -16,GR_Parameter_Y
|
||||
br.call.sptk b0=__libm_error_support# // Call error handling function
|
||||
};;
|
||||
{ .mmi
|
||||
nop.m 0
|
||||
nop.m 0
|
||||
add GR_Parameter_RESULT = 48,sp
|
||||
};;
|
||||
{ .mmi
|
||||
ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
|
||||
.restore sp
|
||||
add sp = 64,sp // Restore stack pointer
|
||||
mov b0 = GR_SAVE_B0 // Restore return address
|
||||
};;
|
||||
{ .mib
|
||||
mov gp = GR_SAVE_GP // Restore gp
|
||||
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
|
||||
br.ret.sptk b0 // Return
|
||||
};;
|
||||
LOCAL_LIBM_END(__libm_error_region#)
|
||||
.type __libm_error_support#,@function
|
||||
.global __libm_error_support#
|
@ -1,394 +0,0 @@
|
||||
.file "hypotf.s"
|
||||
|
||||
|
||||
// Copyright (c) 2000 - 2003, Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
//
|
||||
//*********************************************************************
|
||||
//
|
||||
// History:
|
||||
// 02/02/00 hand-optimized
|
||||
// 04/04/00 Unwind support added
|
||||
// 06/26/00 new version
|
||||
// 08/15/00 Bundle added after call to __libm_error_support to properly
|
||||
// set [the previously overwritten] GR_Parameter_RESULT.
|
||||
// 05/20/02 Cleaned up namespace and sf0 syntax
|
||||
// 02/10/03 Reordered header: .section, .global, .proc, .align
|
||||
// 04/17/03 Added missing mutex directive
|
||||
//
|
||||
//*********************************************************************
|
||||
// ___________
|
||||
// Function: hypotf(x,y) = |(x^2 + y^2) = for single precision values
|
||||
// x and y
|
||||
// Also provides cabsf functionality.
|
||||
//
|
||||
//*********************************************************************
|
||||
//
|
||||
// Resources Used:
|
||||
//
|
||||
// Floating-Point Registers: f8 (Input and Return Value)
|
||||
// f9 (Input)
|
||||
// f6 -f15
|
||||
//
|
||||
// General Purpose Registers:
|
||||
// r2-r3 (Scratch)
|
||||
// r32-r36 (Locals)
|
||||
// r37-r40 (Used to pass arguments to error handling routine)
|
||||
//
|
||||
// Predicate Registers: p6 - p10
|
||||
//
|
||||
//*********************************************************************
|
||||
//
|
||||
// IEEE Special Conditions:
|
||||
//
|
||||
// All faults and exceptions should be raised correctly.
|
||||
// Overflow can occur.
|
||||
// hypotf(Infinity and anything) = +Infinity
|
||||
// hypotf(QNaN and anything) = QNaN
|
||||
// hypotf(SNaN and anything ) = QNaN
|
||||
//
|
||||
//*********************************************************************
|
||||
//
|
||||
// Implementation:
|
||||
// x2 = x * x in double-extended
|
||||
// y2 = y * y in double-extended
|
||||
// temp = x2 + y2 in double-extended
|
||||
// sqrt(temp) rounded to single precision
|
||||
//
|
||||
//*********************************************************************
|
||||
|
||||
GR_SAVE_PFS = r33
|
||||
GR_SAVE_B0 = r34
|
||||
GR_SAVE_GP = r35
|
||||
GR_Parameter_X = r36
|
||||
GR_Parameter_Y = r37
|
||||
GR_Parameter_RESULT = r38
|
||||
GR_Parameter_TAG = r39
|
||||
|
||||
FR_X = f14
|
||||
FR_Y = f15
|
||||
FR_RESULT = f8
|
||||
|
||||
.section .text
|
||||
|
||||
LOCAL_LIBM_ENTRY(cabsf)
|
||||
LOCAL_LIBM_END(cabsf)
|
||||
|
||||
GLOBAL_IEEE754_ENTRY(hypotf)
|
||||
{.mfi
|
||||
alloc r32= ar.pfs,0,4,4,0
|
||||
// Compute x*x
|
||||
fma.s1 f10=f8,f8,f0
|
||||
// r2=bias-1
|
||||
mov r2=0xfffe
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// y*y
|
||||
fma.s1 f11=f9,f9,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// Check if x is an Inf - if so return Inf even
|
||||
// if y is a NaN (C9X)
|
||||
fclass.m.unc p7, p6 = f8, 0x023
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// if possible overflow, copy f8 to f14
|
||||
// set Denormal, if necessary
|
||||
// (p8)
|
||||
fma.s.s0 f14=f8,f1,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// Check if y is an Inf - if so return Inf even
|
||||
// if x is a NaN (C9X)
|
||||
fclass.m.unc p8, p9 = f9, 0x023
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// For x=inf, multiply y by 1 to raise invalid on y an SNaN
|
||||
// (p7) fma.s0 f9=f9,f1,f0
|
||||
// copy f9 to f15; set Denormal, if necessary
|
||||
fma.s.s0 f15=f9,f1,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// is y Zero ?
|
||||
(p6) fclass.m p6,p0=f9,0x7
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// is x Zero ?
|
||||
(p9) fclass.m p9,p0=f8,0x7
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
// f7=0.5
|
||||
setf.exp f7=r2
|
||||
// a=x2+y2
|
||||
fma.s1 f12=f10,f1,f11
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// x not NaN ?
|
||||
(p6) fclass.m p7,p0=f8,0x3f
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
// 2*emax-2
|
||||
mov r2=0x100fb
|
||||
// f6=2
|
||||
fma.s1 f6=f1,f1,f1
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// y not NaN ?
|
||||
(p9) fclass.m p8,p0=f9,0x3f
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
.pred.rel "mutex",p7,p8
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// if f8=Infinity or f9=Zero, return |f8|
|
||||
(p7) fmerge.s f8=f0,f14
|
||||
(p7) br.ret.spnt b0
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// if f9=Infinity or f8=Zero, return |f9|
|
||||
(p8) fmerge.s f8=f0,f15
|
||||
(p8) br.ret.spnt b0;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// Identify Natvals, Infs, NaNs, and Zeros
|
||||
// and return result
|
||||
fclass.m.unc p7, p0 = f12, 0x1E7
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// z0=frsqrta(a)
|
||||
frsqrta.s1 f8,p6=f12
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfb
|
||||
// get exponent of x^2+y^2
|
||||
getf.exp r3=f12
|
||||
// if special case, set f8
|
||||
(p7) mov f8=f12
|
||||
(p7) br.ret.spnt b0;;
|
||||
}
|
||||
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// S0=a*z0
|
||||
(p6) fma.s1 f12=f12,f8,f0
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// H0=0.5*z0
|
||||
(p6) fma.s1 f10=f8,f7,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// f6=5/2
|
||||
fma.s1 f6=f7,f1,f6
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// f11=3/2
|
||||
fma.s1 f11=f7,f1,f1
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// d=0.5-S0*H0
|
||||
(p6) fnma.s1 f7=f12,f10,f7
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P01=d+1
|
||||
(p6) fma.s1 f10=f1,f7,f1
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P23=5/2*d+3/2
|
||||
(p6) fma.s1 f11=f6,f7,f11
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// d2=d*d
|
||||
(p6) fma.s1 f7=f7,f7,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
|
||||
{.mfi
|
||||
// Is x^2 + y^2 well less than the overflow
|
||||
// threshold?
|
||||
(p6) cmp.lt.unc p7, p8 = r3,r2
|
||||
// P=P01+d2*P23
|
||||
(p6) fma.s1 f10=f7,f11,f10
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// S=P*S0
|
||||
fma.s.s0 f8=f10,f12,f0
|
||||
// No overflow in this case
|
||||
(p7) br.ret.sptk b0;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p8) fsetc.s2 0x7F,0x42
|
||||
// Possible overflow path, must detect by
|
||||
// Setting widest range exponent with prevailing
|
||||
// rounding mode.
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
// bias+0x400 (bias+EMAX+1)
|
||||
(p8) mov r2=0x1007f
|
||||
// S=P*S0
|
||||
(p8) fma.s.s2 f12=f10,f12,f0
|
||||
nop.i 0 ;;
|
||||
}
|
||||
{ .mfi
|
||||
(p8) setf.exp f11 = r2
|
||||
(p8) fsetc.s2 0x7F,0x40
|
||||
// Restore Original Mode in S2
|
||||
nop.i 0 ;;
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p8) fcmp.lt.unc.s1 p9, p10 = f12, f11
|
||||
nop.i 0 ;;
|
||||
}
|
||||
{ .mib
|
||||
nop.m 0
|
||||
mov GR_Parameter_TAG = 47
|
||||
// No overflow
|
||||
(p9) br.ret.sptk b0;;
|
||||
}
|
||||
GLOBAL_IEEE754_END(hypotf)
|
||||
libm_alias_float_other (__hypot, hypot)
|
||||
|
||||
LOCAL_LIBM_ENTRY(__libm_error_region)
|
||||
.prologue
|
||||
{ .mii
|
||||
add GR_Parameter_Y=-32,sp // Parameter 2 value
|
||||
mov GR_Parameter_TAG = 47
|
||||
.save ar.pfs,GR_SAVE_PFS
|
||||
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
|
||||
}
|
||||
{ .mfi
|
||||
.fframe 64
|
||||
add sp=-64,sp // Create new stack
|
||||
nop.f 0
|
||||
mov GR_SAVE_GP=gp // Save gp
|
||||
};;
|
||||
{ .mmi
|
||||
stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack
|
||||
add GR_Parameter_X = 16,sp // Parameter 1 address
|
||||
.save b0, GR_SAVE_B0
|
||||
mov GR_SAVE_B0=b0 // Save b0
|
||||
};;
|
||||
.body
|
||||
{ .mib
|
||||
stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
|
||||
add GR_Parameter_RESULT = 0,GR_Parameter_Y
|
||||
nop.b 0 // Parameter 3 address
|
||||
}
|
||||
{ .mib
|
||||
stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
|
||||
add GR_Parameter_Y = -16,GR_Parameter_Y
|
||||
br.call.sptk b0=__libm_error_support# // Call error handling function
|
||||
};;
|
||||
{ .mmi
|
||||
nop.m 0
|
||||
nop.m 0
|
||||
add GR_Parameter_RESULT = 48,sp
|
||||
};;
|
||||
{ .mmi
|
||||
ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
|
||||
.restore sp
|
||||
add sp = 64,sp // Restore stack pointer
|
||||
mov b0 = GR_SAVE_B0 // Restore return address
|
||||
};;
|
||||
{ .mib
|
||||
mov gp = GR_SAVE_GP // Restore gp
|
||||
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
|
||||
br.ret.sptk b0 // Return
|
||||
};;
|
||||
|
||||
LOCAL_LIBM_END(__libm_error_region)
|
||||
|
||||
|
||||
.type __libm_error_support#,@function
|
||||
.global __libm_error_support#
|
@ -1,475 +0,0 @@
|
||||
.file "hypotl.s"
|
||||
|
||||
|
||||
// Copyright (c) 2000 - 2003, Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
//
|
||||
//*********************************************************************
|
||||
//
|
||||
// History:
|
||||
// 02/02/00 hand-optimized
|
||||
// 04/04/00 Unwind support added
|
||||
// 06/20/00 new version
|
||||
// 08/15/00 Bundle added after call to __libm_error_support to properly
|
||||
// set [the previously overwritten] GR_Parameter_RESULT.
|
||||
// 05/20/02 Cleaned up namespace and sf0 syntax
|
||||
// 02/10/03 Reordered header: .section, .global, .proc, .align
|
||||
//
|
||||
//*********************************************************************
|
||||
// ___________
|
||||
// Function: hypotl(x,y) = |(x^2 + y^2) = for double extended values
|
||||
// x and y
|
||||
// Also provides cabsl functionality.
|
||||
//
|
||||
//*********************************************************************
|
||||
//
|
||||
// Resources Used:
|
||||
//
|
||||
// Floating-Point Registers: f8 (Input and Return Value)
|
||||
// f9 (Input)
|
||||
// f6 -f15, f32-f34
|
||||
//
|
||||
// General Purpose Registers:
|
||||
// r2-r3 (Scratch)
|
||||
// r32-r36 (Locals)
|
||||
// r37-r40 (Used to pass arguments to error handling routine)
|
||||
//
|
||||
// Predicate Registers: p6 - p10
|
||||
//
|
||||
//*********************************************************************
|
||||
//
|
||||
// IEEE Special Conditions:
|
||||
//
|
||||
// All faults and exceptions should be raised correctly.
|
||||
// Overflow can occur.
|
||||
// hypotl(Infinity and anything) = +Infinity
|
||||
// hypotl(QNaN and anything) = QNaN
|
||||
// hypotl(SNaN and anything ) = QNaN
|
||||
//
|
||||
//*********************************************************************
|
||||
//
|
||||
// Implementation:
|
||||
// x2 = x * x in double-extended
|
||||
// y2 = y * y in double-extended
|
||||
// temp = x2 + y2 in double-extended
|
||||
// sqrt(temp) rounded to double extended
|
||||
//
|
||||
//*********************************************************************
|
||||
|
||||
GR_SAVE_PFS = r33
|
||||
GR_SAVE_B0 = r34
|
||||
GR_SAVE_GP = r35
|
||||
GR_Parameter_X = r36
|
||||
GR_Parameter_Y = r37
|
||||
GR_Parameter_RESULT = r38
|
||||
GR_Parameter_TAG = r39
|
||||
|
||||
FR_X = f32
|
||||
FR_Y = f33
|
||||
FR_RESULT = f8
|
||||
|
||||
.section .text
|
||||
|
||||
LOCAL_LIBM_ENTRY(cabsl)
|
||||
LOCAL_LIBM_END(cabsl)
|
||||
|
||||
GLOBAL_IEEE754_ENTRY(hypotl)
|
||||
{.mfi
|
||||
alloc r32= ar.pfs,0,4,4,0
|
||||
// Compute x*x
|
||||
fma.s1 f10=f8,f8,f0
|
||||
// r2=bias-1
|
||||
mov r2=0xfffe
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// y*y
|
||||
fma.s1 f11=f9,f9,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// Check if x is an Inf - if so return Inf even
|
||||
// if y is a NaN (C9X)
|
||||
fclass.m.unc p7, p6 = f8, 0x023
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// if possible overflow, copy f8 to f32
|
||||
// set Denormal, if necessary
|
||||
// (p8)
|
||||
fma.s0 f32=f8,f1,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// Check if y is an Inf - if so return Inf even
|
||||
// if x is a NaN (C9X)
|
||||
fclass.m.unc p8, p9 = f9, 0x023
|
||||
nop.i 0
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 999
|
||||
// For x=inf, multiply y by 1 to raise invalid on y an SNaN
|
||||
// (p7) fma.s0 f9=f9,f1,f0
|
||||
// copy f9 to f33; set Denormal, if necessary
|
||||
fma.s0 f33=f9,f1,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// is y Zero ?
|
||||
(p6) fclass.m p6,p0=f9,0x7
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
// f7=0.5
|
||||
setf.exp f7=r2
|
||||
// a=x2+y2
|
||||
fma.s1 f12=f10,f1,f11
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
mov r2=0x408c //0000
|
||||
// dx=x*x-x2
|
||||
fms.s1 f13=f8,f8,f10
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// is x Zero ?
|
||||
(p9) fclass.m p9,p0=f8,0x7
|
||||
shl r2=r2,16
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// dy=y*y-y2
|
||||
fms.s1 f14=f9,f9,f11
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// x not NaN ?
|
||||
(p6) fclass.m p7,p0=f8,0x3f
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// f6=2
|
||||
fma.s1 f6=f1,f1,f1
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// f34=min(x2,y2)
|
||||
famin.s1 f34=f10,f11
|
||||
nop.i 0
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// f10=max(x2,y2)
|
||||
famax.s1 f10=f11,f10
|
||||
nop.b 0;; //
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// y not NaN ?
|
||||
(p9) fclass.m p8,p0=f9,0x3f
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfb
|
||||
// f9=35/8
|
||||
setf.s f9=r2
|
||||
// if f8=Infinity or f9=Zero, return |f8|
|
||||
(p7) fmerge.s f8=f0,f32
|
||||
(p7) br.ret.spnt b0;;
|
||||
}
|
||||
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// z0=frsqrta(a)
|
||||
frsqrta.s1 f8,p6=f12
|
||||
nop.i 0;;
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
// Identify Natvals, Infs, NaNs, and Zeros
|
||||
// and return result
|
||||
fclass.m.unc p7, p0 = f12, 0x1E7
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
// get exponent of x^2+y^2
|
||||
getf.exp r3=f12
|
||||
// dxy=dx+dy
|
||||
fma.s1 f13=f13,f1,f14
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfb
|
||||
// 2*emax-2
|
||||
mov r2=0x17ffb
|
||||
// if f9=Infinity or f8=Zero, return |f9|
|
||||
(p8) fmerge.s f8=f0,f33
|
||||
(p8) br.ret.spnt b0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// dd=a-max(x2,y2)
|
||||
fnma.s1 f10=f10,f1,f12
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// S0=a*z0
|
||||
(p6) fma.s1 f14=f12,f8,f0
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// H0=0.5*z0
|
||||
(p6) fma.s1 f15=f8,f7,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// if special case, set f8
|
||||
(p7) mov f8=f12
|
||||
(p7) br.ret.spnt b0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// da=min(x2,y2)-dd
|
||||
fnma.s1 f10=f10,f1,f34
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// f6=5/2
|
||||
fma.s1 f6=f7,f1,f6
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// f11=3/2
|
||||
fma.s1 f11=f7,f1,f1
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// d=0.5-S0*H0
|
||||
(p6) fnma.s1 f7=f14,f15,f7
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P1=3/2*d+1
|
||||
(p6) fma.s1 f11=f11,f7,f1
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P2=35/8*d+5/2
|
||||
(p6) fma.s1 f9=f9,f7,f6
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// d2=d*d
|
||||
(p6) fma.s1 f34=f7,f7,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// T0=d*S0
|
||||
(p6) fma.s1 f6=f7,f14,f0
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// G0=d*H0
|
||||
(p6) fma.s1 f7=f7,f15,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P=d2*P2+P1
|
||||
(p6) fma.s1 f11=f34,f9,f11
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// S1=p*T0+S0
|
||||
(p6) fma.s1 f14=f11,f6,f14
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// H1=p*G0+H0
|
||||
(p6) fma.s1 f15=f11,f7,f15
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// e1=a-S1*S1
|
||||
(p6) fnma.s1 f7=f14,f14,f12
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
// Is x^2 + y^2 well less than the overflow
|
||||
// threshold?
|
||||
(p6) cmp.lt.unc p7, p8 = r3,r2
|
||||
// c=dxy+da
|
||||
(p6) fma.s1 f13=f13,f1,f10
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// e=e1+c
|
||||
(p6) fma.s1 f13=f7,f1,f13
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// S=e*H1+S1
|
||||
fma.s0 f8=f13,f15,f14
|
||||
// No overflow in this case
|
||||
(p7) br.ret.sptk b0;;
|
||||
}
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p8) fsetc.s2 0x7F,0x42
|
||||
// Possible overflow path, must detect by
|
||||
// Setting widest range exponent with prevailing
|
||||
// rounding mode.
|
||||
nop.i 0 ;;
|
||||
}
|
||||
|
||||
|
||||
{ .mfi
|
||||
// bias+0x4000 (bias+EMAX+1)
|
||||
(p8) mov r2=0x13fff
|
||||
// S=e*H1+S1
|
||||
(p8) fma.s2 f12=f13,f15,f14
|
||||
nop.i 0 ;;
|
||||
}
|
||||
{ .mfi
|
||||
(p8) setf.exp f11 = r2
|
||||
(p8) fsetc.s2 0x7F,0x40
|
||||
// Restore Original Mode in S2
|
||||
nop.i 0 ;;
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
(p8) fcmp.lt.unc.s1 p9, p10 = f12, f11
|
||||
nop.i 0 ;;
|
||||
}
|
||||
{ .mib
|
||||
nop.m 0
|
||||
mov GR_Parameter_TAG = 45;
|
||||
// No overflow
|
||||
(p9) br.ret.sptk b0;;
|
||||
}
|
||||
GLOBAL_IEEE754_END(hypotl)
|
||||
libm_alias_ldouble_other (__hypot, hypot)
|
||||
|
||||
LOCAL_LIBM_ENTRY(__libm_error_region)
|
||||
.prologue
|
||||
{ .mfi
|
||||
add GR_Parameter_Y=-32,sp // Parameter 2 value
|
||||
nop.f 0
|
||||
.save ar.pfs,GR_SAVE_PFS
|
||||
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
|
||||
}
|
||||
{ .mfi
|
||||
.fframe 64
|
||||
add sp=-64,sp // Create new stack
|
||||
nop.f 0
|
||||
mov GR_SAVE_GP=gp // Save gp
|
||||
};;
|
||||
{ .mmi
|
||||
stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
|
||||
add GR_Parameter_X = 16,sp // Parameter 1 address
|
||||
.save b0, GR_SAVE_B0
|
||||
mov GR_SAVE_B0=b0 // Save b0
|
||||
};;
|
||||
.body
|
||||
{ .mib
|
||||
stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
|
||||
add GR_Parameter_RESULT = 0,GR_Parameter_Y
|
||||
nop.b 0 // Parameter 3 address
|
||||
}
|
||||
{ .mib
|
||||
stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
|
||||
add GR_Parameter_Y = -16,GR_Parameter_Y
|
||||
br.call.sptk b0=__libm_error_support# // Call error handling function
|
||||
};;
|
||||
{ .mmi
|
||||
nop.m 0
|
||||
nop.m 0
|
||||
add GR_Parameter_RESULT = 48,sp
|
||||
};;
|
||||
{ .mmi
|
||||
ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
|
||||
.restore sp
|
||||
add sp = 64,sp // Restore stack pointer
|
||||
mov b0 = GR_SAVE_B0 // Restore return address
|
||||
};;
|
||||
{ .mib
|
||||
mov gp = GR_SAVE_GP // Restore gp
|
||||
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
|
||||
br.ret.sptk b0 // Return
|
||||
};;
|
||||
LOCAL_LIBM_END(__libm_error_region#)
|
||||
.type __libm_error_support#,@function
|
||||
.global __libm_error_support#
|
@ -1,267 +0,0 @@
|
||||
.file "ilogbl.s"
|
||||
|
||||
|
||||
// Copyright (c) 2000 - 2003, Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
//
|
||||
// History
|
||||
//==============================================================
|
||||
// 02/03/00 Initial version
|
||||
// 05/26/00 Fix bug when x a double-extended denormal;
|
||||
// if x=0 call error routine, per C9X
|
||||
// 08/15/00 Bundle added after call to __libm_error_support to properly
|
||||
// set [the previously overwritten] GR_Parameter_RESULT.
|
||||
// 01/20/01 Fixed result for x=0
|
||||
// 05/20/02 Cleaned up namespace and sf0 syntax
|
||||
// 01/20/03 Improved performance
|
||||
//
|
||||
// API
|
||||
//==============================================================
|
||||
// int ilogbl( long double x );
|
||||
//
|
||||
// Overview of operation
|
||||
//==============================================================
|
||||
// The ilogbl function extracts the exponent of x as an integer
|
||||
// and returns it in r8
|
||||
//
|
||||
// ilogbl is similar to logbl but differs in the following ways:
|
||||
// +-inf
|
||||
// ilogbl: returns INT_MAX
|
||||
// logbl: returns +inf
|
||||
// Nan returns FP_LOGBNAN (which is either INT_MAX or INT_MIN)
|
||||
// ilogbl: returns INT_MAX (7fffffff)
|
||||
// logbl: returns QNAN (quietized SNAN)
|
||||
// 0 returns FP_ILOGB0 (which is either INT_MIN or -INT_MAX)
|
||||
// ilogbl: returns -INT_MAX (80000001)
|
||||
// logbl: returns -inf, raises the divide-by-zero exception,
|
||||
// and calls libm_error_support to set domain error
|
||||
//
|
||||
// Registers used
|
||||
//==============================================================
|
||||
// general registers used:
|
||||
// r26 -> r39
|
||||
// r36 -> r39 used as parameters to error path
|
||||
//
|
||||
// predicate registers used:
|
||||
// p6 -> p10
|
||||
// floating-point registers used:
|
||||
// f9, f10, f11
|
||||
// f8, input
|
||||
|
||||
rExpBias = r26
|
||||
rExpMask = r27
|
||||
rSignexp_x = r28
|
||||
rExp_x = r29
|
||||
rIntMax = r30
|
||||
rExp_2to64 = r31
|
||||
|
||||
GR_SAVE_PFS = r32
|
||||
rTrialResult = r33
|
||||
GR_SAVE_B0 = r34
|
||||
GR_SAVE_GP = r35
|
||||
|
||||
GR_Parameter_X = r36
|
||||
GR_Parameter_Y = r37
|
||||
GR_Parameter_RESULT = r38
|
||||
GR_Parameter_TAG = r39
|
||||
|
||||
fTmp = f9
|
||||
fNorm_x = f10
|
||||
f2to64 = f11
|
||||
|
||||
.section .text
|
||||
GLOBAL_LIBM_ENTRY(__ieee754_ilogbl)
|
||||
|
||||
// X NORMAL
|
||||
// TrueExp_x = exp(f8) - 0xffff
|
||||
// r8 = TrueExp_x
|
||||
{ .mfi
|
||||
getf.exp rSignexp_x = f8
|
||||
fclass.m p8,p0 = f8, 0x0b // Test for x unorm
|
||||
mov rExpBias = 0xffff // Exponent bias
|
||||
}
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fnorm.s1 fNorm_x = f8
|
||||
mov rExpMask = 0x1ffff // Exponent mask
|
||||
}
|
||||
;;
|
||||
|
||||
// Form signexp of 2^64 in case need to scale denormal
|
||||
{ .mfb
|
||||
mov rExp_2to64 = 0x1003f
|
||||
fclass.m p6,p9 = f8, 0x1e3 // Test x natval, nan, inf
|
||||
(p8) br.cond.spnt ILOGB_DENORM // Branch if x unorm
|
||||
}
|
||||
;;
|
||||
|
||||
ILOGB_COMMON:
|
||||
// Return here from ILOGB_DENORM
|
||||
{ .mfi
|
||||
and rExp_x = rSignexp_x, rExpMask // Get biased exponent
|
||||
fclass.m p7,p10 = f8, 0x07 // Test x zero
|
||||
nop.i 0
|
||||
}
|
||||
{ .mlx
|
||||
nop.m 0
|
||||
movl rIntMax = 0x000000007fffffff // Form INT_MAX
|
||||
}
|
||||
;;
|
||||
|
||||
.pred.rel "mutex",p6,p9
|
||||
{ .mfi
|
||||
(p9) sub r8 = rExp_x, rExpBias // Get true exponent for normal path
|
||||
(p6) fma.s0 fTmp = f8, f8, f0 // Dummy to set Invalid flag
|
||||
(p6) mov r8 = rIntMax // If nan, inf, return INT_MAX
|
||||
}
|
||||
{ .mbb
|
||||
nop.m 0
|
||||
(p7) br.cond.spnt ILOGB_ZERO // Branch if x zero
|
||||
(p10) br.ret.sptk b0 // Exit if x not zero
|
||||
}
|
||||
;;
|
||||
|
||||
|
||||
ILOGB_DENORM:
|
||||
// Form 2^64 in case need to scale denormal
|
||||
// Check to see if double-extended denormal
|
||||
{ .mfi
|
||||
setf.exp f2to64 = rExp_2to64
|
||||
fclass.m p8,p0 = fNorm_x, 0x0b
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
{ .mfi
|
||||
nop.m 0
|
||||
fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
// If double-extended denormal add 64 to exponent bias for scaling
|
||||
// If double-extended denormal form x * 2^64 which is normal
|
||||
{ .mfi
|
||||
(p8) add rExpBias = 64, rExpBias
|
||||
(p8) fmpy.s1 fNorm_x = fNorm_x, f2to64
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
|
||||
// Logic is the same as normal path but use normalized input
|
||||
{ .mib
|
||||
getf.exp rSignexp_x = fNorm_x
|
||||
nop.i 0
|
||||
br.cond.sptk ILOGB_COMMON // Return to main path
|
||||
}
|
||||
;;
|
||||
|
||||
ILOGB_ZERO:
|
||||
// Here if x zero
|
||||
// Return INT_MIN, call error support
|
||||
|
||||
{ .mlx
|
||||
alloc r32=ar.pfs,1,3,4,0
|
||||
movl rTrialResult = 0x0000000080000000
|
||||
}
|
||||
{ .mib
|
||||
mov GR_Parameter_TAG = 156 // Error code
|
||||
nop.i 0
|
||||
br.cond.sptk __libm_error_region // Call error support
|
||||
}
|
||||
;;
|
||||
|
||||
GLOBAL_LIBM_END(__ieee754_ilogbl)
|
||||
|
||||
|
||||
LOCAL_LIBM_ENTRY(__libm_error_region)
|
||||
.prologue
|
||||
|
||||
{ .mfi
|
||||
add GR_Parameter_Y=-32,sp // Parameter 2 value
|
||||
nop.f 0
|
||||
.save ar.pfs,GR_SAVE_PFS
|
||||
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
|
||||
}
|
||||
{ .mfi
|
||||
.fframe 64
|
||||
add sp=-64,sp // Create new stack
|
||||
nop.f 0
|
||||
mov GR_SAVE_GP=gp // Save gp
|
||||
};;
|
||||
|
||||
{ .mmi
|
||||
stfe [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack
|
||||
add GR_Parameter_X = 16,sp // Parameter 1 address
|
||||
.save b0, GR_SAVE_B0
|
||||
mov GR_SAVE_B0=b0 // Save b0
|
||||
};;
|
||||
|
||||
.body
|
||||
{ .mib
|
||||
stfe [GR_Parameter_X] = f8 // STORE Parameter 1 on stack
|
||||
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
|
||||
nop.b 0
|
||||
}
|
||||
{ .mib
|
||||
stfe [GR_Parameter_Y] = f9 // Store Parameter 3 on stack
|
||||
add GR_Parameter_Y = -16,GR_Parameter_Y
|
||||
br.call.sptk b0=__libm_error_support# // Call error handling function
|
||||
};;
|
||||
|
||||
{ .mmi
|
||||
add GR_Parameter_RESULT = 48,sp
|
||||
nop.m 0
|
||||
nop.i 0
|
||||
};;
|
||||
|
||||
{ .mmi
|
||||
mov r8 = rTrialResult
|
||||
.restore sp
|
||||
add sp = 64,sp // Restore stack pointer
|
||||
mov b0 = GR_SAVE_B0 // Restore return address
|
||||
};;
|
||||
|
||||
{ .mib
|
||||
mov gp = GR_SAVE_GP // Restore gp
|
||||
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
|
||||
br.ret.sptk b0
|
||||
};;
|
||||
|
||||
LOCAL_LIBM_END(__libm_error_region)
|
||||
|
||||
|
||||
.type __libm_error_support#,@function
|
||||
.global __libm_error_support#
|
@ -1,70 +0,0 @@
|
||||
/* file: lgamma_r.c */
|
||||
|
||||
|
||||
// Copyright (c) 2002 Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
//
|
||||
//
|
||||
|
||||
// History
|
||||
//==============================================================
|
||||
// 02/04/02: Initial version
|
||||
// 02/22/02: Removed lgammaf_r, gammaf_r
|
||||
/*
|
||||
// FUNCTIONS: double lgamma_r(double x, int* signgam)
|
||||
// double gamma_r(double x, int* signgam)
|
||||
// Natural logarithm of GAMMA function
|
||||
*/
|
||||
|
||||
#include "libm_support.h"
|
||||
|
||||
|
||||
extern double __libm_lgamma(double /*x*/, int* /*signgam*/, int /*signgamsz*/);
|
||||
|
||||
|
||||
double __ieee754_lgamma_r(double x, int* signgam)
|
||||
{
|
||||
return __libm_lgamma(x, signgam, sizeof(*signgam));
|
||||
}
|
||||
libm_alias_double_r (__ieee754_lgamma, lgamma, _r)
|
||||
|
||||
#ifndef _LIBC
|
||||
double __ieee754_gamma_r(double x, int* signgam)
|
||||
{
|
||||
return __libm_lgamma(x, signgam, sizeof(*signgam));
|
||||
}
|
||||
weak_alias (__ieee754_gamma_r, gamma_r)
|
||||
#endif
|
@ -1,70 +0,0 @@
|
||||
/* file: lgammaf_r.c */
|
||||
|
||||
|
||||
// Copyright (c) 2002 Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
//
|
||||
//
|
||||
|
||||
// History
|
||||
//==============================================================
|
||||
// 02/04/02: Initial version
|
||||
// 02/22/02: Removed lgamma_r, gamma_r
|
||||
/*
|
||||
// FUNCTIONS: float lgammaf_r(float x, int* signgam)
|
||||
// float gammaf_r(float x, int* signgam)
|
||||
// Natural logarithm of GAMMA function
|
||||
*/
|
||||
|
||||
#include "libm_support.h"
|
||||
|
||||
|
||||
extern float __libm_lgammaf(float /*x*/, int* /*signgam*/, int /*signgamsz*/);
|
||||
|
||||
|
||||
float __ieee754_lgammaf_r(float x, int* signgam)
|
||||
{
|
||||
return __libm_lgammaf(x, signgam, sizeof(*signgam));
|
||||
}
|
||||
libm_alias_float_r (__ieee754_lgamma, lgamma, _r)
|
||||
|
||||
#ifndef _LIBC
|
||||
float __ieee754_gammaf_r(float x, int* signgam)
|
||||
{
|
||||
return __libm_lgammaf(x, signgam, sizeof(*signgam));
|
||||
}
|
||||
weak_alias (__ieee754_gammaf_r, gammaf_r)
|
||||
#endif
|
@ -1,69 +0,0 @@
|
||||
/* file: lgammal_r.c */
|
||||
|
||||
|
||||
// Copyright (c) 2002 Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
//
|
||||
//
|
||||
|
||||
// History
|
||||
//==============================================================
|
||||
// 08/15/02: Initial version
|
||||
/*
|
||||
// FUNCTIONS: long double lgammal_r(long double x, int* signgam)
|
||||
// long double gammal_r(long double x, int* signgam)
|
||||
// Natural logarithm of GAMMA function
|
||||
*/
|
||||
|
||||
#include "libm_support.h"
|
||||
|
||||
|
||||
extern double __libm_lgammal(long double /*x*/, int* /*signgam*/, int /*signgamsz*/);
|
||||
|
||||
|
||||
long double __ieee754_lgammal_r(long double x, int* signgam)
|
||||
{
|
||||
return __libm_lgammal(x, signgam, sizeof(*signgam));
|
||||
}
|
||||
libm_alias_ldouble_r (__ieee754_lgamma, lgamma, _r)
|
||||
|
||||
#ifndef _LIBC
|
||||
long double __ieee754_gammal_r(long double x, int* signgam)
|
||||
{
|
||||
return __libm_lgammal(x, signgam, sizeof(*signgam));
|
||||
}
|
||||
weak_alias (__ieee754_gammal_r, gammal_r)
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
@ -1 +0,0 @@
|
||||
/* Not needed. */
|
@ -1 +0,0 @@
|
||||
/* Not needed. */
|
@ -1 +0,0 @@
|
||||
/* Not needed. */
|
@ -1,715 +0,0 @@
|
||||
.file "log2.s"
|
||||
|
||||
|
||||
// Copyright (c) 2000 - 2003, Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
//
|
||||
// History
|
||||
//=================================================================
|
||||
// 09/11/00 Initial version
|
||||
// 03/19/01 Added one polynomial coefficient, to improve accuracy
|
||||
// 05/20/02 Cleaned up namespace and sf0 syntax
|
||||
// 02/10/03 Reordered header: .section, .global, .proc, .align
|
||||
// 04/18/03 Reformatted T[255]
|
||||
//
|
||||
// API
|
||||
//=================================================================
|
||||
// double log2(double)
|
||||
//
|
||||
// Overview of operation
|
||||
//=================================================================
|
||||
// Background
|
||||
//
|
||||
// Implementation
|
||||
//
|
||||
// Let x = 2^l * m, where m=1.b1 b2 ... b8 b9 ... b52
|
||||
// y=frcpa(m), r=m*y-1, f=b1 b2 .. b8 (table index)
|
||||
// j=0 if f<128; j=1 if f>=128
|
||||
// T is a table that stores log2(1/y) (in entries 1..255) rounded to
|
||||
// double extended precision; f is used as an index; T[255]=0
|
||||
//
|
||||
// If f=0 and b9=0, r is set to 2^{-8}* 0.b9 b10 ... b52 = m-1 (fractional part of m),
|
||||
// and 0 is used instead of T[0]
|
||||
// (polynomial evaluation only, for m=1+r, 0<=r<2^{-9})
|
||||
// If f=255, r is set to (m-2)/2 (T[255]=0, and only polynomial evaluation is used
|
||||
// for m=2(1-r'), 0<=r'<2^{-9})
|
||||
//
|
||||
// log2(x) is approximated as
|
||||
// (l-j) + T[f] + (c1*r+c2*r^2+...+c7*r^7), if f>0
|
||||
//
|
||||
|
||||
|
||||
// Special values
|
||||
//=================================================================
|
||||
// log2(0)=-inf, raises Divide by Zero
|
||||
// log2(+inf)=inf
|
||||
// log2(x)=NaN, raises Invalid if x<0
|
||||
//
|
||||
|
||||
|
||||
// Registers used
|
||||
//==============================================================
|
||||
// f6-f15, f32-f33
|
||||
// r2-r3, r23-r30
|
||||
// p6,p7,p8,p12
|
||||
//
|
||||
|
||||
|
||||
GR_SAVE_B0 = r33
|
||||
GR_SAVE_PFS = r34
|
||||
GR_SAVE_GP = r35 // This reg. can safely be used
|
||||
GR_SAVE_SP = r36
|
||||
|
||||
GR_Parameter_X = r37
|
||||
GR_Parameter_Y = r38
|
||||
GR_Parameter_RESULT = r39
|
||||
GR_Parameter_TAG = r40
|
||||
|
||||
FR_X = f10
|
||||
FR_Y = f1
|
||||
FR_RESULT = f8
|
||||
|
||||
|
||||
|
||||
|
||||
// Data tables
|
||||
//==============================================================
|
||||
|
||||
RODATA
|
||||
|
||||
.align 16
|
||||
|
||||
LOCAL_OBJECT_START(poly_coeffs)
|
||||
|
||||
data8 0xbfd0000000000000, 0x3fc999999999999a //C_4, C_5
|
||||
data8 0xbfc5555555555555, 0x3fc2492492492492 //C_6, C_7
|
||||
data8 0xb8aa3b295c17f0bc, 0x00003fff // C_1
|
||||
data8 0xaaaaaaaaaaaaaaab, 0x00003ffd // C_3=1/3
|
||||
LOCAL_OBJECT_END(poly_coeffs)
|
||||
|
||||
|
||||
LOCAL_OBJECT_START(T_table)
|
||||
|
||||
data8 0xb8d8752172fed131, 0x00003ff6
|
||||
data8 0x8ae7f475764180a3, 0x00003ff8
|
||||
data8 0xe7f73862e72ee35d, 0x00003ff8
|
||||
data8 0xa2b25310c941a2f2, 0x00003ff9
|
||||
data8 0xcbb91d671abb2e85, 0x00003ff9
|
||||
data8 0xfac91e34daa50483, 0x00003ff9
|
||||
data8 0x9504a5042eb495c5, 0x00003ffa
|
||||
data8 0xa9c4a0bbb580ee02, 0x00003ffa
|
||||
data8 0xc19264dc8a5e3bf9, 0x00003ffa
|
||||
data8 0xd67aa6703ebf4a77, 0x00003ffa
|
||||
data8 0xee76cac6d6e08ce7, 0x00003ffa
|
||||
data8 0x81c3f7de5434ed04, 0x00003ffb
|
||||
data8 0x8c563033a3ce01e4, 0x00003ffb
|
||||
data8 0x9876e9f09a98661c, 0x00003ffb
|
||||
data8 0xa31e0ac9b2326ce2, 0x00003ffb
|
||||
data8 0xadcf09e1fd10e4a5, 0x00003ffb
|
||||
data8 0xb889f992cf03cdb6, 0x00003ffb
|
||||
data8 0xc34eec68d901a714, 0x00003ffb
|
||||
data8 0xce1df524e9909ed9, 0x00003ffb
|
||||
data8 0xd8f726bcb0b80ad0, 0x00003ffb
|
||||
data8 0xe3da945b878e27d1, 0x00003ffb
|
||||
data8 0xeec851633b76a320, 0x00003ffb
|
||||
data8 0xf82ea4bb6101421a, 0x00003ffb
|
||||
data8 0x8197ddd7736b2864, 0x00003ffc
|
||||
data8 0x871dad4f994253f0, 0x00003ffc
|
||||
data8 0x8ca8cae3e892d549, 0x00003ffc
|
||||
data8 0x916d6e1559a4b697, 0x00003ffc
|
||||
data8 0x97028118efabeb7d, 0x00003ffc
|
||||
data8 0x9bcfbce1592ad5d5, 0x00003ffc
|
||||
data8 0xa16ee95d0da54a91, 0x00003ffc
|
||||
data8 0xa644dcf3403fa5d0, 0x00003ffc
|
||||
data8 0xab1ee14ffd659064, 0x00003ffc
|
||||
data8 0xb0cd12faebcc6757, 0x00003ffc
|
||||
data8 0xb5affdf9b3b221e0, 0x00003ffc
|
||||
data8 0xba970fb307c6ade1, 0x00003ffc
|
||||
data8 0xbf824f3a9f3e7561, 0x00003ffc
|
||||
data8 0xc544c055fde99333, 0x00003ffc
|
||||
data8 0xca39266532bdf26c, 0x00003ffc
|
||||
data8 0xcf31d124b8fa2f56, 0x00003ffc
|
||||
data8 0xd42ec7f59017b6ab, 0x00003ffc
|
||||
data8 0xd930124bea9a2c67, 0x00003ffc
|
||||
data8 0xde35b7af70e4dab3, 0x00003ffc
|
||||
data8 0xe33fbfbb8533ef03, 0x00003ffc
|
||||
data8 0xe77625911a7dcef3, 0x00003ffc
|
||||
data8 0xec884bd689cc12e3, 0x00003ffc
|
||||
data8 0xf19eeabf9e99a40a, 0x00003ffc
|
||||
data8 0xf6ba0a35e3d88051, 0x00003ffc
|
||||
data8 0xfbd9b237f7b4192b, 0x00003ffc
|
||||
data8 0x80111d4a1ee0c79e, 0x00003ffd
|
||||
data8 0x82a523a5f875bbfc, 0x00003ffd
|
||||
data8 0x84ccecdc92cd0815, 0x00003ffd
|
||||
data8 0x87653369d92c057a, 0x00003ffd
|
||||
data8 0x89ffd1742da3aa21, 0x00003ffd
|
||||
data8 0x8c2d2227d053d9b6, 0x00003ffd
|
||||
data8 0x8e5c189793f7f798, 0x00003ffd
|
||||
data8 0x90fd0a20e72f3c96, 0x00003ffd
|
||||
data8 0x932fa937301e59ae, 0x00003ffd
|
||||
data8 0x95d5061a5f0f5f7f, 0x00003ffd
|
||||
data8 0x980b5a2ef10e7023, 0x00003ffd
|
||||
data8 0x9a4361c5514d3c27, 0x00003ffd
|
||||
data8 0x9c7d1f7d541313fd, 0x00003ffd
|
||||
data8 0x9f2b16040b500d04, 0x00003ffd
|
||||
data8 0xa168a0fa9db22c98, 0x00003ffd
|
||||
data8 0xa3a7eaa1f9116293, 0x00003ffd
|
||||
data8 0xa5e8f5b4072a3d44, 0x00003ffd
|
||||
data8 0xa82bc4f11a5e88aa, 0x00003ffd
|
||||
data8 0xaa705b2001db8317, 0x00003ffd
|
||||
data8 0xacb6bb0e1e0f8005, 0x00003ffd
|
||||
data8 0xaefee78f75707221, 0x00003ffd
|
||||
data8 0xb148e37ec994dd99, 0x00003ffd
|
||||
data8 0xb394b1bdaca0bc17, 0x00003ffd
|
||||
data8 0xb5e255349707e496, 0x00003ffd
|
||||
data8 0xb831d0d2fda791cc, 0x00003ffd
|
||||
data8 0xba83278f6838ab20, 0x00003ffd
|
||||
data8 0xbcd65c67881c7d47, 0x00003ffd
|
||||
data8 0xbeb3e0f21d72dc92, 0x00003ffd
|
||||
data8 0xc10a7a03457d35dc, 0x00003ffd
|
||||
data8 0xc362f9b6f51eddd3, 0x00003ffd
|
||||
data8 0xc5bd6326ebfce656, 0x00003ffd
|
||||
data8 0xc7a0b3d0637c8f97, 0x00003ffd
|
||||
data8 0xc9fe96af0df8e4b5, 0x00003ffd
|
||||
data8 0xcc5e6c214b4a2cd7, 0x00003ffd
|
||||
data8 0xce46199f374d29cf, 0x00003ffd
|
||||
data8 0xd0a978a14c0d9ebe, 0x00003ffd
|
||||
data8 0xd293fecafec7f9b5, 0x00003ffd
|
||||
data8 0xd4faf1f6f5cf32e6, 0x00003ffd
|
||||
data8 0xd6e8595abaad34d1, 0x00003ffd
|
||||
data8 0xd952eb7a8ffc1593, 0x00003ffd
|
||||
data8 0xdb433ccd805f171e, 0x00003ffd
|
||||
data8 0xddb178dc43e6bd84, 0x00003ffd
|
||||
data8 0xdfa4bcfb333342a4, 0x00003ffd
|
||||
data8 0xe19953741ccea015, 0x00003ffd
|
||||
data8 0xe40cee16a2ff21c5, 0x00003ffd
|
||||
data8 0xe6048470cdbde8ea, 0x00003ffd
|
||||
data8 0xe7fd7308d6895b14, 0x00003ffd
|
||||
data8 0xe9f7bbb6a1ff9f87, 0x00003ffd
|
||||
data8 0xec7280138809433d, 0x00003ffd
|
||||
data8 0xee6fda4365cd051f, 0x00003ffd
|
||||
data8 0xf06e94a122ff1f12, 0x00003ffd
|
||||
data8 0xf26eb1151441fce5, 0x00003ffd
|
||||
data8 0xf470318b88a77e2f, 0x00003ffd
|
||||
data8 0xf67317f4d4c8aa58, 0x00003ffd
|
||||
data8 0xf8f8b250a9c4cde6, 0x00003ffd
|
||||
data8 0xfafec54831f1a484, 0x00003ffd
|
||||
data8 0xfd06449bf3eaea1e, 0x00003ffd
|
||||
data8 0xff0f324ddb19ab67, 0x00003ffd
|
||||
data8 0x808cc8320a9acf15, 0x00003ffe
|
||||
data8 0x8192b0748f2cef06, 0x00003ffe
|
||||
data8 0x829952f5e6a24ee5, 0x00003ffe
|
||||
data8 0x83a0b0bfafe1424e, 0x00003ffe
|
||||
data8 0x8466b29f9c41caea, 0x00003ffe
|
||||
data8 0x856f5aae0881d857, 0x00003ffe
|
||||
data8 0x8678c0eae8ee8190, 0x00003ffe
|
||||
data8 0x8782e6685676b9d7, 0x00003ffe
|
||||
data8 0x888dcc3abc4554ec, 0x00003ffe
|
||||
data8 0x89997378de7b98b8, 0x00003ffe
|
||||
data8 0x8aa5dd3be1044279, 0x00003ffe
|
||||
data8 0x8b6facdfd0360ab8, 0x00003ffe
|
||||
data8 0x8c7d6db7169e0cdb, 0x00003ffe
|
||||
data8 0x8d8bf424d6e130b2, 0x00003ffe
|
||||
data8 0x8e575b506f409fa6, 0x00003ffe
|
||||
data8 0x8f673e418776492c, 0x00003ffe
|
||||
data8 0x9077e9ed700ef9ba, 0x00003ffe
|
||||
data8 0x9144ef1baec80b20, 0x00003ffe
|
||||
data8 0x9256fcdb537f035f, 0x00003ffe
|
||||
data8 0x9369d68d75e7e1d6, 0x00003ffe
|
||||
data8 0x943880613b8f9f1e, 0x00003ffe
|
||||
data8 0x954cc1d9e0d94206, 0x00003ffe
|
||||
data8 0xd3c70a37bdf7a294, 0x0000bffd
|
||||
data8 0xd19bb053fb0284ec, 0x0000bffd
|
||||
data8 0xcffa1a3b7dafb8bf, 0x0000bffd
|
||||
data8 0xcdcbe1e2776479ee, 0x0000bffd
|
||||
data8 0xcc282218b8bfdda2, 0x0000bffd
|
||||
data8 0xc9f703a9afcb38ac, 0x0000bffd
|
||||
data8 0xc851146ab89593c6, 0x0000bffd
|
||||
data8 0xc61d08265927a860, 0x0000bffd
|
||||
data8 0xc474e39705912d26, 0x0000bffd
|
||||
data8 0xc23de19ec30c6e3e, 0x0000bffd
|
||||
data8 0xc09381cc45db45b4, 0x0000bffd
|
||||
data8 0xbee82b4e025ff90c, 0x0000bffd
|
||||
data8 0xbcace101149788ec, 0x0000bffd
|
||||
data8 0xbaff46962ea47964, 0x0000bffd
|
||||
data8 0xb950b1be5e0c14a2, 0x0000bffd
|
||||
data8 0xb7110e6ce866f2bc, 0x0000bffd
|
||||
data8 0xb5602ccc2a81db52, 0x0000bffd
|
||||
data8 0xb3ae4ce740fc8ef1, 0x0000bffd
|
||||
data8 0xb1fb6d92c8240ccc, 0x0000bffd
|
||||
data8 0xafb609c09b244abc, 0x0000bffd
|
||||
data8 0xae00d1cfdeb43cfd, 0x0000bffd
|
||||
data8 0xac4a967a8c8c9bd0, 0x0000bffd
|
||||
data8 0xaa93568c249e6c52, 0x0000bffd
|
||||
data8 0xa8db10cdff375343, 0x0000bffd
|
||||
data8 0xa68e6fc5a42376e3, 0x0000bffd
|
||||
data8 0xa4d3c25e68dc57f2, 0x0000bffd
|
||||
data8 0xa3180b0c192a3816, 0x0000bffd
|
||||
data8 0xa15b488e7aa329a0, 0x0000bffd
|
||||
data8 0x9f9d79a30f0e1d5f, 0x0000bffd
|
||||
data8 0x9dde9d050ee7d4ac, 0x0000bffd
|
||||
data8 0x9c1eb16d63d7356c, 0x0000bffd
|
||||
data8 0x9a5db592a310c36a, 0x0000bffd
|
||||
data8 0x989ba82907a9016f, 0x0000bffd
|
||||
data8 0x96d887e26cd57b79, 0x0000bffd
|
||||
data8 0x9514536e481c3a4f, 0x0000bffd
|
||||
data8 0x934f0979a3715fc9, 0x0000bffd
|
||||
data8 0x9188a8af1742a9d5, 0x0000bffd
|
||||
data8 0x8fc12fb6c470995f, 0x0000bffd
|
||||
data8 0x8df89d364e34f8f1, 0x0000bffd
|
||||
data8 0x8c2eefd0d3f67dd6, 0x0000bffd
|
||||
data8 0x8a642626eb093d54, 0x0000bffd
|
||||
data8 0x88983ed6985bae58, 0x0000bffd
|
||||
data8 0x86cb387b4a0feec6, 0x0000bffd
|
||||
data8 0x84fd11add101024b, 0x0000bffd
|
||||
data8 0x83c856dd81804b78, 0x0000bffd
|
||||
data8 0x81f84c2c62afd6f1, 0x0000bffd
|
||||
data8 0x80271d3e4be5ea5a, 0x0000bffd
|
||||
data8 0xfca991447e7b485d, 0x0000bffc
|
||||
data8 0xf90299c904793a3c, 0x0000bffc
|
||||
data8 0xf559511d2dc1ed69, 0x0000bffc
|
||||
data8 0xf2e72afee9bd2aee, 0x0000bffc
|
||||
data8 0xef39ff1d8a40770e, 0x0000bffc
|
||||
data8 0xeb8a7a2311c935dc, 0x0000bffc
|
||||
data8 0xe7d8990dc620012f, 0x0000bffc
|
||||
data8 0xe560b1e3b86e44b6, 0x0000bffc
|
||||
data8 0xe1aadb38caee80c4, 0x0000bffc
|
||||
data8 0xddf2a051f81b76a4, 0x0000bffc
|
||||
data8 0xdb7678bafcaf4b5f, 0x0000bffc
|
||||
data8 0xd7ba3a8f0df19bfc, 0x0000bffc
|
||||
data8 0xd3fb8fdbdd5cebdb, 0x0000bffc
|
||||
data8 0xd17b191905c35652, 0x0000bffc
|
||||
data8 0xcdb85d29cefd7121, 0x0000bffc
|
||||
data8 0xc9f32c3c88221ef6, 0x0000bffc
|
||||
data8 0xc76e5741a95b5dae, 0x0000bffc
|
||||
data8 0xc3a506d80d38c718, 0x0000bffc
|
||||
data8 0xbfd938ccef8b68c1, 0x0000bffc
|
||||
data8 0xbd4ff63e82eef78c, 0x0000bffc
|
||||
data8 0xb97ffa2b563865bd, 0x0000bffc
|
||||
data8 0xb6f3eb3011eddcea, 0x0000bffc
|
||||
data8 0xb31fb7d64898b3e6, 0x0000bffc
|
||||
data8 0xb090d63a409e7880, 0x0000bffc
|
||||
data8 0xacb8623c7ffa4f39, 0x0000bffc
|
||||
data8 0xa8dd5c83d2e45246, 0x0000bffc
|
||||
data8 0xa649e998a8d91f2e, 0x0000bffc
|
||||
data8 0xa26a93fed6faa94f, 0x0000bffc
|
||||
data8 0x9fd43df079d0db1f, 0x0000bffc
|
||||
data8 0x9d3cbe69aecac4c2, 0x0000bffc
|
||||
data8 0x99574f13c570d0fb, 0x0000bffc
|
||||
data8 0x96bce349bf7ee6c7, 0x0000bffc
|
||||
data8 0x92d30c9b86cee18e, 0x0000bffc
|
||||
data8 0x9035adef17c5bd5c, 0x0000bffc
|
||||
data8 0x8c4765e8e8b5f251, 0x0000bffc
|
||||
data8 0x89a70da448316ffa, 0x0000bffc
|
||||
data8 0x85b44a24474af78a, 0x0000bffc
|
||||
data8 0x8310f17aab5adf70, 0x0000bffc
|
||||
data8 0x806c6388d0965f29, 0x0000bffc
|
||||
data8 0xf8e69092bf0c5ead, 0x0000bffb
|
||||
data8 0xf397608bfd2d90e6, 0x0000bffb
|
||||
data8 0xee45be24d0eedbc4, 0x0000bffb
|
||||
data8 0xe646af233db881e9, 0x0000bffb
|
||||
data8 0xe0eee4e1ce3d06fb, 0x0000bffb
|
||||
data8 0xdb94a049e6e87a4f, 0x0000bffb
|
||||
data8 0xd3888ef9a4249f5a, 0x0000bffb
|
||||
data8 0xce280e6fbac39194, 0x0000bffb
|
||||
data8 0xc8c50b72319ad574, 0x0000bffb
|
||||
data8 0xc0abcd39f41e329b, 0x0000bffb
|
||||
data8 0xbb4279cfa7f9667b, 0x0000bffb
|
||||
data8 0xb5d69bac77ec398a, 0x0000bffb
|
||||
data8 0xb068306bf20d6233, 0x0000bffb
|
||||
data8 0xa83dc1b019ddb6a8, 0x0000bffb
|
||||
data8 0xa2c8eb1886c2d024, 0x0000bffb
|
||||
data8 0x9d517ee93f8e16c0, 0x0000bffb
|
||||
data8 0x97d77aae659b92fb, 0x0000bffb
|
||||
data8 0x8f9b91da5736d415, 0x0000bffb
|
||||
data8 0x8a1b06b09b7fd1d1, 0x0000bffb
|
||||
data8 0x8497daca0a2e077a, 0x0000bffb
|
||||
data8 0xfe241745a453f10c, 0x0000bffa
|
||||
data8 0xf3132d6708d723c5, 0x0000bffa
|
||||
data8 0xe7fcf2e21a0e7d77, 0x0000bffa
|
||||
data8 0xd75198b04afb8da9, 0x0000bffa
|
||||
data8 0xcc2dfe1a4a8ca305, 0x0000bffa
|
||||
data8 0xc10500d63aa65882, 0x0000bffa
|
||||
data8 0xb5d69bac77ec398a, 0x0000bffa
|
||||
data8 0xaaa2c95dc66abcde, 0x0000bffa
|
||||
data8 0x9f6984a342d13101, 0x0000bffa
|
||||
data8 0x942ac82e5387ac51, 0x0000bffa
|
||||
data8 0x88e68ea899a0976c, 0x0000bffa
|
||||
data8 0xefebc4409ccf872e, 0x0000bff9
|
||||
data8 0xd947b0c6642ef69e, 0x0000bff9
|
||||
data8 0xc2987d51e043d407, 0x0000bff9
|
||||
data8 0xabde1eeee6bfd257, 0x0000bff9
|
||||
data8 0x95188a9917cf2e01, 0x0000bff9
|
||||
data8 0xfc8f6a777c1b7f1e, 0x0000bff8
|
||||
data8 0xced727635c59725c, 0x0000bff8
|
||||
data8 0xa108358a4c904615, 0x0000bff8
|
||||
data8 0xe644fcbeb3ac9c90, 0x0000bff7
|
||||
data8 0x8a4bd667bf08e7de, 0x0000bff7
|
||||
data8 0x0000000000000000 // T[255] Low
|
||||
data8 0x0000000000000000 // T[255] High
|
||||
LOCAL_OBJECT_END(T_table)
|
||||
|
||||
|
||||
|
||||
.section .text
|
||||
WEAK_LIBM_ENTRY(log2)
|
||||
|
||||
{ .mfi
|
||||
alloc r32=ar.pfs,1,4,4,0
|
||||
// y=frcpa(x)
|
||||
frcpa.s1 f6,p0=f1,f8
|
||||
// will form significand of 1.5 (to test whether the index is 128 or above)
|
||||
mov r24=0xc
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// normalize x
|
||||
fma.s1 f7=f8,f1,f0
|
||||
// r2 = pointer to C_1...C_6 followed by T_table
|
||||
addl r2 = @ltoff(poly_coeffs), gp;;
|
||||
}
|
||||
{.mfi
|
||||
// get significand
|
||||
getf.sig r25=f8
|
||||
// f8 denormal ?
|
||||
fclass.m p8,p10=f8,0x9
|
||||
// will form significand of 1.5 (to test whether the index is 128 or above)
|
||||
shl r24=r24,60
|
||||
}
|
||||
{.mfi
|
||||
mov r26=0x804
|
||||
nop.f 0
|
||||
// r23=bias-1
|
||||
mov r23=0xfffe;;
|
||||
}
|
||||
|
||||
{.mmf
|
||||
getf.exp r29=f8
|
||||
// load start address for C_1...C_6 followed by T_table
|
||||
ld8 r2=[r2]
|
||||
// will continue only for positive normal/denormal numbers
|
||||
fclass.nm.unc p12,p7 = f8, 0x19 ;;
|
||||
}
|
||||
|
||||
.pred.rel "mutex",p8,p10
|
||||
{.mfi
|
||||
// denormal input, repeat get significand (after normalization)
|
||||
(p8) getf.sig r25=f7
|
||||
// x=1 ?
|
||||
fcmp.eq.s0 p6,p0=f8,f1
|
||||
// get T_index
|
||||
(p10) shr.u r28=r25,63-8
|
||||
}
|
||||
{.mfi
|
||||
// f32=0.5
|
||||
setf.exp f32=r23
|
||||
nop.f 0
|
||||
// r27=bias
|
||||
mov r27=0xffff;;
|
||||
}
|
||||
|
||||
{.mmi
|
||||
// denormal input, repeat get exponent (after normalization)
|
||||
(p8) getf.exp r29=f7
|
||||
mov r23=0xff
|
||||
// r26=0x80400...0 (threshold for using polynomial approximation)
|
||||
shl r26=r26,64-12;;
|
||||
}
|
||||
|
||||
{.mfb
|
||||
add r3=48,r2
|
||||
// r=1-x*y
|
||||
fms.s1 f6=f6,f8,f1
|
||||
(p12) br.cond.spnt SPECIAL_LOG2
|
||||
}
|
||||
{.mfi
|
||||
// load C_4, C_5
|
||||
ldfpd f10,f11=[r2],16
|
||||
nop.f 0
|
||||
cmp.geu p12,p0=r25,r24;;
|
||||
}
|
||||
|
||||
{.mmi
|
||||
// load C_6, C_7
|
||||
ldfpd f12,f13=[r2],16
|
||||
// r27=bias-1 (if index >=128, will add exponent+1)
|
||||
(p12) mov r27=0xfffe
|
||||
(p8) shr.u r28=r25,63-8;;
|
||||
}
|
||||
|
||||
|
||||
{.mfi
|
||||
// load C_1
|
||||
ldfe f14=[r2],32
|
||||
fmerge.se f7=f1,f7
|
||||
// if first 9 bits after leading 1 are all zero, then p8=1
|
||||
cmp.ltu p8,p12=r25,r26
|
||||
}
|
||||
{.mfi
|
||||
// load C_3
|
||||
ldfe f15=[r3]
|
||||
nop.f 0
|
||||
// get T_index
|
||||
and r28=r28,r23;;
|
||||
}
|
||||
{.mfi
|
||||
// r29=exponent-bias
|
||||
sub r29=r29,r27
|
||||
// x=1, return 0
|
||||
(p6) fma.d.s0 f8=f0,f0,f0
|
||||
// get T address
|
||||
shladd r2=r28,4,r2
|
||||
}
|
||||
{.mfb
|
||||
// first 8 bits after leading 1 are all ones ?
|
||||
cmp.eq p10,p0=r23,r28
|
||||
// if first 8 bits after leading bit are 0, use polynomial approx. only
|
||||
(p8) fms.s1 f6=f7,f1,f1
|
||||
// x=1, return
|
||||
(p6) br.ret.spnt b0;;
|
||||
}
|
||||
{.mfi
|
||||
// r26=1
|
||||
mov r26=1
|
||||
// if first 8 bits after leading 1 are all ones, use polynomial approx. only
|
||||
(p10) fms.s1 f6=f7,f32,f1
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
.pred.rel "mutex",p8,p12
|
||||
{.mmf
|
||||
// load T (unless first 9 bits after leading 1 are 0)
|
||||
(p12) ldfe f33=[r2]
|
||||
// f8=expon - bias
|
||||
setf.sig f8=r29
|
||||
// set T=0 (if first 9 bits after leading 1 are 0)
|
||||
(p8) fma.s1 f33=f0,f0,f0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P12=1-0.5*r
|
||||
fnma.s1 f32=f32,f6,f1
|
||||
// r26=2^{63}
|
||||
shl r26=r26,63
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// r2=r*r
|
||||
fma.s1 f7=f6,f6,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
// significand(x)=1 ?
|
||||
cmp.eq p0,p6=r26,r25
|
||||
// P67=C_6+C_7*r
|
||||
fma.s1 f13=f13,f6,f12
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P45=C_4+C_5*r
|
||||
fma.s1 f10=f11,f6,f10
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// C_1*r
|
||||
(p6) fma.s1 f14=f14,f6,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// normalize additive term (l=exponent of x)
|
||||
fcvt.xf f8=f8
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P13=1-0.5*r+C_3*r^2
|
||||
(p6) fma.s1 f15=f15,f7,f32
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P47=P45+r2*P67
|
||||
(p6) fma.s1 f13=f13,f7,f10
|
||||
// if significand(x)=1, return exponent (l)
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// r3=r^3
|
||||
(p6) fma.s1 f7=f7,f6,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// add T+l
|
||||
(p6) fma.s1 f8=f8,f1,f33
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P17=P13+r3*P47
|
||||
(p6) fma.s1 f13=f13,f7,f15
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// result=T+l+(C_1*r)*P16
|
||||
(p6) fma.d.s0 f8=f13,f14,f8
|
||||
// return
|
||||
br.ret.sptk b0;;
|
||||
}
|
||||
|
||||
|
||||
SPECIAL_LOG2:
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// x=+Infinity ?
|
||||
fclass.m p7,p0=f8,0x21
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// x=+/-Zero ?
|
||||
fclass.m p8,p0=f8,0x7
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// x=-Infinity, -normal, -denormal ?
|
||||
fclass.m p6,p0=f8,0x3a
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// log2(+Infinity)=+Infinity
|
||||
nop.f 0
|
||||
(p7) br.ret.spnt b0;;
|
||||
}
|
||||
{.mfi
|
||||
(p8) mov GR_Parameter_TAG = 170
|
||||
// log2(+/-0)=-infinity, raises Divide by Zero
|
||||
// set f8=-0
|
||||
(p8) fmerge.ns f8=f0,f8
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
(p8) frcpa.s0 f8,p0=f1,f8
|
||||
(p8) br.cond.sptk __libm_error_region;;
|
||||
}
|
||||
{.mfb
|
||||
(p6) mov GR_Parameter_TAG = 171
|
||||
// x<0: return NaN, raise Invalid
|
||||
(p6) frcpa.s0 f8,p0=f0,f0
|
||||
(p6) br.cond.sptk __libm_error_region;;
|
||||
}
|
||||
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// Remaining cases: NaNs
|
||||
fma.d.s0 f8=f8,f1,f0
|
||||
br.ret.sptk b0;;
|
||||
}
|
||||
|
||||
WEAK_LIBM_END(log2)
|
||||
libm_alias_double_other (__log2, log2)
|
||||
#ifdef SHARED
|
||||
.symver log2,log2@@GLIBC_2.29
|
||||
.weak __log2_compat
|
||||
.set __log2_compat,__log2
|
||||
.symver __log2_compat,log2@GLIBC_2.2
|
||||
#endif
|
||||
|
||||
|
||||
LOCAL_LIBM_ENTRY(__libm_error_region)
|
||||
.prologue
|
||||
{ .mfi
|
||||
add GR_Parameter_Y=-32,sp // Parameter 2 value
|
||||
nop.f 0
|
||||
.save ar.pfs,GR_SAVE_PFS
|
||||
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
|
||||
}
|
||||
{ .mfi
|
||||
.fframe 64
|
||||
add sp=-64,sp // Create new stack
|
||||
nop.f 0
|
||||
mov GR_SAVE_GP=gp // Save gp
|
||||
};;
|
||||
{ .mmi
|
||||
stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
|
||||
add GR_Parameter_X = 16,sp // Parameter 1 address
|
||||
.save b0, GR_SAVE_B0
|
||||
mov GR_SAVE_B0=b0 // Save b0
|
||||
};;
|
||||
.body
|
||||
{ .mib
|
||||
stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
|
||||
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
|
||||
nop.b 0
|
||||
}
|
||||
{ .mib
|
||||
stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
|
||||
add GR_Parameter_Y = -16,GR_Parameter_Y
|
||||
br.call.sptk b0=__libm_error_support# // Call error handling function
|
||||
};;
|
||||
{ .mmi
|
||||
nop.m 0
|
||||
nop.m 0
|
||||
add GR_Parameter_RESULT = 48,sp
|
||||
};;
|
||||
{ .mmi
|
||||
ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
|
||||
.restore sp
|
||||
add sp = 64,sp // Restore stack pointer
|
||||
mov b0 = GR_SAVE_B0 // Restore return address
|
||||
};;
|
||||
{ .mib
|
||||
mov gp = GR_SAVE_GP // Restore gp
|
||||
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
|
||||
br.ret.sptk b0 // Return
|
||||
};;
|
||||
|
||||
LOCAL_LIBM_END(__libm_error_region)
|
||||
.type __libm_error_support#,@function
|
||||
.global __libm_error_support#
|
@ -1 +0,0 @@
|
||||
/* Not needed. */
|
@ -1,553 +0,0 @@
|
||||
.file "log2f.s"
|
||||
|
||||
|
||||
// Copyright (c) 2000 - 2003, Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
//
|
||||
// History
|
||||
//==============================================================
|
||||
// 09/11/00 Initial version
|
||||
// 05/20/02 Cleaned up namespace and sf0 syntax
|
||||
// 02/10/03 Reordered header: .section, .global, .proc, .align
|
||||
//
|
||||
// API
|
||||
//==============================================================
|
||||
// float log2f(float)
|
||||
//
|
||||
// Overview of operation
|
||||
//==============================================================
|
||||
// Background
|
||||
//
|
||||
// Implementation
|
||||
//
|
||||
// Let x = 2^l * m, where m=1.b1 b2 ... b8 b9 ... b52
|
||||
// y=frcpa(m), r=m*y-1, f=b1 b2 .. b8 (table index)
|
||||
// j=0 if f<128; j=1 if f>=128
|
||||
// T is a table that stores log2(1/y) (in entries 1..255) rounded to
|
||||
// double extended precision; f is used as an index; T[255]=0
|
||||
//
|
||||
// If f=0 and b9=0, r is set to 2^{-8}* 0.b9 b10 ... b52 = m-1 (fractional part of m),
|
||||
// and 0 is used instead of T[0]
|
||||
// (polynomial evaluation only, for m=1+r, 0<=r<2^{-9})
|
||||
// If f=255, r is set to (m-2)/2 (T[255]=0, and only polynomial evaluation is used
|
||||
// for m=2(1-r'), 0<=r'<2^{-9})
|
||||
//
|
||||
// log2f(x) is approximated as
|
||||
// (l-j) + T[f] + (c1*r+c2*r^2+...+c6*r^6), if f>0
|
||||
//
|
||||
|
||||
|
||||
// Special values
|
||||
//==============================================================
|
||||
// log2f(0)=-inf, raises Divide by Zero
|
||||
// log2f(+inf)=inf
|
||||
// log2f(x)=NaN, raises Invalid if x<0
|
||||
//
|
||||
|
||||
|
||||
// Registers used
|
||||
//==============================================================
|
||||
// f6-f14
|
||||
// r2-r3, r23-r30
|
||||
// p6,p7,p8,p12
|
||||
//
|
||||
|
||||
|
||||
GR_SAVE_B0 = r33
|
||||
GR_SAVE_PFS = r34
|
||||
GR_SAVE_GP = r35 // This reg. can safely be used
|
||||
GR_SAVE_SP = r36
|
||||
|
||||
GR_Parameter_X = r37
|
||||
GR_Parameter_Y = r38
|
||||
GR_Parameter_RESULT = r39
|
||||
GR_Parameter_TAG = r40
|
||||
|
||||
FR_X = f10
|
||||
FR_Y = f1
|
||||
FR_RESULT = f8
|
||||
|
||||
|
||||
|
||||
|
||||
// Data tables
|
||||
//==============================================================
|
||||
|
||||
RODATA
|
||||
|
||||
.align 16
|
||||
|
||||
LOCAL_OBJECT_START(poly_coeffs)
|
||||
|
||||
data8 0x3fdec709dc3a03fd, 0xbfd71547652b82fe //C_3 and C_4
|
||||
data8 0xb8aa3b295c17f0bc, 0x00003fff // C_1
|
||||
data8 0xb8aa3b295c17f0bc, 0x0000bffe // C_2
|
||||
LOCAL_OBJECT_END(poly_coeffs)
|
||||
|
||||
|
||||
LOCAL_OBJECT_START(T_table)
|
||||
|
||||
data8 0x3f671b0ea42e5fda, 0x3f815cfe8eaec830
|
||||
data8 0x3f8cfee70c5ce5dc, 0x3f94564a62192834
|
||||
data8 0x3f997723ace35766, 0x3f9f5923c69b54a1
|
||||
data8 0x3fa2a094a085d693, 0x3fa538941776b01e
|
||||
data8 0x3fa8324c9b914bc7, 0x3faacf54ce07d7e9
|
||||
data8 0x3fadced958dadc12, 0x3fb0387efbca869e
|
||||
data8 0x3fb18ac6067479c0, 0x3fb30edd3e13530d
|
||||
data8 0x3fb463c15936464e, 0x3fb5b9e13c3fa21d
|
||||
data8 0x3fb7113f3259e07a, 0x3fb869dd8d1b2035
|
||||
data8 0x3fb9c3bea49d3214, 0x3fbb1ee4d7961701
|
||||
data8 0x3fbc7b528b70f1c5, 0x3fbdd90a2c676ed4
|
||||
data8 0x3fbf05d4976c2028, 0x3fc032fbbaee6d65
|
||||
data8 0x3fc0e3b5a9f3284a, 0x3fc195195c7d125b
|
||||
data8 0x3fc22dadc2ab3497, 0x3fc2e050231df57d
|
||||
data8 0x3fc379f79c2b255b, 0x3fc42ddd2ba1b4a9
|
||||
data8 0x3fc4c89b9e6807f5, 0x3fc563dc29ffacb2
|
||||
data8 0x3fc619a25f5d798d, 0x3fc6b5ffbf367644
|
||||
data8 0x3fc752e1f660f8d6, 0x3fc7f049e753e7cf
|
||||
data8 0x3fc8a8980abfbd32, 0x3fc94724cca657be
|
||||
data8 0x3fc9e63a24971f46, 0x3fca85d8feb202f7
|
||||
data8 0x3fcb2602497d5346, 0x3fcbc6b6f5ee1c9b
|
||||
data8 0x3fcc67f7f770a67e, 0x3fcceec4b2234fba
|
||||
data8 0x3fcd91097ad13982, 0x3fce33dd57f3d335
|
||||
data8 0x3fced74146bc7b10, 0x3fcf7b3646fef683
|
||||
data8 0x3fd00223a943dc19, 0x3fd054a474bf0eb7
|
||||
data8 0x3fd0999d9b9259a1, 0x3fd0eca66d3b2581
|
||||
data8 0x3fd13ffa2e85b475, 0x3fd185a444fa0a7b
|
||||
data8 0x3fd1cb8312f27eff, 0x3fd21fa1441ce5e8
|
||||
data8 0x3fd265f526e603cb, 0x3fd2baa0c34be1ec
|
||||
data8 0x3fd3016b45de21ce, 0x3fd3486c38aa29a8
|
||||
data8 0x3fd38fa3efaa8262, 0x3fd3e562c0816a02
|
||||
data8 0x3fd42d141f53b646, 0x3fd474fd543f222c
|
||||
data8 0x3fd4bd1eb680e548, 0x3fd505789e234bd1
|
||||
data8 0x3fd54e0b64003b70, 0x3fd596d761c3c1f0
|
||||
data8 0x3fd5dfdcf1eeae0e, 0x3fd6291c6fd9329c
|
||||
data8 0x3fd6729637b59418, 0x3fd6bc4aa692e0fd
|
||||
data8 0x3fd7063a1a5fb4f2, 0x3fd75064f1ed0715
|
||||
data8 0x3fd79acb8cf10390, 0x3fd7d67c1e43ae5c
|
||||
data8 0x3fd8214f4068afa7, 0x3fd86c5f36dea3dc
|
||||
data8 0x3fd8b7ac64dd7f9d, 0x3fd8f4167a0c6f92
|
||||
data8 0x3fd93fd2d5e1bf1d, 0x3fd98bcd84296946
|
||||
data8 0x3fd9c8c333e6e9a5, 0x3fda152f142981b4
|
||||
data8 0x3fda527fd95fd8ff, 0x3fda9f5e3edeb9e6
|
||||
data8 0x3fdadd0b2b5755a7, 0x3fdb2a5d6f51ff83
|
||||
data8 0x3fdb686799b00be3, 0x3fdbb62f1b887cd8
|
||||
data8 0x3fdbf4979f666668, 0x3fdc332a6e8399d4
|
||||
data8 0x3fdc819dc2d45fe4, 0x3fdcc0908e19b7bd
|
||||
data8 0x3fdcffae611ad12b, 0x3fdd3ef776d43ff4
|
||||
data8 0x3fdd8e5002710128, 0x3fddcdfb486cb9a1
|
||||
data8 0x3fde0dd294245fe4, 0x3fde4dd622a28840
|
||||
data8 0x3fde8e06317114f0, 0x3fdece62fe9a9915
|
||||
data8 0x3fdf1f164a15389a, 0x3fdf5fd8a9063e35
|
||||
data8 0x3fdfa0c8937e7d5d, 0x3fdfe1e649bb6335
|
||||
data8 0x3fe011990641535a, 0x3fe032560e91e59e
|
||||
data8 0x3fe0532a5ebcd44a, 0x3fe0741617f5fc28
|
||||
data8 0x3fe08cd653f38839, 0x3fe0adeb55c1103b
|
||||
data8 0x3fe0cf181d5d1dd0, 0x3fe0f05ccd0aced7
|
||||
data8 0x3fe111b9875788ab, 0x3fe1332e6f1bcf73
|
||||
data8 0x3fe154bba77c2088, 0x3fe16df59bfa06c1
|
||||
data8 0x3fe18fadb6e2d3c2, 0x3fe1b17e849adc26
|
||||
data8 0x3fe1caeb6a0de814, 0x3fe1ece7c830eec9
|
||||
data8 0x3fe20efd3dae01df, 0x3fe2289de375d901
|
||||
data8 0x3fe24adf9b6a6fe0, 0x3fe26d3ad1aebcfc
|
||||
data8 0x3fe287100c2771f4, 0x3fe2a9983b3c1b28
|
||||
data8 0xbfda78e146f7bef4, 0xbfda33760a7f6051
|
||||
data8 0xbfd9ff43476fb5f7, 0xbfd9b97c3c4eec8f
|
||||
data8 0xbfd98504431717fc, 0xbfd93ee07535f967
|
||||
data8 0xbfd90a228d5712b2, 0xbfd8c3a104cb24f5
|
||||
data8 0xbfd88e9c72e0b226, 0xbfd847bc33d8618e
|
||||
data8 0xbfd812703988bb69, 0xbfd7dd0569c04bff
|
||||
data8 0xbfd7959c202292f1, 0xbfd75fe8d2c5d48f
|
||||
data8 0xbfd72a1637cbc183, 0xbfd6e221cd9d0cde
|
||||
data8 0xbfd6ac059985503b, 0xbfd675c99ce81f92
|
||||
data8 0xbfd63f6db2590482, 0xbfd5f6c138136489
|
||||
data8 0xbfd5c01a39fbd688, 0xbfd58952cf519193
|
||||
data8 0xbfd5526ad18493ce, 0xbfd51b6219bfe6ea
|
||||
data8 0xbfd4d1cdf8b4846f, 0xbfd49a784bcd1b8b
|
||||
data8 0xbfd4630161832547, 0xbfd42b6911cf5465
|
||||
data8 0xbfd3f3af3461e1c4, 0xbfd3bbd3a0a1dcfb
|
||||
data8 0xbfd383d62dac7ae7, 0xbfd34bb6b2546218
|
||||
data8 0xbfd313750520f520, 0xbfd2db10fc4d9aaf
|
||||
data8 0xbfd2a28a6dc90387, 0xbfd269e12f346e2c
|
||||
data8 0xbfd2311515e2e855, 0xbfd1f825f6d88e13
|
||||
data8 0xbfd1bf13a6c9c69f, 0xbfd185ddfa1a7ed0
|
||||
data8 0xbfd14c84c4dd6128, 0xbfd11307dad30b76
|
||||
data8 0xbfd0d9670f6941fe, 0xbfd09fa235ba2020
|
||||
data8 0xbfd0790adbb03009, 0xbfd03f09858c55fb
|
||||
data8 0xbfd004e3a7c97cbd, 0xbfcf9532288fcf69
|
||||
data8 0xbfcf205339208f27, 0xbfceab2a23a5b83e
|
||||
data8 0xbfce5ce55fdd37a5, 0xbfcde73fe3b1480f
|
||||
data8 0xbfcd714f44623927, 0xbfccfb1321b8c400
|
||||
data8 0xbfccac163c770dc9, 0xbfcc355b67195dd0
|
||||
data8 0xbfcbbe540a3f036f, 0xbfcb6ecf175f95e9
|
||||
data8 0xbfcaf74751e1be33, 0xbfca7f71fb7bab9d
|
||||
data8 0xbfca2f632320b86b, 0xbfc9b70ba539dfae
|
||||
data8 0xbfc93e6587910444, 0xbfc8edcae8352b6c
|
||||
data8 0xbfc874a0db01a719, 0xbfc7fb27199df16d
|
||||
data8 0xbfc7a9fec7d05ddf, 0xbfc72fff456ac70d
|
||||
data8 0xbfc6de7d66023dbc, 0xbfc663f6fac91316
|
||||
data8 0xbfc6121ac74813cf, 0xbfc5970c478fff4a
|
||||
data8 0xbfc51bab907a5c8a, 0xbfc4c93d33151b24
|
||||
data8 0xbfc44d527fdadf55, 0xbfc3fa87be0f3a1b
|
||||
data8 0xbfc3a797cd35d959, 0xbfc32ae9e278ae1a
|
||||
data8 0xbfc2d79c6937efdd, 0xbfc25a619370d9dc
|
||||
data8 0xbfc206b5bde2f8b8, 0xbfc188ecbd1d16be
|
||||
data8 0xbfc134e1b489062e, 0xbfc0b6894488e95f
|
||||
data8 0xbfc0621e2f556b5c, 0xbfc00d8c711a12cc
|
||||
data8 0xbfbf1cd21257e18c, 0xbfbe72ec117fa5b2
|
||||
data8 0xbfbdc8b7c49a1ddb, 0xbfbcc8d5e467b710
|
||||
data8 0xbfbc1ddc9c39c7a1, 0xbfbb7294093cdd0f
|
||||
data8 0xbfba7111df348494, 0xbfb9c501cdf75872
|
||||
data8 0xbfb918a16e46335b, 0xbfb81579a73e83c6
|
||||
data8 0xbfb7684f39f4ff2d, 0xbfb6bad3758efd87
|
||||
data8 0xbfb60d060d7e41ac, 0xbfb507b836033bb7
|
||||
data8 0xbfb4591d6310d85a, 0xbfb3aa2fdd27f1c3
|
||||
data8 0xbfb2faef55ccb372, 0xbfb1f3723b4ae6db
|
||||
data8 0xbfb14360d6136ffa, 0xbfb092fb594145c1
|
||||
data8 0xbfafc482e8b48a7e, 0xbfae6265ace11ae4
|
||||
data8 0xbfacff9e5c4341d0, 0xbfaaea3316095f72
|
||||
data8 0xbfa985bfc3495194, 0xbfa820a01ac754cb
|
||||
data8 0xbfa6bad3758efd87, 0xbfa554592bb8cd58
|
||||
data8 0xbfa3ed3094685a26, 0xbfa2855905ca70f6
|
||||
data8 0xbfa11cd1d5133413, 0xbf9dfd78881399f1
|
||||
data8 0xbf9b28f618cc85df, 0xbf98530faa3c087b
|
||||
data8 0xbf957bc3dddcd7fa, 0xbf92a3115322f9e6
|
||||
data8 0xbf8f91ed4eef8370, 0xbf89dae4ec6b8b2e
|
||||
data8 0xbf842106b1499209, 0xbf7cc89f97d67594
|
||||
data8 0xbf71497accf7e11d, 0x0000000000000000
|
||||
LOCAL_OBJECT_END(T_table)
|
||||
|
||||
|
||||
.section .text
|
||||
WEAK_LIBM_ENTRY(log2f)
|
||||
|
||||
{ .mfi
|
||||
alloc r32=ar.pfs,1,4,4,0
|
||||
// y=frcpa(x)
|
||||
frcpa.s1 f6,p0=f1,f8
|
||||
// will form significand of 1.5 (to test whether the index is 128 or above)
|
||||
mov r24=0xc
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// normalize x
|
||||
fma.s1 f7=f8,f1,f0
|
||||
// r2 = pointer to C_1...C_6 followed by T_table
|
||||
addl r2 = @ltoff(poly_coeffs), gp;;
|
||||
}
|
||||
{.mfi
|
||||
// get significand
|
||||
getf.sig r25=f8
|
||||
// f8 denormal ?
|
||||
fclass.m p8,p10=f8,0x9
|
||||
// will form significand of 1.5 (to test whether the index is 128 or above)
|
||||
shl r24=r24,60
|
||||
}
|
||||
{.mfi
|
||||
mov r26=0x804
|
||||
nop.f 0
|
||||
// r23=bias-1
|
||||
mov r23=0xfffe;;
|
||||
}
|
||||
|
||||
{.mmf
|
||||
getf.exp r29=f8
|
||||
// load start address for C_1...C_6 followed by T_table
|
||||
ld8 r2=[r2]
|
||||
// will continue only for positive normal/denormal numbers
|
||||
fclass.nm.unc p12,p7 = f8, 0x19 ;;
|
||||
}
|
||||
|
||||
.pred.rel "mutex",p8,p10
|
||||
{.mfi
|
||||
// denormal input, repeat get significand (after normalization)
|
||||
(p8) getf.sig r25=f7
|
||||
// x=1 ?
|
||||
fcmp.eq.s0 p6,p0=f8,f1
|
||||
// get T_index
|
||||
(p10) shr.u r28=r25,63-8
|
||||
}
|
||||
{.mfi
|
||||
// f12=0.5
|
||||
setf.exp f12=r23
|
||||
nop.f 0
|
||||
// r27=bias
|
||||
mov r27=0xffff;;
|
||||
}
|
||||
|
||||
{.mfb
|
||||
// denormal input, repeat get exponent (after normalization)
|
||||
(p8) getf.exp r29=f7
|
||||
nop.f 0
|
||||
(p12) br.cond.spnt SPECIAL_log2f
|
||||
}
|
||||
{.mfi
|
||||
cmp.geu p12,p0=r25,r24
|
||||
nop.f 0
|
||||
mov r23=0xff;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
add r3=32,r2
|
||||
// r=1-x*y
|
||||
fms.s1 f6=f6,f8,f1
|
||||
// r26=0x80400...0 (threshold for using polynomial approximation)
|
||||
shl r26=r26,64-12
|
||||
}
|
||||
{.mfi
|
||||
// load C_3, C_4
|
||||
ldfpd f10,f11=[r2],16
|
||||
nop.f 0
|
||||
// r27=bias-1 (if index >=128, will add exponent+1)
|
||||
(p12) mov r27=0xfffe;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
// load C_1
|
||||
ldfe f14=[r2],32
|
||||
// x=1, return 0
|
||||
(p6) fma.s.s0 f8=f0,f0,f0
|
||||
(p8) shr.u r28=r25,63-8
|
||||
}
|
||||
{.mib
|
||||
// load C_2
|
||||
ldfe f13=[r3]
|
||||
// r29=exponent-bias
|
||||
sub r29=r29,r27
|
||||
// x=1, return
|
||||
(p6) br.ret.spnt b0;;
|
||||
}
|
||||
|
||||
|
||||
{.mfi
|
||||
// get T_index
|
||||
and r28=r28,r23
|
||||
fmerge.se f7=f1,f7
|
||||
// if first 9 bits after leading 1 are all zero, then p8=1
|
||||
cmp.ltu p8,p12=r25,r26;;
|
||||
}
|
||||
{.mfi
|
||||
// f8=expon - bias
|
||||
setf.sig f8=r29
|
||||
nop.f 0
|
||||
// get T address
|
||||
shladd r2=r28,3,r2
|
||||
}
|
||||
{.mfi
|
||||
// first 8 bits after leading 1 are all ones ?
|
||||
cmp.eq p10,p0=r23,r28
|
||||
// if first 8 bits after leading bit are 0, use polynomial approx. only
|
||||
(p8) fms.s1 f6=f7,f1,f1
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
//r26=1
|
||||
mov r26=1
|
||||
// if first 8 bits after leading 1 are all ones, use polynomial approx. only
|
||||
(p10) fms.s1 f6=f7,f12,f1
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
.pred.rel "mutex",p8,p12
|
||||
{.mmf
|
||||
// load T (unless first 9 bits after leading 1 are 0)
|
||||
(p12) ldfd f12=[r2]
|
||||
nop.m 0
|
||||
// set T=0 (if first 9 bits after leading 1 are 0)
|
||||
(p8) fma.s1 f12=f0,f0,f0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P34=C_3+C_4*r
|
||||
fma.s1 f10=f11,f6,f10
|
||||
// r26=2^{63}
|
||||
shl r26=r26,63
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// r2=r*r
|
||||
fma.s1 f11=f6,f6,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
// significand of x is 1 ?
|
||||
cmp.eq p0,p6=r25,r26
|
||||
// P12=C_1+C_2*r
|
||||
fma.s1 f14=f13,f6,f14
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// normalize additive term (l=exponent of x)
|
||||
fcvt.xf f8=f8
|
||||
// if significand(x)=1, return exponent (l)
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// add T+l
|
||||
(p6) fma.s1 f8=f8,f1,f12
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P14=P12+r2*P34
|
||||
(p6) fma.s1 f13=f10,f11,f14
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// result=T+l+r*P14
|
||||
(p6) fma.s.s0 f8=f13,f6,f8
|
||||
// return
|
||||
br.ret.sptk b0;;
|
||||
}
|
||||
|
||||
|
||||
SPECIAL_log2f:
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// x=+Infinity ?
|
||||
fclass.m p7,p0=f8,0x21
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// x=+/-Zero ?
|
||||
fclass.m p8,p0=f8,0x7
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// x=-Infinity, -normal, -denormal ?
|
||||
fclass.m p6,p0=f8,0x3a
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// log2f(+Infinity)=+Infinity
|
||||
nop.f 0
|
||||
(p7) br.ret.spnt b0;;
|
||||
}
|
||||
{.mfi
|
||||
(p8) mov GR_Parameter_TAG = 172
|
||||
// log2f(+/-0)=-infinity, raises Divide by Zero
|
||||
// set f8=-0
|
||||
(p8) fmerge.ns f8=f0,f8
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
(p8) frcpa.s0 f8,p0=f1,f8
|
||||
(p8) br.cond.sptk __libm_error_region;;
|
||||
}
|
||||
{.mfb
|
||||
(p6) mov GR_Parameter_TAG = 173
|
||||
// x<0: return NaN, raise Invalid
|
||||
(p6) frcpa.s0 f8,p0=f0,f0
|
||||
(p6) br.cond.sptk __libm_error_region;;
|
||||
}
|
||||
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// Remaining cases: NaNs
|
||||
fma.s.s0 f8=f8,f1,f0
|
||||
br.ret.sptk b0;;
|
||||
}
|
||||
|
||||
WEAK_LIBM_END(log2f)
|
||||
libm_alias_float_other (__log2, log2)
|
||||
#ifdef SHARED
|
||||
.symver log2f,log2f@@GLIBC_2.27
|
||||
.weak __log2f_compat
|
||||
.set __log2f_compat,__log2f
|
||||
.symver __log2f_compat,log2f@GLIBC_2.2
|
||||
#endif
|
||||
|
||||
|
||||
LOCAL_LIBM_ENTRY(__libm_error_region)
|
||||
.prologue
|
||||
{ .mfi
|
||||
add GR_Parameter_Y=-32,sp // Parameter 2 value
|
||||
nop.f 0
|
||||
.save ar.pfs,GR_SAVE_PFS
|
||||
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
|
||||
}
|
||||
{ .mfi
|
||||
.fframe 64
|
||||
add sp=-64,sp // Create new stack
|
||||
nop.f 0
|
||||
mov GR_SAVE_GP=gp // Save gp
|
||||
};;
|
||||
{ .mmi
|
||||
stfs [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
|
||||
add GR_Parameter_X = 16,sp // Parameter 1 address
|
||||
.save b0, GR_SAVE_B0
|
||||
mov GR_SAVE_B0=b0 // Save b0
|
||||
};;
|
||||
.body
|
||||
{ .mib
|
||||
stfs [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
|
||||
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
|
||||
nop.b 0
|
||||
}
|
||||
{ .mib
|
||||
stfs [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
|
||||
add GR_Parameter_Y = -16,GR_Parameter_Y
|
||||
br.call.sptk b0=__libm_error_support# // Call error handling function
|
||||
};;
|
||||
{ .mmi
|
||||
nop.m 0
|
||||
nop.m 0
|
||||
add GR_Parameter_RESULT = 48,sp
|
||||
};;
|
||||
{ .mmi
|
||||
ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
|
||||
.restore sp
|
||||
add sp = 64,sp // Restore stack pointer
|
||||
mov b0 = GR_SAVE_B0 // Restore return address
|
||||
};;
|
||||
{ .mib
|
||||
mov gp = GR_SAVE_GP // Restore gp
|
||||
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
|
||||
br.ret.sptk b0 // Return
|
||||
};;
|
||||
|
||||
LOCAL_LIBM_END(__libm_error_region)
|
||||
.type __libm_error_support#,@function
|
||||
.global __libm_error_support#
|
@ -1 +0,0 @@
|
||||
/* Not needed. */
|
@ -1,815 +0,0 @@
|
||||
.file "log2l.s"
|
||||
|
||||
|
||||
// Copyright (c) 2000 - 2003, Intel Corporation
|
||||
// All rights reserved.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote
|
||||
// products derived from this software without specific prior written
|
||||
// permission.
|
||||
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Intel Corporation is the author of this code, and requests that all
|
||||
// problem reports or change requests be submitted to it directly at
|
||||
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
||||
//
|
||||
// History
|
||||
//==============================================================
|
||||
// 09/25/00 Initial version
|
||||
// 11/22/00 Fixed accuracy bug (for mantissas near 1, 2)
|
||||
// 12/07/00 Fixed C_1l constant, eliminated rounding errors in
|
||||
// reduced argument (x*frcpa(x)-1)
|
||||
// 05/20/02 Cleaned up namespace and sf0 syntax
|
||||
// 02/10/03 Reordered header: .section, .global, .proc, .align
|
||||
//
|
||||
// API
|
||||
//==============================================================
|
||||
// long double log2l(long double)
|
||||
//
|
||||
// Overview of operation
|
||||
//==============================================================
|
||||
// Background
|
||||
//
|
||||
// Implementation
|
||||
//
|
||||
// Let x = 2^l * m, where m=1.b1 b2 ... b8 b9 ... b52
|
||||
// y=frcpa(m), r=m*y-1, f=b1 b2 .. b8
|
||||
// T_hi is a table that stores the 24 most significant bits of log2(1/y)
|
||||
// (in entries 1..255) in single precision format
|
||||
// T_low is a table that stores (log2(1/y)-T_high), rounded to double
|
||||
// precision
|
||||
//
|
||||
// f is used as an index; T_high[255]=T_low[255]=0
|
||||
//
|
||||
// If f=0 and b9=0, r is set to 2^{-8}* 0.b9 b10 ... b52 = m-1 (fractional part of m),
|
||||
// and 0 is used instead of T_high[0], T_low[0]
|
||||
// (polynomial evaluation only, for m=1+r, 0<=r<2^{-9})
|
||||
// If f=255, r is set to (m-2)/2 (T[255]=0, and only polynomial evaluation is used
|
||||
// for m=2(1-r'), 0<=r'<2^{-9})
|
||||
//
|
||||
// If 2^{-9}<=m<2-2^{-8} or (input not near 1), let C1r=(2^{16}+C1*r)-2^{16}
|
||||
// and let E=((RN(m*y)-1)-r)+(m*y-RN(m*y))
|
||||
// Else let C1r=C1*r (rounded to 64 significant bits) and let E=0
|
||||
//
|
||||
// Let D=C1*r-C1r
|
||||
//
|
||||
//
|
||||
// log2l(x) is approximated as
|
||||
// (l+T_high[f]+C1r) + (D+r*(c1+c2*r+c3*r^2...+c8*r^7)+(T_low[f]+C_1*E))
|
||||
//
|
||||
|
||||
|
||||
// Special values
|
||||
//==============================================================
|
||||
// log2l(0)=-inf, raises Divide by Zero
|
||||
// log2l(+inf)=inf
|
||||
// log2l(x)=NaN, raises Invalid if x<0
|
||||
//
|
||||
|
||||
|
||||
// Registers used
|
||||
//==============================================================
|
||||
// f6-f15, f32-f36
|
||||
// r2-r3, r23-r23
|
||||
// p6,p7,p8,p12
|
||||
//
|
||||
|
||||
|
||||
GR_SAVE_B0 = r33
|
||||
GR_SAVE_PFS = r34
|
||||
GR_SAVE_GP = r35 // This reg. can safely be used
|
||||
GR_SAVE_SP = r36
|
||||
|
||||
GR_Parameter_X = r37
|
||||
GR_Parameter_Y = r38
|
||||
GR_Parameter_RESULT = r39
|
||||
GR_Parameter_TAG = r40
|
||||
|
||||
FR_X = f10
|
||||
FR_Y = f1
|
||||
FR_RESULT = f8
|
||||
|
||||
|
||||
|
||||
|
||||
// Data tables
|
||||
//==============================================================
|
||||
|
||||
RODATA
|
||||
|
||||
.align 16
|
||||
|
||||
LOCAL_OBJECT_START(poly_coeffs)
|
||||
|
||||
data8 0xb8aa3b295c17f0bc, 0x00003fff // C_1
|
||||
data8 0x3fca61762a7aded9, 0xbfc71547652b82fe // C_7, C_8
|
||||
data8 0x3fd2776c50ef9bfe, 0xbfcec709dc3a03fd // C_5, C_6
|
||||
data8 0x3fdec709dc3a03fd, 0xbfd71547652b82fe // C_3, C_4
|
||||
//data8 0xd871319ff0342580, 0x0000bfbd // C_1l (low part of C1)
|
||||
data8 0x82f0025f2dc582ee, 0x0000bfbe // C_1l (low part of C1)
|
||||
data8 0xb8aa3b295c17f0bc, 0x0000bffe // C_2
|
||||
LOCAL_OBJECT_END(poly_coeffs)
|
||||
|
||||
|
||||
|
||||
|
||||
LOCAL_OBJECT_START(T_table)
|
||||
|
||||
data4 0x3b38d875, 0x3c0ae7f4, 0x3c67f738, 0x3ca2b253
|
||||
data4 0x3ccbb91d, 0x3cfac91e, 0x3d1504a5, 0x3d29c4a0
|
||||
data4 0x3d419264, 0x3d567aa6, 0x3d6e76ca, 0x3d81c3f7
|
||||
data4 0x3d8c5630, 0x3d9876e9, 0x3da31e0a, 0x3dadcf09
|
||||
data4 0x3db889f9, 0x3dc34eec, 0x3dce1df5, 0x3dd8f726
|
||||
data4 0x3de3da94, 0x3deec851, 0x3df82ea4, 0x3e0197dd
|
||||
data4 0x3e071dad, 0x3e0ca8ca, 0x3e116d6e, 0x3e170281
|
||||
data4 0x3e1bcfbc, 0x3e216ee9, 0x3e2644dc, 0x3e2b1ee1
|
||||
data4 0x3e30cd12, 0x3e35affd, 0x3e3a970f, 0x3e3f824f
|
||||
data4 0x3e4544c0, 0x3e4a3926, 0x3e4f31d1, 0x3e542ec7
|
||||
data4 0x3e593012, 0x3e5e35b7, 0x3e633fbf, 0x3e677625
|
||||
data4 0x3e6c884b, 0x3e719eea, 0x3e76ba0a, 0x3e7bd9b2
|
||||
data4 0x3e80111d, 0x3e82a523, 0x3e84ccec, 0x3e876533
|
||||
data4 0x3e89ffd1, 0x3e8c2d22, 0x3e8e5c18, 0x3e90fd0a
|
||||
data4 0x3e932fa9, 0x3e95d506, 0x3e980b5a, 0x3e9a4361
|
||||
data4 0x3e9c7d1f, 0x3e9f2b16, 0x3ea168a0, 0x3ea3a7ea
|
||||
data4 0x3ea5e8f5, 0x3ea82bc4, 0x3eaa705b, 0x3eacb6bb
|
||||
data4 0x3eaefee7, 0x3eb148e3, 0x3eb394b1, 0x3eb5e255
|
||||
data4 0x3eb831d0, 0x3eba8327, 0x3ebcd65c, 0x3ebeb3e0
|
||||
data4 0x3ec10a7a, 0x3ec362f9, 0x3ec5bd63, 0x3ec7a0b3
|
||||
data4 0x3ec9fe96, 0x3ecc5e6c, 0x3ece4619, 0x3ed0a978
|
||||
data4 0x3ed293fe, 0x3ed4faf1, 0x3ed6e859, 0x3ed952eb
|
||||
data4 0x3edb433c, 0x3eddb178, 0x3edfa4bc, 0x3ee19953
|
||||
data4 0x3ee40cee, 0x3ee60484, 0x3ee7fd73, 0x3ee9f7bb
|
||||
data4 0x3eec7280, 0x3eee6fda, 0x3ef06e94, 0x3ef26eb1
|
||||
data4 0x3ef47031, 0x3ef67317, 0x3ef8f8b2, 0x3efafec5
|
||||
data4 0x3efd0644, 0x3eff0f32, 0x3f008cc8, 0x3f0192b0
|
||||
data4 0x3f029952, 0x3f03a0b0, 0x3f0466b2, 0x3f056f5a
|
||||
data4 0x3f0678c0, 0x3f0782e6, 0x3f088dcc, 0x3f099973
|
||||
data4 0x3f0aa5dd, 0x3f0b6fac, 0x3f0c7d6d, 0x3f0d8bf4
|
||||
data4 0x3f0e575b, 0x3f0f673e, 0x3f1077e9, 0x3f1144ef
|
||||
data4 0x3f1256fc, 0x3f1369d6, 0x3f143880, 0x3f154cc1
|
||||
data4 0x3f161c7a, 0x3f173227, 0x3f1802f2, 0x3f191a0f
|
||||
data4 0x3f19ebee, 0x3f1b047e, 0x3f1bd775, 0x3f1cf17b
|
||||
data4 0x3f1dc58e, 0x3f1ee10f, 0x3f1fb63f, 0x3f208bea
|
||||
data4 0x3f21a98f, 0x3f22805c, 0x3f2357a7, 0x3f247778
|
||||
data4 0x3f254fe9, 0x3f2628d9, 0x3f270249, 0x3f2824fb
|
||||
data4 0x3f28ff97, 0x3f29dab4, 0x3f2ab654, 0x3f2b9277
|
||||
data4 0x3f2cb8c8, 0x3f2d961e, 0x3f2e73fa, 0x3f2f525b
|
||||
data4 0x3f303143, 0x3f3110b1, 0x3f31f0a7, 0x3f32d125
|
||||
data4 0x3f33b22b, 0x3f3493bc, 0x3f3575d6, 0x3f36587b
|
||||
data4 0x3f373bab, 0x3f381f68, 0x3f3903b1, 0x3f39e888
|
||||
data4 0x3f3acdec, 0x3f3bb3e0, 0x3f3c9a63, 0x3f3d8177
|
||||
data4 0x3f3e1bd4, 0x3f3f03d9, 0x3f3fec71, 0x3f40d59b
|
||||
data4 0x3f41bf59, 0x3f42a9ab, 0x3f434635, 0x3f443180
|
||||
data4 0x3f451d61, 0x3f4609d9, 0x3f46a7d3, 0x3f479549
|
||||
data4 0x3f488357, 0x3f492261, 0x3f4a1171, 0x3f4b011c
|
||||
data4 0x3f4ba139, 0x3f4c91e8, 0x3f4d8334, 0x3f4e246a
|
||||
data4 0x3f4f16be, 0x3f5009b1, 0x3f50ac02, 0x3f51a001
|
||||
data4 0x3f524305, 0x3f533812, 0x3f53dbca, 0x3f54d1e7
|
||||
data4 0x3f55c8a8, 0x3f566d85, 0x3f57655b, 0x3f580af0
|
||||
data4 0x3f58b0d0, 0x3f59aa2c, 0x3f5a50c7, 0x3f5b4b3c
|
||||
data4 0x3f5bf294, 0x3f5cee26, 0x3f5d963c, 0x3f5e92ed
|
||||
data4 0x3f5f3bc3, 0x3f5fe4e7, 0x3f60e32d, 0x3f618d13
|
||||
data4 0x3f623748, 0x3f63372a, 0x3f63e223, 0x3f648d6b
|
||||
data4 0x3f658eee, 0x3f663afe, 0x3f66e75e, 0x3f67ea86
|
||||
data4 0x3f6897b0, 0x3f69452c, 0x3f69f2f9, 0x3f6af847
|
||||
data4 0x3f6ba6e2, 0x3f6c55d0, 0x3f6d0510, 0x3f6e0c8d
|
||||
data4 0x3f6ebc9f, 0x3f6f6d04, 0x3f701dbe, 0x3f70cecd
|
||||
data4 0x3f718030, 0x3f728ae6, 0x3f733d20, 0x3f73efaf
|
||||
data4 0x3f74a296, 0x3f7555d3, 0x3f760967, 0x3f76bd53
|
||||
data4 0x3f777197, 0x3f7880a1, 0x3f7935c2, 0x3f79eb3c
|
||||
data4 0x3f7aa10f, 0x3f7b573b, 0x3f7c0dc2, 0x3f7cc4a3
|
||||
data4 0x3f7d7bdf, 0x3f7e3376, 0x3f7eeb68, 0x00000000
|
||||
LOCAL_OBJECT_END(T_table)
|
||||
|
||||
|
||||
|
||||
LOCAL_OBJECT_START(T_low)
|
||||
|
||||
|
||||
data8 0x3dc0b97f689876ef, 0x3dfd5d906028ac01
|
||||
data8 0x3df8b9cbb8d7240b, 0x3de0c941a2f220cd
|
||||
data8 0x3e09c6aecba15936, 0x3dfa6d528241827c
|
||||
data8 0x3dd0bad25714903c, 0x3e2776b01dc036a2
|
||||
data8 0x3e2b914bc77f158b, 0x3e1c0fafd29dc74a
|
||||
data8 0x3e28dadc119cd3de, 0x3e3bca869da085be
|
||||
data8 0x3e19d1e700f2200a, 0x3e3e13530cc37504
|
||||
data8 0x3e3936464d9c41ee, 0x3e3c3fa21c9499d0
|
||||
data8 0x3e3259e079b6c6e8, 0x3e2a364069c4f7f3
|
||||
data8 0x3e1274c84f6c6364, 0x3e3796170159f454
|
||||
data8 0x3e26e1e389f4364e, 0x3e28cedda8c7f658
|
||||
data8 0x3e376c2028433268, 0x3e4aee6d650c82e1
|
||||
data8 0x3e33e65094fbeeb4, 0x3e4c7d125aa92c5d
|
||||
data8 0x3e1559a4b69691d8, 0x3e18efabeb7d7221
|
||||
data8 0x3e4c2b255abaa8de, 0x3e37436952a4538b
|
||||
data8 0x3e4e6807f4ba00b8, 0x3e33ff5964190e42
|
||||
data8 0x3e4f5d798cead43c, 0x3e4f3676443bf453
|
||||
data8 0x3e4660f8d5bc1bf5, 0x3e2d4f9f3ab04f36
|
||||
data8 0x3e357f7a64ccd537, 0x3e394caf7c9b05af
|
||||
data8 0x3e225c7d17ab29b0, 0x3e4eb202f6d55a12
|
||||
data8 0x3e32faa68b19bcd2, 0x3e45ee1c9b566a8b
|
||||
data8 0x3e4770a67de054ff, 0x3e42234fb9de6d6b
|
||||
data8 0x3e4ad139825c6e19, 0x3e47f3d334814a93
|
||||
data8 0x3e2af1ec402867b6, 0x3e2bfbda0c956e3d
|
||||
data8 0x3e4287b831e77ff2, 0x3e54bf0eb77f7b89
|
||||
data8 0x3e5b9259a1029607, 0x3e4a764b015e699d
|
||||
data8 0x3e4d0b68ea883ab5, 0x3e33e829ecdadf46
|
||||
data8 0x3e52f27efef3031b, 0x3e3073979e4af89e
|
||||
data8 0x3e3b980f2cd6c253, 0x3e2a5f0f5f7f66a9
|
||||
data8 0x3e37788738117b02, 0x3e58aa29a784d52f
|
||||
data8 0x3e4f5504c4ff2466, 0x3e002d40340fa647
|
||||
data8 0x3e5f53b64592f4c3, 0x3e543f222c526802
|
||||
data8 0x3e5680e547a872fa, 0x3e5e234bd1154450
|
||||
data8 0x3e3000edc18b6d21, 0x3e1c3c1f000942a8
|
||||
data8 0x3e51eeae0e442d6e, 0x3e4fb265376623f2
|
||||
data8 0x3e57b5941782d830, 0x3e3a4b83f24ae52c
|
||||
data8 0x3e5a5fb4f23978de, 0x3e51ed071563fb02
|
||||
data8 0x3e49e2071f51a7a8, 0x3e5e43ae5b924234
|
||||
data8 0x3dfa2be9aedf374a, 0x3e56dea3dbba67d5
|
||||
data8 0x3e3375fe732b3c3e, 0x3e5a0c6f91f2e77e
|
||||
data8 0x3e55e1bf1c969e41, 0x3e30a5a5166b8eee
|
||||
data8 0x3e53e6e9a539d46c, 0x3e542981b3d7b0e6
|
||||
data8 0x3e595fd8ff36ad64, 0x3e5edeb9e65cbbb4
|
||||
data8 0x3e46aeab4d3434c1, 0x3e4ea3ff0564b010
|
||||
data8 0x3e59b00be2e3c25a, 0x3e5b887cd7b0821f
|
||||
data8 0x3e5f666668547b4d, 0x3e4d0733a805273f
|
||||
data8 0x3e26a2ff21c4aec5, 0x3e4c336f7a3a78f3
|
||||
data8 0x3e11ad12b628e2d0, 0x3e56d43ff3f0ea64
|
||||
data8 0x3e238809433cccd2, 0x3e40d9734147d40f
|
||||
data8 0x3e54245fe3e24e06, 0x3e251441fce4d48c
|
||||
data8 0x3e517114efc5d1f9, 0x3e5e9a99154b0d82
|
||||
data8 0x3e442a71337970f8, 0x3e420c7c69211fdf
|
||||
data8 0x3e537e7d5d43c6a7, 0x3e4376c66ad9ad8b
|
||||
data8 0x3e49054d678a4f1c, 0x3e5d23cb3bc19f18
|
||||
data8 0x3e6ebcd449dcab2b, 0x3e67f5fc2849c88a
|
||||
data8 0x3e63f388395d3e84, 0x3e65c1103b0ad7e9
|
||||
data8 0x3e6d5d1dd031f353, 0x3e5a159dae75c4d0
|
||||
data8 0x3e4d5e22aa75f71d, 0x3e5e379ee62e1e35
|
||||
data8 0x3e4df082213cb2dc, 0x3e6bfa06c156f521
|
||||
data8 0x3e66e2d3c19b517b, 0x3e426b7098590071
|
||||
data8 0x3e541bd027e9854e, 0x3e5061dd924b0ac0
|
||||
data8 0x3e6dae01df373a03, 0x3e3baec80b207b0b
|
||||
data8 0x3e6b6a6fe06bebac, 0x3e61aebcfc3ab5d1
|
||||
data8 0x3e584ee3e7c79d83, 0x3e6b3c1b2840cb40
|
||||
data8 0x3e6c842085d6befd, 0x3e6ac04fd7b141e0
|
||||
data8 0x3e6c48250474141d, 0x3e2d889b86125f69
|
||||
data8 0x3e6e74740225dad0, 0x3e45940d31d50a7c
|
||||
data8 0x3e695476a6c39ddc, 0x3e6d9a6d857a060a
|
||||
data8 0x3e4a3e9bb4b69337, 0x3e484f3ce4707ed6
|
||||
data8 0x3e39dd125d25fc27, 0x3e563fb400de8732
|
||||
data8 0x3e5fdd6d0ee28b48, 0x3e669d15b869bb07
|
||||
data8 0x3e40687cfad7964d, 0x3e69317990d43957
|
||||
data8 0x3e633d57e24ae1bd, 0x3e618bf03710eabb
|
||||
data8 0x3e4b4df6fccd1160, 0x3e3fb26ddaa1ec45
|
||||
data8 0x3e3810a5e1817fd4, 0x3e6857373642fa5c
|
||||
data8 0x3e673db6193add31, 0x3e63200c8acbc9c3
|
||||
data8 0x3e3d2dee448ebb62, 0x3e6a19723a80db6a
|
||||
data8 0x3e5e7cdab8fd3e6a, 0x3e671855cd660672
|
||||
data8 0x3e473c3c78a85ecd, 0x3e5f5e23056a7cf2
|
||||
data8 0x3e52538519527367, 0x3e4b573bcf2580e9
|
||||
data8 0x3e6d6f856fe90c60, 0x3e2d932a8487642e
|
||||
data8 0x3e5236fc78b6174c, 0x3e50cb91d406db50
|
||||
data8 0x3e650e8bd562aa57, 0x3e424ee3d9a82f2e
|
||||
data8 0x3e59363960e1e3d9, 0x3e379604c1150a3e
|
||||
data8 0x3e6d914f6c2ac258, 0x3e62967a451a7b48
|
||||
data8 0x3e684b5f01139cb2, 0x3e448bbfbf6d292c
|
||||
data8 0x3e6227e7fb487e73, 0x3e6d39d50290f458
|
||||
data8 0x3e58368342b4b668, 0x3e65dc0c25bd1763
|
||||
data8 0x3e61b7dc362e22b5, 0x3e671691f094bb80
|
||||
data8 0x3e5011642d5123f2, 0x3e4c4eb7f11e41be
|
||||
data8 0x3e5dcee36ca242cf, 0x3e6791cefff688f1
|
||||
data8 0x3e60e23c8dda4ecd, 0x3e48e6a22fe78cfe
|
||||
data8 0x3e6d703f244adc86, 0x3e6a281a85a5049d
|
||||
data8 0x3e570f20e6403d9e, 0x3e2211518a12956f
|
||||
data8 0x3e6737d1e54d71df, 0x3e66b1881476f5e9
|
||||
data8 0x3e6e1bbeef085376, 0x3e47cad4944a32be
|
||||
data8 0x3e527f2c738e7ee9, 0x3e699883a4b9fb29
|
||||
data8 0x3e5c17d1108740d9, 0x3e5d4a9c79a43389
|
||||
data8 0x3e49fdc24462ba3b, 0x3e24dbb3a60cceb2
|
||||
data8 0x3e5c5bf618780748, 0x3e5c38005b0c778c
|
||||
data8 0x3e6be168dd6dd3fe, 0x3e633ab9370693b0
|
||||
data8 0x3dd290556b0ae339, 0x3e607c317927096a
|
||||
data8 0x3e59651353b3d90e, 0x3e4d8751e5e0ae0d
|
||||
data8 0x3e46c81023272a85, 0x3e6b23c988f391b2
|
||||
data8 0x3e608741d215209c, 0x3e60b8ba506d758f
|
||||
data8 0x3e62ddbe74803297, 0x3e5dbb8b5087587d
|
||||
data8 0x3e642aa529048131, 0x3e3dcbda6835dcf4
|
||||
data8 0x3e6db503ce854d2a, 0x3e6dd00b49bc6849
|
||||
data8 0x3e4db2f11243bc84, 0x3e3b9848efc2ea97
|
||||
data8 0x3e58f18e17c82609, 0x3e6ed8645e16c312
|
||||
data8 0x3e4065bdb60a5dd4, 0x3e490453c6e6c30a
|
||||
data8 0x3e62373994aa31ba, 0x3e56305f0e6b2a95
|
||||
data8 0x3e68c1601a6614ee, 0x3e614e204f19d93f
|
||||
data8 0x3e6e5037ca773299, 0x3e693f98892561a6
|
||||
data8 0x3e639de4f4bf700d, 0x3e416c071e93fd97
|
||||
data8 0x3e65466991b415ef, 0x3e6896a324afac9d
|
||||
data8 0x3e44f64802e2f11c, 0x3e64d7d747e2191a
|
||||
data8 0x3e6174b7581de84c, 0x3e44c7b946e1d43c
|
||||
data8 0x3e6a3bcbe30512ec, 0x3e5d3ed411c95ce4
|
||||
data8 0x3e3e5b5735cfaf8e, 0x3e6e538ab34efb51
|
||||
data8 0x3e514e204f19d93f, 0x3e5a88e6550c89a4
|
||||
data8 0x3e66b97a5d9dfd8b, 0x3e5f46b1e14ebaf3
|
||||
data8 0x3e357665f6893f5d, 0x3e6bbf633078d1d5
|
||||
data8 0x3e5e7337a212c417, 0x3e3570fde15fc8cc
|
||||
data8 0x3e21119402da92b4, 0x3e6566e830d1ff3b
|
||||
data8 0x3e558883e480e220, 0x3e589ca3a68da411
|
||||
data8 0x3e44eb66df73d648, 0x3e1a0a629b1b7e68
|
||||
data8 0x3e54cc207b8c1116, 0x0000000000000000
|
||||
LOCAL_OBJECT_END(T_low)
|
||||
|
||||
|
||||
.section .text
|
||||
GLOBAL_IEEE754_ENTRY(log2l)
|
||||
|
||||
{ .mfi
|
||||
alloc r32=ar.pfs,1,4,4,0
|
||||
// normalize x
|
||||
// y=frcpa(x)
|
||||
frcpa.s1 f41,p0=f1,f8
|
||||
// r26=bias-1
|
||||
mov r26=0xfffe
|
||||
}
|
||||
{.mfi
|
||||
// r23=bias+16
|
||||
mov r23=0xffff+16
|
||||
fma.s1 f7=f8,f1,f0
|
||||
// r2 = pointer to C_1...C_6 followed by T_table
|
||||
addl r2 = @ltoff(poly_coeffs), gp;;
|
||||
}
|
||||
{.mfi
|
||||
// get significand
|
||||
getf.sig r25=f8
|
||||
// f8 denormal ?
|
||||
fclass.m p8,p10=f8,0x9
|
||||
// r24=bias-8
|
||||
mov r24=0xffff-8;;
|
||||
}
|
||||
{.mfi
|
||||
setf.exp f36=r26
|
||||
nop.f 0
|
||||
// r27=bias
|
||||
mov r27=0xffff;;
|
||||
}
|
||||
|
||||
{.mmf
|
||||
getf.exp r29=f8
|
||||
// load start address for C_1...C_7 followed by T_table
|
||||
ld8 r2=[r2]
|
||||
// will continue only for positive normal/unnormal numbers
|
||||
fclass.m.unc p0,p12 = f8, 0x19;;
|
||||
}
|
||||
|
||||
|
||||
.pred.rel "mutex",p8,p10
|
||||
{.mfi
|
||||
// denormal input, repeat get significand (after normalization)
|
||||
(p8) getf.sig r25=f7
|
||||
// x=1 ?
|
||||
fcmp.eq.s0 p6,p0=f8,f1
|
||||
// get T_index
|
||||
(p10) shr.u r28=r25,63-8
|
||||
}
|
||||
{.mfi
|
||||
// f32=2^16
|
||||
setf.exp f32=r23
|
||||
nop.f 0
|
||||
mov r26=0x804;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
// denormal input, repeat get exponent (after normalization)
|
||||
(p8) getf.exp r29=f7
|
||||
// f33=0
|
||||
mov f33=f0
|
||||
// r26=0x80400...0 (threshold for using polynomial approximation)
|
||||
shl r26=r26,64-12;;
|
||||
}
|
||||
|
||||
{.mfb
|
||||
add r3=16,r2
|
||||
// r=x*y-1
|
||||
fms.s1 f6=f41,f8,f1
|
||||
(p12) br.cond.spnt SPECIAL_log2l
|
||||
}
|
||||
{.mfi
|
||||
// load C_1
|
||||
ldfe f14=[r2],48
|
||||
// RN(x*y)
|
||||
fma.s1 f43=f41,f8,f0
|
||||
mov r23=0xff;;
|
||||
}
|
||||
|
||||
{.mmi
|
||||
// load C_7, C_8
|
||||
ldfpd f10,f11=[r3],16
|
||||
// load C_3,C_4
|
||||
ldfpd f15,f42=[r2],16
|
||||
(p8) shr.u r28=r25,63-8;;
|
||||
}
|
||||
|
||||
|
||||
{.mfi
|
||||
// load C_5, C_6
|
||||
ldfpd f12,f13=[r3]
|
||||
// pseudo-zero ?
|
||||
fcmp.eq.s0 p7,p0=f7,f0
|
||||
// if first 9 bits after leading 1 are all zero, then p8=1
|
||||
cmp.ltu p8,p12=r25,r26
|
||||
}
|
||||
{.mfi
|
||||
// load C1l
|
||||
ldfe f34=[r2],16
|
||||
fmerge.se f7=f1,f7
|
||||
// get T_index
|
||||
and r28=r28,r23;;
|
||||
}
|
||||
{.mfi
|
||||
// r29=exponent-bias
|
||||
sub r29=r29,r27
|
||||
// if first 8 bits after leading bit are 0, use polynomial approx. only
|
||||
(p8) fms.s1 f6=f7,f1,f1
|
||||
// start address of T_low
|
||||
add r3=1024+16,r2
|
||||
}
|
||||
{.mfi
|
||||
// load C_2
|
||||
ldfe f35=[r2],16
|
||||
// x=1, return 0
|
||||
(p6) fma.s0 f8=f0,f0,f0
|
||||
// first 8 bits after leading 1 are all ones ?
|
||||
cmp.eq p10,p0=r23,r28;;
|
||||
}
|
||||
|
||||
{.mfb
|
||||
// if first 8 bits after leading 1 are all ones, use polynomial approx. only
|
||||
// add 1 to the exponent additive term, and estimate log2(1-r)
|
||||
(p10) add r29=1,r29
|
||||
nop.f 0
|
||||
(p7) br.cond.spnt LOG2_PSEUDO_ZERO
|
||||
}
|
||||
{.mfi
|
||||
// get T_low address
|
||||
shladd r3=r28,3,r3
|
||||
// if first 8 bits after leading 1 are all ones, use polynomial approx. only
|
||||
(p10) fms.s1 f6=f7,f36,f1
|
||||
// p10 --> p8=1, p12=0
|
||||
(p10) cmp.eq p8,p12=r0,r0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
// get T_high address
|
||||
shladd r2=r28,2,r2
|
||||
// L(x*y)=x*y-RN(x*y)
|
||||
fms.s1 f41=f41,f8,f43
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
// p13=p12
|
||||
(p12) cmp.eq.unc p13,p0=r0,r0
|
||||
// RtH=RN(x*y)-1 (will eliminate rounding errors in r)
|
||||
fms.s1 f43=f43,f1,f1
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
.pred.rel "mutex",p8,p12
|
||||
{.mfb
|
||||
// load T_high (unless first 9 bits after leading 1 are 0)
|
||||
(p12) ldfs f7=[r2]
|
||||
// set T_high=0 (if first 9 bits after leading 1 are 0)
|
||||
(p8) fma.s1 f7=f0,f0,f0
|
||||
// x=1, return
|
||||
(p6) br.ret.spnt b0
|
||||
}
|
||||
.pred.rel "mutex",p8,p12
|
||||
{.mfi
|
||||
// p12: load T_low
|
||||
(p12) ldfd f36=[r3]
|
||||
// p8: set T_low=0
|
||||
(p8) fma.s1 f36=f0,f0,f0
|
||||
(p8) cmp.eq p8,p12=r29,r0;; //nop.i 0;;
|
||||
}
|
||||
|
||||
.pred.rel "mutex",p8,p12
|
||||
{.mfi
|
||||
// f8=expon - bias
|
||||
setf.sig f8=r29
|
||||
// general case: 2^{16}+C1*r
|
||||
(p12) fma.s1 f33=f6,f14,f32
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
// r26=1
|
||||
mov r26=1
|
||||
// p8 (mantissa is close to 1, or close to 2): 2^{-8}+C1*r
|
||||
(p8) fma.s1 f32=f6,f14,f33
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P78=C_7+C_8*r
|
||||
fma.s1 f10=f11,f6,f10
|
||||
// r26=2^{63}
|
||||
shl r26=r26,63
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P34=C_3+r*C_4
|
||||
fma.s1 f15=f42,f6,f15
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// r2=r*r
|
||||
fma.s1 f11=f6,f6,f0
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P56=C_5+C_6*r
|
||||
fma.s1 f13=f13,f6,f12
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// Rth-r
|
||||
(p13) fms.s1 f43=f43,f1,f6
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
// significand(x)=1 ?
|
||||
cmp.eq p0,p6=r25,r26
|
||||
// P12=C1l+C_2*r
|
||||
fma.s1 f34=f35,f6,f34
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
.pred.rel "mutex",p8,p12
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// p12: C1r=(2^{16}+C1*r)-2^{16}
|
||||
(p12) fms.s1 f32=f33,f1,f32
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// p8: C1r=C1*r (double extended)
|
||||
(p8) fms.s1 f32=f32,f1,f33
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// L(x*y)*C_1+T_low
|
||||
(p13) fma.s1 f36=f41,f14,f36
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P58=P56+r2*P78
|
||||
fma.s1 f13=f11,f10,f13
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P14=P12+r2*P34
|
||||
fma.s1 f15=f15,f11,f34
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// r4=r2*r2
|
||||
fma.s1 f11=f11,f11,f0
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// normalize additive term (l=exponent of x)
|
||||
fcvt.xf f8=f8
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// D=C1*r-C1r
|
||||
(p6) fms.s1 f12=f14,f6,f32
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// T_low'=(Rth-r)*C1+(L(x*y)*C1+T_low)
|
||||
(p13) fma.s1 f36=f43,f14,f36
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// P18=P14+r4*P58
|
||||
(p6) fma.s1 f13=f11,f13,f15
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// add T_high+l
|
||||
(p6) fma.s1 f8=f8,f1,f7
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// D+T_low
|
||||
(p6) fma.s1 f12=f12,f1,f36
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// (T_high+l)+C1r
|
||||
(p6) fma.s1 f8=f8,f1,f32
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// (D+T_low)+r*P18
|
||||
(p6) fma.s1 f13=f13,f6,f12
|
||||
nop.i 0;;
|
||||
}
|
||||
|
||||
//{.mfb
|
||||
//nop.m 0
|
||||
//mov f8=f36
|
||||
//fma.s0 f8=f13,f6,f0
|
||||
//br.ret.sptk b0;;
|
||||
//}
|
||||
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// result=((T_high+l)+C1r)+((D+T_low)+r*P18)
|
||||
(p6) fma.s0 f8=f13,f1,f8
|
||||
// return
|
||||
br.ret.sptk b0;;
|
||||
}
|
||||
|
||||
|
||||
SPECIAL_log2l:
|
||||
{.mfi
|
||||
nop.m 0
|
||||
mov FR_X=f8
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// x=+Infinity ?
|
||||
fclass.m p7,p0=f8,0x21
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// x=+/-Zero ?
|
||||
fclass.m p8,p0=f7,0x7
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfi
|
||||
nop.m 0
|
||||
// x=-Infinity, -normal, -denormal ?
|
||||
fclass.m p6,p0=f8,0x3a
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// log2l(+Infinity)=+Infinity
|
||||
nop.f 0
|
||||
(p7) br.ret.spnt b0;;
|
||||
}
|
||||
{.mfi
|
||||
(p8) mov GR_Parameter_TAG = 168
|
||||
// log2l(+/-0)=-infinity, raises Divide by Zero
|
||||
// set f8=-0
|
||||
(p8) fmerge.ns f8=f0,f8
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
(p8) frcpa.s0 f8,p0=f1,f8
|
||||
(p8) br.cond.sptk __libm_error_region;;
|
||||
}
|
||||
{.mfb
|
||||
(p6) mov GR_Parameter_TAG = 169
|
||||
// x<0: return NaN, raise Invalid
|
||||
(p6) frcpa.s0 f8,p0=f0,f0
|
||||
(p6) br.cond.sptk __libm_error_region;;
|
||||
}
|
||||
|
||||
|
||||
{.mfb
|
||||
nop.m 0
|
||||
// Remaining cases: NaNs
|
||||
fma.s0 f8=f8,f1,f0
|
||||
br.ret.sptk b0;;
|
||||
}
|
||||
|
||||
LOG2_PSEUDO_ZERO:
|
||||
|
||||
{.mfi
|
||||
nop.m 0
|
||||
mov FR_X=f8
|
||||
nop.i 0
|
||||
}
|
||||
{.mfi
|
||||
mov GR_Parameter_TAG = 168
|
||||
// log2l(+/-0)=-infinity, raises Divide by Zero
|
||||
// set f8=-0
|
||||
fmerge.ns f8=f0,f8
|
||||
nop.i 0;;
|
||||
}
|
||||
{.mfb
|
||||
nop.m 0
|
||||
frcpa.s0 f8,p0=f1,f8
|
||||
br.cond.sptk __libm_error_region;;
|
||||
}
|
||||
|
||||
|
||||
GLOBAL_IEEE754_END(log2l)
|
||||
libm_alias_ldouble_other (__log2, log2)
|
||||
|
||||
|
||||
LOCAL_LIBM_ENTRY(__libm_error_region)
|
||||
.prologue
|
||||
{ .mfi
|
||||
add GR_Parameter_Y=-32,sp // Parameter 2 value
|
||||
nop.f 0
|
||||
.save ar.pfs,GR_SAVE_PFS
|
||||
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
|
||||
}
|
||||
{ .mfi
|
||||
.fframe 64
|
||||
add sp=-64,sp // Create new stack
|
||||
nop.f 0
|
||||
mov GR_SAVE_GP=gp // Save gp
|
||||
};;
|
||||
{ .mmi
|
||||
stfe [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
|
||||
add GR_Parameter_X = 16,sp // Parameter 1 address
|
||||
.save b0, GR_SAVE_B0
|
||||
mov GR_SAVE_B0=b0 // Save b0
|
||||
};;
|
||||
.body
|
||||
{ .mib
|
||||
stfe [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
|
||||
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
|
||||
nop.b 0
|
||||
}
|
||||
{ .mib
|
||||
stfe [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
|
||||
add GR_Parameter_Y = -16,GR_Parameter_Y
|
||||
br.call.sptk b0=__libm_error_support# // Call error handling function
|
||||
};;
|
||||
{ .mmi
|
||||
nop.m 0
|
||||
nop.m 0
|
||||
add GR_Parameter_RESULT = 48,sp
|
||||
};;
|
||||
{ .mmi
|
||||
ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
|
||||
.restore sp
|
||||
add sp = 64,sp // Restore stack pointer
|
||||
mov b0 = GR_SAVE_B0 // Restore return address
|
||||
};;
|
||||
{ .mib
|
||||
mov gp = GR_SAVE_GP // Restore gp
|
||||
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
|
||||
br.ret.sptk b0 // Return
|
||||
};;
|
||||
|
||||
LOCAL_LIBM_END(__libm_error_region)
|
||||
.type __libm_error_support#,@function
|
||||
.global __libm_error_support#
|
@ -1 +0,0 @@
|
||||
/* Not needed. */
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user