mirror of
https://sourceware.org/git/glibc.git
synced 2025-01-01 07:20:15 +00:00
30891f35fa
We stopped adding "Contributed by" or similar lines in sources in 2012 in favour of git logs and keeping the Contributors section of the glibc manual up to date. Removing these lines makes the license header a bit more consistent across files and also removes the possibility of error in attribution when license blocks or files are copied across since the contributed-by lines don't actually reflect reality in those cases. Move all "Contributed by" and similar lines (Written by, Test by, etc.) into a new file CONTRIBUTED-BY to retain record of these contributions. These contributors are also mentioned in manual/contrib.texi, so we just maintain this additional record as a courtesy to the earlier developers. The following scripts were used to filter a list of files to edit in place and to clean up the CONTRIBUTED-BY file respectively. These were not added to the glibc sources because they're not expected to be of any use in future given that this is a one time task: https://gist.github.com/siddhesh/b5ecac94eabfd72ed2916d6d8157e7dc https://gist.github.com/siddhesh/15ea1f5e435ace9774f485030695ee02 Reviewed-by: Carlos O'Donell <carlos@redhat.com>
1345 lines
55 KiB
ArmAsm
1345 lines
55 KiB
ArmAsm
.file "tanhl.s"
|
|
|
|
|
|
// Copyright (c) 2001 - 2003, Intel Corporation
|
|
// All rights reserved.
|
|
//
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions are
|
|
// met:
|
|
//
|
|
// * Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions and the following disclaimer.
|
|
//
|
|
// * Redistributions in binary form must reproduce the above copyright
|
|
// notice, this list of conditions and the following disclaimer in the
|
|
// documentation and/or other materials provided with the distribution.
|
|
//
|
|
// * The name of Intel Corporation may not be used to endorse or promote
|
|
// products derived from this software without specific prior written
|
|
// permission.
|
|
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
|
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
|
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
//
|
|
// Intel Corporation is the author of this code, and requests that all
|
|
// problem reports or change requests be submitted to it directly at
|
|
// http://www.intel.com/software/products/opensource/libraries/num.htm.
|
|
//
|
|
// History
|
|
//==============================================================
|
|
// 11/29/01 Initial version
|
|
// 05/20/02 Cleaned up namespace and sf0 syntax
|
|
// 08/14/02 Changed mli templates to mlx
|
|
// 02/10/03 Reordered header: .section, .global, .proc, .align
|
|
//
|
|
// API
|
|
//==============================================================
|
|
// long double tanhl(long double)
|
|
//
|
|
// Overview of operation
|
|
//==============================================================
|
|
//
|
|
// Algorithm description
|
|
// ---------------------
|
|
//
|
|
// There are 4 paths:
|
|
//
|
|
// 1. Special path: x = 0, Inf, NaNs, denormal
|
|
// Return tanhl(x) = +/-0.0 for zeros
|
|
// Return tanhl(x) = QNaN for NaNs
|
|
// Return tanhl(x) = sign(x)*1.0 for Inf
|
|
// Return tanhl(x) = x + x^2 for - denormals
|
|
// Return tanhl(x) = x - x^2 for + denormals
|
|
//
|
|
// 2. [0;1/8] path: 0.0 < |x| < 1/8
|
|
// Return tanhl(x) = x + x^3*A3 + ... + x^15*A15
|
|
//
|
|
// 3. Main path: 1/8 <= |x| < 22.8
|
|
// For several ranges of 1/8 <= |x| < 22.8
|
|
// Return tanhl(x) = sign(x)*((A0H+A0L) + y*(A1H+A1L) + y^2*(A2H+A2L) +
|
|
// + y^3*A3 + y^4*A4 + ... + y^25*A25 )
|
|
// where y = (|x|/a) - b
|
|
//
|
|
// For each range there is particular set of coefficients.
|
|
// Below is the list of ranges:
|
|
// 1/8 <= |x| < 1/4 a = 0.125, b = 1.5
|
|
// 1/4 <= |x| < 1/2 a = 0.25, b = 1.5
|
|
// 1/2 <= |x| < 1.0 a = 0.5, b = 1.5
|
|
// 1.0 <= |x| < 2.0 a = 1.0, b = 1.5
|
|
// 2.0 <= |x| < 3.25 a = 2.0, b = 1.5
|
|
// 3.25 <= |x| < 4.0 a = 2.0, b = 2.0
|
|
// 4.0 <= |x| < 6.5 a = 4.0, b = 1.5
|
|
// 6.5 <= |x| < 8.0 a = 4.0, b = 2.0
|
|
// 8.0 <= |x| < 13.0 a = 8.0, b = 1.5
|
|
// 13.0 <= |x| < 16.0 a = 8.0, b = 2.0
|
|
// 16.0 <= |x| < 22.8 a = 16.0, b = 1.5
|
|
// ( [3.25;4.0], [6.5;8.0], [13.9;16.0] subranges separated
|
|
// for monotonicity issues resolve )
|
|
//
|
|
// 4. Saturation path: 22.8 <= |x| < +INF
|
|
// Return tanhl(x) = sign(x)*(1.0 - tiny_value)
|
|
// (tiny_value ~ 1e-1233)
|
|
//
|
|
// Implementation notes
|
|
// --------------------
|
|
//
|
|
// 1. Special path: x = 0, INF, NaNa, denormals
|
|
//
|
|
// This branch is cut off by one fclass operation.
|
|
// Then zeros+nans, infinities and denormals processed separately.
|
|
// For denormals we use simple fma operaton x+x*x (- for +denorms)
|
|
//
|
|
// 2. [0;1/8] path: 0.0 < |x| < 1/8
|
|
//
|
|
// Here we use simple polynimial computations, where last step
|
|
// is performed as x + x^3*A3+...
|
|
// The rest of polynomial is factorized using binary tree technique.
|
|
//
|
|
// 3. Main path: 1/8 <= |x| < 22.8
|
|
//
|
|
// Multiprecision have to be performed only for first few
|
|
// polynomial iterations (up to 3-rd x degree)
|
|
// Here we use the same parallelisation way as above:
|
|
// Split whole polynomial to first, "multiprecision" part, and second,
|
|
// so called "tail", native precision part.
|
|
//
|
|
// 1) Multiprecision part:
|
|
// [v1=(A0H+A0L)+y*(A1H+A1L)] + [v2=y^2*((A2H+A2L)+y*A3)]
|
|
// v1 and v2 terms calculated in parallel
|
|
//
|
|
// 2) Tail part:
|
|
// v3 = x^4 * ( A4 + x*A5 + ... + x^21*A25 )
|
|
// v3 is splitted to 2 even parts (10 coefficient in each one).
|
|
// These 2 parts are also factorized using binary tree technique.
|
|
//
|
|
// So Multiprecision and Tail parts cost is almost the same
|
|
// and we have both results ready before final summation.
|
|
//
|
|
// Some tricks were applied to maintain symmetry at direct
|
|
// rounding modes (to +/-inf). We had to set result sign
|
|
// not at the last operation but much more earlier and at
|
|
// several places.
|
|
//
|
|
// 4. Saturation path: 22.8 <= |x| < +INF
|
|
//
|
|
// We use formula sign(x)*(1.0 - tiny_value) instead of simple sign(x)*1.0
|
|
// just to meet IEEE requirements for different rounding modes in this case.
|
|
//
|
|
// Registers used
|
|
//==============================================================
|
|
// Floating Point registers used:
|
|
// f8 - input & output
|
|
// f32 -> f92
|
|
|
|
// General registers used:
|
|
// r2, r3, r32 -> r52
|
|
|
|
// Predicate registers used:
|
|
// p0, p6 -> p11, p14, p15
|
|
|
|
// p6 - arg is zero, denormal or special IEEE
|
|
// p7 - arg is in [16;32] binary interval
|
|
// p8 - arg is in one of subranges
|
|
// [3.25;4.0], [6.5;8.0], [13.9;16.0]
|
|
// p9 - arg < 1/8
|
|
// p10 - arg is NOT in one of subranges
|
|
// [3.25;4.0], [6.5;8.0], [13.9;16.0]
|
|
// p11 - arg in saturation domain
|
|
// p14 - arg is positive
|
|
// p15 - arg is negative
|
|
|
|
// Assembly macros
|
|
//==============================================================
|
|
rDataPtr = r2
|
|
rTailDataPtr = r3
|
|
|
|
rBias = r33
|
|
rSignBit = r34
|
|
rInterval = r35
|
|
|
|
rArgExp = r36
|
|
rArgSig = r37
|
|
r3p25Offset = r38
|
|
r2to4 = r39
|
|
r1p25 = r40
|
|
rOffset = r41
|
|
r1p5 = r42
|
|
rSaturation = r43
|
|
r1625Sign = r44
|
|
rTiny = r45
|
|
rAddr1 = r46
|
|
rAddr2 = r47
|
|
rTailAddr1 = r48
|
|
rTailAddr2 = r49
|
|
rTailOffset = r50
|
|
rTailAddOffset = r51
|
|
rShiftedDataPtr = r52
|
|
|
|
//==============================================================
|
|
fA0H = f32
|
|
fA0L = f33
|
|
fA1H = f34
|
|
fA1L = f35
|
|
fA2H = f36
|
|
fA2L = f37
|
|
fA3 = f38
|
|
fA4 = f39
|
|
fA5 = f40
|
|
fA6 = f41
|
|
fA7 = f42
|
|
fA8 = f43
|
|
fA9 = f44
|
|
fA10 = f45
|
|
fA11 = f46
|
|
fA12 = f47
|
|
fA13 = f48
|
|
fA14 = f49
|
|
fA15 = f50
|
|
fA16 = f51
|
|
fA17 = f52
|
|
fA18 = f53
|
|
fA19 = f54
|
|
fA20 = f55
|
|
fA21 = f56
|
|
fA22 = f57
|
|
fA23 = f58
|
|
fA24 = f59
|
|
fA25 = f60
|
|
|
|
fArgSqr = f61
|
|
fArgCube = f62
|
|
fArgFour = f63
|
|
fArgEight = f64
|
|
|
|
fArgAbsNorm = f65
|
|
fArgAbsNorm2 = f66
|
|
fArgAbsNorm2L = f67
|
|
fArgAbsNorm3 = f68
|
|
fArgAbsNorm4 = f69
|
|
fArgAbsNorm11 = f70
|
|
|
|
fRes = f71
|
|
fResH = f72
|
|
fResL = f73
|
|
fRes1H = f74
|
|
fRes1L = f75
|
|
fRes1Hd = f76
|
|
fRes2H = f77
|
|
fRes2L = f78
|
|
fRes3H = f79
|
|
fRes3L = f80
|
|
fRes4 = f81
|
|
|
|
fTT = f82
|
|
fTH = f83
|
|
fTL = f84
|
|
fTT2 = f85
|
|
fTH2 = f86
|
|
fTL2 = f87
|
|
|
|
f1p5 = f88
|
|
f2p0 = f89
|
|
fTiny = f90
|
|
fSignumX = f91
|
|
fArgAbsNorm4X = f92
|
|
|
|
// Data tables
|
|
//==============================================================
|
|
RODATA
|
|
|
|
.align 16
|
|
LOCAL_OBJECT_START(tanhl_data)
|
|
|
|
////////// Main tables ///////////
|
|
_0p125_to_0p25_data: // exp = 2^-3
|
|
// Polynomial coefficients for the tanh(x), 1/8 <= |x| < 1/4
|
|
data8 0x93D27D6AE7E835F8, 0x0000BFF4 //A3 = -5.6389704216278164626050408239e-04
|
|
data8 0xBF66E8668A78A8BC //A2H = -2.7963640930198357253955165902e-03
|
|
data8 0xBBD5384EFD0E7A54 //A2L = -1.7974001252014762983581666453e-20
|
|
data8 0x3FBEE69E31DB6156 //A1H = 1.2070645062647619716322822114e-01
|
|
data8 0x3C43A0B4E24A3DCA //A1L = 2.1280460108882061756490131241e-18
|
|
data8 0x3FC7B8FF903BF776 //A0H = 1.8533319990813951205765874874e-01
|
|
data8 0x3C593F1A61986FD4 //A0L = 5.4744612262799573374268254539e-18
|
|
data8 0xDB9E6735560AAE5A, 0x0000BFA3 //A25 = -3.4649731131719154051239475238e-28
|
|
data8 0xF0DDE953E4327704, 0x00003FA4 //A24 = 7.6004173864565644629900702857e-28
|
|
data8 0x8532AED11DEC5612, 0x00003FAB //A23 = 5.3798235684551098715428515761e-26
|
|
data8 0xAEF72A34D88B0038, 0x0000BFAD //A22 = -2.8267199091484508912273222600e-25
|
|
data8 0x9645EF1DCB759DDD, 0x0000BFB2 //A21 = -7.7689413112830095709522203109e-24
|
|
data8 0xA5D12364E121F70F, 0x00003FB5 //A20 = 6.8580281614531622113161030550e-23
|
|
data8 0x9CF166EA815AC705, 0x00003FB9 //A19 = 1.0385615003184753213024737634e-21
|
|
data8 0x852B1D0252498752, 0x0000BFBD //A18 = -1.4099753997949827217635356478e-20
|
|
data8 0x9270F5716D25EC9F, 0x0000BFC0 //A17 = -1.2404055949090177751123473821e-19
|
|
data8 0xC216A9C4EEBDDDCA, 0x00003FC4 //A16 = 2.6303900460415782677749729120e-18
|
|
data8 0xDCE944D89FF592F2, 0x00003FC6 //A15 = 1.1975620514752377092265425941e-17
|
|
data8 0x83C8DDF213711381, 0x0000BFCC //A14 = -4.5721980583985311263109531319e-16
|
|
LOCAL_OBJECT_END(tanhl_data)
|
|
|
|
LOCAL_OBJECT_START(_0p25_to_0p5_data)
|
|
// Polynomial coefficients for the tanh(x), 1/4 <= |x| < 1/2
|
|
data8 0xB6E27B747C47C8AD, 0x0000BFF6 //A3 = -2.7905990032063258105302045572e-03
|
|
data8 0xBF93FD54E226F8F7 //A2H = -1.9521070769536099515084615064e-02
|
|
data8 0xBC491BC884F6F18A //A2L = -2.7222721075104525371410300625e-18
|
|
data8 0x3FCBE3FBB015A591 //A1H = 2.1789499376181400980279079249e-01
|
|
data8 0x3C76AFC2D1AE35F7 //A1L = 1.9677459707672596091076696742e-17
|
|
data8 0x3FD6EF53DE8C8FAF //A0H = 3.5835739835078589399230963863e-01
|
|
data8 0x3C8E2A1C14355F9D //A0L = 5.2327050592919416045278607775e-17
|
|
data8 0xF56D363AAE3BAD53, 0x00003FBB //A25 = 6.4963882412697389947564301120e-21
|
|
data8 0xAD6348526CEEB897, 0x0000BFBD //A24 = -1.8358149767147407353343152624e-20
|
|
data8 0x85D96A988565FD65, 0x0000BFC1 //A23 = -2.2674950494950919052759556703e-19
|
|
data8 0xD52CAF6B1E4D9717, 0x00003FC3 //A22 = 1.4445269502644677106995571101e-18
|
|
data8 0xBD7E1BE5CBEF7A01, 0x00003FC5 //A21 = 5.1362075721080004718090799595e-18
|
|
data8 0xAE84A9B12ADD6948, 0x0000BFC9 //A20 = -7.5685210830925426342786733068e-17
|
|
data8 0xEAC2D5FCF80E250C, 0x00003FC6 //A19 = 1.2726423522879522181100392135e-17
|
|
data8 0xE0D2A8AC8C2EDB95, 0x00003FCE //A18 = 3.1200443098733419749016380203e-15
|
|
data8 0xB22F0AB7B417F78E, 0x0000BFD0 //A17 = -9.8911854977385933809488291835e-15
|
|
data8 0xE25A627BAEFFA7A4, 0x0000BFD3 //A16 = -1.0052095388666003876301743498e-13
|
|
data8 0xC90F32EC4A17F908, 0x00003FD6 //A15 = 7.1430637679768183097897337145e-13
|
|
data8 0x905F6F124AF956B1, 0x00003FD8 //A14 = 2.0516607231389483452611375485e-12
|
|
LOCAL_OBJECT_END(_0p25_to_0p5_data)
|
|
|
|
LOCAL_OBJECT_START(_0p5_to_1_data)
|
|
// Polynomial coefficients for the tanh(x), 1/2 <= |x| < 1
|
|
data8 0xAB402BE491EE72A7, 0x00003FF7 //A3 = 5.2261556931080934657023772945e-03
|
|
data8 0xBFB8403D3DDA87BE //A2H = -9.4730212784752659826992271519e-02
|
|
data8 0xBC6FF7BC2AB71A8B //A2L = -1.3863786398568460929625760740e-17
|
|
data8 0x3FD3173B1EFA6EF4 //A1H = 2.9829290414066567116435635398e-01
|
|
data8 0x3C881E4DCABDE840 //A1L = 4.1838710466827119847963316219e-17
|
|
data8 0x3FE45323E552F228 //A0H = 6.3514895238728730220145735075e-01
|
|
data8 0x3C739D5832BF7BCF //A0L = 1.7012977006567066423682445459e-17
|
|
data8 0xF153980BECD8AE12, 0x00003FD0 //A25 = 1.3396313991261493342597057700e-14
|
|
data8 0xEC9ACCD245368129, 0x0000BFD3 //A24 = -1.0507358886349528807350792383e-13
|
|
data8 0x8AE6498CA36D2D1A, 0x00003FD4 //A23 = 1.2336759149738309660361813001e-13
|
|
data8 0x8DF02FBF5AC70E64, 0x00003FD7 //A22 = 1.0085317723615282268326194551e-12
|
|
data8 0x9E15C7125DA204EE, 0x0000BFD9 //A21 = -4.4930478919612724261941857560e-12
|
|
data8 0xA62C6F39BDDCEC1C, 0x00003FD7 //A20 = 1.1807342457875095150035780314e-12
|
|
data8 0xDFD8D65D30F80F52, 0x00003FDC //A19 = 5.0896919887121116317817665996e-11
|
|
data8 0xB795AFFD458F743E, 0x0000BFDE //A18 = -1.6696932710534097241291327756e-10
|
|
data8 0xFEF30234CB01EC89, 0x0000BFDD //A17 = -1.1593749714588103589483091370e-10
|
|
data8 0xA2F638356E13761E, 0x00003FE2 //A16 = 2.3714062288761887457674853605e-09
|
|
data8 0xC429CC0D031E4FD5, 0x0000BFE3 //A15 = -5.7091025466377379046489586383e-09
|
|
data8 0xC78363FF929EFF62, 0x0000BFE4 //A14 = -1.1613199289622686725595739572e-08
|
|
LOCAL_OBJECT_END(_0p5_to_1_data)
|
|
|
|
LOCAL_OBJECT_START(_1_to_2_data)
|
|
// Polynomial coefficients for the tanh(x), 1 <= |x| < 2.0
|
|
data8 0xB3D8FB48A548D99A, 0x00003FFB //A3 = 8.7816203264683800892441646129e-02
|
|
data8 0xBFC4EFBD8FB38E3B //A2H = -1.6356629864377389416141284073e-01
|
|
data8 0xBC77687FD8087B23 //A2L = -2.0303377679446772162287121190e-17
|
|
data8 0x3FC72165282C6F72 //A1H = 1.8070663892364852154415189034e-01
|
|
data8 0x3C64E01F7A76D777 //A1L = 9.0532964466719018524360408402e-18
|
|
data8 0x3FECF6F9786DF577 //A0H = 9.0514825364486639625027919465e-01
|
|
data8 0x3C8834EDCE71A65B //A0L = 4.1992023813070331863928976191e-17
|
|
data8 0xC3EEEB3EFA688094, 0x00003FE2 //A25 = 2.8512044383274095705865793485e-09
|
|
data8 0x88461973672AEB12, 0x0000BFE1 //A24 = -9.9152258079470849685057375343e-10
|
|
data8 0xFC2AF9950DC5027E, 0x0000BFE4 //A23 = -1.4678101918123116001692289670e-08
|
|
data8 0x9C80CA742F89B7B5, 0x00003FE6 //A22 = 3.6438714992394138274843759814e-08
|
|
data8 0xA0B3D7FAA606260A, 0x0000BFE6 //A21 = -3.7416469848124568887944709492e-08
|
|
data8 0xDA5858432FBD9D9D, 0x0000BFE6 //A20 = -5.0837429421503142141842414978e-08
|
|
data8 0xB0244D1E1AE9C1B0, 0x00003FE9 //A19 = 3.2808967255272595749004827841e-07
|
|
data8 0xC8D3109ACF740738, 0x0000BFEA //A18 = -7.4812945767507614821609020680e-07
|
|
data8 0xBB0F3440EEA55BBF, 0x00003FEA //A17 = 6.9685053481643125932497676583e-07
|
|
data8 0xC13A8B08D8576C19, 0x00003FEB //A16 = 1.4396658837712390333960587173e-06
|
|
data8 0xFF3A1163CC5522A1, 0x0000BFED //A15 = -7.6063522055104010298762276148e-06
|
|
data8 0x8672AF27EB0823B7, 0x00003FEF //A14 = 1.6027448793338500004496520337e-05
|
|
LOCAL_OBJECT_END(_1_to_2_data)
|
|
|
|
LOCAL_OBJECT_START(_2_to_3p25_data)
|
|
// Polynomial coefficients for the tanh(x), 2 <= |x| < 3.25
|
|
data8 0xD45657BEC559E366, 0x00003FFA //A3 = 5.1840155367548909799883161889e-02
|
|
data8 0xBFA41B109CA6AB81 //A2H = -3.9268988726084870510835145296e-02
|
|
data8 0xBC2C3D708A4E56C5 //A2L = -7.6544669252238280132415018518e-19
|
|
data8 0x3F9434A517BBC5F4 //A1H = 1.9732074330880380874653212686e-02
|
|
data8 0x3C3ED62DD9585229 //A1L = 1.6716574468135097509707871438e-18
|
|
data8 0x3FEFD77D111A0AFF //A0H = 9.9505475368673035330147058630e-01
|
|
data8 0x3C9C415E151C6CA5 //A0L = 9.8030409604070051319822874013e-17
|
|
data8 0xB1596391D4534D52, 0x00003FEC //A25 = 2.6427086526487251988631279067e-06
|
|
data8 0xC4DC44E243D1AF5F, 0x00003FEF //A24 = 2.3467591534149209236830008333e-05
|
|
data8 0xAED5786023982BB8, 0x00003FF0 //A23 = 4.1683642395739762658623742687e-05
|
|
data8 0xCF39926C9FBC6A10, 0x00003FF0 //A22 = 4.9406263949321793291856681624e-05
|
|
data8 0xA255A72359928142, 0x00003FF0 //A21 = 3.8703580278108400672236161973e-05
|
|
data8 0xA2E573B9FC332C0D, 0x00003FED //A20 = 4.8546879618263642155709302480e-06
|
|
data8 0x82C7BD01830ACA93, 0x00003FF0 //A19 = 3.1180436075031301077175550468e-05
|
|
data8 0xB38AF4C76E96444B, 0x0000BFF0 //A18 = -4.2806338675404452784440167120e-05
|
|
data8 0xEC08FF0FB194464C, 0x00003FF0 //A17 = 5.6275163156181928637744511210e-05
|
|
data8 0xB850825D9E235135, 0x0000BFF0 //A16 = -4.3943998628289568813056822585e-05
|
|
data8 0xF98436E838763687, 0x0000BFEF //A15 = -2.9744680263523220185672219686e-05
|
|
data8 0xE1851A2D00737A5D, 0x00003FF2 //A14 = 2.1507256570895163202182573369e-04
|
|
LOCAL_OBJECT_END(_2_to_3p25_data)
|
|
|
|
LOCAL_OBJECT_START(_4_to_6p5_data)
|
|
// Polynomial coefficients for the tanh(x), 4 <= |x| < 6.5
|
|
data8 0x896FDBD321A0BE58, 0x00003FF5 //A3 = 1.0485606995331904734870550114e-03
|
|
data8 0xBF39C522B95A37D6 //A2H = -3.9321992640217512306882730044e-04
|
|
data8 0xBBA9B3EC39A45338 //A2L = -2.7213922673282819034134988241e-21
|
|
data8 0x3F19C5377A48B5AD //A1H = 9.8306189621330793766869338146e-05
|
|
data8 0x3BCAFCB1D08A891C //A1L = 1.1429476443042275163117526657e-20
|
|
data8 0x3FEFFFE63ABE253B //A0H = 9.9998771165079547440512897083e-01
|
|
data8 0x3C9BB74C4EE0D16F //A0L = 9.6159219890436197391279544561e-17
|
|
data8 0x8D86121D469AFA7E, 0x0000BFEF //A25 = -1.6870941388985743600323604423e-05
|
|
data8 0x9D3656A36593C5C4, 0x00003FEF //A24 = 1.8741161763079973068909254398e-05
|
|
data8 0xDCD772D5BF9ADB96, 0x00003FF0 //A23 = 5.2652739523018349983563695656e-05
|
|
data8 0xFF79ADCF0DCBCC2D, 0x00003FF1 //A22 = 1.2182012003034659966028035977e-04
|
|
data8 0x84D24E394DEFD0D2, 0x00003FF1 //A21 = 6.3334229517535065590380468696e-05
|
|
data8 0xA66B56BFD2782544, 0x00003FF1 //A20 = 7.9354902476954571736114945842e-05
|
|
data8 0xFB15771FBF3155FE, 0x0000BFEE //A19 = -1.4965763624796745134798717707e-05
|
|
data8 0xC774790126BE54C3, 0x00003FEF //A18 = 2.3776885435831770523136610539e-05
|
|
data8 0x825A13DACB8C68CD, 0x00003FEF //A17 = 1.5539153272890695426189818556e-05
|
|
data8 0xCFF96E6810AACE27, 0x0000BFF1 //A16 = -9.9169893703251156059893890295e-05
|
|
data8 0x8A85D2061B865024, 0x00003FF3 //A15 = 2.6421115104625621420758344535e-04
|
|
data8 0x922EC6F3CFE0496E, 0x0000BFF4 //A14 = -5.5764283474946207558456581668e-04
|
|
LOCAL_OBJECT_END(_4_to_6p5_data)
|
|
|
|
LOCAL_OBJECT_START(_8_to_13_data)
|
|
// Polynomial coefficients for the tanh(x), 8 <= |x| < 13
|
|
data8 0xDD6050A898303460, 0x00003FE6 //A3 = 5.1543170295688189081352133793e-08
|
|
data8 0xBE44C1078FDBADC0 //A2H = -9.6643444318955652627581125180e-09
|
|
data8 0xBAF95FCAA6DBBA6F //A2L = -1.3118146684038113473094275420e-24
|
|
data8 0x3E14C1078FE26748 //A1H = 1.2080430540780827633746315479e-09
|
|
data8 0x3A88168082F37D95 //A1L = 9.7290246966246404028418245094e-27
|
|
data8 0x3FEFFFFFFFF59F7C //A0H = 9.9999999992449728480892190419e-01
|
|
data8 0x3C7C068EBC5C2EEB //A0L = 2.4308346546749583521003998922e-17
|
|
data8 0x9DC155C77A6C46E5, 0x00003FF2 //A25 = 1.5044709695520252096006763473e-04
|
|
data8 0xF2F9E09CA47F46E9, 0x00003FF3 //A24 = 4.6344010077547944693833282056e-04
|
|
data8 0xCBFD67E704734BC8, 0x00003FF4 //A23 = 7.7815958662026429864083620142e-04
|
|
data8 0xC18DC821CD67E621, 0x00003FF4 //A22 = 7.3834928521190855055818897104e-04
|
|
data8 0x8AF72BCAB05A296E, 0x00003FF4 //A21 = 5.3011135848666430331904214879e-04
|
|
data8 0xC2E73BE9B9AB4007, 0x00003FF2 //A20 = 1.8587423129049905806822275188e-04
|
|
data8 0xE7E8C2058E2FF9F7, 0x00003FF1 //A19 = 1.1058292891321512917337425414e-04
|
|
data8 0xC46309F52E429F97, 0x0000BFF0 //A18 = -4.6822278664829811025251866877e-05
|
|
data8 0x81966C1E007E9BEB, 0x00003FF1 //A17 = 6.1792176836716291200611553354e-05
|
|
data8 0x8CEDC4BEFCAB9A7E, 0x0000BFF1 //A16 = -6.7200080564674449915571760779e-05
|
|
data8 0x8B64E9FA53210018, 0x00003FF1 //A15 = 6.6468331917938095774361868182e-05
|
|
data8 0x82DEDAA539A3A3F1, 0x0000BFF1 //A14 = -6.2403928644276709411156885292e-05
|
|
LOCAL_OBJECT_END(_8_to_13_data)
|
|
|
|
LOCAL_OBJECT_START(_16_to_22p8_data)
|
|
// Polynomial coefficients for the tanh(x), 16 <= |x| < 22.88
|
|
data8 0x992C00F33DDE804D, 0x00003FCE //A3 = 2.1256869805798788337547274131e-15
|
|
data8 0x3C8D42EA28102760 //A2H = 5.0760412270332007485198379096e-17
|
|
data8 0x391A747B43B072DD //A2L = 1.2737621993898125881520341053e-33
|
|
data8 0x3C309BC5C3CB4D5F //A1H = 9.0034785192019775952205276560e-19
|
|
data8 0x38A8EF3B5C9DCE71 //A1L = 9.3793162715476168397242934494e-36
|
|
data8 0x3FF0000000000000 //A0H = 1.0000000000000000000000000000e+00
|
|
data8 0x3BACC66AFD5CA22A //A0L = 3.0466790472070565954180861749e-21
|
|
data8 0xF020FB351C2F37CB, 0x00003FF1 //A25 = 1.1450235038836625246604146870e-04
|
|
data8 0xBE80596C51302A7B, 0x00003FF4 //A24 = 7.2670503421185030764546828414e-04
|
|
data8 0x91343CF8577E0131, 0x00003FF6 //A23 = 2.2156380512949603402001207105e-03
|
|
data8 0x8D029A8679641286, 0x00003FF7 //A22 = 4.3032888906494613055765544559e-03
|
|
data8 0xC3713F64D8DC4BAB, 0x00003FF7 //A21 = 5.9644279041951657632420721490e-03
|
|
data8 0xCD678C455A5D06C2, 0x00003FF7 //A20 = 6.2684473911812928601693994403e-03
|
|
data8 0xA9E1C825BDCEEBCC, 0x00003FF7 //A19 = 5.1843859941826642445235686826e-03
|
|
data8 0xE29C919AD93F6EB9, 0x00003FF6 //A18 = 3.4578185539872939928152204329e-03
|
|
data8 0xF7E615A75994A607, 0x00003FF5 //A17 = 1.8913175041916131006881986311e-03
|
|
data8 0xE102EFE0F7F2B2AD, 0x00003FF4 //A16 = 8.5835064987089641065525269712e-04
|
|
data8 0xAAD62946DEE96996, 0x00003FF3 //A15 = 3.2584489313998677644253007210e-04
|
|
data8 0xDA2470DE110B293E, 0x00003FF1 //A14 = 1.0401837693241806604296821650e-04
|
|
LOCAL_OBJECT_END(_16_to_22p8_data)
|
|
|
|
LOCAL_OBJECT_START(_3p25_to_4_data)
|
|
// Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4
|
|
data8 0xE9E07240432926E6, 0x00003FF7 //A3 = 7.1373517862636557382403555215e-03
|
|
data8 0xBF75F495227AF306 //A2H = -5.3602052282115727338540622782e-03
|
|
data8 0xBBBE92D355A6B716 //A2L = -6.4741983326810209847018826624e-21
|
|
data8 0x3F65F85AD510B690 //A1H = 2.6819013660517934671823070403e-03
|
|
data8 0x3C159A0B73E6EC01 //A1L = 2.9275813076637328121849573333e-19
|
|
data8 0x3FEFFA81708A0B42 //A0H = 9.9932929973906703402519724477e-01
|
|
data8 0x3C66857246C19DC6 //A0L = 9.7670460995685717424398031188e-18
|
|
data8 0xE6B6B8365B1E4D6C, 0x00003FE3 //A25 = 6.7146538162212081470554423396e-09
|
|
data8 0xE0453CEEF483A510, 0x00003FE2 //A24 = 3.2635647369924061614015292015e-09
|
|
data8 0x9C7D83B56E92CF1A, 0x00003FE5 //A23 = 1.8217867585545497089756353348e-08
|
|
data8 0xA94635C48ABA9EB4, 0x0000BFE4 //A22 = -9.8530586070049930796756799547e-09
|
|
data8 0xB1B0C14443067646, 0x00003FE5 //A21 = 2.0685890807654992387562340307e-08
|
|
data8 0x9C6E549781E293C3, 0x00003FDE //A20 = 1.4227314592865135171341122138e-10
|
|
data8 0xB0CBFCE7C80F57A7, 0x0000BFE7 //A19 = -8.2327438416004542109809245219e-08
|
|
data8 0xB151AB3876E896E1, 0x00003FE9 //A18 = 3.3028241036175815328309577940e-07
|
|
data8 0xFCF3A5C1A5CB7EEE, 0x0000BFEA //A17 = -9.4231869277542043001280640966e-07
|
|
data8 0x96A9016C7C95BEDA, 0x00003FEC //A16 = 2.2450115975007100522962781833e-06
|
|
data8 0x9B9B0A3901DEC05B, 0x0000BFED //A15 = -4.6374089937147736266514566049e-06
|
|
data8 0x8987DF26A6789CCF, 0x00003FEE //A14 = 8.1974714257536543772040700977e-06
|
|
LOCAL_OBJECT_END(_3p25_to_4_data)
|
|
|
|
LOCAL_OBJECT_START(_6p5_to_8_data)
|
|
// Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0
|
|
data8 0xA11C8A63815E5657, 0x00003FEF //A3 = 1.9205985861286093001394561449e-05
|
|
data8 0xBEDE355AD6CB61D8 //A2H = -7.2022479400070228499307345427e-06
|
|
data8 0xBB8E6B50B8468A63 //A2L = -8.0518953122203408718779840543e-22
|
|
data8 0x3EBE355B48DCF330 //A1H = 1.8005623902549165889479948488e-06
|
|
data8 0x3B5837550FFA98DA //A1L = 8.0124491698609178046195694087e-23
|
|
data8 0x3FEFFFFF872A91F8 //A0H = 9.9999977492967584424832239165e-01
|
|
data8 0x3C8A43B839B4EB63 //A0L = 4.5561696441306660142461355317e-17
|
|
data8 0xB5BC1948966B8826, 0x0000BFE6 //A25 = -4.2313421330480692560677276010e-08
|
|
data8 0x91D0BE367389BDFC, 0x0000BFE8 //A24 = -1.3580117599617083801153887619e-07
|
|
data8 0xFFD950AF282AB36C, 0x0000BFE8 //A23 = -2.3827784451962439125197203287e-07
|
|
data8 0x959B1770EBB8903A, 0x0000BFE9 //A22 = -2.7866256690165347051403663794e-07
|
|
data8 0xCC78060D1C0CFF3C, 0x0000BFE8 //A21 = -1.9042644867126442102188429523e-07
|
|
data8 0xF8919BAF2E87F31D, 0x0000BFE8 //A20 = -2.3149771783868910586746973299e-07
|
|
data8 0xC5B6AC942A3F2440, 0x00003FE8 //A19 = 1.8413511183396213757149263639e-07
|
|
data8 0xABF1A4703056450A, 0x0000BFEA //A18 = -6.4054099983863829656292958643e-07
|
|
data8 0xBB543D8BDB670453, 0x00003FEB //A17 = 1.3957102903892251890348444989e-06
|
|
data8 0xC9D6F37700C1D092, 0x0000BFEC //A16 = -3.0076451968978522605262647414e-06
|
|
data8 0xCA6EF4BB64E49EC8, 0x00003FED //A15 = 6.0329860989478473738709576062e-06
|
|
data8 0xBE25D0FD069D0A93, 0x0000BFEE //A14 = -1.1333687314965721384777951065e-05
|
|
LOCAL_OBJECT_END(_6p5_to_8_data)
|
|
|
|
LOCAL_OBJECT_START(_13_to_16_data)
|
|
// Polynomial coefficients for the tanh(x), 13 <= |x| < 16
|
|
data8 0x98176FD2075BDBD5, 0x00003FDB //A3 = 1.7290807363028159200235264756e-11
|
|
data8 0xBD8C8464F76162D1 //A2H = -3.2420263805679445515400340441e-12
|
|
data8 0xBA2D56B508E0F1FD //A2L = -1.8515322669984580704502445180e-28
|
|
data8 0x3D5C8464F761639C //A1H = 4.0525329757100331782338488690e-13
|
|
data8 0x3A0A09D9E328E620 //A1L = 4.1081479300866418212862258651e-29
|
|
data8 0x3FEFFFFFFFFFFF1B //A0H = 9.9999999999997457589273608392e-01
|
|
data8 0x3C9B9B089E9BFD89 //A0L = 9.5776165728054091471814161399e-17
|
|
data8 0xC5395B9EC765BDB7, 0x00003FE6 //A25 = 4.5919803498257974411526879804e-08
|
|
data8 0x9A0F1FCB1DC24C3A, 0x00003FE8 //A24 = 1.4347869798460288751020493795e-07
|
|
data8 0x8AA5C3459FAD0B28, 0x00003FE9 //A23 = 2.5825111356333853968900510087e-07
|
|
data8 0x9578B747988CFF9D, 0x00003FE9 //A22 = 2.7841245127068220034870119246e-07
|
|
data8 0x810DF1A589D9CAF1, 0x00003FE9 //A21 = 2.4038267971021370956311255310e-07
|
|
data8 0x8A00D77B9416EB75, 0x00003FE8 //A20 = 1.2852557749068320312899366352e-07
|
|
data8 0xB2436C4A1849C498, 0x00003FE7 //A19 = 8.3010350873515703893886683374e-08
|
|
data8 0xEA6405B18356600B, 0x00003FE3 //A18 = 6.8216675390299296071261114202e-09
|
|
data8 0xF7606C022194B7E8, 0x00003FE5 //A17 = 2.8798432098264655723769995993e-08
|
|
data8 0xAF4B0C453FCAF34E, 0x0000BFE5 //A16 = -2.0406809167824936143455638336e-08
|
|
data8 0xC324C1F10D5FA7CC, 0x00003FE5 //A15 = 2.2717703170390130238356558599e-08
|
|
data8 0xB34A2E3A4D3B9C31, 0x0000BFE5 //A14 = -2.0872076027950789618606920471e-08
|
|
LOCAL_OBJECT_END(_13_to_16_data)
|
|
|
|
|
|
//////// "Tail" tables //////////
|
|
LOCAL_OBJECT_START(_0p125_to_0p25_data_tail)
|
|
// Polynomial coefficients for the erf(x), 1/8 <= |x| < 1/4
|
|
data8 0x9D7D206E97ADC83A, 0x0000BFCC //A13 = -5.4639895428711257047470806445e-16
|
|
data8 0xA8972B666A845810, 0x00003FD3 //A12 = 7.4869224589947988668562043110e-14
|
|
data8 0x9A5B31511C9F4698, 0x0000BFD4 //A11 = -1.3709586467430093373657009487e-13
|
|
data8 0xCBB8047BCB274982, 0x0000BFDA //A10 = -1.1580074124926108509393610532e-11
|
|
data8 0xF95EB849E5F9247C, 0x00003FDC //A9 = 5.6700173336564916962945623180e-11
|
|
data8 0xE7893404C6A53386, 0x00003FE1 //A8 = 1.6846457582993065168777704528e-09
|
|
data8 0xF2E5C7E2B5F55ECC, 0x0000BFE4 //A7 = -1.4138500046802141367543484859e-08
|
|
data8 0xF43906FF53A002C0, 0x0000BFE8 //A6 = -2.2745017243678613107034288816e-07
|
|
data8 0xC6175D5E47D1D259, 0x00003FEC //A5 = 2.9517899220726077077586632607e-06
|
|
data8 0xE7C2AE92CB36769B, 0x00003FEF //A4 = 2.7628001723157068127646694830e-05
|
|
LOCAL_OBJECT_END(_0p125_to_0p25_data_tail)
|
|
|
|
LOCAL_OBJECT_START(_0p25_to_0p5_data_tail)
|
|
// Polynomial coefficients for the tanh(x), 1/4 <= |x| < 1/2
|
|
data8 0x9E2972C008B9965E, 0x0000BFDC //A13 = -3.5961854154738002253192260213e-11
|
|
data8 0xC3EABA3D219BEA8A, 0x00003FDB //A12 = 2.2273173303628274478819473067e-11
|
|
data8 0xC50FB68D960D5CD9, 0x00003FE1 //A11 = 1.4338102430978399800743148719e-09
|
|
data8 0xB3BB92499EF2D583, 0x0000BFE3 //A10 = -5.2309100551458044083112632491e-09
|
|
data8 0xBD915BE632F1D04E, 0x0000BFE6 //A9 = -4.4137194873936112573773943707e-08
|
|
data8 0xBC48C813FA819141, 0x00003FE9 //A8 = 3.5070684356359066908197915734e-07
|
|
data8 0xD3E34EA031AC611B, 0x00003FEA //A7 = 7.8934400708919584259192272835e-07
|
|
data8 0x8EAC489D859541CD, 0x0000BFEF //A6 = -1.7007944944124693133572815137e-05
|
|
data8 0x98D4D7E5D1508B8A, 0x00003FEF //A5 = 1.8218924920302265989878708948e-05
|
|
data8 0xAC262F3F8CF49C02, 0x00003FF4 //A4 = 6.5669692402266433496312492412e-04
|
|
LOCAL_OBJECT_END(_0p25_to_0p5_data_tail)
|
|
|
|
LOCAL_OBJECT_START(_0p5_to_1_data_tail)
|
|
// Polynomial coefficients for the tanh(x), 1/2 <= |x| < 1
|
|
data8 0xDF67FB36FFA2A538, 0x00003FE7 //A13 = 1.0403160796697495720021114635e-07
|
|
data8 0xB7FB80FB5AFA63A4, 0x0000BFE8 //A12 = -1.7134699677764282023124981753e-07
|
|
data8 0xC87625A0BA7D6C5F, 0x0000BFEA //A11 = -7.4677732458471897291461679095e-07
|
|
data8 0x90DA375DD9AF6D79, 0x00003FED //A10 = 4.3169381418023765618186668159e-06
|
|
data8 0x82DFB03317B17316, 0x0000BFED //A9 = -3.9003426534601562552753368105e-06
|
|
data8 0xAA582FD4F3438BB4, 0x0000BFF0 //A8 = -4.0613288845040776435400454867e-05
|
|
data8 0xB1532D8CF763B21C, 0x00003FF2 //A7 = 1.6911021594787399557528570601e-04
|
|
data8 0x82E12AEF7CAB76C6, 0x0000BFEF //A6 = -1.5602059530458172761585925044e-05
|
|
data8 0x83256E3D0FBA5C93, 0x0000BFF6 //A5 = -2.0011324059500451791903108104e-03
|
|
data8 0xCC4AB2EC0965499B, 0x00003FF7 //A4 = 6.2344907419841579664122448353e-03
|
|
LOCAL_OBJECT_END(_0p5_to_1_data_tail)
|
|
|
|
LOCAL_OBJECT_START(_1_to_2_data_tail)
|
|
// Polynomial coefficients for the tanh(x), 1 <= |x| < 2.0
|
|
data8 0xCCAEE174EAC17F78, 0x0000BFEE //A13 = -1.2200065117856038355953618829e-05
|
|
data8 0xA39DD0981D1A2776, 0x0000BFF0 //A12 = -3.9009204899026604074167603200e-05
|
|
data8 0xB7104FA27FAF80D0, 0x00003FF2 //A11 = 1.7458316338540792661905876072e-04
|
|
data8 0xB219A7274436A734, 0x0000BFF3 //A10 = -3.3969918595931391572998415468e-04
|
|
data8 0xCCD9D03C0C73CECF, 0x00003FF2 //A9 = 1.9536097875337884986025498958e-04
|
|
data8 0x85321EA40CFEEBEE, 0x00003FF5 //A8 = 1.0162031558369402750607778300e-03
|
|
data8 0x81F272C08C308220, 0x0000BFF7 //A7 = -3.9656696618251138315464862909e-03
|
|
data8 0xE8761C6BDEA9ED87, 0x00003FF7 //A6 = 7.0941580558970243020090656343e-03
|
|
data8 0xAE4E9F3691F66877, 0x0000BFF6 //A5 = -2.6597155288710984120834711909e-03
|
|
data8 0xCC8286B331BD8AAA, 0x0000BFF9 //A4 = -2.4964583478826523250880337777e-02
|
|
LOCAL_OBJECT_END(_1_to_2_data_tail)
|
|
|
|
LOCAL_OBJECT_START(_2_to_3p25_data_tail)
|
|
// Polynomial coefficients for the tanh(x), 2 <= |x| < 3.25
|
|
data8 0x92E1711A3BD6408B, 0x0000BFF4 //A13 = -5.6030514548041036913731470443e-04
|
|
data8 0x8B9BD885FF3E98C5, 0x00003FF5 //A12 = 1.0651304064581604055612602669e-03
|
|
data8 0xD041356C7FA26A22, 0x0000BFF5 //A11 = -1.5888574328066952147023520244e-03
|
|
data8 0xDFA210BE9BE6B7FD, 0x00003FF5 //A10 = 1.7061849060196387827639060629e-03
|
|
data8 0x8ECC3606808028E9, 0x0000BFF4 //A9 = -5.4472999329435778312080340471e-04
|
|
data8 0xD5C053B8EEBD10C8, 0x0000BFF6 //A8 = -3.2615856552479930645151033322e-03
|
|
data8 0xB7BFD63AC5051539, 0x00003FF8 //A7 = 1.1215171059191957498023766643e-02
|
|
data8 0xC367C59D7FA3ADA2, 0x0000BFF9 //A6 = -2.3853193251842394834616848995e-02
|
|
data8 0x9FC9FB890BB053CF, 0x00003FFA //A5 = 3.9010984954739386625695104667e-02
|
|
data8 0xD01D077B42E7ED76, 0x0000BFFA //A4 = -5.0808934425896607486919526567e-02
|
|
LOCAL_OBJECT_END(_2_to_3p25_data_tail)
|
|
|
|
LOCAL_OBJECT_START(_4_to_6p5_data_tail)
|
|
// Polynomial coefficients for the tanh(x), 4 <= |x| < 6.5
|
|
data8 0x870CCE8C76C52C7E, 0x00003FF5 //A13 = 1.0303499350193060915603525934e-03
|
|
data8 0xE1431E54AD2A738B, 0x0000BFF5 //A12 = -1.7186140560972621669872002486e-03
|
|
data8 0xAB20056533E28734, 0x00003FF6 //A11 = 2.6111615345168277554841545330e-03
|
|
data8 0xECCB91D64718B9BD, 0x0000BFF6 //A10 = -3.6132079169671860943878776041e-03
|
|
data8 0x94771DA3B8C2EB4F, 0x00003FF7 //A9 = 4.5308012699419563988381317896e-03
|
|
data8 0xA7497377E4946F2C, 0x0000BFF7 //A8 = -5.1051915941441437592654444804e-03
|
|
data8 0xA76B2D6FCA088AE9, 0x00003FF7 //A7 = 5.1092120989582196669504468168e-03
|
|
data8 0x928C8961F33C9560, 0x0000BFF7 //A6 = -4.4723196805537430568162704711e-03
|
|
data8 0xDBDDDF6CDE9AB9BE, 0x00003FF6 //A5 = 3.3548994514326736175581084349e-03
|
|
data8 0x896E211733AD9D40, 0x0000BFF6 //A4 = -2.0970183170010094667442967500e-03
|
|
LOCAL_OBJECT_END(_4_to_6p5_data_tail)
|
|
|
|
LOCAL_OBJECT_START(_8_to_13_data_tail)
|
|
// Polynomial coefficients for the tanh(x), 8 <= |x| < 13
|
|
data8 0xE50C3476BED020AA, 0x00003FF0 //A13 = 5.4609221347524272615754239857e-05
|
|
data8 0xBA16F5F4EDC0EABC, 0x0000BFF0 //A12 = -4.4367239594986428539386662937e-05
|
|
data8 0x8B916C2F002C3D91, 0x00003FF0 //A11 = 3.3275617838067362533536610680e-05
|
|
data8 0xBFE8031097CB4442, 0x0000BFEF //A10 = -2.2877013297722792747267224605e-05
|
|
data8 0xEFE1FFD106B2DA41, 0x00003FEE //A9 = 1.4298129659899553350478452989e-05
|
|
data8 0x86EF1FF403A6622E, 0x0000BFEE //A8 = -8.0426979849841642112688693288e-06
|
|
data8 0x86EF200FD047306B, 0x00003FED //A7 = 4.0213490418736097707257704218e-06
|
|
data8 0xEC22782377882553, 0x0000BFEB //A6 = -1.7593402092805559754997565942e-06
|
|
data8 0xB119DA1DB7C47773, 0x00003FEA //A5 = 6.5975257917246601211360847253e-07
|
|
data8 0xDD6050A7761D67BB, 0x0000BFE8 //A4 = -2.0617268111985310661707082242e-07
|
|
LOCAL_OBJECT_END(_8_to_13_data_tail)
|
|
|
|
LOCAL_OBJECT_START(_16_to_22p8_data_tail)
|
|
// Polynomial coefficients for the tanh(x), 16 <= |x| < 22.88
|
|
data8 0xEAF4AF87336E81B1, 0x00003FEF //A13 = 2.8008914392791730186582989654e-05
|
|
data8 0xD5B309EA768E2711, 0x00003FED //A12 = 6.3687375204024238267961143128e-06
|
|
data8 0xA4048CA537113538, 0x00003FEB //A11 = 1.2220276227448617951538196845e-06
|
|
data8 0xD3EC78BB3425377D, 0x00003FE8 //A10 = 1.9736934193679794194181457250e-07
|
|
data8 0xE5763CD37440266E, 0x00003FE5 //A9 = 2.6712876934440631473215182284e-08
|
|
data8 0xCECA765EEB4A265F, 0x00003FE2 //A8 = 3.0092031912460315516888139627e-09
|
|
data8 0x99ABF588DF81A52E, 0x00003FDF //A7 = 2.7952722177649984066847682907e-10
|
|
data8 0xB9C78918294A4685, 0x00003FDB //A6 = 2.1120676552098603524020495036e-11
|
|
data8 0xB3A3C42AD539D50F, 0x00003FD7 //A5 = 1.2764169243389521270291967366e-12
|
|
data8 0x86BC347939478174, 0x00003FD3 //A4 = 5.9834437707863962671883176163e-14
|
|
LOCAL_OBJECT_END(_16_to_22p8_data_tail)
|
|
|
|
LOCAL_OBJECT_START(_3p25_to_4_data_tail)
|
|
// Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4
|
|
data8 0xBE9A2BE19F21BA1C, 0x0000BFEE //A13 = -1.1360778336288065244475976873e-05
|
|
data8 0xF84910F515BDB014, 0x00003FED //A12 = 7.3994819819577018481862729782e-06
|
|
data8 0xC4C84FB788AA4007, 0x00003FEF //A11 = 2.3458298013663976251972482656e-05
|
|
data8 0x86CC6243C170E5ED, 0x0000BFF2 //A10 = -1.2855374755847770638424932233e-04
|
|
data8 0xD3065AC539ABABFF, 0x00003FF3 //A9 = 4.0249790677367806832685138089e-04
|
|
data8 0x82C4413795EC381B, 0x0000BFF5 //A8 = -9.9767013652382759950854031514e-04
|
|
data8 0x88D588720888899A, 0x00003FF6 //A7 = 2.0879228705174076794011525274e-03
|
|
data8 0xF4CA066137741469, 0x0000BFF6 //A6 = -3.7351861548964870836350490741e-03
|
|
data8 0xB998746D56E81737, 0x00003FF7 //A5 = 5.6639259807333999973200378964e-03
|
|
data8 0xE93FB2F48233275B, 0x0000BFF7 //A4 = -7.1181892208343798194003322900e-03
|
|
LOCAL_OBJECT_END(_3p25_to_4_data_tail)
|
|
|
|
LOCAL_OBJECT_START(_6p5_to_8_data_tail)
|
|
// Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0
|
|
data8 0xA6881D7D21774BFD, 0x00003FEF //A13 = 1.9852125640303530752913966680e-05
|
|
data8 0x875E983AA042E605, 0x0000BFF0 //A12 = -3.2274606306629334402383651599e-05
|
|
data8 0xCB19E01E94FC133C, 0x00003FF0 //A11 = 4.8423069963831314927026982707e-05
|
|
data8 0x8BA5E8D9E72D56B2, 0x0000BFF1 //A10 = -6.6589395655200734237190902534e-05
|
|
data8 0xAE91F647ED4E46B2, 0x00003FF1 //A9 = 8.3241541003842930001632190258e-05
|
|
data8 0xC465A7E0B22F884E, 0x0000BFF1 //A8 = -9.3649431639051891449916386619e-05
|
|
data8 0xC4666148AA01A4D7, 0x00003FF1 //A7 = 9.3650780646160216748407869111e-05
|
|
data8 0xABD9E63D181B0C6C, 0x0000BFF1 //A6 = -8.1945023256769295802996591839e-05
|
|
data8 0x80E38B18E509387A, 0x00003FF1 //A5 = 6.1458988764532931141264026311e-05
|
|
data8 0xA11C80E20ADA5A64, 0x0000BFF0 //A4 = -3.8411937140983728563216440713e-05
|
|
LOCAL_OBJECT_END(_6p5_to_8_data_tail)
|
|
|
|
LOCAL_OBJECT_START(_13_to_16_data_tail)
|
|
// Polynomial coefficients for the tanh(x), 13 <= |x| < 16
|
|
data8 0x9D6CCDA4767CA6D9, 0x00003FE5 //A13 = 1.8326683535066775712253572575e-08
|
|
data8 0xFFAF154F334BF403, 0x0000BFE4 //A12 = -1.4882762852665077172347508377e-08
|
|
data8 0xBFC68FA7C61B6C17, 0x00003FE4 //A11 = 1.1162810813806544919835662888e-08
|
|
data8 0x83D8439A6B19A015, 0x0000BFE4 //A10 = -7.6743763372603959795701788561e-09
|
|
data8 0xA4CE5BE9DC6A2962, 0x00003FE3 //A9 = 4.7964885012772346158732715382e-09
|
|
data8 0xB96826C0697253CA, 0x0000BFE2 //A8 = -2.6980246373950994097953903952e-09
|
|
data8 0xB96826CADDC00E35, 0x00003FE1 //A7 = 1.3490123232313844006540534789e-09
|
|
data8 0xA23B21F1155DF322, 0x0000BFE0 //A6 = -5.9019289132168830718664922372e-10
|
|
data8 0xF358B2E9A50C349C, 0x00003FDE //A5 = 2.2132233424669131155945897524e-10
|
|
data8 0x98176FD2074C1D77, 0x0000BFDD //A4 = -6.9163229452106125388824134881e-11
|
|
LOCAL_OBJECT_END(_13_to_16_data_tail)
|
|
|
|
LOCAL_OBJECT_START(_0_to_1o8_data)
|
|
// Polynomial coefficients for the tanh(x), 0.0 <= |x| < 0.125
|
|
data8 0xBA0EC1879495150B, 0x0000BFF5 // A15 = -1.4195071451378679802688367813e-03
|
|
data8 0xEB5A82898D1BCBA4, 0x00003FF6 // A13 = 3.5912102408030526706365632879e-03
|
|
data8 0x91370DAFE0B64438, 0x0000BFF8 // A11 = -8.8632234251336964576640807982e-03
|
|
data8 0xB327A435358F1200, 0x00003FF9 // A9 = 2.1869488447622383899199238857e-02
|
|
data8 0xDD0DD0DD07A0775F, 0x0000BFFA // A7 = -5.3968253967902161405327069187e-02
|
|
data8 0x888888888887C299, 0x00003FFC // A5 = 1.3333333333333264660338062012e-01
|
|
data8 0xAAAAAAAAAAAAAA98, 0x0000BFFD // A3 = -3.3333333333333333282255458755e-01
|
|
LOCAL_OBJECT_END(_0_to_1o8_data)
|
|
|
|
|
|
.section .text
|
|
GLOBAL_LIBM_ENTRY(tanhl)
|
|
|
|
{ .mfi
|
|
alloc r32 = ar.pfs, 0, 21, 0, 0
|
|
fmerge.se fArgAbsNorm = f1, f8 // normalized x (1.0 <= x < 2.0)
|
|
addl rSignBit = 0x20000, r0 // Set sign bit for exponent
|
|
}
|
|
{ .mlx
|
|
addl rDataPtr = @ltoff(tanhl_data), gp // Get common data ptr
|
|
movl r1p5 = 0x3FF8000000000000 // 1.5 in dbl repres.
|
|
};;
|
|
|
|
{ .mfi
|
|
getf.exp rArgExp = f8 // Get arg exponent
|
|
fclass.m p6,p0 = f8, 0xEF // Filter 0, denormals and specials
|
|
// 0xEF = @qnan|@snan|@pos|@neg|@zero|@unorm|@inf
|
|
addl rBias = 0xfffc, r0 // Value to subtract from exp
|
|
// to get actual interval number
|
|
}
|
|
{ .mfi
|
|
ld8 rDataPtr = [rDataPtr] // Get real common data pointer
|
|
fma.s1 fArgSqr = f8, f8, f0 // x^2 (for [0;1/8] path)
|
|
addl r2to4 = 0x10000, r0 // unbiased exponent
|
|
// for [2;4] binary interval
|
|
};;
|
|
|
|
{ .mfi
|
|
getf.sig rArgSig = f8 // Get arg significand
|
|
fcmp.lt.s1 p15, p14 = f8, f0 // Is arg negative/positive?
|
|
addl rSaturation = 0xb70, r0 // First 12 bits of
|
|
// saturation value signif.
|
|
}
|
|
{ .mfi
|
|
setf.d f1p5 = r1p5 // 1.5 construction
|
|
fma.s1 f2p0 = f1,f1,f1 // 2.0 construction
|
|
addl r1625Sign = 0xd01, r0 // First 12 bits of
|
|
// 1.625 value signif.
|
|
// 1.625 significand used to filter values greater than 3.25, 6.5, 13.0
|
|
};;
|
|
|
|
{ .mfi
|
|
addl rTailDataPtr = 0xB00, rDataPtr // Pointer to "tail" data
|
|
fmerge.s fSignumX = f8, f1 // signum(x)
|
|
andcm rArgExp = rArgExp, rSignBit // Remove sign of exp
|
|
}
|
|
{ .mfb
|
|
addl rTiny = 0xf000, r0 // Tiny value for saturation path
|
|
nop.f 0
|
|
(p6) br.cond.spnt tanhl_spec // Branch to zero, denorm & specs
|
|
};;
|
|
|
|
{ .mfi
|
|
sub rInterval = rArgExp, rBias // Get actual interval number
|
|
nop.f 0
|
|
shr.u rArgSig = rArgSig, 52 // Leave only 12 bits of sign.
|
|
}
|
|
{ .mfi
|
|
adds rShiftedDataPtr = 0x10, rDataPtr // Second ptr to data
|
|
nop.f 0
|
|
cmp.ge p8, p10 = rArgExp, r2to4 // If exp >= 2to4 interval?
|
|
};;
|
|
|
|
{ .mfi
|
|
(p8) cmp.le p8, p10 = r1625Sign, rArgSig // If signd is greater
|
|
// than 1.625? (arg is at one of binary subranges)
|
|
nop.f 0
|
|
shl rOffset = rInterval, 8 // Make offset from
|
|
// interval number
|
|
}
|
|
{ .mfi
|
|
cmp.gt p9, p0 = 0x0, rInterval // If interval is less than 0
|
|
// (means arg is in [0; 1/8])
|
|
nop.f 0
|
|
cmp.eq p7, p0 = 0x7, rInterval // If arg is in [16;] interv.?
|
|
};;
|
|
|
|
{ .mfi
|
|
(p8) adds rOffset = 0x400, rOffset // Add additional offset
|
|
// (arg is at one of binary subranges)
|
|
fma.s1 fArgCube = fArgSqr, f8, f0 // x^3 (for [0;1/8] path)
|
|
shl rTailOffset = rInterval, 7 // Make offset to "tail" data
|
|
// from interval number
|
|
}
|
|
{ .mib
|
|
setf.exp fTiny = rTiny // Construct "tiny" value
|
|
// for saturation path
|
|
cmp.ltu p11, p0 = 0x7, rInterval // if arg > 32
|
|
(p9) br.cond.spnt _0_to_1o8
|
|
};;
|
|
|
|
{ .mfi
|
|
add rAddr1 = rDataPtr, rOffset // Get address for
|
|
// interval data
|
|
nop.f 0
|
|
shl rTailAddOffset = rInterval, 5 // Offset to interval
|
|
// "tail" data
|
|
}
|
|
{ .mib
|
|
add rAddr2 = rShiftedDataPtr, rOffset // Get second
|
|
// address for interval data
|
|
(p7) cmp.leu p11, p0 = rSaturation, rArgSig // if arg is
|
|
// in [22.8;32] interval
|
|
(p11) br.cond.spnt _saturation // Branch to Saturation path
|
|
};;
|
|
|
|
{ .mmi
|
|
ldfe fA3 = [rAddr1], 0x90 // Load A3
|
|
ldfpd fA2H, fA2L = [rAddr2], 16 // Load A2High, A2Low
|
|
add rTailOffset = rTailOffset, rTailAddOffset // "Tail" offset
|
|
};;
|
|
|
|
{ .mmi
|
|
ldfe fA20 = [rAddr1], 16 // Load A20
|
|
ldfpd fA1H, fA1L = [rAddr2], 16 // Load A1High, A1Low
|
|
(p8) adds rTailOffset = 0x280, rTailOffset // Additional offset
|
|
// (arg is at one of binary subranges)
|
|
};;
|
|
|
|
{ .mmi
|
|
ldfe fA19 = [rAddr1], 16 // Load A19
|
|
ldfpd fA0H, fA0L = [rAddr2], 16 // Load A0High, A0Low
|
|
add rTailAddr1 = rTailDataPtr, rTailOffset // First tail
|
|
// data address
|
|
};;
|
|
|
|
.pred.rel "mutex",p8,p10
|
|
{ .mfi
|
|
ldfe fA18 = [rAddr1], 16 // Load A18
|
|
(p8) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f2p0 // Add 2.0
|
|
// (arg is at one of binary subranges)
|
|
adds rTailAddr2 = 0x10, rTailAddr1 // First tail
|
|
// data address
|
|
}
|
|
{ .mfi
|
|
ldfe fA25 = [rAddr2], 16 // Load A25
|
|
(p10) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f1p5 // Add 1.5
|
|
// to normalized arg
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mmi
|
|
ldfe fA17 = [rAddr1], 16 // Load A17
|
|
ldfe fA24 = [rAddr2], 16 // Load A24
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mmi
|
|
ldfe fA16 = [rAddr1], 16 // Load A16
|
|
ldfe fA23 = [rAddr2], 16 // Load A23
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mmi
|
|
ldfe fA15 = [rAddr1], 16 // Load A15
|
|
ldfe fA22 = [rAddr2], 16 // Load A22
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mmi
|
|
ldfe fA14 = [rAddr1], 16 // Load A14
|
|
ldfe fA21 = [rAddr2], 16 // Load A21
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
ldfe fA13 = [rTailAddr1], 32 // Load A13
|
|
fms.s1 fArgAbsNorm2 = fArgAbsNorm, fArgAbsNorm, f0 // x^2
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
ldfe fA12 = [rTailAddr2], 32 // Load A12
|
|
nop.f 0
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
ldfe fA11 = [rTailAddr1], 32 // Load A11
|
|
fma.s1 fRes3H = fA3, fArgAbsNorm, fA2H // (A3*x+A2)*x^2
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
ldfe fA10 = [rTailAddr2], 32 // Load A10
|
|
fma.s1 fTH = fA3, fArgAbsNorm, f0 // (A3*x+A2)*x^2
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
ldfe fA9 = [rTailAddr1], 32 // Load A9
|
|
fma.s1 fTT2 = fA1L, fArgAbsNorm, f0 // A1*x+A0
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
ldfe fA8 = [rTailAddr2], 32 // Load A8
|
|
nop.f 0
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mmi
|
|
ldfe fA7 = [rTailAddr1], 32 // Load A7
|
|
ldfe fA6 = [rTailAddr2], 32 // Load A6
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mmi
|
|
ldfe fA5 = [rTailAddr1], 32 // Load A5
|
|
ldfe fA4 = [rTailAddr2], 32 // Load A4
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fms.s1 fArgAbsNorm2L = fArgAbsNorm, fArgAbsNorm, fArgAbsNorm2
|
|
// Low part of x^2 (delta)
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
nop.m 0
|
|
fms.s1 fArgAbsNorm4 = fArgAbsNorm2, fArgAbsNorm2, f0 // x^4
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fms.s1 fRes3L = fA2H, f1, fRes3H // // (A3*x+A2)*x^2
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fms.s1 fArgAbsNorm3 = fArgAbsNorm2, fArgAbsNorm, f0 // x^3
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fTH2 = fA1H, fArgAbsNorm, fTT2 // A1*x+A0
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fA23 = fA24, fArgAbsNorm, fA23 // Polynomial tail
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fA21 = fA22, fArgAbsNorm, fA21 // Polynomial tail
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fA12 = fA13, fArgAbsNorm, fA12 // Polynomial tail
|
|
nop.i 0
|
|
}
|
|
;;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fRes3L = fRes3L, f1, fTH // (A3*x+A2)*x^2
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fA19 = fA20, fArgAbsNorm, fA19 // Polynomial tail
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fRes1H = fTH2, f1, fA0H // A1*x+A0
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
nop.m 0
|
|
fms.s1 fTL2 = fA1H, fArgAbsNorm, fTH2 // A1*x+A0
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fA8 = fA9, fArgAbsNorm, fA8 // Polynomial tail
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fA10 = fA11, fArgAbsNorm, fA10 // Polynomial tail
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fA15 = fA16, fArgAbsNorm, fA15 // Polynomial tail
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fA17 = fA18, fArgAbsNorm, fA17 // Polynomial tail
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fms.s1 fArgAbsNorm11 = fArgAbsNorm4, fArgAbsNorm4, f0 // x^8
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fA4 = fA5, fArgAbsNorm, fA4 // Polynomial tail
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fRes3L = fRes3L, f1, fA2L // (A3*x+A2)*x^2
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fA6 = fA7, fArgAbsNorm, fA6 // Polynomial tail
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fTL2 = fTL2, f1, fTT2 // A1*x+A0
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
nop.m 0
|
|
fms.s1 fRes1L = fA0H, f1, fRes1H // A1*x+A0
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fA23 = fA25, fArgAbsNorm2, fA23 // Polynomial tail
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fA12 = fA14, fArgAbsNorm2, fA12 // Polynomial tail
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fA19 = fA21, fArgAbsNorm2, fA19 // Polynomial tail
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fA8 = fA10, fArgAbsNorm2, fA8 // Polynomial tail
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fA15 = fA17, fArgAbsNorm2, fA15 // Polynomial tail
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
nop.m 0
|
|
fms.s1 fArgAbsNorm11 = fArgAbsNorm11, fArgAbsNorm3, f0 // x^11
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fTT = fRes3L, fArgAbsNorm2, f0 // (A3*x+A2)*x^2
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fA4 = fA6, fArgAbsNorm2, fA4 // Polynomial tail
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fRes1L = fRes1L, f1, fTH2 // A1*x+A0
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
nop.m 0
|
|
fms.s1 fArgAbsNorm4X = fArgAbsNorm4, fSignumX, f0 // x^4 * signum
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fA19 = fA23, fArgAbsNorm4, fA19 // Polynomial tail
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fA8 = fA12, fArgAbsNorm4, fA8 // Polynomial tail
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fTT = fRes3H, fArgAbsNorm2L, fTT // (A3*x+A2)*x^2
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fRes1L = fRes1L, f1, fTL2 // A1*x+A0
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fA15 = fA19, fArgAbsNorm4, fA15 // Polynomial tail
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fA4 = fA8, fArgAbsNorm4, fA4 // Polynomial tail
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fRes2H = fRes3H, fArgAbsNorm2, fTT // (A3*x+A2)*x^2
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fRes1L = fRes1L, f1, fA0L // A1*x+A0
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fRes4 = fA15, fArgAbsNorm11, fA4 // Result of
|
|
// polynomial tail
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fms.s1 fRes2L = fRes3H, fArgAbsNorm2, fRes2H // (A3*x+A2)*x^2
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fResH = fRes2H, f1, fRes1H // High result
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
(p14) fma.s1 fRes1L = fRes4, fArgAbsNorm4X, fRes1L // A1*x+A0
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
nop.m 0
|
|
(p15) fms.s1 fRes1L = fRes4, fArgAbsNorm4X, fRes1L // A1*x+A0
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fRes2L = fRes2L, f1, fTT // (A3*x+A2)*x^2
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
nop.m 0
|
|
fms.s1 fResL = fRes1H, f1, fResH // Low result
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s0 fRes1L = fRes2L, fSignumX, fRes1L // Low result
|
|
// .s0 - for symmetry issue resolving at +/-inf rounding mode
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fResL = fResL, f1, fRes2H // Low result
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
(p14) fma.s0 fResL = fRes1L, f1, fResL // Low result
|
|
// .s0 - for symmetry issue resolving at +/-inf rounding mode
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
nop.m 0
|
|
(p15) fms.s0 fResL = fRes1L, f1, fResL // Low result
|
|
// .s0 - for symmetry issue resolving at +/-inf rounding mode
|
|
nop.i 0
|
|
};;
|
|
|
|
.pred.rel "mutex",p14,p15
|
|
{ .mfi
|
|
nop.m 0
|
|
(p14) fma.s0 f8 = fResL, f1, fResH// Add high and low results
|
|
nop.i 0
|
|
}
|
|
{ .mfb
|
|
nop.m 0
|
|
(p15) fms.s0 f8 = fResL, f1, fResH // Add high and low results
|
|
br.ret.sptk b0 // Main path return
|
|
};;
|
|
|
|
// satiration path ////////////////////////////////////////////////////////////
|
|
_saturation:
|
|
|
|
.pred.rel "mutex",p14,p15
|
|
{ .mfi
|
|
nop.m 0
|
|
(p14) fms.s0 f8 = f1, f1, fTiny // Saturation result r = 1-tiny
|
|
nop.i 0
|
|
};;
|
|
{ .mfb
|
|
nop.m 0
|
|
(p15) fnma.s0 f8 = f1, f1, fTiny // Saturation result r = tiny-1
|
|
br.ret.sptk b0 // Saturation path return
|
|
};;
|
|
|
|
|
|
// 0, denormals and special IEEE numbers path /////////////////////////////////
|
|
tanhl_spec:
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fclass.m p6,p0 = f8, 0x23 // To filter infinities
|
|
// 0x23 = @pos|@neg|@inf
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fclass.m p7,p0 = f8, 0xC7 // To filter NaNs & Zeros
|
|
// 0xC7 = @pos|@neg|@zero|@qnan|@snan
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfb
|
|
nop.m 0
|
|
(p6) fmerge.s f8 = f8, f1 // +/-1 for INF args
|
|
(p6) br.ret.spnt b0 // exit for x = INF
|
|
};;
|
|
|
|
{ .mfb
|
|
nop.m 0
|
|
(p7) fma.s0 f8 = f8, f1, f8 // +/-0 for 0 args
|
|
// and NaNs for NaNs
|
|
(p7) br.ret.spnt b0 // exit for x = NaN or +/-0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fnorm.s0 f8 = f8 // Normalize arg
|
|
nop.i 0
|
|
};;
|
|
|
|
.pred.rel "mutex",p14,p15
|
|
{ .mfi
|
|
nop.m 0
|
|
(p14) fnma.s0 f8 = f8, f8, f8 // res = r-r^2
|
|
nop.i 0
|
|
}
|
|
{ .mfb
|
|
nop.m 0
|
|
(p15) fma.s0 f8 = f8, f8, f8 // res = r+r^2
|
|
br.ret.sptk b0 // 0, denormals, IEEE specials return
|
|
};;
|
|
|
|
|
|
// 0 < |x| < 1/8 path /////////////////////////////////////////////////////////
|
|
_0_to_1o8:
|
|
|
|
{ .mmi
|
|
adds rAddr1 = 0x11e0, rDataPtr // Ptr 1 to coeffs
|
|
adds rAddr2 = 0x11f0, rDataPtr // Ptr 2 to coeffs
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mmi
|
|
ldfe fA15 = [rAddr1], 32 // Load A15
|
|
ldfe fA13 = [rAddr2], 32 // Load A13
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mmi
|
|
ldfe fA11 = [rAddr1], 32 // Load A11
|
|
ldfe fA9 = [rAddr2], 32 // Load A9
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mmi
|
|
ldfe fA7 = [rAddr1], 32 // Load A7
|
|
ldfe fA5 = [rAddr2] // Load A5
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
ldfe fA3 = [rAddr1] // Load A3
|
|
fma.s1 fA11 = fA13, fArgSqr, fA11 // Polynomial tail
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fArgFour = fArgSqr, fArgSqr, f0 // a^4
|
|
nop.i 0
|
|
};;
|
|
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fA3 = fA5, fArgSqr, fA3 // Polynomial tail
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fA7 = fA9, fArgSqr, fA7 // Polynomial tail
|
|
nop.i 0
|
|
};;
|
|
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fA11 = fA15, fArgFour, fA11 // Polynomial tail
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fA3 = fA7, fArgFour, fA3 // Polynomial tail
|
|
nop.i 0
|
|
}
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fArgEight = fArgFour, fArgFour, f0 // a^8
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfi
|
|
nop.m 0
|
|
fma.s1 fRes = fA11, fArgEight, fA3 //Polynomial tail result
|
|
nop.i 0
|
|
};;
|
|
|
|
{ .mfb
|
|
nop.m 0
|
|
fma.s0 f8 = fRes, fArgCube, f8 // (Polynomial tail)*x^3
|
|
br.ret.sptk b0 // [0;1/8] interval return
|
|
};;
|
|
|
|
GLOBAL_LIBM_END(tanhl)
|
|
libm_alias_ldouble_other (tanh, tanh)
|