glibc/sysdeps/ia64/fpu/s_erf.S
Siddhesh Poyarekar 30891f35fa Remove "Contributed by" lines
We stopped adding "Contributed by" or similar lines in sources in 2012
in favour of git logs and keeping the Contributors section of the
glibc manual up to date.  Removing these lines makes the license
header a bit more consistent across files and also removes the
possibility of error in attribution when license blocks or files are
copied across since the contributed-by lines don't actually reflect
reality in those cases.

Move all "Contributed by" and similar lines (Written by, Test by,
etc.) into a new file CONTRIBUTED-BY to retain record of these
contributions.  These contributors are also mentioned in
manual/contrib.texi, so we just maintain this additional record as a
courtesy to the earlier developers.

The following scripts were used to filter a list of files to edit in
place and to clean up the CONTRIBUTED-BY file respectively.  These
were not added to the glibc sources because they're not expected to be
of any use in future given that this is a one time task:

https://gist.github.com/siddhesh/b5ecac94eabfd72ed2916d6d8157e7dc
https://gist.github.com/siddhesh/15ea1f5e435ace9774f485030695ee02

Reviewed-by: Carlos O'Donell <carlos@redhat.com>
2021-09-03 22:06:44 +05:30

925 lines
24 KiB
ArmAsm

.file "erf.s"
// Copyright (c) 2001 - 2005, Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
// 08/15/01 Initial version
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/06/03 Reordered header: .section, .global, .proc, .align
// 03/31/05 Reformatted delimiters between data tables
//
// API
//==============================================================
// double erf(double)
//
// Overview of operation
//==============================================================
// Background
//
//
// There are 9 paths:
// 1. x = +/-0.0
// Return erf(x) = +/-0.0
//
// 2. 0.0 < |x| < 0.5
// Return erf(x) = x *Pol9(x^2)
//
// 3. For several subranges of 0.5 <= |x| < 5.90625
// Return erf(x) = sign(x)*Pol19(y),
// where y = (|x|-b)/a, Pol19(y) = A0 + A1*y^1 + A2*y^2 + ... + A19*y^19
//
// For each subrange there is particular set of coefficients.
// Below is the list of subranges:
// 3.1 0.5 <= |x| < 1.0 b = a = 0.5
// 3.2 1.0 <= |x| < 2.0, b = a = 1.0
// 3.3 2.0 <= |x| < 3.25 b = a = 2.0
// 3.4 4.0 <= |x| < 5.90625 b = 4.0, a = 2.0
//
// 4. 3.25 <= |x| < 4.0
// Return erf(x) = sign(x)*Pol14(|x| - 3.25)
//
// 5. 5.90625 <= |x| < +INF
// Return erf(x) = sign(x)*(1.0d - 2^(-63))
//
// 6. |x| = INF
// Return erf(x) = sign(x) * 1.0
//
// 7. x = [S,Q]NaN
// Return erf(x) = QNaN
//
// 8. x is positive denormal
// Return erf(x) = A0*x - x^2,
// where A0 = 2.0/sqrt(Pi)
//
// 9. x is negative denormal
// Return erf(x) = A0*x + x^2,
// where A0 = 2.0/sqrt(Pi)
//
// Registers used
//==============================================================
// Floating Point registers used:
// f8, input, output
// f32 -> f63
// General registers used:
// r32 -> r48, r2, r3
// Predicate registers used:
// p0, p6 -> p15
// p6 to filter out case when x = denormal
// p7 to filter out case when x = [Q,S]NaN or +/-0,
// used also to process denormals
// p8 to filter out case when 3.25 <= |x| < 4.0,
// used also to process denormals
// p9 to filter out case when |x| = inf
// p10 to filter out case when |x| < 0.5
// p11 set when |x| < 3.25 or |x| > 4.0
// p12 to filter out case when |x| >= 5.90625
// p13 set if 4.0 <=|x| < 5.90625
// p14 set to 1 for positive x
// p15 set to 1 for negative x
// Assembly macros
//==============================================================
rDataPtr = r2
rDataPtr1 = r3
rBias = r33
rCoeffAddr3 = r34
rThreeAndQ = r35
rCoeffAddr2 = r36
rMask = r37
rArg = r38
rSignBit = r39
rAbsArg = r40
rSaturation = r41
rIndex = r42
rCoeffAddr1 = r43
rCoeffAddr4 = r44
rShiftedArg = r45
rShiftedArgMasked = r46
rBiasedExpOf4 = r47
rShiftedAbsArg = r48
//==============================================================
fA0 = f32
fA1 = f33
fA2 = f34
fA3 = f35
fA4 = f36
fA5 = f37
fA6 = f38
fA7 = f39
fA8 = f40
fA9 = f41
fA10 = f42
fA11 = f43
fA12 = f44
fA13 = f45
fA14 = f46
fA15 = f47
fA16 = f48
fA17 = f49
fA18 = f50
fA19 = f51
fArgSqr = f52
fArgAbsNorm = f53
fSignumX = f54
fRes = f55
fThreeAndQ = f56
fArgAbs = f57
fTSqr = f58
fTQuadr = f59
fTDeg3 = f60
fTDeg7 = f61
fArgAbsNormSgn = f62
fTQuadrSgn = f63
// Data tables
//==============================================================
RODATA
.align 64
LOCAL_OBJECT_START(erf_data)
// Coefficients ##0..15
// Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0
data8 0xB69AC40646D1F6C1, 0x00003FD2 //A19
data8 0x90AD48C0118FA10C, 0x00003FD7 //A18
data8 0x826FBAD055EA4AB8, 0x0000BFDB //A17
data8 0x8DAB171246CC2B89, 0x00003FDC //A16
data8 0xC0B1D6662F8A7564, 0x00003FDF //A15
data8 0xA46374AC35099BAF, 0x0000BFE1 //A14
data8 0xB2F230996346EF27, 0x0000BFE4 //A13
data8 0xCDEC50950FACE04A, 0x00003FE6 //A12
data8 0x826014649396E9D2, 0x00003FE9 //A11
data8 0xCDB787DC718B13F9, 0x0000BFEB //A10
data8 0x8E0B23C24EE0C8EE, 0x0000BFED //A9
data8 0xA49EA40A4E5A3F76, 0x00003FF0 //A8
data8 0xB11E30BE912617D3, 0x00003FF0 //A7
data8 0xCCF89D9351CE26E3, 0x0000BFF4 //A6
data8 0xEFF75AD1F0F22809, 0x00003FF2 //A5
data8 0xBB793EF404C09A22, 0x00003FF8 //A4
// Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0
data8 0xBAE93FF4174EA59B, 0x00003FE6 //A19
data8 0x8A0FD46092F95D44, 0x0000BFEA //A18
data8 0xA37B3242B7809E12, 0x00003FEC //A17
data8 0xA0330A5CD2E91689, 0x0000BFED //A16
data8 0x8E34A678F3497D17, 0x0000BFEC //A15
data8 0xAC185D45A2772384, 0x00003FEF //A14
data8 0xB0C11347CE7EEDE8, 0x00003FEF //A13
data8 0xD3330DC14EA0E4EB, 0x0000BFF2 //A12
data8 0xB4A6DFDE578A428F, 0x00003FF1 //A11
data8 0xA0B4034310D2D9CB, 0x00003FF5 //A10
data8 0xF71662D3132B7759, 0x0000BFF5 //A9
data8 0x9C88BF157695E9EC, 0x0000BFF7 //A8
data8 0xF84B80EFCA43895D, 0x00003FF8 //A7
data8 0x9722D22DA628A17B, 0x00003FF7 //A6
data8 0x8DB0A586F8F3381F, 0x0000BFFB //A5
data8 0x8DB0A5879F87E5BE, 0x00003FFB //A4
// Polynomial coefficients for the erf(x), 2.0 <= |x| < 3.25
data8 0x9C4AF1F3A4B21AFC, 0x00003FF6 //A19
data8 0x8D40D5D5DB741AB8, 0x0000BFF9 //A18
data8 0xDEBE7099E0A75BA4, 0x00003FFA //A17
data8 0xB99A33294D32429D, 0x0000BFFB //A16
data8 0x8109D9C7197BC7C9, 0x00003FFB //A15
data8 0xC30DE8E2EFC2D760, 0x00003FFA //A14
data8 0x80DDA28C5B35DC73, 0x0000BFFC //A13
data8 0x9BE4DE5095BACE0D, 0x00003FF9 //A12
data8 0xDA4092509EE7D111, 0x00003FFC //A11
data8 0x89D98C561B0C9040, 0x0000BFFD //A10
data8 0xD20B26EB2F0881D4, 0x0000BFF9 //A9
data8 0xD089C56948731561, 0x00003FFD //A8
data8 0xDD704DEFFB21B7E7, 0x0000BFFD //A7
data8 0xF0C9A6BBDE469115, 0x00003FF9 //A6
data8 0xD673A02CB5766633, 0x00003FFD //A5
data8 0x8D162CBAD8A12649, 0x0000BFFE //A4
// Polynomial coefficients for the erf(x), 4.0 <= |x| < 6.0
data8 0xD4428B75C6FE8FD1, 0x0000BFFC //A19
data8 0xF76BE1935675D5C8, 0x00003FFE //A18
data8 0xFD6BB3B14AA7A8E6, 0x0000BFFF //A17
data8 0x8BE8F573D348DDA4, 0x00004000 //A16
data8 0x81E91923A1030502, 0x0000BFFF //A15
data8 0xCE7FE87B26CFD286, 0x0000BFFE //A14
data8 0x84EF6B4E17404384, 0x00004000 //A13
data8 0x91FEF33015404991, 0x0000C000 //A12
data8 0xDEDF6A9370747E56, 0x00003FFF //A11
data8 0x8397E6FF56CDFD9D, 0x0000BFFF //A10
data8 0xFAD1CE912473937B, 0x00003FFD //A9
data8 0xC48C1EA8AAA624EA, 0x0000BFFC //A8
data8 0xFECAF0097ACF981B, 0x00003FFA //A7
data8 0x8829A394065E4B95, 0x0000BFF9 //A6
data8 0xED3003E477A53EE7, 0x00003FF6 //A5
data8 0xA4C07E9BB3FCB0F3, 0x0000BFF4 //A4
//
// Coefficients ##16..19
// Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0
data8 0x95FA98C337005D13, 0x0000BFF9 //A3
data8 0xE0F7E524D2808A97, 0x0000BFFB //A2
data8 0xE0F7E524D2808A98, 0x00003FFD //A1
data8 0x853F7AE0C76E915F, 0x00003FFE //A0
// Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0
data8 0x8DB0A587A96ABCF0, 0x00003FFC //A3
data8 0xD488F84B7DE18DA8, 0x0000BFFD //A2
data8 0xD488F84B7DE12E9C, 0x00003FFD //A1
data8 0xD7BB3D3A08445636, 0x00003FFE //A0
// Polynomial coefficients for the erf(x), 2.0 <= |x| < 3.25
data8 0xC58571D23D5C4B3A, 0x00003FFD //A3
data8 0xA94DCF467CD6AFF3, 0x0000BFFC //A2
data8 0xA94DCF467CD10A16, 0x00003FFA //A1
data8 0xFECD70A13CAF1997, 0x00003FFE //A0
// Polynomial coefficients for the erf(x), 4.0 <= |x| < 6.0
data8 0xB01D2B4F0D5AB8B0, 0x00003FF1 //A3
data8 0x8858A465CE594BD1, 0x0000BFEE //A2
data8 0x8858A447456DE61D, 0x00003FEA //A1
data8 0xFFFFFFBDC88BB107, 0x00003FFE //A0
// Polynomial coefficients for the erf(x), 0.0 <= |x| < 0.5
data8 0xBE839EDBB36C7FCE //A9
data8 0x3EBB7745A18DD242 //A8
data8 0xBF4C02DB238F2AFC //A5
data8 0x3F7565BCD0A9A3EA //A4
data8 0xC093A3581BCF3333, 0x0000BFFD //A1
data8 0xBEEF4BB82AD8AE22 //A7
data8 0x3F1F9A2A57A218CD //A6
data8 0xBF9B82CE3127F4E4 //A3
data8 0x3FBCE2F21A042B25 //A2
data8 0x906EBA8214DB688D, 0x00003FFF //A0
// 1.0 - 2^(-63)
data8 0xFFFFFFFFFFFFFFFF, 0x00003FFE
// Polynomial coefficients for the erf(x), 3.25 <= |x| < 4.0
data8 0x95E91576C7A12250, 0x00003FE7 //A14
data8 0x8E5E0D0E1F5D3CB5, 0x0000BFEA //A13
data8 0xED761DAFAF814DE9, 0x00003FEB //A12
data8 0xB3A77D921D0ACFC7, 0x0000BFEC //A11
data8 0xA662D27096B08D7C, 0x0000BFEC //A10
data8 0xDA0F410AE6233EA5, 0x00003FEF //A9
data8 0xAB4A8B16B3124327, 0x0000BFF1 //A8
data8 0xB241E236A5EDCED3, 0x00003FF2 //A7
data8 0x8A2A65BA1F551F77, 0x0000BFF3 //A6
data8 0xA4852D0B1D87000A, 0x00003FF3 //A5
data8 0x963EB00039489476, 0x0000BFF3 //A4
data8 0xCD5244FF4F7313A5, 0x00003FF2 //A3
data8 0xC6F1E695363BCB26, 0x0000BFF1 //A2
data8 0xF4DAF4680DA54C02, 0x00003FEF //A1
data8 0xFFFFB7CFB3F2ABBE, 0x00003FFE //A0
// A = 2.0/sqrt(Pi)
data8 0x906EBA8214DB688D, 0x00003FFF
LOCAL_OBJECT_END(erf_data)
.section .text
GLOBAL_LIBM_ENTRY(erf)
{ .mfi
alloc r32 = ar.pfs, 0, 17, 0, 0
fmerge.se fArgAbsNorm = f1, f8 // normalized x
adds rSignBit = 0x1, r0
}
{ .mfi
addl rDataPtr = @ltoff(erf_data), gp
fma.s1 fArgSqr = f8, f8, f0 // x^2
addl rThreeAndQ = 0x400A0, r0 // shifted bits of 3.25
}
;;
{ .mfi
getf.d rArg = f8 // x in GR
fclass.m p6,p0 = f8, 0x0b // is x denormal ?
shl rThreeAndQ = rThreeAndQ, 44 // bits of 3.25
}
{ .mfi
ld8 rDataPtr = [rDataPtr]
nop.f 0
addl rBiasedExpOf4 = 0x40100, r0 // shifted bits of 4.0
}
;;
{ .mfi
addl rSaturation = 0x4017A, r0 // shifted bits of 5.90625
fclass.m p7,p0 = f8, 0xc7 // is x [S,Q]NaN or +/-0 ?
shl rSignBit = rSignBit, 63 // mask for sign bit
}
{ .mfi
addl rMask = 0x7FF00, r0 // Mask for index bits
nop.f 0
addl rBias = 0x3FE00, r0 // bias of 0.5 << 8
}
;;
{ .mfi
setf.d fThreeAndQ = rThreeAndQ // 3.25 if FP register
fclass.m p9,p0 = f8, 0x23 // is x +/- inf?
shr.u rShiftedArg = rArg, 44
}
{ .mfb
andcm rAbsArg = rArg, rSignBit // |x| in GR
nop.f 0
(p6) br.cond.spnt erf_denormal // branch out if x is denormal
}
;;
{ .mfi
and rShiftedArgMasked = rShiftedArg, rMask // bias of x << 8
fmerge.s fArgAbs = f1, f8 // |x|
shr rShiftedAbsArg = rAbsArg, 44
}
{ .mfb
cmp.lt p8, p11 = rThreeAndQ, rAbsArg // p8 = 1 if |x| >= 3.25
(p7) fma.d.s0 f8 = f8,f1,f8 // NaN or +/-0
(p7) br.ret.spnt b0 // exit for x = NaN or +/-0
}
;;
{ .mfi
sub rIndex = rShiftedArgMasked, rBias // index << 8
nop.f 0
cmp.lt p10, p0 = rShiftedArgMasked, rBias // p10 = 1 if |x| < 0.5
}
{ .mfb
// p8 = 1 if 3.25 <= |x| < 4.0
(p8) cmp.lt p8, p11 = rShiftedAbsArg, rBiasedExpOf4
fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f1
(p10) br.cond.spnt erf_near_zero // branch out if |x| < 0.5
}
;;
.pred.rel "mutex", p8, p11
{ .mfi
(p8) adds rCoeffAddr1 = 1392, rDataPtr // coeff. for 3.25 <=|x|<4.0
(p9) fmerge.s f8 = f8,f1 // +/- inf
nop.i 0
}
{ .mfb
(p11) add rCoeffAddr1 = rDataPtr, rIndex// coeff. ##0,2,..14
nop.f 0
(p9) br.ret.spnt b0 // exit for x = +/- inf
}
;;
{ .mfi
adds rCoeffAddr2 = 16, rCoeffAddr1
fmerge.s fSignumX = f8, f1 // signum(x)
nop.i 0
}
{ .mfb
cmp.lt p12, p0 = rSaturation, rShiftedAbsArg // |x| > 5.90625?
nop.f 0
(p12) br.cond.spnt erf_saturation // branch out if x |x| >= 6.0
}
;;
// Here if paths #3,4
// if path #4 we'll branch out after loading of 14 necessary coefficients
{.mfi
ldfe fA19 = [rCoeffAddr1], 32
nop.f 0
nop.i 0
}
{.mfi
ldfe fA18 = [rCoeffAddr2], 32
nop.f 0
adds rCoeffAddr3 = 1024, rDataPtr
}
;;
{.mfi
ldfe fA17 = [rCoeffAddr1], 32
nop.f 0
nop.i 0
}
{.mfi
ldfe fA16 = [rCoeffAddr2], 32
nop.f 0
nop.i 0
}
;;
{.mfi
ldfe fA15 = [rCoeffAddr1], 32
fma.s1 fTSqr = fArgAbsNorm, fArgAbsNorm, f0
shr.u rIndex = rIndex, 2
}
{.mfi
ldfe fA14 = [rCoeffAddr2], 32
nop.f 0
adds rCoeffAddr4 = 16, r0
}
;;
{.mfi
ldfe fA13 = [rCoeffAddr1], 32
nop.f 0
// address of coefficients ##16..23
add rCoeffAddr3 = rCoeffAddr3, rIndex
}
{.mfi
ldfe fA12 = [rCoeffAddr2], 32
nop.f 0
cmp.lt p15, p14 = rArg, r0
}
;;
{.mfi
ldfe fA11 = [rCoeffAddr1], 32
nop.f 0
add rCoeffAddr4 = rCoeffAddr3, rCoeffAddr4
}
{.mfi
ldfe fA10 = [rCoeffAddr2], 32
nop.f 0
nop.i 0
}
;;
{.mfi
ldfe fA9 = [rCoeffAddr1], 32
nop.f 0
nop.i 0
}
{.mfi
ldfe fA8 = [rCoeffAddr2], 32
nop.f 0
nop.i 0
}
;;
{.mfi
ldfe fA7 = [rCoeffAddr1], 32
fms.s1 fArgAbs = fArgAbs, f1, fThreeAndQ
nop.i 0
}
{.mfb
ldfe fA6 = [rCoeffAddr2], 32
nop.f 0
(p8) br.cond.spnt erf_3q_4 // branch out if 3.25 < |x| < 4.0
}
;;
{.mfi
ldfe fA5 = [rCoeffAddr1], 32
fma.s1 fTDeg3 = fArgAbsNorm, fTSqr, f0
nop.i 0
}
{.mfi
ldfe fA4 = [rCoeffAddr2], 32
fma.s1 fTQuadr = fTSqr, fTSqr, f0
nop.i 0
}
;;
// Path #3 Polynomial Pol19(y) computation; y = fArgAbsNorm
{.mfi
ldfe fA3 = [rCoeffAddr3], 32
fma.s1 fArgAbsNormSgn = fArgAbsNorm, fSignumX, f0
nop.i 0
}
{.mfi
ldfe fA2 = [rCoeffAddr4], 32
nop.f 0
nop.i 0
}
;;
{.mfi
ldfe fA1 = [rCoeffAddr3], 32
fma.s1 fRes = fA19, fArgAbsNorm, fA18
nop.i 0
}
{.mfi
ldfe fA0 = [rCoeffAddr4], 32
nop.f 0
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA17 = fA17, fArgAbsNorm, fA16
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA15 = fA15, fArgAbsNorm, fA14
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fTDeg7 = fTDeg3, fTQuadr, f0
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA13 = fA13, fArgAbsNorm, fA12
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA11 = fA11, fArgAbsNorm, fA10
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA9 = fA9, fArgAbsNorm, fA8
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fRes = fRes, fTSqr, fA17
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA7 = fA7, fArgAbsNorm, fA6
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA5 = fA5, fArgAbsNorm, f0
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA15 = fA15, fTSqr, fA13
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA4 = fA4, fArgAbsNorm, fA3
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA2 = fA2, fArgAbsNorm, fA1
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA11 = fA11, fTSqr, fA9
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA7 = fA7, fTSqr, fA5
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fRes = fRes, fTQuadr, fA15
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA4 = fA4, fTSqr, fA2
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fRes = fRes, fTQuadr, fA11
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA4 = fA7, fTDeg3, fA4
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fRes = fRes, fTDeg7, fA4
nop.i 0
}
;;
{ .mfi
nop.m 0
// result for negative argument
(p15) fms.d.s0 f8 = fRes, fArgAbsNormSgn, fA0
nop.i 0
}
{ .mfb
nop.m 0
// result for positive argument
(p14) fma.d.s0 f8 = fRes, fArgAbsNormSgn, fA0
br.ret.sptk b0
}
// Here if 3.25 < |x| < 4.0
.align 32
erf_3q_4:
.pred.rel "mutex", p14, p15
{ .mfi
ldfe fA5 = [rCoeffAddr1], 32
fma.s1 fTSqr = fArgAbs, fArgAbs, f0
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fRes = fA19, fArgAbs, fA18
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA17 = fA17, fArgAbs, fA16
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA15 = fA15, fArgAbs, fA14
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA13 = fA13, fArgAbs, fA12
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA11 = fA11, fArgAbs, fA10
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA9 = fA9, fArgAbs, fA8
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fArgAbsNormSgn = fArgAbs, fSignumX, f0
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fTQuadr = fTSqr, fTSqr, f0
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fRes = fRes, fTSqr, fA17
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA15 = fA15, fTSqr, fA13
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA11 = fA11, fTSqr, fA9
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA7 = fA7, fArgAbs, fA6
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fTDeg7 = fTQuadr, fTSqr, f0
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fRes = fRes, fTQuadr, fA15
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA11 = fA11, fTSqr, fA7
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fRes = fRes, fTDeg7, fA11
nop.i 0
}
;;
{ .mfi
nop.m 0
// result for negative argument
(p15) fms.d.s0 f8 = fRes, fArgAbsNormSgn, fA5
nop.i 0
}
{ .mfb
nop.m 0
// result for positive argument
(p14) fma.d.s0 f8 = fRes, fArgAbsNormSgn, fA5
br.ret.sptk b0
}
;;
// Here if |x| < 0.5
.align 32
erf_near_zero:
{ .mfi
adds rCoeffAddr1 = 1280, rDataPtr // address of A9
fma.s1 fTSqr = fArgSqr, fArgSqr, f0 // x^4
nop.i 0
}
{ .mfi
adds rCoeffAddr2 = 1328, rDataPtr // address of A7
nop.f 0
nop.i 0
}
;;
{ .mfi
ldfpd fA9, fA8 = [rCoeffAddr1], 16
nop.f 0
nop.i 0
}
{ .mfi
ldfpd fA7, fA6 = [rCoeffAddr2], 16
nop.f 0
nop.i 0
}
;;
{ .mfi
ldfpd fA5, fA4 = [rCoeffAddr1], 16
nop.f 0
nop.i 0
}
{ .mfi
ldfpd fA3, fA2 = [rCoeffAddr2], 16
nop.f 0
nop.i 0
}
;;
{ .mfi
ldfe fA1 = [rCoeffAddr1]
nop.f 0
nop.i 0
}
{ .mfi
ldfe fA0 = [rCoeffAddr2]
nop.f 0
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fTQuadr = fTSqr, fTSqr, f0
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fRes = fA9, fArgSqr, fA8
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA7 = fA7, fArgSqr, fA6
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA3 = fA3, fArgSqr, fA2
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA5 = fA5, fArgSqr, fA4
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA1 = fA1, fArgSqr, fA0
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fTQuadrSgn = fTQuadr, f8, f0
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fRes = fRes, fTSqr, fA7
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA1 = fA3, fTSqr, fA1
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fRes = fRes, fTSqr, fA5
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA1 = fA1, f8, f0
nop.i 0
}
;;
{ .mfb
nop.m 0
fma.d.s0 f8 = fRes, fTQuadrSgn, fA1 // x*Pol9(x^2)
br.ret.sptk b0 // Exit for |x| < 0.5
};;
// Here if 5.90625 <= |x| < +inf
.align 32
erf_saturation:
{ .mfi
adds rDataPtr = 1376, rDataPtr // address of A0
nop.f 0
nop.i 0
}
;;
{ .mfi
ldfe fA0 = [rDataPtr]
nop.f 0
nop.i 0
}
;;
{ .mfb
nop.m 0
fma.d.s0 f8 = fA0, fSignumX, f0 // sign(x)*(1.0 - 2^(-63))
// Exit for 5.90625 <= |x| < +inf
br.ret.sptk b0 // Exit for 5.90625 <=|x|< +inf
}
;;
// Here if x is double precision denormal
.align 32
erf_denormal:
{ .mfi
adds rDataPtr = 1632, rDataPtr // address of A0
fclass.m p7,p8 = f8, 0x0a // is x -denormal ?
nop.i 0
}
;;
{ .mfi
ldfe fA0 = [rDataPtr] // A0
nop.f 0
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fA0 = fA0,f8,f0 // A0*x
nop.i 0
}
;;
{ .mfi
nop.m 0
(p7) fma.d.s0 f8 = f8,f8,fA0 // -denormal
nop.i 0
}
{ .mfb
nop.m 0
(p8) fnma.d.s0 f8 = f8,f8,fA0 // +denormal
br.ret.sptk b0 // Exit for denormal
}
;;
GLOBAL_LIBM_END(erf)
libm_alias_double_other (erf, erf)