Fixed float denorm conversion handling for XMConvertFloatToHalf (#114)

2024-09-19 14:49:54 +00:00 · 2020-06-25 15:08:32 -07:00 · 2020-06-25 15:08:32 -07:00 · d0bbddc9f2
commit d0bbddc9f2
parent 196104d0eb
2 changed files with 27 additions and 36 deletions
--- a/Inc/DirectXPackedVector.h
+++ b/Inc/DirectXPackedVector.h
@ -71,7 +71,7 @@ namespace DirectX
        //------------------------------------------------------------------------------
        // 16 bit floating point number consisting of a sign bit, a 5 bit biased
        // exponent, and a 10 bit mantissa
-        typedef uint16_t HALF;
+        using HALF = uint16_t;

        //------------------------------------------------------------------------------
        // 2D Vector; 16 bit floating point components
--- a/Inc/DirectXPackedVector.inl
+++ b/Inc/DirectXPackedVector.inl
@ -387,8 +387,8 @@ inline HALF XMConvertFloatToHalf(float Value) noexcept
 {
 #if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
    __m128 V1 = _mm_set_ss(Value);
-    __m128i V2 = _mm_cvtps_ph(V1, 0);
-    return static_cast<HALF>(_mm_cvtsi128_si32(V2));
+    __m128i V2 = _mm_cvtps_ph(V1, _MM_FROUND_TO_NEAREST_INT);
+    return static_cast<HALF>(_mm_extract_epi16(V2, 0));
 #elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_)
    float32x4_t vFloat = vdupq_n_f32(Value);
    float16x4_t vHalf = vcvt_f16_f32(vFloat);
@ -399,38 +399,29 @@ inline HALF XMConvertFloatToHalf(float Value) noexcept
    auto IValue = reinterpret_cast<uint32_t*>(&Value)[0];
    uint32_t Sign = (IValue & 0x80000000U) >> 16U;
    IValue = IValue & 0x7FFFFFFFU;      // Hack off the sign
-
-    if (IValue > 0x477FE000U)
+    if (IValue >= 0x47800000 /*e+16*/)
    {
-        // The number is too large to be represented as a half.  Saturate to infinity.
-        if (((IValue & 0x7F800000) == 0x7F800000) && ((IValue & 0x7FFFFF) != 0))
-        {
-            Result = 0x7FFF; // NAN
-        }
-        else
-        {
-            Result = 0x7C00U; // INF
-        }
+        // The number is too large to be represented as a half. Return infinity or NaN
+        Result = 0x7C00U | ((IValue > 0x7F800000) ? (0x200 | ((IValue >> 13U) & 0x3FFU)) : 0U);
    }
-    else if (!IValue)
+    else if (IValue <= 0x33000000U /*e-25*/)
    {
        Result = 0;
    }
+    else if (IValue < 0x38800000U /*e-14*/)
+    {
+        // The number is too small to be represented as a normalized half.
+        // Convert it to a denormalized value.
+        uint32_t Shift = 125U - (IValue >> 23U);
+        IValue = 0x800000U | (IValue & 0x7FFFFFU);
+        Result = IValue >> (Shift + 1);
+        uint32_t s = (IValue & ((1U << Shift) - 1)) != 0;
+        Result += (Result | s) & ((IValue >> Shift) & 1U);
+    }
    else
    {
-        if (IValue < 0x38800000U)
-        {
-            // The number is too small to be represented as a normalized half.
-            // Convert it to a denormalized value.
-            uint32_t Shift = 113U - (IValue >> 23U);
-            IValue = (0x800000U | (IValue & 0x7FFFFFU)) >> Shift;
-        }
-        else
-        {
-            // Rebias the exponent to represent the value as a normalized half.
-            IValue += 0xC8000000U;
-        }
-
+        // Rebias the exponent to represent the value as a normalized half.
+        IValue += 0xC8000000U;
        Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U) & 0x7FFFU;
    }
    return static_cast<HALF>(Result | Sign);
@ -477,7 +468,7 @@ inline HALF* XMConvertFloatToHalfStream
                        __m128 FV = _mm_load_ps(reinterpret_cast<const float*>(pFloat));
                        pFloat += InputStride * 4;

-                        __m128i HV = _mm_cvtps_ph(FV, 0);
+                        __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT);

                        _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
                        pHalf += OutputStride * 4;
@ -492,7 +483,7 @@ inline HALF* XMConvertFloatToHalfStream
                        __m128 FV = _mm_loadu_ps(reinterpret_cast<const float*>(pFloat));
                        pFloat += InputStride * 4;

-                        __m128i HV = _mm_cvtps_ph(FV, 0);
+                        __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT);

                        _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
                        pHalf += OutputStride * 4;
@ -510,7 +501,7 @@ inline HALF* XMConvertFloatToHalfStream
                        __m128 FV = _mm_load_ps(reinterpret_cast<const float*>(pFloat));
                        pFloat += InputStride * 4;

-                        __m128i HV = _mm_cvtps_ph(FV, 0);
+                        __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT);

                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
                        pHalf += OutputStride;
@ -531,7 +522,7 @@ inline HALF* XMConvertFloatToHalfStream
                        __m128 FV = _mm_loadu_ps(reinterpret_cast<const float*>(pFloat));
                        pFloat += InputStride * 4;

-                        __m128i HV = _mm_cvtps_ph(FV, 0);
+                        __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT);

                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
                        pHalf += OutputStride;
@ -567,7 +558,7 @@ inline HALF* XMConvertFloatToHalfStream
                __m128 FT = _mm_blend_ps(FV3, FV4, 0x8);
                FV = _mm_blend_ps(FV, FT, 0xC);

-                __m128i HV = _mm_cvtps_ph(FV, 0);
+                __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT);

                _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
                pHalf += OutputStride * 4;
@ -595,7 +586,7 @@ inline HALF* XMConvertFloatToHalfStream
                __m128 FT = _mm_blend_ps(FV3, FV4, 0x8);
                FV = _mm_blend_ps(FV, FT, 0xC);

-                __m128i HV = _mm_cvtps_ph(FV, 0);
+                __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT);

                *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
                pHalf += OutputStride;
@ -2099,7 +2090,7 @@ inline void XM_CALLCONV XMStoreHalf2
 {
    assert(pDestination);
 #if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    __m128i V1 = _mm_cvtps_ph(V, 0);
+    __m128i V1 = _mm_cvtps_ph(V, _MM_FROUND_TO_NEAREST_INT);
    _mm_store_ss(reinterpret_cast<float*>(pDestination), _mm_castsi128_ps(V1));
 #else
    pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V));
@ -2655,7 +2646,7 @@ inline void XM_CALLCONV XMStoreHalf4
 {
    assert(pDestination);
 #if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    __m128i V1 = _mm_cvtps_ph(V, 0);
+    __m128i V1 = _mm_cvtps_ph(V, _MM_FROUND_TO_NEAREST_INT);
    _mm_storel_epi64(reinterpret_cast<__m128i*>(pDestination), V1);
 #else
    XMFLOAT4A t;