ARM-NEON intrinsics code paths now type-safe (#115)

2024-11-25 05:30:04 +00:00 · 2020-07-31 15:49:13 -07:00 · 2020-07-31 15:49:13 -07:00 · 9962628ef4
commit 9962628ef4
parent 404c59a7dd
6 changed files with 561 additions and 539 deletions
--- a/Inc/DirectXMath.h
+++ b/Inc/DirectXMath.h
@ -196,7 +196,7 @@

 #if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)

-#if defined(__clang__)
+#if defined(__clang__) || defined(__GNUC__)
 #define XM_PREFETCH( a ) __builtin_prefetch(a)
 #elif defined(_MSC_VER)
 #define XM_PREFETCH( a ) __prefetch(a)
@ -380,9 +380,13 @@ namespace DirectX

        inline operator XMVECTOR() const noexcept { return v; }
        inline operator const float* () const noexcept { return f; }
-#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
+#ifdef _XM_NO_INTRINSICS_
+#elif defined(_XM_SSE_INTRINSICS_)
        inline operator __m128i() const noexcept { return _mm_castps_si128(v); }
        inline operator __m128d() const noexcept { return _mm_castps_pd(v); }
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(__GNUC__)
+        inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); }
+        inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); }
 #endif
    };

@ -395,9 +399,13 @@ namespace DirectX
        };

        inline operator XMVECTOR() const noexcept { return v; }
-#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
+#ifdef _XM_NO_INTRINSICS_
+#elif defined(_XM_SSE_INTRINSICS_)
        inline operator __m128i() const noexcept { return _mm_castps_si128(v); }
        inline operator __m128d() const noexcept { return _mm_castps_pd(v); }
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(__GNUC__)
+        inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); }
+        inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); }
 #endif
    };

@ -410,9 +418,13 @@ namespace DirectX
        };

        inline operator XMVECTOR() const noexcept { return v; }
-#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
+#ifdef _XM_NO_INTRINSICS_
+#elif defined(_XM_SSE_INTRINSICS_)
        inline operator __m128i() const noexcept { return _mm_castps_si128(v); }
        inline operator __m128d() const noexcept { return _mm_castps_pd(v); }
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(__GNUC__)
+        inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); }
+        inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); }
 #endif
    };

@ -425,9 +437,13 @@ namespace DirectX
        };

        inline operator XMVECTOR() const noexcept { return v; }
-#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
+#ifdef _XM_NO_INTRINSICS_
+#elif defined(_XM_SSE_INTRINSICS_)
        inline operator __m128i() const noexcept { return _mm_castps_si128(v); }
        inline operator __m128d() const noexcept { return _mm_castps_pd(v); }
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(__GNUC__)
+        inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); }
+        inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); }
 #endif
    };

@ -2166,7 +2182,7 @@ namespace DirectX
        // Convert DivExponent into 1.0f/(1<<DivExponent)
        uint32_t uScale = 0x3F800000U - (DivExponent << 23);
        // Splat the scalar value (It's really a float)
-        vScale = vdupq_n_u32(uScale);
+        vScale = vreinterpretq_s32_u32(vdupq_n_u32(uScale));
        // Multiply by the reciprocal (Perform a right shift by DivExponent)
        vResult = vmulq_f32(vResult, reinterpret_cast<const float32x4_t*>(&vScale)[0]);
        return vResult;
--- a/Inc/DirectXMathConvert.inl
+++ b/Inc/DirectXMathConvert.inl
@ -39,7 +39,7 @@ inline XMVECTOR XM_CALLCONV XMConvertVectorIntToFloat
    return Result;
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    float fScale = 1.0f / (float)(1U << DivExponent);
-    float32x4_t vResult = vcvtq_f32_s32(VInt);
+    float32x4_t vResult = vcvtq_f32_s32(vreinterpretq_s32_f32(VInt));
    return vmulq_n_f32(vResult, fScale);
 #else // _XM_SSE_INTRINSICS_
    // Convert to floats
@ -91,10 +91,10 @@ inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToInt
    // Float to int conversion
    int32x4_t vResulti = vcvtq_s32_f32(vResult);
    // If there was positive overflow, set to 0x7FFFFFFF
-    vResult = vandq_u32(vOverflow, g_XMAbsMask);
-    vOverflow = vbicq_u32(vResulti, vOverflow);
-    vOverflow = vorrq_u32(vOverflow, vResult);
-    return vOverflow;
+    vResult = vreinterpretq_f32_u32(vandq_u32(vOverflow, g_XMAbsMask));
+    vOverflow = vbicq_u32(vreinterpretq_u32_s32(vResulti), vOverflow);
+    vOverflow = vorrq_u32(vOverflow, vreinterpretq_u32_f32(vResult));
+    return vreinterpretq_f32_u32(vOverflow);
 #else // _XM_SSE_INTRINSICS_
    XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent));
    vResult = _mm_mul_ps(vResult, VFloat);
@ -129,7 +129,7 @@ inline XMVECTOR XM_CALLCONV XMConvertVectorUIntToFloat
    return Result;
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    float fScale = 1.0f / (float)(1U << DivExponent);
-    float32x4_t vResult = vcvtq_f32_u32(VUInt);
+    float32x4_t vResult = vcvtq_f32_u32(vreinterpretq_u32_f32(VUInt));
    return vmulq_n_f32(vResult, fScale);
 #else // _XM_SSE_INTRINSICS_
    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
@ -191,9 +191,9 @@ inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToUInt
    // Float to int conversion
    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
    // If there was overflow, set to 0xFFFFFFFFU
-    vResult = vbicq_u32(vResulti, vOverflow);
-    vOverflow = vorrq_u32(vOverflow, vResult);
-    return vOverflow;
+    vResult = vreinterpretq_f32_u32(vbicq_u32(vResulti, vOverflow));
+    vOverflow = vorrq_u32(vOverflow, vreinterpretq_u32_f32(vResult));
+    return vreinterpretq_f32_u32(vOverflow);
 #else // _XM_SSE_INTRINSICS_
    XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent));
    vResult = _mm_mul_ps(vResult, VFloat);
@ -240,7 +240,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt(const uint32_t* pSource) noexcept
    return V;
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t zero = vdupq_n_u32(0);
-    return vld1q_lane_u32(pSource, zero, 0);
+    return vreinterpretq_f32_u32(vld1q_lane_u32(pSource, zero, 0));
 #elif defined(_XM_SSE_INTRINSICS_)
    return _mm_load_ss(reinterpret_cast<const float*>(pSource));
 #endif
@ -281,7 +281,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt2(const uint32_t* pSource) noexcept
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x2_t x = vld1_u32(pSource);
    uint32x2_t zero = vdup_n_u32(0);
-    return vcombine_u32(x, zero);
+    return vreinterpretq_f32_u32(vcombine_u32(x, zero));
 #elif defined(_XM_SSE_INTRINSICS_)
    return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
 #endif
@ -307,7 +307,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt2A(const uint32_t* pSource) noexcept
    uint32x2_t x = vld1_u32(pSource);
 #endif
    uint32x2_t zero = vdup_n_u32(0);
-    return vcombine_u32(x, zero);
+    return vreinterpretq_f32_u32(vcombine_u32(x, zero));
 #elif defined(_XM_SSE_INTRINSICS_)
    return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
 #endif
@ -434,7 +434,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt3(const uint32_t* pSource) noexcept
    uint32x2_t x = vld1_u32(pSource);
    uint32x2_t zero = vdup_n_u32(0);
    uint32x2_t y = vld1_lane_u32(pSource + 2, zero, 0);
-    return vcombine_u32(x, y);
+    return vreinterpretq_f32_u32(vcombine_u32(x, y));
 #elif defined(_XM_SSE4_INTRINSICS_)
    __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
    __m128 z = _mm_load_ss(reinterpret_cast<const float*>(pSource + 2));
@ -466,7 +466,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt3A(const uint32_t* pSource) noexcept
 #else
    uint32x4_t V = vld1q_u32(pSource);
 #endif
-    return vsetq_lane_u32(0, V, 3);
+    return vreinterpretq_f32_u32(vsetq_lane_u32(0, V, 3));
 #elif defined(_XM_SSE4_INTRINSICS_)
    __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
    __m128 z = _mm_load_ss(reinterpret_cast<const float*>(pSource + 2));
@ -614,7 +614,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt4(const uint32_t* pSource) noexcept
    V.vector4_u32[3] = pSource[3];
    return V;
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vld1q_u32(pSource);
+    return vreinterpretq_f32_u32(vld1q_u32(pSource));
 #elif defined(_XM_SSE_INTRINSICS_)
    __m128i V = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pSource));
    return _mm_castsi128_ps(V);
@ -638,7 +638,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt4A(const uint32_t* pSource) noexcept
 #ifdef _MSC_VER
    return vld1q_u32_ex(pSource, 128);
 #else
-    return vld1q_u32(pSource);
+    return vreinterpretq_f32_u32(vld1q_u32(pSource));
 #endif
 #elif defined(_XM_SSE_INTRINSICS_)
    __m128i V = _mm_load_si128(reinterpret_cast<const __m128i*>(pSource));
@ -780,8 +780,8 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat3x3(const XMFLOAT3X3* pSource) noexcept
    float32x4_t T = vextq_f32(v0, v1, 3);

    XMMATRIX M;
-    M.r[0] = vandq_u32(v0, g_XMMask3);
-    M.r[1] = vandq_u32(T, g_XMMask3);
+    M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3));
+    M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T), g_XMMask3));
    M.r[2] = vcombine_f32(vget_high_f32(v1), v2);
    M.r[3] = g_XMIdentityR3;
    return M;
@ -846,9 +846,9 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat4x3(const XMFLOAT4X3* pSource) noexcept
    float32x4_t T3 = vextq_f32(v2, v2, 1);

    XMMATRIX M;
-    M.r[0] = vandq_u32(v0, g_XMMask3);
-    M.r[1] = vandq_u32(T1, g_XMMask3);
-    M.r[2] = vandq_u32(T2, g_XMMask3);
+    M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3));
+    M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3));
+    M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3));
    M.r[3] = vsetq_lane_f32(1.f, T3, 3);
    return M;
 #elif defined(_XM_SSE_INTRINSICS_)
@ -930,9 +930,9 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat4x3A(const XMFLOAT4X3A* pSource) noexcept
    float32x4_t T3 = vextq_f32(v2, v2, 1);

    XMMATRIX M;
-    M.r[0] = vandq_u32(v0, g_XMMask3);
-    M.r[1] = vandq_u32(T1, g_XMMask3);
-    M.r[2] = vandq_u32(T2, g_XMMask3);
+    M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3));
+    M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3));
+    M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3));
    M.r[3] = vsetq_lane_f32(1.f, T3, 3);
    return M;
 #elif defined(_XM_SSE_INTRINSICS_)
@ -1012,9 +1012,9 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat3x4(const XMFLOAT3X4* pSource) noexcept
    float32x4_t T3 = vcombine_f32(vTemp0.val[3], rh);

    XMMATRIX M = {};
-    M.r[0] = vandq_u32(T0, g_XMMask3);
-    M.r[1] = vandq_u32(T1, g_XMMask3);
-    M.r[2] = vandq_u32(T2, g_XMMask3);
+    M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T0), g_XMMask3));
+    M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3));
+    M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3));
    M.r[3] = vsetq_lane_f32(1.f, T3, 3);
    return M;
 #elif defined(_XM_SSE_INTRINSICS_)
@ -1096,9 +1096,9 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat3x4A(const XMFLOAT3X4A* pSource) noexcept
    float32x4_t T3 = vcombine_f32(vTemp0.val[3], rh);

    XMMATRIX M = {};
-    M.r[0] = vandq_u32(T0, g_XMMask3);
-    M.r[1] = vandq_u32(T1, g_XMMask3);
-    M.r[2] = vandq_u32(T2, g_XMMask3);
+    M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T0), g_XMMask3));
+    M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3));
+    M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3));
    M.r[3] = vsetq_lane_f32(1.f, T3, 3);
    return M;
 #elif defined(_XM_SSE_INTRINSICS_)
@ -1283,7 +1283,7 @@ inline void XM_CALLCONV XMStoreInt2
    pDestination[0] = V.vector4_u32[0];
    pDestination[1] = V.vector4_u32[1];
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t VL = vget_low_u32(V);
+    uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V));
    vst1_u32(pDestination, VL);
 #elif defined(_XM_SSE_INTRINSICS_)
    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
@ -1304,7 +1304,7 @@ inline void XM_CALLCONV XMStoreInt2A
    pDestination[0] = V.vector4_u32[0];
    pDestination[1] = V.vector4_u32[1];
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t VL = vget_low_u32(V);
+    uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V));
 #ifdef _MSC_VER
    vst1_u32_ex(pDestination, VL, 64);
 #else
@ -1373,9 +1373,9 @@ inline void XM_CALLCONV XMStoreSInt2
    pDestination->x = static_cast<int32_t>(V.vector4_f32[0]);
    pDestination->y = static_cast<int32_t>(V.vector4_f32[1]);
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
-    int32x2_t v = vget_low_s32(V);
-    v = vcvt_s32_f32(v);
-    vst1_s32(reinterpret_cast<int32_t*>(pDestination), v);
+    float32x2_t v = vget_low_f32(V);
+    int32x2_t iv = vcvt_s32_f32(v);
+    vst1_s32(reinterpret_cast<int32_t*>(pDestination), iv);
 #elif defined(_XM_SSE_INTRINSICS_)
    // In case of positive overflow, detect it
    XMVECTOR vOverflow = _mm_cmpgt_ps(V, g_XMMaxInt);
@ -1443,7 +1443,7 @@ inline void XM_CALLCONV XMStoreInt3
    pDestination[1] = V.vector4_u32[1];
    pDestination[2] = V.vector4_u32[2];
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t VL = vget_low_u32(V);
+    uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V));
    vst1_u32(pDestination, VL);
    vst1q_lane_u32(pDestination + 2, *reinterpret_cast<const uint32x4_t*>(&V), 2);
 #elif defined(_XM_SSE_INTRINSICS_)
@ -1468,7 +1468,7 @@ inline void XM_CALLCONV XMStoreInt3A
    pDestination[1] = V.vector4_u32[1];
    pDestination[2] = V.vector4_u32[2];
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t VL = vget_low_u32(V);
+    uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V));
 #ifdef _MSC_VER
    vst1_u32_ex(pDestination, VL, 64);
 #else
@ -1634,7 +1634,7 @@ inline void XM_CALLCONV XMStoreInt4
    pDestination[2] = V.vector4_u32[2];
    pDestination[3] = V.vector4_u32[3];
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_u32(pDestination, V);
+    vst1q_u32(pDestination, vreinterpretq_u32_f32(V));
 #elif defined(_XM_SSE_INTRINSICS_)
    _mm_storeu_si128(reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V));
 #endif
@ -1659,7 +1659,7 @@ inline void XM_CALLCONV XMStoreInt4A
 #ifdef _MSC_VER
    vst1q_u32_ex(pDestination, V, 128);
 #else
-    vst1q_u32(pDestination, V);
+    vst1q_u32(pDestination, vreinterpretq_u32_f32(V));
 #endif
 #elif defined(_XM_SSE_INTRINSICS_)
    _mm_store_si128(reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V));
--- a/Inc/DirectXMathMatrix.inl
+++ b/Inc/DirectXMathMatrix.inl
@ -48,23 +48,25 @@ inline bool XM_CALLCONV XMMatrixIsNaN(FXMMATRIX M) noexcept
    return (i != 0);      // i == 0 if nothing matched
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Load in registers
-    XMVECTOR vX = M.r[0];
-    XMVECTOR vY = M.r[1];
-    XMVECTOR vZ = M.r[2];
-    XMVECTOR vW = M.r[3];
+    float32x4_t vX = M.r[0];
+    float32x4_t vY = M.r[1];
+    float32x4_t vZ = M.r[2];
+    float32x4_t vW = M.r[3];
    // Test themselves to check for NaN
-    vX = vmvnq_u32(vceqq_f32(vX, vX));
-    vY = vmvnq_u32(vceqq_f32(vY, vY));
-    vZ = vmvnq_u32(vceqq_f32(vZ, vZ));
-    vW = vmvnq_u32(vceqq_f32(vW, vW));
+    uint32x4_t xmask = vmvnq_u32(vceqq_f32(vX, vX));
+    uint32x4_t ymask = vmvnq_u32(vceqq_f32(vY, vY));
+    uint32x4_t zmask = vmvnq_u32(vceqq_f32(vZ, vZ));
+    uint32x4_t wmask = vmvnq_u32(vceqq_f32(vW, vW));
    // Or all the results
-    vX = vorrq_u32(vX, vZ);
-    vY = vorrq_u32(vY, vW);
-    vX = vorrq_u32(vX, vY);
+    xmask = vorrq_u32(xmask, zmask);
+    ymask = vorrq_u32(ymask, wmask);
+    xmask = vorrq_u32(xmask, ymask);
    // If any tested true, return true
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vX), vget_high_u8(vX));
-    uint16x4x2_t vTemp2 = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    uint32_t r = vget_lane_u32(vTemp2.val[1], 1);
+    uint8x8x2_t vTemp = vzip_u8(
+        vget_low_u8(vreinterpretq_u8_u32(xmask)),
+        vget_high_u8(vreinterpretq_u8_u32(xmask)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
    return (r != 0);
 #elif defined(_XM_SSE_INTRINSICS_)
    // Load in registers
@ -113,24 +115,31 @@ inline bool XM_CALLCONV XMMatrixIsInfinite(FXMMATRIX M) noexcept
    } while (--i);
    return (i != 0);      // i == 0 if nothing matched
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Load in registers
+    float32x4_t vX = M.r[0];
+    float32x4_t vY = M.r[1];
+    float32x4_t vZ = M.r[2];
+    float32x4_t vW = M.r[3];
    // Mask off the sign bits
-    XMVECTOR vTemp1 = vandq_u32(M.r[0], g_XMAbsMask);
-    XMVECTOR vTemp2 = vandq_u32(M.r[1], g_XMAbsMask);
-    XMVECTOR vTemp3 = vandq_u32(M.r[2], g_XMAbsMask);
-    XMVECTOR vTemp4 = vandq_u32(M.r[3], g_XMAbsMask);
+    vX = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vX), g_XMAbsMask));
+    vY = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vY), g_XMAbsMask));
+    vZ = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vZ), g_XMAbsMask));
+    vW = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vW), g_XMAbsMask));
    // Compare to infinity
-    vTemp1 = vceqq_f32(vTemp1, g_XMInfinity);
-    vTemp2 = vceqq_f32(vTemp2, g_XMInfinity);
-    vTemp3 = vceqq_f32(vTemp3, g_XMInfinity);
-    vTemp4 = vceqq_f32(vTemp4, g_XMInfinity);
+    uint32x4_t xmask = vceqq_f32(vX, g_XMInfinity);
+    uint32x4_t ymask = vceqq_f32(vY, g_XMInfinity);
+    uint32x4_t zmask = vceqq_f32(vZ, g_XMInfinity);
+    uint32x4_t wmask = vceqq_f32(vW, g_XMInfinity);
    // Or the answers together
-    vTemp1 = vorrq_u32(vTemp1, vTemp2);
-    vTemp3 = vorrq_u32(vTemp3, vTemp4);
-    vTemp1 = vorrq_u32(vTemp1, vTemp3);
-    // If any are infinity, the signs are true.
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1));
-    uint16x4x2_t vTemp5 = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    uint32_t r = vget_lane_u32(vTemp5.val[1], 1);
+    xmask = vorrq_u32(xmask, zmask);
+    ymask = vorrq_u32(ymask, wmask);
+    xmask = vorrq_u32(xmask, ymask);
+    // If any tested true, return true
+    uint8x8x2_t vTemp = vzip_u8(
+        vget_low_u8(vreinterpretq_u8_u32(xmask)),
+        vget_high_u8(vreinterpretq_u8_u32(xmask)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
    return (r != 0);
 #elif defined(_XM_SSE_INTRINSICS_)
    // Mask off the sign bits
@ -187,16 +196,16 @@ inline bool XM_CALLCONV XMMatrixIsIdentity(FXMMATRIX M) noexcept
    uOne |= uZero;
    return (uOne == 0);
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR vTemp1 = vceqq_f32(M.r[0], g_XMIdentityR0);
-    XMVECTOR vTemp2 = vceqq_f32(M.r[1], g_XMIdentityR1);
-    XMVECTOR vTemp3 = vceqq_f32(M.r[2], g_XMIdentityR2);
-    XMVECTOR vTemp4 = vceqq_f32(M.r[3], g_XMIdentityR3);
-    vTemp1 = vandq_u32(vTemp1, vTemp2);
-    vTemp3 = vandq_u32(vTemp3, vTemp4);
-    vTemp1 = vandq_u32(vTemp1, vTemp3);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1));
-    uint16x4x2_t vTemp5 = vzip_u16(vTemp.val[0], vTemp.val[1]);
-    uint32_t r = vget_lane_u32(vTemp5.val[1], 1);
+    uint32x4_t xmask = vceqq_f32(M.r[0], g_XMIdentityR0);
+    uint32x4_t ymask = vceqq_f32(M.r[1], g_XMIdentityR1);
+    uint32x4_t zmask = vceqq_f32(M.r[2], g_XMIdentityR2);
+    uint32x4_t wmask = vceqq_f32(M.r[3], g_XMIdentityR3);
+    xmask = vandq_u32(xmask, zmask);
+    ymask = vandq_u32(ymask, wmask);
+    xmask = vandq_u32(xmask, ymask);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(xmask)), vget_high_u8(vreinterpretq_u8_u32(xmask)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
    return (r == 0xFFFFFFFFU);
 #elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp1 = _mm_cmpeq_ps(M.r[0], g_XMIdentityR0);
@ -265,10 +274,10 @@ inline XMMATRIX XM_CALLCONV XMMatrixMultiply
    float32x2_t VL = vget_low_f32(M1.r[0]);
    float32x2_t VH = vget_high_f32(M1.r[0]);
    // Perform the operation on the first row
-    XMVECTOR vX = vmulq_lane_f32(M2.r[0], VL, 0);
-    XMVECTOR vY = vmulq_lane_f32(M2.r[1], VL, 1);
-    XMVECTOR vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
-    XMVECTOR vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
+    float32x4_t vX = vmulq_lane_f32(M2.r[0], VL, 0);
+    float32x4_t vY = vmulq_lane_f32(M2.r[1], VL, 1);
+    float32x4_t vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
+    float32x4_t vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
    mResult.r[0] = vaddq_f32(vZ, vW);
    // Repeat for the other 3 rows
    VL = vget_low_f32(M1.r[1]);
@ -478,10 +487,10 @@ inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose
    float32x2_t VL = vget_low_f32(M1.r[0]);
    float32x2_t VH = vget_high_f32(M1.r[0]);
    // Perform the operation on the first row
-    XMVECTOR vX = vmulq_lane_f32(M2.r[0], VL, 0);
-    XMVECTOR vY = vmulq_lane_f32(M2.r[1], VL, 1);
-    XMVECTOR vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
-    XMVECTOR vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
+    float32x4_t vX = vmulq_lane_f32(M2.r[0], VL, 0);
+    float32x4_t vY = vmulq_lane_f32(M2.r[1], VL, 1);
+    float32x4_t vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
+    float32x4_t vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
    float32x4_t r0 = vaddq_f32(vZ, vW);
    // Repeat for the other 3 rows
    VL = vget_low_f32(M1.r[1]);
@ -1403,9 +1412,9 @@ inline XMMATRIX XM_CALLCONV XMMatrixScalingFromVector(FXMVECTOR Scale) noexcept

 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    XMMATRIX M;
-    M.r[0] = vandq_u32(Scale, g_XMMaskX);
-    M.r[1] = vandq_u32(Scale, g_XMMaskY);
-    M.r[2] = vandq_u32(Scale, g_XMMaskZ);
+    M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskX));
+    M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskY));
+    M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskZ));
    M.r[3] = g_XMIdentityR3.v;
    return M;
 #elif defined(_XM_SSE_INTRINSICS_)
@ -1455,12 +1464,12 @@ inline XMMATRIX XM_CALLCONV XMMatrixRotationX(float Angle) noexcept
    float    fCosAngle;
    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);

-    const XMVECTOR Zero = vdupq_n_f32(0);
+    const float32x4_t Zero = vdupq_n_f32(0);

-    XMVECTOR T1 = vsetq_lane_f32(fCosAngle, Zero, 1);
+    float32x4_t T1 = vsetq_lane_f32(fCosAngle, Zero, 1);
    T1 = vsetq_lane_f32(fSinAngle, T1, 2);

-    XMVECTOR T2 = vsetq_lane_f32(-fSinAngle, Zero, 1);
+    float32x4_t T2 = vsetq_lane_f32(-fSinAngle, Zero, 1);
    T2 = vsetq_lane_f32(fCosAngle, T2, 2);

    XMMATRIX M;
@ -1528,12 +1537,12 @@ inline XMMATRIX XM_CALLCONV XMMatrixRotationY(float Angle) noexcept
    float    fCosAngle;
    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);

-    const XMVECTOR Zero = vdupq_n_f32(0);
+    const float32x4_t Zero = vdupq_n_f32(0);

-    XMVECTOR T0 = vsetq_lane_f32(fCosAngle, Zero, 0);
+    float32x4_t T0 = vsetq_lane_f32(fCosAngle, Zero, 0);
    T0 = vsetq_lane_f32(-fSinAngle, T0, 2);

-    XMVECTOR T2 = vsetq_lane_f32(fSinAngle, Zero, 0);
+    float32x4_t T2 = vsetq_lane_f32(fSinAngle, Zero, 0);
    T2 = vsetq_lane_f32(fCosAngle, T2, 2);

    XMMATRIX M;
@ -1601,12 +1610,12 @@ inline XMMATRIX XM_CALLCONV XMMatrixRotationZ(float Angle) noexcept
    float    fCosAngle;
    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);

-    const XMVECTOR Zero = vdupq_n_f32(0);
+    const float32x4_t Zero = vdupq_n_f32(0);

-    XMVECTOR T0 = vsetq_lane_f32(fCosAngle, Zero, 0);
+    float32x4_t T0 = vsetq_lane_f32(fCosAngle, Zero, 0);
    T0 = vsetq_lane_f32(fSinAngle, T0, 1);

-    XMVECTOR T1 = vsetq_lane_f32(-fSinAngle, Zero, 0);
+    float32x4_t T1 = vsetq_lane_f32(-fSinAngle, Zero, 0);
    T1 = vsetq_lane_f32(fCosAngle, T1, 1);

    XMMATRIX M;
@ -2166,7 +2175,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveLH
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    float TwoNearZ = NearZ + NearZ;
    float fRange = FarZ / (FarZ - NearZ);
-    const XMVECTOR Zero = vdupq_n_f32(0);
+    const float32x4_t Zero = vdupq_n_f32(0);
    XMMATRIX M;
    M.r[0] = vsetq_lane_f32(TwoNearZ / ViewWidth, Zero, 0);
    M.r[1] = vsetq_lane_f32(TwoNearZ / ViewHeight, Zero, 1);
@ -2253,7 +2262,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveRH
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    float TwoNearZ = NearZ + NearZ;
    float fRange = FarZ / (NearZ - FarZ);
-    const XMVECTOR Zero = vdupq_n_f32(0);
+    const float32x4_t Zero = vdupq_n_f32(0);

    XMMATRIX M;
    M.r[0] = vsetq_lane_f32(TwoNearZ / ViewWidth, Zero, 0);
@ -2351,7 +2360,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovLH
    float fRange = FarZ / (FarZ - NearZ);
    float Height = CosFov / SinFov;
    float Width = Height / AspectRatio;
-    const XMVECTOR Zero = vdupq_n_f32(0);
+    const float32x4_t Zero = vdupq_n_f32(0);

    XMMATRIX M;
    M.r[0] = vsetq_lane_f32(Width, Zero, 0);
@ -2452,7 +2461,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovRH
    float fRange = FarZ / (NearZ - FarZ);
    float Height = CosFov / SinFov;
    float Width = Height / AspectRatio;
-    const XMVECTOR Zero = vdupq_n_f32(0);
+    const float32x4_t Zero = vdupq_n_f32(0);

    XMMATRIX M;
    M.r[0] = vsetq_lane_f32(Width, Zero, 0);
@ -2549,7 +2558,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterLH
    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
    float fRange = FarZ / (FarZ - NearZ);
-    const XMVECTOR Zero = vdupq_n_f32(0);
+    const float32x4_t Zero = vdupq_n_f32(0);

    XMMATRIX M;
    M.r[0] = vsetq_lane_f32(TwoNearZ * ReciprocalWidth, Zero, 0);
@ -2647,7 +2656,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterRH
    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
    float fRange = FarZ / (NearZ - FarZ);
-    const XMVECTOR Zero = vdupq_n_f32(0);
+    const float32x4_t Zero = vdupq_n_f32(0);

    XMMATRIX M;
    M.r[0] = vsetq_lane_f32(TwoNearZ * ReciprocalWidth, Zero, 0);
@ -2737,7 +2746,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixOrthographicLH
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    float fRange = 1.0f / (FarZ - NearZ);

-    const XMVECTOR Zero = vdupq_n_f32(0);
+    const float32x4_t Zero = vdupq_n_f32(0);
    XMMATRIX M;
    M.r[0] = vsetq_lane_f32(2.0f / ViewWidth, Zero, 0);
    M.r[1] = vsetq_lane_f32(2.0f / ViewHeight, Zero, 1);
@ -2821,7 +2830,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixOrthographicRH
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    float fRange = 1.0f / (NearZ - FarZ);

-    const XMVECTOR Zero = vdupq_n_f32(0);
+    const float32x4_t Zero = vdupq_n_f32(0);
    XMMATRIX M;
    M.r[0] = vsetq_lane_f32(2.0f / ViewWidth, Zero, 0);
    M.r[1] = vsetq_lane_f32(2.0f / ViewHeight, Zero, 1);
@ -2910,7 +2919,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterLH
    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
    float fRange = 1.0f / (FarZ - NearZ);
-    const XMVECTOR Zero = vdupq_n_f32(0);
+    const float32x4_t Zero = vdupq_n_f32(0);
    XMMATRIX M;
    M.r[0] = vsetq_lane_f32(ReciprocalWidth + ReciprocalWidth, Zero, 0);
    M.r[1] = vsetq_lane_f32(ReciprocalHeight + ReciprocalHeight, Zero, 1);
@ -3010,7 +3019,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterRH
    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
    float fRange = 1.0f / (NearZ - FarZ);
-    const XMVECTOR Zero = vdupq_n_f32(0);
+    const float32x4_t Zero = vdupq_n_f32(0);
    XMMATRIX M;
    M.r[0] = vsetq_lane_f32(ReciprocalWidth + ReciprocalWidth, Zero, 0);
    M.r[1] = vsetq_lane_f32(ReciprocalHeight + ReciprocalHeight, Zero, 1);
@ -3178,7 +3187,7 @@ inline XMMATRIX& XMMATRIX::operator/= (float S) noexcept
    R0 = vmul_f32(S0, R0);
    S0 = vrecps_f32(R0, vS);
    R0 = vmul_f32(S0, R0);
-    float32x4_t Reciprocal = vcombine_u32(R0, R0);
+    float32x4_t Reciprocal = vcombine_f32(R0, R0);
    r[0] = vmulq_f32(r[0], Reciprocal);
    r[1] = vmulq_f32(r[1], Reciprocal);
    r[2] = vmulq_f32(r[2], Reciprocal);
@ -3266,7 +3275,7 @@ inline XMMATRIX XMMATRIX::operator/ (float S) const noexcept
    R0 = vmul_f32(S0, R0);
    S0 = vrecps_f32(R0, vS);
    R0 = vmul_f32(S0, R0);
-    float32x4_t Reciprocal = vcombine_u32(R0, R0);
+    float32x4_t Reciprocal = vcombine_f32(R0, R0);
    XMMATRIX R;
    R.r[0] = vmulq_f32(r[0], Reciprocal);
    R.r[1] = vmulq_f32(r[1], Reciprocal);
--- a/Inc/DirectXMathMisc.inl
+++ b/Inc/DirectXMathMisc.inl
@ -120,12 +120,12 @@ inline XMVECTOR XM_CALLCONV XMQuaternionMultiply
    vResult = vmlaq_f32(vResult, Q2X, ControlWZYX);

    // Mul by Q1ZWXY
-    vTemp = vrev64q_u32(vTemp);
+    vTemp = vreinterpretq_f32_u32(vrev64q_u32(vreinterpretq_u32_f32(vTemp)));
    Q2Y = vmulq_f32(Q2Y, vTemp);
    vResult = vmlaq_f32(vResult, Q2Y, ControlZWXY);

    // Mul by Q1YXWZ
-    vTemp = vrev64q_u32(vTemp);
+    vTemp = vreinterpretq_f32_u32(vrev64q_u32(vreinterpretq_u32_f32(vTemp)));
    vTemp = vcombine_f32(vget_high_f32(vTemp), vget_low_f32(vTemp));
    Q2Z = vmulq_f32(Q2Z, vTemp);
    vResult = vmlaq_f32(vResult, Q2Z, ControlYXWZ);
@ -728,74 +728,74 @@ inline XMVECTOR XM_CALLCONV XMQuaternionRotationMatrix(FXMMATRIX M) noexcept
    static const XMVECTORU32 Select0110 = { { { XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0 } } };
    static const XMVECTORU32 Select0010 = { { { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 } } };

-    XMVECTOR r0 = M.r[0];
-    XMVECTOR r1 = M.r[1];
-    XMVECTOR r2 = M.r[2];
+    float32x4_t r0 = M.r[0];
+    float32x4_t r1 = M.r[1];
+    float32x4_t r2 = M.r[2];

-    XMVECTOR r00 = vdupq_lane_f32(vget_low_f32(r0), 0);
-    XMVECTOR r11 = vdupq_lane_f32(vget_low_f32(r1), 1);
-    XMVECTOR r22 = vdupq_lane_f32(vget_high_f32(r2), 0);
+    float32x4_t r00 = vdupq_lane_f32(vget_low_f32(r0), 0);
+    float32x4_t r11 = vdupq_lane_f32(vget_low_f32(r1), 1);
+    float32x4_t r22 = vdupq_lane_f32(vget_high_f32(r2), 0);

    // x^2 >= y^2 equivalent to r11 - r00 <= 0
-    XMVECTOR r11mr00 = vsubq_f32(r11, r00);
-    XMVECTOR x2gey2 = vcleq_f32(r11mr00, g_XMZero);
+    float32x4_t r11mr00 = vsubq_f32(r11, r00);
+    uint32x4_t x2gey2 = vcleq_f32(r11mr00, g_XMZero);

    // z^2 >= w^2 equivalent to r11 + r00 <= 0
-    XMVECTOR r11pr00 = vaddq_f32(r11, r00);
-    XMVECTOR z2gew2 = vcleq_f32(r11pr00, g_XMZero);
+    float32x4_t r11pr00 = vaddq_f32(r11, r00);
+    uint32x4_t z2gew2 = vcleq_f32(r11pr00, g_XMZero);

    // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0
-    XMVECTOR x2py2gez2pw2 = vcleq_f32(r22, g_XMZero);
+    uint32x4_t x2py2gez2pw2 = vcleq_f32(r22, g_XMZero);

    // (4*x^2, 4*y^2, 4*z^2, 4*w^2)
-    XMVECTOR t0 = vmulq_f32(XMPMMP, r00);
-    XMVECTOR x2y2z2w2 = vmlaq_f32(t0, XMMPMP, r11);
+    float32x4_t t0 = vmulq_f32(XMPMMP, r00);
+    float32x4_t x2y2z2w2 = vmlaq_f32(t0, XMMPMP, r11);
    x2y2z2w2 = vmlaq_f32(x2y2z2w2, XMMMPP, r22);
    x2y2z2w2 = vaddq_f32(x2y2z2w2, g_XMOne);

    // (r01, r02, r12, r11)
    t0 = vextq_f32(r0, r0, 1);
-    XMVECTOR t1 = vextq_f32(r1, r1, 1);
+    float32x4_t t1 = vextq_f32(r1, r1, 1);
    t0 = vcombine_f32(vget_low_f32(t0), vrev64_f32(vget_low_f32(t1)));

    // (r10, r20, r21, r10)
    t1 = vextq_f32(r2, r2, 3);
-    XMVECTOR r10 = vdupq_lane_f32(vget_low_f32(r1), 0);
+    float32x4_t r10 = vdupq_lane_f32(vget_low_f32(r1), 0);
    t1 = vbslq_f32(Select0110, t1, r10);

    // (4*x*y, 4*x*z, 4*y*z, unused)
-    XMVECTOR xyxzyz = vaddq_f32(t0, t1);
+    float32x4_t xyxzyz = vaddq_f32(t0, t1);

    // (r21, r20, r10, r10)
    t0 = vcombine_f32(vrev64_f32(vget_low_f32(r2)), vget_low_f32(r10));

    // (r12, r02, r01, r12)
-    XMVECTOR t2 = vcombine_f32(vrev64_f32(vget_high_f32(r0)), vrev64_f32(vget_low_f32(r0)));
-    XMVECTOR t3 = vdupq_lane_f32(vget_high_f32(r1), 0);
+    float32x4_t t2 = vcombine_f32(vrev64_f32(vget_high_f32(r0)), vrev64_f32(vget_low_f32(r0)));
+    float32x4_t t3 = vdupq_lane_f32(vget_high_f32(r1), 0);
    t1 = vbslq_f32(Select0110, t2, t3);

    // (4*x*w, 4*y*w, 4*z*w, unused)
-    XMVECTOR xwywzw = vsubq_f32(t0, t1);
+    float32x4_t xwywzw = vsubq_f32(t0, t1);
    xwywzw = vmulq_f32(XMMPMP, xwywzw);

    // (4*x*x, 4*x*y, 4*x*z, 4*x*w)
    t0 = vextq_f32(xyxzyz, xyxzyz, 3);
    t1 = vbslq_f32(Select0110, t0, x2y2z2w2);
    t2 = vdupq_lane_f32(vget_low_f32(xwywzw), 0);
-    XMVECTOR tensor0 = vbslq_f32(g_XMSelect1110, t1, t2);
+    float32x4_t tensor0 = vbslq_f32(g_XMSelect1110, t1, t2);

    // (4*y*x, 4*y*y, 4*y*z, 4*y*w)
    t0 = vbslq_f32(g_XMSelect1011, xyxzyz, x2y2z2w2);
    t1 = vdupq_lane_f32(vget_low_f32(xwywzw), 1);
-    XMVECTOR tensor1 = vbslq_f32(g_XMSelect1110, t0, t1);
+    float32x4_t tensor1 = vbslq_f32(g_XMSelect1110, t0, t1);

    // (4*z*x, 4*z*y, 4*z*z, 4*z*w)
    t0 = vextq_f32(xyxzyz, xyxzyz, 1);
    t1 = vcombine_f32(vget_low_f32(t0), vrev64_f32(vget_high_f32(xwywzw)));
-    XMVECTOR tensor2 = vbslq_f32(Select0010, x2y2z2w2, t1);
+    float32x4_t tensor2 = vbslq_f32(Select0010, x2y2z2w2, t1);

    // (4*w*x, 4*w*y, 4*w*z, 4*w*w)
-    XMVECTOR tensor3 = vbslq_f32(g_XMSelect1110, xwywzw, x2y2z2w2);
+    float32x4_t tensor3 = vbslq_f32(g_XMSelect1110, xwywzw, x2y2z2w2);

    // Select the row of the tensor-product matrix that has the largest
    // magnitude.
@ -1358,8 +1358,8 @@ inline XMVECTOR XM_CALLCONV XMColorNegative(FXMVECTOR vColor) noexcept
        } } };
    return vResult.v;
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR vTemp = veorq_u32(vColor, g_XMNegate3);
-    return vaddq_f32(vTemp, g_XMOne3);
+    uint32x4_t vTemp = veorq_u32(vreinterpretq_u32_f32(vColor), g_XMNegate3);
+    return vaddq_f32(vreinterpretq_f32_u32(vTemp), g_XMOne3);
 #elif defined(_XM_SSE_INTRINSICS_)
    // Negate only x,y and z.
    XMVECTOR vTemp = _mm_xor_ps(vColor, g_XMNegate3);
--- a/Inc/DirectXMathVector.inl
+++ b/Inc/DirectXMathVector.inl
--- a/Inc/DirectXPackedVector.inl
+++ b/Inc/DirectXPackedVector.inl
@ -23,7 +23,7 @@ inline float XMConvertHalfToFloat(HALF Value) noexcept
    __m128i V1 = _mm_cvtsi32_si128(static_cast<int>(Value));
    __m128 V2 = _mm_cvtph_ps(V1);
    return _mm_cvtss_f32(V2);
-#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_)
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2))
    uint16x4_t vHalf = vdup_n_u16(Value);
    float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
    return vgetq_lane_f32(vFloat, 0);
@ -255,7 +255,7 @@ inline float* XMConvertHalfToFloatStream
    XM_SFENCE();

    return pOutputStream;
-#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_)
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2))
    auto pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pFloat = reinterpret_cast<uint8_t*>(pOutputStream);

@ -389,7 +389,7 @@ inline HALF XMConvertFloatToHalf(float Value) noexcept
    __m128 V1 = _mm_set_ss(Value);
    __m128i V2 = _mm_cvtps_ph(V1, _MM_FROUND_TO_NEAREST_INT);
    return static_cast<HALF>(_mm_extract_epi16(V2, 0));
-#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_)
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2))
    float32x4_t vFloat = vdupq_n_f32(Value);
    float16x4_t vHalf = vcvt_f16_f32(vFloat);
    return vget_lane_u16(vreinterpret_u16_f16(vHalf), 0);
@ -609,7 +609,7 @@ inline HALF* XMConvertFloatToHalfStream
    }

    return pOutputStream;
-#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_)
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2))
    auto pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pHalf = reinterpret_cast<uint8_t*>(pOutputStream);

@ -1091,9 +1091,9 @@ inline XMVECTOR XM_CALLCONV XMLoadUByte2(const XMUBYTE2* pSource) noexcept
    return vResult.v;
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint16x4_t vInt8 = vld1_dup_u16(reinterpret_cast<const uint16_t*>(pSource));
-    uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u32(vInt8));
+    uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u16(vInt8));
    uint32x4_t vInt = vmovl_u16(vget_low_u16(vInt16));
-    vInt = vandq_s32(vInt, g_XMMaskXY);
+    vInt = vandq_u32(vInt, g_XMMaskXY);
    return vcvtq_f32_u32(vInt);
 #elif defined(_XM_SSE_INTRINSICS_)
    static const XMVECTORF32 Scale = { { { 1.0f, 1.0f / 256.0f, 0, 0 } } };
@ -1317,9 +1317,9 @@ inline XMVECTOR XM_CALLCONV XMLoadShortN4(const XMSHORTN4* pSource) noexcept
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    int16x4_t vInt = vld1_s16(reinterpret_cast<const int16_t*>(pSource));
    int32x4_t V = vmovl_s16(vInt);
-    V = vcvtq_f32_s32(V);
-    V = vmulq_n_f32(V, 1.0f / 32767.0f);
-    return vmaxq_f32(V, vdupq_n_f32(-1.f));
+    float32x4_t vResult = vcvtq_f32_s32(V);
+    vResult = vmulq_n_f32(vResult, 1.0f / 32767.0f);
+    return vmaxq_f32(vResult, vdupq_n_f32(-1.f));
 #elif defined(_XM_SSE_INTRINSICS_)
    // Splat the color in all four entries (x,z,y,w)
    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double*>(&pSource->x));
@ -1391,8 +1391,8 @@ inline XMVECTOR XM_CALLCONV XMLoadUShortN4(const XMUSHORTN4* pSource) noexcept
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint16x4_t vInt = vld1_u16(reinterpret_cast<const uint16_t*>(pSource));
    uint32x4_t V = vmovl_u16(vInt);
-    V = vcvtq_f32_u32(V);
-    return vmulq_n_f32(V, 1.0f / 65535.0f);
+    float32x4_t vResult = vcvtq_f32_u32(V);
+    return vmulq_n_f32(vResult, 1.0f / 65535.0f);
 #elif defined(_XM_SSE_INTRINSICS_)
    static const XMVECTORF32 FixupY16W16 = { { { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / (65535.0f * 65536.0f), 1.0f / (65535.0f * 65536.0f) } } };
    static const XMVECTORF32 FixaddY16W16 = { { { 0, 0, 32768.0f * 65536.0f, 32768.0f * 65536.0f } } };
@ -1626,7 +1626,7 @@ inline XMVECTOR XM_CALLCONV XMLoadUDecN4_XR(const XMUDECN4* pSource) noexcept
    uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
    vInt = vandq_u32(vInt, g_XMMaskDec4);
    int32x4_t vTemp = vsubq_s32(vreinterpretq_s32_u32(vInt), XRBias);
-    vTemp = veorq_u32(vTemp, g_XMFlipW);
+    vTemp = veorq_s32(vTemp, g_XMFlipW);
    float32x4_t R = vcvtq_f32_s32(vTemp);
    R = vaddq_f32(R, g_XMAddUDec4);
    return vmulq_f32(R, XRMul);
@ -2686,8 +2686,7 @@ inline void XM_CALLCONV XMStoreShortN4
    float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(-1.f));
    vResult = vminq_f32(vResult, vdupq_n_f32(1.0f));
    vResult = vmulq_n_f32(vResult, 32767.0f);
-    vResult = vcvtq_s32_f32(vResult);
-    int16x4_t vInt = vmovn_s32(vResult);
+    int16x4_t vInt = vmovn_s32(vcvtq_s32_f32(vResult));
    vst1_s16(reinterpret_cast<int16_t*>(pDestination), vInt);
 #elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = _mm_max_ps(V, g_XMNegativeOne);
@ -2724,8 +2723,7 @@ inline void XM_CALLCONV XMStoreShort4
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x4_t vResult = vmaxq_f32(V, g_ShortMin);
    vResult = vminq_f32(vResult, g_ShortMax);
-    vResult = vcvtq_s32_f32(vResult);
-    int16x4_t vInt = vmovn_s32(vResult);
+    int16x4_t vInt = vmovn_s32(vcvtq_s32_f32(vResult));
    vst1_s16(reinterpret_cast<int16_t*>(pDestination), vInt);
 #elif defined(_XM_SSE_INTRINSICS_)
    // Bounds check
@ -2767,8 +2765,7 @@ inline void XM_CALLCONV XMStoreUShortN4
    vResult = vminq_f32(vResult, vdupq_n_f32(1.0f));
    vResult = vmulq_n_f32(vResult, 65535.0f);
    vResult = vaddq_f32(vResult, g_XMOneHalf);
-    vResult = vcvtq_u32_f32(vResult);
-    uint16x4_t vInt = vmovn_u32(vResult);
+    uint16x4_t vInt = vmovn_u32(vcvtq_u32_f32(vResult));
    vst1_u16(reinterpret_cast<uint16_t*>(pDestination), vInt);
 #elif defined(_XM_SSE_INTRINSICS_)
    // Bounds check
@ -2812,8 +2809,7 @@ inline void XM_CALLCONV XMStoreUShort4
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0));
    vResult = vminq_f32(vResult, g_UShortMax);
-    vResult = vcvtq_u32_f32(vResult);
-    uint16x4_t vInt = vmovn_u32(vResult);
+    uint16x4_t vInt = vmovn_u32(vcvtq_u32_f32(vResult));
    vst1_u16(reinterpret_cast<uint16_t*>(pDestination), vInt);
 #elif defined(_XM_SSE_INTRINSICS_)
    // Bounds check
@ -2947,7 +2943,7 @@ inline void XM_CALLCONV XMStoreXDec4
    vTemp = vorr_u32(vTemp, vTemp2);
    // Perform a single bit left shift on y|w
    vTemp2 = vdup_lane_u32(vTemp, 1);
-    vTemp2 = vadd_s32(vTemp2, vTemp2);
+    vTemp2 = vadd_u32(vTemp2, vTemp2);
    vTemp = vorr_u32(vTemp, vTemp2);
    vst1_lane_u32(&pDestination->v, vTemp, 0);
 #elif defined(_XM_SSE_INTRINSICS_)
@ -3640,7 +3636,7 @@ inline void XM_CALLCONV XMStoreU555
    vTemp = vorr_u32(vTemp, vTemp2);
    // Perform a single bit left shift on y|w
    vTemp2 = vdup_lane_u32(vTemp, 1);
-    vTemp2 = vadd_s32(vTemp2, vTemp2);
+    vTemp2 = vadd_u32(vTemp2, vTemp2);
    vTemp = vorr_u32(vTemp, vTemp2);
    vst1_lane_u16(&pDestination->v, vreinterpret_u16_u32(vTemp), 0);
 #elif defined(_XM_SSE_INTRINSICS_)