ARMv8 optimizations

2024-11-09 14:10:09 +00:00 · 2016-11-09 15:51:02 -08:00 · 2016-11-09 15:51:02 -08:00 · d75f745b16
commit d75f745b16
parent dc68173efe
3 changed files with 319 additions and 0 deletions
--- a/Inc/DirectXMathMatrix.inl
+++ b/Inc/DirectXMathMatrix.inl
@ -2996,6 +2996,13 @@ inline XMMATRIX& XMMATRIX::operator/= (float S)
    r[3] = XMVectorDivide( r[3], vS );
    return *this;
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
+#ifdef _M_ARM64
+    float32x4_t vS = vdupq_n_f32( S );
+    r[0] = vdivq_f32( r[0], vS );
+    r[1] = vdivq_f32( r[1], vS );
+    r[2] = vdivq_f32( r[2], vS );
+    r[3] = vdivq_f32( r[3], vS );
+#else
    // 2 iterations of Newton-Raphson refinement of reciprocal
    float32x2_t vS = vdup_n_f32( S );
    float32x2_t R0 = vrecpe_f32( vS );
@ -3008,6 +3015,7 @@ inline XMMATRIX& XMMATRIX::operator/= (float S)
    r[1] = vmulq_f32( r[1], Reciprocal );
    r[2] = vmulq_f32( r[2], Reciprocal );
    r[3] = vmulq_f32( r[3], Reciprocal );
+#endif
    return *this;
 #elif defined(_XM_SSE_INTRINSICS_)
    __m128 vS = _mm_set_ps1( S );
@ -3075,6 +3083,14 @@ inline XMMATRIX XMMATRIX::operator/ (float S) const
    R.r[3] = XMVectorDivide( r[3], vS );
    return R;
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
+#ifdef _M_ARM64
+    float32x4_t vS = vdupq_n_f32( S );
+    XMMATRIX R;
+    R.r[0] = vdivq_f32( r[0], vS );
+    R.r[1] = vdivq_f32( r[1], vS );
+    R.r[2] = vdivq_f32( r[2], vS );
+    R.r[3] = vdivq_f32( r[3], vS );
+#else
    // 2 iterations of Newton-Raphson refinement of reciprocal
    float32x2_t vS = vdup_n_f32( S );
    float32x2_t R0 = vrecpe_f32( vS );
@ -3088,6 +3104,7 @@ inline XMMATRIX XMMATRIX::operator/ (float S) const
    R.r[1] = vmulq_f32( r[1], Reciprocal );
    R.r[2] = vmulq_f32( r[2], Reciprocal );
    R.r[3] = vmulq_f32( r[3], Reciprocal );
+#endif
    return R;
 #elif defined(_XM_SSE_INTRINSICS_)
    __m128 vS = _mm_set_ps1( S );
--- a/Inc/DirectXMathVector.inl
+++ b/Inc/DirectXMathVector.inl
@ -2352,6 +2352,9 @@ inline XMVECTOR XM_CALLCONV XMVectorRound
    return Result;

 #elif defined(_XM_ARM_NEON_INTRINSICS_)
+#ifdef _M_ARM64
+    return vrndnq_f32(V);
+#else
    uint32x4_t sign = vandq_u32( V, g_XMNegativeZero );
    uint32x4_t sMagic = vorrq_u32( g_XMNoFraction, sign );
    float32x4_t R1 = vaddq_f32( V, sMagic );
@ -2360,6 +2363,7 @@ inline XMVECTOR XM_CALLCONV XMVectorRound
    uint32x4_t mask = vcleq_f32( R2, g_XMNoFraction );
    XMVECTOR vResult = vbslq_f32( mask, R1, V );
    return vResult;
+#endif
 #elif defined(_XM_SSE4_INTRINSICS_)
    return _mm_round_ps( V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC );
 #elif defined(_XM_SSE_INTRINSICS_)
@ -2412,6 +2416,9 @@ inline XMVECTOR XM_CALLCONV XMVectorTruncate
    return Result;

 #elif defined(_XM_ARM_NEON_INTRINSICS_)
+#ifdef _M_ARM64
+    return vrndq_f32(V);
+#else
    float32x4_t vTest = vabsq_f32( V );
    vTest = vcltq_f32( vTest, g_XMNoFraction );

@ -2421,6 +2428,7 @@ inline XMVECTOR XM_CALLCONV XMVectorTruncate
    // All numbers less than 8388608 will use the round to int
    // All others, use the ORIGINAL value
    return vbslq_f32( vTest, vResult, V );
+#endif
 #elif defined(_XM_SSE4_INTRINSICS_)
    return _mm_round_ps( V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC );
 #elif defined(_XM_SSE_INTRINSICS_)
@ -2457,6 +2465,9 @@ inline XMVECTOR XM_CALLCONV XMVectorFloor
    Result.vector4_f32[3] = floorf( V.vector4_f32[3] );
    return Result;
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
+#ifdef _M_ARM64
+    return vrndmq_f32(V);
+#else
    float32x4_t vTest = vabsq_f32( V );
    vTest = vcltq_f32( vTest, g_XMNoFraction );
    // Truncate
@ -2469,6 +2480,7 @@ inline XMVECTOR XM_CALLCONV XMVectorFloor
    // All numbers less than 8388608 will use the round to int
    // All others, use the ORIGINAL value
    return vbslq_f32( vTest, vResult, V );
+#endif
 #elif defined(_XM_SSE4_INTRINSICS_)
    return _mm_floor_ps( V );
 #elif defined(_XM_SSE_INTRINSICS_)
@ -2506,6 +2518,9 @@ inline XMVECTOR XM_CALLCONV XMVectorCeiling
    Result.vector4_f32[3] = ceilf( V.vector4_f32[3] );
    return Result;
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
+#ifdef _M_ARM64
+    return vrndpq_f32(V);
+#else
    float32x4_t vTest = vabsq_f32( V );
    vTest = vcltq_f32( vTest, g_XMNoFraction );
    // Truncate
@ -2518,6 +2533,7 @@ inline XMVECTOR XM_CALLCONV XMVectorCeiling
    // All numbers less than 8388608 will use the round to int
    // All others, use the ORIGINAL value
    return vbslq_f32( vTest, vResult, V );
+#endif
 #elif defined(_XM_SSE4_INTRINSICS_)
    return _mm_ceil_ps( V );
 #elif defined(_XM_SSE_INTRINSICS_)
@ -2798,11 +2814,16 @@ inline XMVECTOR XM_CALLCONV XMVectorSum
    return Result;

 #elif defined(_XM_ARM_NEON_INTRINSICS_)
+#ifdef _M_ARM64
+    XMVECTOR vTemp = vpaddq_f32(V, V);
+    return vpaddq_f32(vTemp,vTemp);
+#else
    float32x2_t v1 = vget_low_f32(V);
    float32x2_t v2 = vget_high_f32(V);
    v1 = vadd_f32(v1, v2);
    v1 = vpadd_f32(v1, v1);
    return vcombine_f32(v1, v1);
+#endif
 #elif defined(_XM_SSE3_INTRINSICS_)
    XMVECTOR vTemp = _mm_hadd_ps(V, V);
    return _mm_hadd_ps(vTemp,vTemp);
@ -3018,6 +3039,9 @@ inline XMVECTOR XM_CALLCONV XMVectorDivide
    Result.vector4_f32[3] = V1.vector4_f32[3] / V2.vector4_f32[3];
    return Result;
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
+#ifdef _M_ARM64
+    return vdivq_f32( V1, V2 );
+#else
    // 2 iterations of Newton-Raphson refinement of reciprocal
    float32x4_t Reciprocal = vrecpeq_f32(V2);
    float32x4_t S = vrecpsq_f32( Reciprocal, V2 );
@ -3025,6 +3049,7 @@ inline XMVECTOR XM_CALLCONV XMVectorDivide
    S = vrecpsq_f32( Reciprocal, V2 );
    Reciprocal = vmulq_f32( S, Reciprocal );
    return vmulq_f32( V1, Reciprocal );
+#endif
 #elif defined(_XM_SSE_INTRINSICS_)
    return _mm_div_ps( V1, V2 );
 #endif
@ -3113,12 +3138,17 @@ inline XMVECTOR XM_CALLCONV XMVectorReciprocal
    Result.vector4_f32[3] = 1.f / V.vector4_f32[3];
    return Result;
 #elif defined(_XM_ARM_NEON_INTRINSICS_)
+#ifdef _M_ARM64
+    float32x4_t one = vdupq_n_f32(1.0f);
+    return vdivq_f32(one,V);
+#else
    // 2 iterations of Newton-Raphson refinement
    float32x4_t Reciprocal = vrecpeq_f32(V);
    float32x4_t S = vrecpsq_f32( Reciprocal, V );
    Reciprocal = vmulq_f32( S, Reciprocal );
    S = vrecpsq_f32( Reciprocal, V );
    return vmulq_f32( S, Reciprocal );
+#endif
 #elif defined(_XM_SSE_INTRINSICS_)
    return _mm_div_ps(g_XMOne,V);
 #endif
@ -7973,6 +8003,10 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream

                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );

+#ifdef _M_ARM64
+                V.val[0] = vdivq_f32( vResult0, W );
+                V.val[1] = vdivq_f32( vResult1, W );
+#else
                // 2 iterations of Newton-Raphson refinement of reciprocal
                float32x4_t Reciprocal = vrecpeq_f32(W);
                float32x4_t S = vrecpsq_f32( Reciprocal, W );
@ -7982,6 +8016,7 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
                
                V.val[0] = vmulq_f32( vResult0, Reciprocal );
                V.val[1] = vmulq_f32( vResult1, Reciprocal );
+#endif

                vst2q_f32( reinterpret_cast<float*>(pOutputVector),V );
                pOutputVector += sizeof(XMFLOAT2)*4;
@ -8002,6 +8037,10 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
        V = vget_high_f32( vResult );
        float32x2_t W = vdup_lane_f32( V, 1 );

+#ifdef _M_ARM64
+        V = vget_low_f32( vResult );
+        V = vdiv_f32( V, W );
+#else
        // 2 iterations of Newton-Raphson refinement of reciprocal for W
        float32x2_t Reciprocal = vrecpe_f32( W );
        float32x2_t S = vrecps_f32( Reciprocal, W );
@ -8011,6 +8050,7 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream

        V = vget_low_f32( vResult );
        V = vmul_f32( V, Reciprocal );
+#endif

        vst1_f32( reinterpret_cast<float*>(pOutputVector), V );
        pOutputVector += OutputStride;
@ -10501,6 +10541,11 @@ inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream

                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );

+#ifdef _M_ARM64
+                V.val[0] = vdivq_f32( vResult0, W );
+                V.val[1] = vdivq_f32( vResult1, W );
+                V.val[2] = vdivq_f32( vResult2, W );
+#else
                // 2 iterations of Newton-Raphson refinement of reciprocal
                float32x4_t Reciprocal = vrecpeq_f32(W);
                float32x4_t S = vrecpsq_f32( Reciprocal, W );
@ -10511,6 +10556,7 @@ inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream
                V.val[0] = vmulq_f32( vResult0, Reciprocal );
                V.val[1] = vmulq_f32( vResult1, Reciprocal );
                V.val[2] = vmulq_f32( vResult2, Reciprocal );
+#endif

                vst3q_f32( reinterpret_cast<float*>(pOutputVector),V );
                pOutputVector += sizeof(XMFLOAT3)*4;
@ -10534,6 +10580,9 @@ inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream
        VH = vget_high_f32(vResult);
        XMVECTOR W = vdupq_lane_f32( VH, 1 );

+#ifdef _M_ARM64
+        vResult = vdivq_f32( vResult, W );
+#else
        // 2 iterations of Newton-Raphson refinement of reciprocal for W
        float32x4_t Reciprocal = vrecpeq_f32( W );
        float32x4_t S = vrecpsq_f32( Reciprocal, W );
@ -10542,6 +10591,7 @@ inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream
        Reciprocal = vmulq_f32( S, Reciprocal );

        vResult = vmulq_f32( vResult, Reciprocal );
+#endif

        VL = vget_low_f32( vResult );
        vst1_f32( reinterpret_cast<float*>(pOutputVector), VL );
@ -11451,6 +11501,11 @@ inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream

                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );

+#ifdef _M_ARM64
+                vResult0 = vdivq_f32( vResult0, W );
+                vResult1 = vdivq_f32( vResult1, W );
+                vResult2 = vdivq_f32( vResult2, W );
+#else
                // 2 iterations of Newton-Raphson refinement of reciprocal
                float32x4_t Reciprocal = vrecpeq_f32(W);
                float32x4_t S = vrecpsq_f32( Reciprocal, W );
@ -11461,6 +11516,7 @@ inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream
                vResult0 = vmulq_f32( vResult0, Reciprocal );
                vResult1 = vmulq_f32( vResult1, Reciprocal );
                vResult2 = vmulq_f32( vResult2, Reciprocal );
+#endif

                V.val[0] = vmlaq_f32( OffsetX, vResult0, ScaleX );
                V.val[1] = vmlaq_f32( OffsetY, vResult1, ScaleY );
@ -11493,6 +11549,9 @@ inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream
            VH = vget_high_f32(vResult);
            XMVECTOR W = vdupq_lane_f32( VH, 1 );

+#ifdef _M_ARM64
+            vResult = vdivq_f32( vResult, W );
+#else
            // 2 iterations of Newton-Raphson refinement of reciprocal for W
            float32x4_t Reciprocal = vrecpeq_f32( W );
            float32x4_t S = vrecpsq_f32( Reciprocal, W );
@ -11501,6 +11560,7 @@ inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream
            Reciprocal = vmulq_f32( S, Reciprocal );

            vResult = vmulq_f32( vResult, Reciprocal );
+#endif

            vResult = vmlaq_f32( Offset, vResult, Scale );

@ -12036,6 +12096,11 @@ inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream

                __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );

+#ifdef _M_ARM64
+                V.val[0] = vdivq_f32( vResult0, W );
+                V.val[1] = vdivq_f32( vResult1, W );
+                V.val[2] = vdivq_f32( vResult2, W );
+#else
                // 2 iterations of Newton-Raphson refinement of reciprocal
                float32x4_t Reciprocal = vrecpeq_f32(W);
                float32x4_t S = vrecpsq_f32( Reciprocal, W );
@ -12046,6 +12111,7 @@ inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream
                V.val[0] = vmulq_f32( vResult0, Reciprocal );
                V.val[1] = vmulq_f32( vResult1, Reciprocal );
                V.val[2] = vmulq_f32( vResult2, Reciprocal );
+#endif

                vst3q_f32( reinterpret_cast<float*>(pOutputVector),V );
                pOutputVector += sizeof(XMFLOAT3)*4;
@ -12080,6 +12146,9 @@ inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream
            VH = vget_high_f32(vResult);
            XMVECTOR W = vdupq_lane_f32( VH, 1 );

+#ifdef _M_ARM64
+            vResult = vdivq_f32( vResult, W );
+#else
            // 2 iterations of Newton-Raphson refinement of reciprocal for W
            float32x4_t Reciprocal = vrecpeq_f32( W );
            float32x4_t S = vrecpsq_f32( Reciprocal, W );
@ -12088,6 +12157,7 @@ inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream
            Reciprocal = vmulq_f32( S, Reciprocal );

            vResult = vmulq_f32( vResult, Reciprocal );
+#endif

            VL = vget_low_f32( vResult );
            vst1_f32( reinterpret_cast<float*>(pOutputVector), VL );
--- a/Inc/DirectXPackedVector.inl
+++ b/Inc/DirectXPackedVector.inl
@ -30,6 +30,10 @@ inline float PackedVector::XMConvertHalfToFloat
    __m128i V1 = _mm_cvtsi32_si128( static_cast<uint32_t>(Value) );
    __m128 V2 = _mm_cvtph_ps( V1 );
    return _mm_cvtss_f32( V2 );
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(_M_ARM64) && !defined(_XM_NO_INTRINSICS_)
+    uint16x4_t vHalf = vdup_n_u16(Value);
+    float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
+    return vgetq_lane_f32(vFloat, 0);
 #else
    uint32_t Mantissa = (uint32_t)(Value & 0x03FF);

@ -254,6 +258,117 @@ inline float* PackedVector::XMConvertHalfToFloatStream

    XM_SFENCE();

+    return pOutputStream;
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(_M_ARM64) && !defined(_XM_NO_INTRINSICS_)
+    const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
+    uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    size_t i = 0;
+    size_t four = HalfCount >> 2;
+    if (four > 0)
+    {
+        if (InputStride == sizeof(HALF))
+        {
+            if (OutputStride == sizeof(float))
+            {
+                // Packed input, packed output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    uint16x4_t vHalf = vld1_u16(reinterpret_cast<const uint16_t*>(pHalf));
+                    pHalf += InputStride * 4;
+
+                    float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
+
+                    vst1q_f32(reinterpret_cast<float*>(pFloat), vFloat);
+                    pFloat += OutputStride * 4;
+                    i += 4;
+                }
+            }
+            else
+            {
+                // Packed input, scattered output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    uint16x4_t vHalf = vld1_u16(reinterpret_cast<const uint16_t*>(pHalf));
+                    pHalf += InputStride * 4;
+
+                    float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
+
+                    vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 0);
+                    pFloat += OutputStride;
+                    vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 1);
+                    pFloat += OutputStride;
+                    vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 2);
+                    pFloat += OutputStride;
+                    vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 3);
+                    pFloat += OutputStride;
+                    i += 4;
+                }
+            }
+        }
+        else if (OutputStride == sizeof(float))
+        {
+            // Scattered input, packed output
+            for (size_t j = 0; j < four; ++j)
+            {
+                uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+                uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+                uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+                uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+
+                uint64_t iHalf = uint64_t(H1) | (uint64_t(H2) << 16) | (uint64_t(H3) << 32) | (uint64_t(H4) << 48);
+                uint16x4_t vHalf = vcreate_u16(iHalf);
+
+                float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
+
+                vst1q_f32(reinterpret_cast<float*>(pFloat), vFloat);
+                pFloat += OutputStride * 4;
+                i += 4;
+            }
+        }
+        else
+        {
+            // Scattered input, scattered output
+            for (size_t j = 0; j < four; ++j)
+            {
+                uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+                uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+                uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+                uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+
+                uint64_t iHalf = uint64_t(H1) | (uint64_t(H2) << 16) | (uint64_t(H3) << 32) | (uint64_t(H4) << 48);
+                uint16x4_t vHalf = vcreate_u16(iHalf);
+
+                float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
+
+                vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 0);
+                pFloat += OutputStride;
+                vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 1);
+                pFloat += OutputStride;
+                vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 2);
+                pFloat += OutputStride;
+                vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 3);
+                pFloat += OutputStride;
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < HalfCount; ++i)
+    {
+        *reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
+        pHalf += InputStride;
+        pFloat += OutputStride;
+    }
+
    return pOutputStream;
 #else
    const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
@ -281,6 +396,10 @@ inline PackedVector::HALF PackedVector::XMConvertFloatToHalf
    __m128 V1 = _mm_set_ss( Value );
    __m128i V2 = _mm_cvtps_ph( V1, 0 );
    return static_cast<HALF>( _mm_cvtsi128_si32(V2) );
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(_M_ARM64) && !defined(_XM_NO_INTRINSICS_)
+    float32x4_t vFloat = vdupq_n_f32(Value);
+    float16x4_t vHalf = vcvt_f16_f32(vFloat);
+    return vget_lane_u16(vreinterpret_u16_f16(vHalf), 0);
 #else
    uint32_t Result;

@ -501,6 +620,119 @@ inline PackedVector::HALF* PackedVector::XMConvertFloatToHalfStream
        pHalf += OutputStride;
    }

+    return pOutputStream;
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(_M_ARM64) && !defined(_XM_NO_INTRINSICS_)
+    const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
+    uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    size_t i = 0;
+    size_t four = FloatCount >> 2;
+    if (four > 0)
+    {
+        if (InputStride == sizeof(float))
+        {
+            if (OutputStride == sizeof(HALF))
+            {
+                // Packed input, packed output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    float32x4_t vFloat = vld1q_f32(reinterpret_cast<const float*>(pFloat));
+                    pFloat += InputStride*4;
+
+                    uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat));
+
+                    vst1_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf);
+                    pHalf += OutputStride*4;
+                    i += 4;
+                }
+            }
+            else
+            {
+                // Packed input, scattered output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    float32x4_t vFloat = vld1q_f32(reinterpret_cast<const float*>(pFloat));
+                    pFloat += InputStride*4;
+
+                    uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat));
+
+                    vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 0);
+                    pHalf += OutputStride;
+                    vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 1);
+                    pHalf += OutputStride;
+                    vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 2);
+                    pHalf += OutputStride;
+                    vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 3);
+                    pHalf += OutputStride;
+                    i += 4;
+                }
+            }
+        }
+        else if (OutputStride == sizeof(HALF))
+        {
+            // Scattered input, packed output
+            for (size_t j = 0; j < four; ++j)
+            {
+                float32x4_t vFloat = vdupq_n_f32(0);
+                vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 0);
+                pFloat += InputStride;
+
+                vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 1);
+                pFloat += InputStride;
+
+                vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 2);
+                pFloat += InputStride;
+
+                vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 3);
+                pFloat += InputStride;
+
+                uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat));
+
+                vst1_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf);
+                pHalf += OutputStride*4;
+                i += 4;
+            }
+        }
+        else
+        {
+            // Scattered input, scattered output
+            for (size_t j = 0; j < four; ++j)
+            {
+                float32x4_t vFloat = vdupq_n_f32(0);
+                vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 0);
+                pFloat += InputStride;
+
+                vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 1);
+                pFloat += InputStride;
+
+                vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 2);
+                pFloat += InputStride;
+
+                vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 3);
+                pFloat += InputStride;
+
+                uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat));
+
+                vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 0);
+                pHalf += OutputStride;
+                vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 1);
+                pHalf += OutputStride;
+                vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 2);
+                pHalf += OutputStride;
+                vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 3);
+                pHalf += OutputStride;
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < FloatCount; ++i)
+    {
+        *reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
+        pFloat += InputStride; 
+        pHalf += OutputStride;
+    }
+
    return pOutputStream;
 #else
    const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);