1
0
mirror of https://github.com/microsoft/DirectXMath synced 2024-11-09 14:10:09 +00:00

ARMv8 optimizations

This commit is contained in:
Chuck Walbourn 2016-11-09 15:51:02 -08:00
parent dc68173efe
commit d75f745b16
3 changed files with 319 additions and 0 deletions

View File

@ -2996,6 +2996,13 @@ inline XMMATRIX& XMMATRIX::operator/= (float S)
r[3] = XMVectorDivide( r[3], vS );
return *this;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#ifdef _M_ARM64
float32x4_t vS = vdupq_n_f32( S );
r[0] = vdivq_f32( r[0], vS );
r[1] = vdivq_f32( r[1], vS );
r[2] = vdivq_f32( r[2], vS );
r[3] = vdivq_f32( r[3], vS );
#else
// 2 iterations of Newton-Raphson refinement of reciprocal
float32x2_t vS = vdup_n_f32( S );
float32x2_t R0 = vrecpe_f32( vS );
@ -3008,6 +3015,7 @@ inline XMMATRIX& XMMATRIX::operator/= (float S)
r[1] = vmulq_f32( r[1], Reciprocal );
r[2] = vmulq_f32( r[2], Reciprocal );
r[3] = vmulq_f32( r[3], Reciprocal );
#endif
return *this;
#elif defined(_XM_SSE_INTRINSICS_)
__m128 vS = _mm_set_ps1( S );
@ -3075,6 +3083,14 @@ inline XMMATRIX XMMATRIX::operator/ (float S) const
R.r[3] = XMVectorDivide( r[3], vS );
return R;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#ifdef _M_ARM64
float32x4_t vS = vdupq_n_f32( S );
XMMATRIX R;
R.r[0] = vdivq_f32( r[0], vS );
R.r[1] = vdivq_f32( r[1], vS );
R.r[2] = vdivq_f32( r[2], vS );
R.r[3] = vdivq_f32( r[3], vS );
#else
// 2 iterations of Newton-Raphson refinement of reciprocal
float32x2_t vS = vdup_n_f32( S );
float32x2_t R0 = vrecpe_f32( vS );
@ -3088,6 +3104,7 @@ inline XMMATRIX XMMATRIX::operator/ (float S) const
R.r[1] = vmulq_f32( r[1], Reciprocal );
R.r[2] = vmulq_f32( r[2], Reciprocal );
R.r[3] = vmulq_f32( r[3], Reciprocal );
#endif
return R;
#elif defined(_XM_SSE_INTRINSICS_)
__m128 vS = _mm_set_ps1( S );

View File

@ -2352,6 +2352,9 @@ inline XMVECTOR XM_CALLCONV XMVectorRound
return Result;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#ifdef _M_ARM64
return vrndnq_f32(V);
#else
uint32x4_t sign = vandq_u32( V, g_XMNegativeZero );
uint32x4_t sMagic = vorrq_u32( g_XMNoFraction, sign );
float32x4_t R1 = vaddq_f32( V, sMagic );
@ -2360,6 +2363,7 @@ inline XMVECTOR XM_CALLCONV XMVectorRound
uint32x4_t mask = vcleq_f32( R2, g_XMNoFraction );
XMVECTOR vResult = vbslq_f32( mask, R1, V );
return vResult;
#endif
#elif defined(_XM_SSE4_INTRINSICS_)
return _mm_round_ps( V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC );
#elif defined(_XM_SSE_INTRINSICS_)
@ -2412,6 +2416,9 @@ inline XMVECTOR XM_CALLCONV XMVectorTruncate
return Result;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#ifdef _M_ARM64
return vrndq_f32(V);
#else
float32x4_t vTest = vabsq_f32( V );
vTest = vcltq_f32( vTest, g_XMNoFraction );
@ -2421,6 +2428,7 @@ inline XMVECTOR XM_CALLCONV XMVectorTruncate
// All numbers less than 8388608 will use the round to int
// All others, use the ORIGINAL value
return vbslq_f32( vTest, vResult, V );
#endif
#elif defined(_XM_SSE4_INTRINSICS_)
return _mm_round_ps( V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC );
#elif defined(_XM_SSE_INTRINSICS_)
@ -2457,6 +2465,9 @@ inline XMVECTOR XM_CALLCONV XMVectorFloor
Result.vector4_f32[3] = floorf( V.vector4_f32[3] );
return Result;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#ifdef _M_ARM64
return vrndmq_f32(V);
#else
float32x4_t vTest = vabsq_f32( V );
vTest = vcltq_f32( vTest, g_XMNoFraction );
// Truncate
@ -2469,6 +2480,7 @@ inline XMVECTOR XM_CALLCONV XMVectorFloor
// All numbers less than 8388608 will use the round to int
// All others, use the ORIGINAL value
return vbslq_f32( vTest, vResult, V );
#endif
#elif defined(_XM_SSE4_INTRINSICS_)
return _mm_floor_ps( V );
#elif defined(_XM_SSE_INTRINSICS_)
@ -2506,6 +2518,9 @@ inline XMVECTOR XM_CALLCONV XMVectorCeiling
Result.vector4_f32[3] = ceilf( V.vector4_f32[3] );
return Result;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#ifdef _M_ARM64
return vrndpq_f32(V);
#else
float32x4_t vTest = vabsq_f32( V );
vTest = vcltq_f32( vTest, g_XMNoFraction );
// Truncate
@ -2518,6 +2533,7 @@ inline XMVECTOR XM_CALLCONV XMVectorCeiling
// All numbers less than 8388608 will use the round to int
// All others, use the ORIGINAL value
return vbslq_f32( vTest, vResult, V );
#endif
#elif defined(_XM_SSE4_INTRINSICS_)
return _mm_ceil_ps( V );
#elif defined(_XM_SSE_INTRINSICS_)
@ -2798,11 +2814,16 @@ inline XMVECTOR XM_CALLCONV XMVectorSum
return Result;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#ifdef _M_ARM64
XMVECTOR vTemp = vpaddq_f32(V, V);
return vpaddq_f32(vTemp,vTemp);
#else
float32x2_t v1 = vget_low_f32(V);
float32x2_t v2 = vget_high_f32(V);
v1 = vadd_f32(v1, v2);
v1 = vpadd_f32(v1, v1);
return vcombine_f32(v1, v1);
#endif
#elif defined(_XM_SSE3_INTRINSICS_)
XMVECTOR vTemp = _mm_hadd_ps(V, V);
return _mm_hadd_ps(vTemp,vTemp);
@ -3018,6 +3039,9 @@ inline XMVECTOR XM_CALLCONV XMVectorDivide
Result.vector4_f32[3] = V1.vector4_f32[3] / V2.vector4_f32[3];
return Result;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#ifdef _M_ARM64
return vdivq_f32( V1, V2 );
#else
// 2 iterations of Newton-Raphson refinement of reciprocal
float32x4_t Reciprocal = vrecpeq_f32(V2);
float32x4_t S = vrecpsq_f32( Reciprocal, V2 );
@ -3025,6 +3049,7 @@ inline XMVECTOR XM_CALLCONV XMVectorDivide
S = vrecpsq_f32( Reciprocal, V2 );
Reciprocal = vmulq_f32( S, Reciprocal );
return vmulq_f32( V1, Reciprocal );
#endif
#elif defined(_XM_SSE_INTRINSICS_)
return _mm_div_ps( V1, V2 );
#endif
@ -3113,12 +3138,17 @@ inline XMVECTOR XM_CALLCONV XMVectorReciprocal
Result.vector4_f32[3] = 1.f / V.vector4_f32[3];
return Result;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#ifdef _M_ARM64
float32x4_t one = vdupq_n_f32(1.0f);
return vdivq_f32(one,V);
#else
// 2 iterations of Newton-Raphson refinement
float32x4_t Reciprocal = vrecpeq_f32(V);
float32x4_t S = vrecpsq_f32( Reciprocal, V );
Reciprocal = vmulq_f32( S, Reciprocal );
S = vrecpsq_f32( Reciprocal, V );
return vmulq_f32( S, Reciprocal );
#endif
#elif defined(_XM_SSE_INTRINSICS_)
return _mm_div_ps(g_XMOne,V);
#endif
@ -7973,6 +8003,10 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
#ifdef _M_ARM64
V.val[0] = vdivq_f32( vResult0, W );
V.val[1] = vdivq_f32( vResult1, W );
#else
// 2 iterations of Newton-Raphson refinement of reciprocal
float32x4_t Reciprocal = vrecpeq_f32(W);
float32x4_t S = vrecpsq_f32( Reciprocal, W );
@ -7982,6 +8016,7 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
V.val[0] = vmulq_f32( vResult0, Reciprocal );
V.val[1] = vmulq_f32( vResult1, Reciprocal );
#endif
vst2q_f32( reinterpret_cast<float*>(pOutputVector),V );
pOutputVector += sizeof(XMFLOAT2)*4;
@ -8002,6 +8037,10 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
V = vget_high_f32( vResult );
float32x2_t W = vdup_lane_f32( V, 1 );
#ifdef _M_ARM64
V = vget_low_f32( vResult );
V = vdiv_f32( V, W );
#else
// 2 iterations of Newton-Raphson refinement of reciprocal for W
float32x2_t Reciprocal = vrecpe_f32( W );
float32x2_t S = vrecps_f32( Reciprocal, W );
@ -8011,6 +8050,7 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
V = vget_low_f32( vResult );
V = vmul_f32( V, Reciprocal );
#endif
vst1_f32( reinterpret_cast<float*>(pOutputVector), V );
pOutputVector += OutputStride;
@ -10501,6 +10541,11 @@ inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
#ifdef _M_ARM64
V.val[0] = vdivq_f32( vResult0, W );
V.val[1] = vdivq_f32( vResult1, W );
V.val[2] = vdivq_f32( vResult2, W );
#else
// 2 iterations of Newton-Raphson refinement of reciprocal
float32x4_t Reciprocal = vrecpeq_f32(W);
float32x4_t S = vrecpsq_f32( Reciprocal, W );
@ -10511,6 +10556,7 @@ inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream
V.val[0] = vmulq_f32( vResult0, Reciprocal );
V.val[1] = vmulq_f32( vResult1, Reciprocal );
V.val[2] = vmulq_f32( vResult2, Reciprocal );
#endif
vst3q_f32( reinterpret_cast<float*>(pOutputVector),V );
pOutputVector += sizeof(XMFLOAT3)*4;
@ -10534,6 +10580,9 @@ inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream
VH = vget_high_f32(vResult);
XMVECTOR W = vdupq_lane_f32( VH, 1 );
#ifdef _M_ARM64
vResult = vdivq_f32( vResult, W );
#else
// 2 iterations of Newton-Raphson refinement of reciprocal for W
float32x4_t Reciprocal = vrecpeq_f32( W );
float32x4_t S = vrecpsq_f32( Reciprocal, W );
@ -10542,6 +10591,7 @@ inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream
Reciprocal = vmulq_f32( S, Reciprocal );
vResult = vmulq_f32( vResult, Reciprocal );
#endif
VL = vget_low_f32( vResult );
vst1_f32( reinterpret_cast<float*>(pOutputVector), VL );
@ -11451,6 +11501,11 @@ inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
#ifdef _M_ARM64
vResult0 = vdivq_f32( vResult0, W );
vResult1 = vdivq_f32( vResult1, W );
vResult2 = vdivq_f32( vResult2, W );
#else
// 2 iterations of Newton-Raphson refinement of reciprocal
float32x4_t Reciprocal = vrecpeq_f32(W);
float32x4_t S = vrecpsq_f32( Reciprocal, W );
@ -11461,6 +11516,7 @@ inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream
vResult0 = vmulq_f32( vResult0, Reciprocal );
vResult1 = vmulq_f32( vResult1, Reciprocal );
vResult2 = vmulq_f32( vResult2, Reciprocal );
#endif
V.val[0] = vmlaq_f32( OffsetX, vResult0, ScaleX );
V.val[1] = vmlaq_f32( OffsetY, vResult1, ScaleY );
@ -11493,6 +11549,9 @@ inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream
VH = vget_high_f32(vResult);
XMVECTOR W = vdupq_lane_f32( VH, 1 );
#ifdef _M_ARM64
vResult = vdivq_f32( vResult, W );
#else
// 2 iterations of Newton-Raphson refinement of reciprocal for W
float32x4_t Reciprocal = vrecpeq_f32( W );
float32x4_t S = vrecpsq_f32( Reciprocal, W );
@ -11501,6 +11560,7 @@ inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream
Reciprocal = vmulq_f32( S, Reciprocal );
vResult = vmulq_f32( vResult, Reciprocal );
#endif
vResult = vmlaq_f32( Offset, vResult, Scale );
@ -12036,6 +12096,11 @@ inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
#ifdef _M_ARM64
V.val[0] = vdivq_f32( vResult0, W );
V.val[1] = vdivq_f32( vResult1, W );
V.val[2] = vdivq_f32( vResult2, W );
#else
// 2 iterations of Newton-Raphson refinement of reciprocal
float32x4_t Reciprocal = vrecpeq_f32(W);
float32x4_t S = vrecpsq_f32( Reciprocal, W );
@ -12046,6 +12111,7 @@ inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream
V.val[0] = vmulq_f32( vResult0, Reciprocal );
V.val[1] = vmulq_f32( vResult1, Reciprocal );
V.val[2] = vmulq_f32( vResult2, Reciprocal );
#endif
vst3q_f32( reinterpret_cast<float*>(pOutputVector),V );
pOutputVector += sizeof(XMFLOAT3)*4;
@ -12080,6 +12146,9 @@ inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream
VH = vget_high_f32(vResult);
XMVECTOR W = vdupq_lane_f32( VH, 1 );
#ifdef _M_ARM64
vResult = vdivq_f32( vResult, W );
#else
// 2 iterations of Newton-Raphson refinement of reciprocal for W
float32x4_t Reciprocal = vrecpeq_f32( W );
float32x4_t S = vrecpsq_f32( Reciprocal, W );
@ -12088,6 +12157,7 @@ inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream
Reciprocal = vmulq_f32( S, Reciprocal );
vResult = vmulq_f32( vResult, Reciprocal );
#endif
VL = vget_low_f32( vResult );
vst1_f32( reinterpret_cast<float*>(pOutputVector), VL );

View File

@ -30,6 +30,10 @@ inline float PackedVector::XMConvertHalfToFloat
__m128i V1 = _mm_cvtsi32_si128( static_cast<uint32_t>(Value) );
__m128 V2 = _mm_cvtph_ps( V1 );
return _mm_cvtss_f32( V2 );
#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(_M_ARM64) && !defined(_XM_NO_INTRINSICS_)
uint16x4_t vHalf = vdup_n_u16(Value);
float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
return vgetq_lane_f32(vFloat, 0);
#else
uint32_t Mantissa = (uint32_t)(Value & 0x03FF);
@ -254,6 +258,117 @@ inline float* PackedVector::XMConvertHalfToFloatStream
XM_SFENCE();
return pOutputStream;
#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(_M_ARM64) && !defined(_XM_NO_INTRINSICS_)
const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
size_t i = 0;
size_t four = HalfCount >> 2;
if (four > 0)
{
if (InputStride == sizeof(HALF))
{
if (OutputStride == sizeof(float))
{
// Packed input, packed output
for (size_t j = 0; j < four; ++j)
{
uint16x4_t vHalf = vld1_u16(reinterpret_cast<const uint16_t*>(pHalf));
pHalf += InputStride * 4;
float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
vst1q_f32(reinterpret_cast<float*>(pFloat), vFloat);
pFloat += OutputStride * 4;
i += 4;
}
}
else
{
// Packed input, scattered output
for (size_t j = 0; j < four; ++j)
{
uint16x4_t vHalf = vld1_u16(reinterpret_cast<const uint16_t*>(pHalf));
pHalf += InputStride * 4;
float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 0);
pFloat += OutputStride;
vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 1);
pFloat += OutputStride;
vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 2);
pFloat += OutputStride;
vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 3);
pFloat += OutputStride;
i += 4;
}
}
}
else if (OutputStride == sizeof(float))
{
// Scattered input, packed output
for (size_t j = 0; j < four; ++j)
{
uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
pHalf += InputStride;
uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
pHalf += InputStride;
uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
pHalf += InputStride;
uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
pHalf += InputStride;
uint64_t iHalf = uint64_t(H1) | (uint64_t(H2) << 16) | (uint64_t(H3) << 32) | (uint64_t(H4) << 48);
uint16x4_t vHalf = vcreate_u16(iHalf);
float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
vst1q_f32(reinterpret_cast<float*>(pFloat), vFloat);
pFloat += OutputStride * 4;
i += 4;
}
}
else
{
// Scattered input, scattered output
for (size_t j = 0; j < four; ++j)
{
uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
pHalf += InputStride;
uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
pHalf += InputStride;
uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
pHalf += InputStride;
uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
pHalf += InputStride;
uint64_t iHalf = uint64_t(H1) | (uint64_t(H2) << 16) | (uint64_t(H3) << 32) | (uint64_t(H4) << 48);
uint16x4_t vHalf = vcreate_u16(iHalf);
float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 0);
pFloat += OutputStride;
vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 1);
pFloat += OutputStride;
vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 2);
pFloat += OutputStride;
vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 3);
pFloat += OutputStride;
i += 4;
}
}
}
for (; i < HalfCount; ++i)
{
*reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
pHalf += InputStride;
pFloat += OutputStride;
}
return pOutputStream;
#else
const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
@ -281,6 +396,10 @@ inline PackedVector::HALF PackedVector::XMConvertFloatToHalf
__m128 V1 = _mm_set_ss( Value );
__m128i V2 = _mm_cvtps_ph( V1, 0 );
return static_cast<HALF>( _mm_cvtsi128_si32(V2) );
#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(_M_ARM64) && !defined(_XM_NO_INTRINSICS_)
float32x4_t vFloat = vdupq_n_f32(Value);
float16x4_t vHalf = vcvt_f16_f32(vFloat);
return vget_lane_u16(vreinterpret_u16_f16(vHalf), 0);
#else
uint32_t Result;
@ -501,6 +620,119 @@ inline PackedVector::HALF* PackedVector::XMConvertFloatToHalfStream
pHalf += OutputStride;
}
return pOutputStream;
#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(_M_ARM64) && !defined(_XM_NO_INTRINSICS_)
const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
size_t i = 0;
size_t four = FloatCount >> 2;
if (four > 0)
{
if (InputStride == sizeof(float))
{
if (OutputStride == sizeof(HALF))
{
// Packed input, packed output
for (size_t j = 0; j < four; ++j)
{
float32x4_t vFloat = vld1q_f32(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride*4;
uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat));
vst1_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf);
pHalf += OutputStride*4;
i += 4;
}
}
else
{
// Packed input, scattered output
for (size_t j = 0; j < four; ++j)
{
float32x4_t vFloat = vld1q_f32(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride*4;
uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat));
vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 0);
pHalf += OutputStride;
vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 1);
pHalf += OutputStride;
vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 2);
pHalf += OutputStride;
vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 3);
pHalf += OutputStride;
i += 4;
}
}
}
else if (OutputStride == sizeof(HALF))
{
// Scattered input, packed output
for (size_t j = 0; j < four; ++j)
{
float32x4_t vFloat = vdupq_n_f32(0);
vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 0);
pFloat += InputStride;
vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 1);
pFloat += InputStride;
vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 2);
pFloat += InputStride;
vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 3);
pFloat += InputStride;
uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat));
vst1_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf);
pHalf += OutputStride*4;
i += 4;
}
}
else
{
// Scattered input, scattered output
for (size_t j = 0; j < four; ++j)
{
float32x4_t vFloat = vdupq_n_f32(0);
vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 0);
pFloat += InputStride;
vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 1);
pFloat += InputStride;
vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 2);
pFloat += InputStride;
vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 3);
pFloat += InputStride;
uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat));
vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 0);
pHalf += OutputStride;
vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 1);
pHalf += OutputStride;
vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 2);
pHalf += OutputStride;
vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 3);
pHalf += OutputStride;
i += 4;
}
}
}
for (; i < FloatCount; ++i)
{
*reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
pFloat += InputStride;
pHalf += OutputStride;
}
return pOutputStream;
#else
const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);