mirror of
https://github.com/microsoft/DirectXMath
synced 2024-11-09 14:10:09 +00:00
ARMv8 optimizations
This commit is contained in:
parent
dc68173efe
commit
d75f745b16
@ -2996,6 +2996,13 @@ inline XMMATRIX& XMMATRIX::operator/= (float S)
|
||||
r[3] = XMVectorDivide( r[3], vS );
|
||||
return *this;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
#ifdef _M_ARM64
|
||||
float32x4_t vS = vdupq_n_f32( S );
|
||||
r[0] = vdivq_f32( r[0], vS );
|
||||
r[1] = vdivq_f32( r[1], vS );
|
||||
r[2] = vdivq_f32( r[2], vS );
|
||||
r[3] = vdivq_f32( r[3], vS );
|
||||
#else
|
||||
// 2 iterations of Newton-Raphson refinement of reciprocal
|
||||
float32x2_t vS = vdup_n_f32( S );
|
||||
float32x2_t R0 = vrecpe_f32( vS );
|
||||
@ -3008,6 +3015,7 @@ inline XMMATRIX& XMMATRIX::operator/= (float S)
|
||||
r[1] = vmulq_f32( r[1], Reciprocal );
|
||||
r[2] = vmulq_f32( r[2], Reciprocal );
|
||||
r[3] = vmulq_f32( r[3], Reciprocal );
|
||||
#endif
|
||||
return *this;
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
__m128 vS = _mm_set_ps1( S );
|
||||
@ -3075,6 +3083,14 @@ inline XMMATRIX XMMATRIX::operator/ (float S) const
|
||||
R.r[3] = XMVectorDivide( r[3], vS );
|
||||
return R;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
#ifdef _M_ARM64
|
||||
float32x4_t vS = vdupq_n_f32( S );
|
||||
XMMATRIX R;
|
||||
R.r[0] = vdivq_f32( r[0], vS );
|
||||
R.r[1] = vdivq_f32( r[1], vS );
|
||||
R.r[2] = vdivq_f32( r[2], vS );
|
||||
R.r[3] = vdivq_f32( r[3], vS );
|
||||
#else
|
||||
// 2 iterations of Newton-Raphson refinement of reciprocal
|
||||
float32x2_t vS = vdup_n_f32( S );
|
||||
float32x2_t R0 = vrecpe_f32( vS );
|
||||
@ -3088,6 +3104,7 @@ inline XMMATRIX XMMATRIX::operator/ (float S) const
|
||||
R.r[1] = vmulq_f32( r[1], Reciprocal );
|
||||
R.r[2] = vmulq_f32( r[2], Reciprocal );
|
||||
R.r[3] = vmulq_f32( r[3], Reciprocal );
|
||||
#endif
|
||||
return R;
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
__m128 vS = _mm_set_ps1( S );
|
||||
|
@ -2352,6 +2352,9 @@ inline XMVECTOR XM_CALLCONV XMVectorRound
|
||||
return Result;
|
||||
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
#ifdef _M_ARM64
|
||||
return vrndnq_f32(V);
|
||||
#else
|
||||
uint32x4_t sign = vandq_u32( V, g_XMNegativeZero );
|
||||
uint32x4_t sMagic = vorrq_u32( g_XMNoFraction, sign );
|
||||
float32x4_t R1 = vaddq_f32( V, sMagic );
|
||||
@ -2360,6 +2363,7 @@ inline XMVECTOR XM_CALLCONV XMVectorRound
|
||||
uint32x4_t mask = vcleq_f32( R2, g_XMNoFraction );
|
||||
XMVECTOR vResult = vbslq_f32( mask, R1, V );
|
||||
return vResult;
|
||||
#endif
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
return _mm_round_ps( V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
@ -2412,6 +2416,9 @@ inline XMVECTOR XM_CALLCONV XMVectorTruncate
|
||||
return Result;
|
||||
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
#ifdef _M_ARM64
|
||||
return vrndq_f32(V);
|
||||
#else
|
||||
float32x4_t vTest = vabsq_f32( V );
|
||||
vTest = vcltq_f32( vTest, g_XMNoFraction );
|
||||
|
||||
@ -2421,6 +2428,7 @@ inline XMVECTOR XM_CALLCONV XMVectorTruncate
|
||||
// All numbers less than 8388608 will use the round to int
|
||||
// All others, use the ORIGINAL value
|
||||
return vbslq_f32( vTest, vResult, V );
|
||||
#endif
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
return _mm_round_ps( V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
@ -2457,6 +2465,9 @@ inline XMVECTOR XM_CALLCONV XMVectorFloor
|
||||
Result.vector4_f32[3] = floorf( V.vector4_f32[3] );
|
||||
return Result;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
#ifdef _M_ARM64
|
||||
return vrndmq_f32(V);
|
||||
#else
|
||||
float32x4_t vTest = vabsq_f32( V );
|
||||
vTest = vcltq_f32( vTest, g_XMNoFraction );
|
||||
// Truncate
|
||||
@ -2469,6 +2480,7 @@ inline XMVECTOR XM_CALLCONV XMVectorFloor
|
||||
// All numbers less than 8388608 will use the round to int
|
||||
// All others, use the ORIGINAL value
|
||||
return vbslq_f32( vTest, vResult, V );
|
||||
#endif
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
return _mm_floor_ps( V );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
@ -2506,6 +2518,9 @@ inline XMVECTOR XM_CALLCONV XMVectorCeiling
|
||||
Result.vector4_f32[3] = ceilf( V.vector4_f32[3] );
|
||||
return Result;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
#ifdef _M_ARM64
|
||||
return vrndpq_f32(V);
|
||||
#else
|
||||
float32x4_t vTest = vabsq_f32( V );
|
||||
vTest = vcltq_f32( vTest, g_XMNoFraction );
|
||||
// Truncate
|
||||
@ -2518,6 +2533,7 @@ inline XMVECTOR XM_CALLCONV XMVectorCeiling
|
||||
// All numbers less than 8388608 will use the round to int
|
||||
// All others, use the ORIGINAL value
|
||||
return vbslq_f32( vTest, vResult, V );
|
||||
#endif
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
return _mm_ceil_ps( V );
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
@ -2798,11 +2814,16 @@ inline XMVECTOR XM_CALLCONV XMVectorSum
|
||||
return Result;
|
||||
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
#ifdef _M_ARM64
|
||||
XMVECTOR vTemp = vpaddq_f32(V, V);
|
||||
return vpaddq_f32(vTemp,vTemp);
|
||||
#else
|
||||
float32x2_t v1 = vget_low_f32(V);
|
||||
float32x2_t v2 = vget_high_f32(V);
|
||||
v1 = vadd_f32(v1, v2);
|
||||
v1 = vpadd_f32(v1, v1);
|
||||
return vcombine_f32(v1, v1);
|
||||
#endif
|
||||
#elif defined(_XM_SSE3_INTRINSICS_)
|
||||
XMVECTOR vTemp = _mm_hadd_ps(V, V);
|
||||
return _mm_hadd_ps(vTemp,vTemp);
|
||||
@ -3018,6 +3039,9 @@ inline XMVECTOR XM_CALLCONV XMVectorDivide
|
||||
Result.vector4_f32[3] = V1.vector4_f32[3] / V2.vector4_f32[3];
|
||||
return Result;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
#ifdef _M_ARM64
|
||||
return vdivq_f32( V1, V2 );
|
||||
#else
|
||||
// 2 iterations of Newton-Raphson refinement of reciprocal
|
||||
float32x4_t Reciprocal = vrecpeq_f32(V2);
|
||||
float32x4_t S = vrecpsq_f32( Reciprocal, V2 );
|
||||
@ -3025,6 +3049,7 @@ inline XMVECTOR XM_CALLCONV XMVectorDivide
|
||||
S = vrecpsq_f32( Reciprocal, V2 );
|
||||
Reciprocal = vmulq_f32( S, Reciprocal );
|
||||
return vmulq_f32( V1, Reciprocal );
|
||||
#endif
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
return _mm_div_ps( V1, V2 );
|
||||
#endif
|
||||
@ -3113,12 +3138,17 @@ inline XMVECTOR XM_CALLCONV XMVectorReciprocal
|
||||
Result.vector4_f32[3] = 1.f / V.vector4_f32[3];
|
||||
return Result;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
#ifdef _M_ARM64
|
||||
float32x4_t one = vdupq_n_f32(1.0f);
|
||||
return vdivq_f32(one,V);
|
||||
#else
|
||||
// 2 iterations of Newton-Raphson refinement
|
||||
float32x4_t Reciprocal = vrecpeq_f32(V);
|
||||
float32x4_t S = vrecpsq_f32( Reciprocal, V );
|
||||
Reciprocal = vmulq_f32( S, Reciprocal );
|
||||
S = vrecpsq_f32( Reciprocal, V );
|
||||
return vmulq_f32( S, Reciprocal );
|
||||
#endif
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
return _mm_div_ps(g_XMOne,V);
|
||||
#endif
|
||||
@ -7973,6 +8003,10 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
|
||||
|
||||
#ifdef _M_ARM64
|
||||
V.val[0] = vdivq_f32( vResult0, W );
|
||||
V.val[1] = vdivq_f32( vResult1, W );
|
||||
#else
|
||||
// 2 iterations of Newton-Raphson refinement of reciprocal
|
||||
float32x4_t Reciprocal = vrecpeq_f32(W);
|
||||
float32x4_t S = vrecpsq_f32( Reciprocal, W );
|
||||
@ -7982,6 +8016,7 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
|
||||
|
||||
V.val[0] = vmulq_f32( vResult0, Reciprocal );
|
||||
V.val[1] = vmulq_f32( vResult1, Reciprocal );
|
||||
#endif
|
||||
|
||||
vst2q_f32( reinterpret_cast<float*>(pOutputVector),V );
|
||||
pOutputVector += sizeof(XMFLOAT2)*4;
|
||||
@ -8002,6 +8037,10 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
|
||||
V = vget_high_f32( vResult );
|
||||
float32x2_t W = vdup_lane_f32( V, 1 );
|
||||
|
||||
#ifdef _M_ARM64
|
||||
V = vget_low_f32( vResult );
|
||||
V = vdiv_f32( V, W );
|
||||
#else
|
||||
// 2 iterations of Newton-Raphson refinement of reciprocal for W
|
||||
float32x2_t Reciprocal = vrecpe_f32( W );
|
||||
float32x2_t S = vrecps_f32( Reciprocal, W );
|
||||
@ -8011,6 +8050,7 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
|
||||
|
||||
V = vget_low_f32( vResult );
|
||||
V = vmul_f32( V, Reciprocal );
|
||||
#endif
|
||||
|
||||
vst1_f32( reinterpret_cast<float*>(pOutputVector), V );
|
||||
pOutputVector += OutputStride;
|
||||
@ -10501,6 +10541,11 @@ inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
|
||||
|
||||
#ifdef _M_ARM64
|
||||
V.val[0] = vdivq_f32( vResult0, W );
|
||||
V.val[1] = vdivq_f32( vResult1, W );
|
||||
V.val[2] = vdivq_f32( vResult2, W );
|
||||
#else
|
||||
// 2 iterations of Newton-Raphson refinement of reciprocal
|
||||
float32x4_t Reciprocal = vrecpeq_f32(W);
|
||||
float32x4_t S = vrecpsq_f32( Reciprocal, W );
|
||||
@ -10511,6 +10556,7 @@ inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream
|
||||
V.val[0] = vmulq_f32( vResult0, Reciprocal );
|
||||
V.val[1] = vmulq_f32( vResult1, Reciprocal );
|
||||
V.val[2] = vmulq_f32( vResult2, Reciprocal );
|
||||
#endif
|
||||
|
||||
vst3q_f32( reinterpret_cast<float*>(pOutputVector),V );
|
||||
pOutputVector += sizeof(XMFLOAT3)*4;
|
||||
@ -10534,6 +10580,9 @@ inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream
|
||||
VH = vget_high_f32(vResult);
|
||||
XMVECTOR W = vdupq_lane_f32( VH, 1 );
|
||||
|
||||
#ifdef _M_ARM64
|
||||
vResult = vdivq_f32( vResult, W );
|
||||
#else
|
||||
// 2 iterations of Newton-Raphson refinement of reciprocal for W
|
||||
float32x4_t Reciprocal = vrecpeq_f32( W );
|
||||
float32x4_t S = vrecpsq_f32( Reciprocal, W );
|
||||
@ -10542,6 +10591,7 @@ inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream
|
||||
Reciprocal = vmulq_f32( S, Reciprocal );
|
||||
|
||||
vResult = vmulq_f32( vResult, Reciprocal );
|
||||
#endif
|
||||
|
||||
VL = vget_low_f32( vResult );
|
||||
vst1_f32( reinterpret_cast<float*>(pOutputVector), VL );
|
||||
@ -11451,6 +11501,11 @@ inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
|
||||
|
||||
#ifdef _M_ARM64
|
||||
vResult0 = vdivq_f32( vResult0, W );
|
||||
vResult1 = vdivq_f32( vResult1, W );
|
||||
vResult2 = vdivq_f32( vResult2, W );
|
||||
#else
|
||||
// 2 iterations of Newton-Raphson refinement of reciprocal
|
||||
float32x4_t Reciprocal = vrecpeq_f32(W);
|
||||
float32x4_t S = vrecpsq_f32( Reciprocal, W );
|
||||
@ -11461,6 +11516,7 @@ inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream
|
||||
vResult0 = vmulq_f32( vResult0, Reciprocal );
|
||||
vResult1 = vmulq_f32( vResult1, Reciprocal );
|
||||
vResult2 = vmulq_f32( vResult2, Reciprocal );
|
||||
#endif
|
||||
|
||||
V.val[0] = vmlaq_f32( OffsetX, vResult0, ScaleX );
|
||||
V.val[1] = vmlaq_f32( OffsetY, vResult1, ScaleY );
|
||||
@ -11493,6 +11549,9 @@ inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream
|
||||
VH = vget_high_f32(vResult);
|
||||
XMVECTOR W = vdupq_lane_f32( VH, 1 );
|
||||
|
||||
#ifdef _M_ARM64
|
||||
vResult = vdivq_f32( vResult, W );
|
||||
#else
|
||||
// 2 iterations of Newton-Raphson refinement of reciprocal for W
|
||||
float32x4_t Reciprocal = vrecpeq_f32( W );
|
||||
float32x4_t S = vrecpsq_f32( Reciprocal, W );
|
||||
@ -11501,6 +11560,7 @@ inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream
|
||||
Reciprocal = vmulq_f32( S, Reciprocal );
|
||||
|
||||
vResult = vmulq_f32( vResult, Reciprocal );
|
||||
#endif
|
||||
|
||||
vResult = vmlaq_f32( Offset, vResult, Scale );
|
||||
|
||||
@ -12036,6 +12096,11 @@ inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
|
||||
|
||||
#ifdef _M_ARM64
|
||||
V.val[0] = vdivq_f32( vResult0, W );
|
||||
V.val[1] = vdivq_f32( vResult1, W );
|
||||
V.val[2] = vdivq_f32( vResult2, W );
|
||||
#else
|
||||
// 2 iterations of Newton-Raphson refinement of reciprocal
|
||||
float32x4_t Reciprocal = vrecpeq_f32(W);
|
||||
float32x4_t S = vrecpsq_f32( Reciprocal, W );
|
||||
@ -12046,6 +12111,7 @@ inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream
|
||||
V.val[0] = vmulq_f32( vResult0, Reciprocal );
|
||||
V.val[1] = vmulq_f32( vResult1, Reciprocal );
|
||||
V.val[2] = vmulq_f32( vResult2, Reciprocal );
|
||||
#endif
|
||||
|
||||
vst3q_f32( reinterpret_cast<float*>(pOutputVector),V );
|
||||
pOutputVector += sizeof(XMFLOAT3)*4;
|
||||
@ -12080,6 +12146,9 @@ inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream
|
||||
VH = vget_high_f32(vResult);
|
||||
XMVECTOR W = vdupq_lane_f32( VH, 1 );
|
||||
|
||||
#ifdef _M_ARM64
|
||||
vResult = vdivq_f32( vResult, W );
|
||||
#else
|
||||
// 2 iterations of Newton-Raphson refinement of reciprocal for W
|
||||
float32x4_t Reciprocal = vrecpeq_f32( W );
|
||||
float32x4_t S = vrecpsq_f32( Reciprocal, W );
|
||||
@ -12088,6 +12157,7 @@ inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream
|
||||
Reciprocal = vmulq_f32( S, Reciprocal );
|
||||
|
||||
vResult = vmulq_f32( vResult, Reciprocal );
|
||||
#endif
|
||||
|
||||
VL = vget_low_f32( vResult );
|
||||
vst1_f32( reinterpret_cast<float*>(pOutputVector), VL );
|
||||
|
@ -30,6 +30,10 @@ inline float PackedVector::XMConvertHalfToFloat
|
||||
__m128i V1 = _mm_cvtsi32_si128( static_cast<uint32_t>(Value) );
|
||||
__m128 V2 = _mm_cvtph_ps( V1 );
|
||||
return _mm_cvtss_f32( V2 );
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(_M_ARM64) && !defined(_XM_NO_INTRINSICS_)
|
||||
uint16x4_t vHalf = vdup_n_u16(Value);
|
||||
float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
|
||||
return vgetq_lane_f32(vFloat, 0);
|
||||
#else
|
||||
uint32_t Mantissa = (uint32_t)(Value & 0x03FF);
|
||||
|
||||
@ -254,6 +258,117 @@ inline float* PackedVector::XMConvertHalfToFloatStream
|
||||
|
||||
XM_SFENCE();
|
||||
|
||||
return pOutputStream;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(_M_ARM64) && !defined(_XM_NO_INTRINSICS_)
|
||||
const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
|
||||
uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
|
||||
|
||||
size_t i = 0;
|
||||
size_t four = HalfCount >> 2;
|
||||
if (four > 0)
|
||||
{
|
||||
if (InputStride == sizeof(HALF))
|
||||
{
|
||||
if (OutputStride == sizeof(float))
|
||||
{
|
||||
// Packed input, packed output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
uint16x4_t vHalf = vld1_u16(reinterpret_cast<const uint16_t*>(pHalf));
|
||||
pHalf += InputStride * 4;
|
||||
|
||||
float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
|
||||
|
||||
vst1q_f32(reinterpret_cast<float*>(pFloat), vFloat);
|
||||
pFloat += OutputStride * 4;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Packed input, scattered output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
uint16x4_t vHalf = vld1_u16(reinterpret_cast<const uint16_t*>(pHalf));
|
||||
pHalf += InputStride * 4;
|
||||
|
||||
float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
|
||||
|
||||
vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 0);
|
||||
pFloat += OutputStride;
|
||||
vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 1);
|
||||
pFloat += OutputStride;
|
||||
vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 2);
|
||||
pFloat += OutputStride;
|
||||
vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 3);
|
||||
pFloat += OutputStride;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (OutputStride == sizeof(float))
|
||||
{
|
||||
// Scattered input, packed output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
|
||||
pHalf += InputStride;
|
||||
uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
|
||||
pHalf += InputStride;
|
||||
uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
|
||||
pHalf += InputStride;
|
||||
uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
|
||||
pHalf += InputStride;
|
||||
|
||||
uint64_t iHalf = uint64_t(H1) | (uint64_t(H2) << 16) | (uint64_t(H3) << 32) | (uint64_t(H4) << 48);
|
||||
uint16x4_t vHalf = vcreate_u16(iHalf);
|
||||
|
||||
float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
|
||||
|
||||
vst1q_f32(reinterpret_cast<float*>(pFloat), vFloat);
|
||||
pFloat += OutputStride * 4;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Scattered input, scattered output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
|
||||
pHalf += InputStride;
|
||||
uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
|
||||
pHalf += InputStride;
|
||||
uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
|
||||
pHalf += InputStride;
|
||||
uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
|
||||
pHalf += InputStride;
|
||||
|
||||
uint64_t iHalf = uint64_t(H1) | (uint64_t(H2) << 16) | (uint64_t(H3) << 32) | (uint64_t(H4) << 48);
|
||||
uint16x4_t vHalf = vcreate_u16(iHalf);
|
||||
|
||||
float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
|
||||
|
||||
vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 0);
|
||||
pFloat += OutputStride;
|
||||
vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 1);
|
||||
pFloat += OutputStride;
|
||||
vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 2);
|
||||
pFloat += OutputStride;
|
||||
vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 3);
|
||||
pFloat += OutputStride;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (; i < HalfCount; ++i)
|
||||
{
|
||||
*reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
|
||||
pHalf += InputStride;
|
||||
pFloat += OutputStride;
|
||||
}
|
||||
|
||||
return pOutputStream;
|
||||
#else
|
||||
const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
|
||||
@ -281,6 +396,10 @@ inline PackedVector::HALF PackedVector::XMConvertFloatToHalf
|
||||
__m128 V1 = _mm_set_ss( Value );
|
||||
__m128i V2 = _mm_cvtps_ph( V1, 0 );
|
||||
return static_cast<HALF>( _mm_cvtsi128_si32(V2) );
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(_M_ARM64) && !defined(_XM_NO_INTRINSICS_)
|
||||
float32x4_t vFloat = vdupq_n_f32(Value);
|
||||
float16x4_t vHalf = vcvt_f16_f32(vFloat);
|
||||
return vget_lane_u16(vreinterpret_u16_f16(vHalf), 0);
|
||||
#else
|
||||
uint32_t Result;
|
||||
|
||||
@ -501,6 +620,119 @@ inline PackedVector::HALF* PackedVector::XMConvertFloatToHalfStream
|
||||
pHalf += OutputStride;
|
||||
}
|
||||
|
||||
return pOutputStream;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(_M_ARM64) && !defined(_XM_NO_INTRINSICS_)
|
||||
const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
|
||||
uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
|
||||
|
||||
size_t i = 0;
|
||||
size_t four = FloatCount >> 2;
|
||||
if (four > 0)
|
||||
{
|
||||
if (InputStride == sizeof(float))
|
||||
{
|
||||
if (OutputStride == sizeof(HALF))
|
||||
{
|
||||
// Packed input, packed output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
float32x4_t vFloat = vld1q_f32(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride*4;
|
||||
|
||||
uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat));
|
||||
|
||||
vst1_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf);
|
||||
pHalf += OutputStride*4;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Packed input, scattered output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
float32x4_t vFloat = vld1q_f32(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride*4;
|
||||
|
||||
uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat));
|
||||
|
||||
vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 0);
|
||||
pHalf += OutputStride;
|
||||
vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 1);
|
||||
pHalf += OutputStride;
|
||||
vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 2);
|
||||
pHalf += OutputStride;
|
||||
vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 3);
|
||||
pHalf += OutputStride;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (OutputStride == sizeof(HALF))
|
||||
{
|
||||
// Scattered input, packed output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
float32x4_t vFloat = vdupq_n_f32(0);
|
||||
vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 0);
|
||||
pFloat += InputStride;
|
||||
|
||||
vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 1);
|
||||
pFloat += InputStride;
|
||||
|
||||
vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 2);
|
||||
pFloat += InputStride;
|
||||
|
||||
vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 3);
|
||||
pFloat += InputStride;
|
||||
|
||||
uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat));
|
||||
|
||||
vst1_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf);
|
||||
pHalf += OutputStride*4;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Scattered input, scattered output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
float32x4_t vFloat = vdupq_n_f32(0);
|
||||
vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 0);
|
||||
pFloat += InputStride;
|
||||
|
||||
vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 1);
|
||||
pFloat += InputStride;
|
||||
|
||||
vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 2);
|
||||
pFloat += InputStride;
|
||||
|
||||
vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 3);
|
||||
pFloat += InputStride;
|
||||
|
||||
uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat));
|
||||
|
||||
vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 0);
|
||||
pHalf += OutputStride;
|
||||
vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 1);
|
||||
pHalf += OutputStride;
|
||||
vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 2);
|
||||
pHalf += OutputStride;
|
||||
vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 3);
|
||||
pHalf += OutputStride;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (; i < FloatCount; ++i)
|
||||
{
|
||||
*reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
|
||||
pFloat += InputStride;
|
||||
pHalf += OutputStride;
|
||||
}
|
||||
|
||||
return pOutputStream;
|
||||
#else
|
||||
const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
|
||||
|
Loading…
Reference in New Issue
Block a user