1
0
mirror of https://github.com/microsoft/DirectXMath synced 2024-11-09 22:20:08 +00:00

ARM-NEON intrinsics code paths now type-safe (#115)

This commit is contained in:
Chuck Walbourn 2020-07-31 15:49:13 -07:00 committed by GitHub
parent 404c59a7dd
commit 9962628ef4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 561 additions and 539 deletions

View File

@ -196,7 +196,7 @@
#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
#if defined(__clang__)
#if defined(__clang__) || defined(__GNUC__)
#define XM_PREFETCH( a ) __builtin_prefetch(a)
#elif defined(_MSC_VER)
#define XM_PREFETCH( a ) __prefetch(a)
@ -380,9 +380,13 @@ namespace DirectX
inline operator XMVECTOR() const noexcept { return v; }
inline operator const float* () const noexcept { return f; }
#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
#ifdef _XM_NO_INTRINSICS_
#elif defined(_XM_SSE_INTRINSICS_)
inline operator __m128i() const noexcept { return _mm_castps_si128(v); }
inline operator __m128d() const noexcept { return _mm_castps_pd(v); }
#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(__GNUC__)
inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); }
inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); }
#endif
};
@ -395,9 +399,13 @@ namespace DirectX
};
inline operator XMVECTOR() const noexcept { return v; }
#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
#ifdef _XM_NO_INTRINSICS_
#elif defined(_XM_SSE_INTRINSICS_)
inline operator __m128i() const noexcept { return _mm_castps_si128(v); }
inline operator __m128d() const noexcept { return _mm_castps_pd(v); }
#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(__GNUC__)
inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); }
inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); }
#endif
};
@ -410,9 +418,13 @@ namespace DirectX
};
inline operator XMVECTOR() const noexcept { return v; }
#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
#ifdef _XM_NO_INTRINSICS_
#elif defined(_XM_SSE_INTRINSICS_)
inline operator __m128i() const noexcept { return _mm_castps_si128(v); }
inline operator __m128d() const noexcept { return _mm_castps_pd(v); }
#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(__GNUC__)
inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); }
inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); }
#endif
};
@ -425,9 +437,13 @@ namespace DirectX
};
inline operator XMVECTOR() const noexcept { return v; }
#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
#ifdef _XM_NO_INTRINSICS_
#elif defined(_XM_SSE_INTRINSICS_)
inline operator __m128i() const noexcept { return _mm_castps_si128(v); }
inline operator __m128d() const noexcept { return _mm_castps_pd(v); }
#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(__GNUC__)
inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); }
inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); }
#endif
};
@ -2166,7 +2182,7 @@ namespace DirectX
// Convert DivExponent into 1.0f/(1<<DivExponent)
uint32_t uScale = 0x3F800000U - (DivExponent << 23);
// Splat the scalar value (It's really a float)
vScale = vdupq_n_u32(uScale);
vScale = vreinterpretq_s32_u32(vdupq_n_u32(uScale));
// Multiply by the reciprocal (Perform a right shift by DivExponent)
vResult = vmulq_f32(vResult, reinterpret_cast<const float32x4_t*>(&vScale)[0]);
return vResult;

View File

@ -39,7 +39,7 @@ inline XMVECTOR XM_CALLCONV XMConvertVectorIntToFloat
return Result;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
float fScale = 1.0f / (float)(1U << DivExponent);
float32x4_t vResult = vcvtq_f32_s32(VInt);
float32x4_t vResult = vcvtq_f32_s32(vreinterpretq_s32_f32(VInt));
return vmulq_n_f32(vResult, fScale);
#else // _XM_SSE_INTRINSICS_
// Convert to floats
@ -91,10 +91,10 @@ inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToInt
// Float to int conversion
int32x4_t vResulti = vcvtq_s32_f32(vResult);
// If there was positive overflow, set to 0x7FFFFFFF
vResult = vandq_u32(vOverflow, g_XMAbsMask);
vOverflow = vbicq_u32(vResulti, vOverflow);
vOverflow = vorrq_u32(vOverflow, vResult);
return vOverflow;
vResult = vreinterpretq_f32_u32(vandq_u32(vOverflow, g_XMAbsMask));
vOverflow = vbicq_u32(vreinterpretq_u32_s32(vResulti), vOverflow);
vOverflow = vorrq_u32(vOverflow, vreinterpretq_u32_f32(vResult));
return vreinterpretq_f32_u32(vOverflow);
#else // _XM_SSE_INTRINSICS_
XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent));
vResult = _mm_mul_ps(vResult, VFloat);
@ -129,7 +129,7 @@ inline XMVECTOR XM_CALLCONV XMConvertVectorUIntToFloat
return Result;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
float fScale = 1.0f / (float)(1U << DivExponent);
float32x4_t vResult = vcvtq_f32_u32(VUInt);
float32x4_t vResult = vcvtq_f32_u32(vreinterpretq_u32_f32(VUInt));
return vmulq_n_f32(vResult, fScale);
#else // _XM_SSE_INTRINSICS_
// For the values that are higher than 0x7FFFFFFF, a fixup is needed
@ -191,9 +191,9 @@ inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToUInt
// Float to int conversion
uint32x4_t vResulti = vcvtq_u32_f32(vResult);
// If there was overflow, set to 0xFFFFFFFFU
vResult = vbicq_u32(vResulti, vOverflow);
vOverflow = vorrq_u32(vOverflow, vResult);
return vOverflow;
vResult = vreinterpretq_f32_u32(vbicq_u32(vResulti, vOverflow));
vOverflow = vorrq_u32(vOverflow, vreinterpretq_u32_f32(vResult));
return vreinterpretq_f32_u32(vOverflow);
#else // _XM_SSE_INTRINSICS_
XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent));
vResult = _mm_mul_ps(vResult, VFloat);
@ -240,7 +240,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt(const uint32_t* pSource) noexcept
return V;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
uint32x4_t zero = vdupq_n_u32(0);
return vld1q_lane_u32(pSource, zero, 0);
return vreinterpretq_f32_u32(vld1q_lane_u32(pSource, zero, 0));
#elif defined(_XM_SSE_INTRINSICS_)
return _mm_load_ss(reinterpret_cast<const float*>(pSource));
#endif
@ -281,7 +281,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt2(const uint32_t* pSource) noexcept
#elif defined(_XM_ARM_NEON_INTRINSICS_)
uint32x2_t x = vld1_u32(pSource);
uint32x2_t zero = vdup_n_u32(0);
return vcombine_u32(x, zero);
return vreinterpretq_f32_u32(vcombine_u32(x, zero));
#elif defined(_XM_SSE_INTRINSICS_)
return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
#endif
@ -307,7 +307,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt2A(const uint32_t* pSource) noexcept
uint32x2_t x = vld1_u32(pSource);
#endif
uint32x2_t zero = vdup_n_u32(0);
return vcombine_u32(x, zero);
return vreinterpretq_f32_u32(vcombine_u32(x, zero));
#elif defined(_XM_SSE_INTRINSICS_)
return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
#endif
@ -434,7 +434,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt3(const uint32_t* pSource) noexcept
uint32x2_t x = vld1_u32(pSource);
uint32x2_t zero = vdup_n_u32(0);
uint32x2_t y = vld1_lane_u32(pSource + 2, zero, 0);
return vcombine_u32(x, y);
return vreinterpretq_f32_u32(vcombine_u32(x, y));
#elif defined(_XM_SSE4_INTRINSICS_)
__m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
__m128 z = _mm_load_ss(reinterpret_cast<const float*>(pSource + 2));
@ -466,7 +466,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt3A(const uint32_t* pSource) noexcept
#else
uint32x4_t V = vld1q_u32(pSource);
#endif
return vsetq_lane_u32(0, V, 3);
return vreinterpretq_f32_u32(vsetq_lane_u32(0, V, 3));
#elif defined(_XM_SSE4_INTRINSICS_)
__m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
__m128 z = _mm_load_ss(reinterpret_cast<const float*>(pSource + 2));
@ -614,7 +614,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt4(const uint32_t* pSource) noexcept
V.vector4_u32[3] = pSource[3];
return V;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
return vld1q_u32(pSource);
return vreinterpretq_f32_u32(vld1q_u32(pSource));
#elif defined(_XM_SSE_INTRINSICS_)
__m128i V = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pSource));
return _mm_castsi128_ps(V);
@ -638,7 +638,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt4A(const uint32_t* pSource) noexcept
#ifdef _MSC_VER
return vld1q_u32_ex(pSource, 128);
#else
return vld1q_u32(pSource);
return vreinterpretq_f32_u32(vld1q_u32(pSource));
#endif
#elif defined(_XM_SSE_INTRINSICS_)
__m128i V = _mm_load_si128(reinterpret_cast<const __m128i*>(pSource));
@ -780,8 +780,8 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat3x3(const XMFLOAT3X3* pSource) noexcept
float32x4_t T = vextq_f32(v0, v1, 3);
XMMATRIX M;
M.r[0] = vandq_u32(v0, g_XMMask3);
M.r[1] = vandq_u32(T, g_XMMask3);
M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3));
M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T), g_XMMask3));
M.r[2] = vcombine_f32(vget_high_f32(v1), v2);
M.r[3] = g_XMIdentityR3;
return M;
@ -846,9 +846,9 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat4x3(const XMFLOAT4X3* pSource) noexcept
float32x4_t T3 = vextq_f32(v2, v2, 1);
XMMATRIX M;
M.r[0] = vandq_u32(v0, g_XMMask3);
M.r[1] = vandq_u32(T1, g_XMMask3);
M.r[2] = vandq_u32(T2, g_XMMask3);
M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3));
M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3));
M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3));
M.r[3] = vsetq_lane_f32(1.f, T3, 3);
return M;
#elif defined(_XM_SSE_INTRINSICS_)
@ -930,9 +930,9 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat4x3A(const XMFLOAT4X3A* pSource) noexcept
float32x4_t T3 = vextq_f32(v2, v2, 1);
XMMATRIX M;
M.r[0] = vandq_u32(v0, g_XMMask3);
M.r[1] = vandq_u32(T1, g_XMMask3);
M.r[2] = vandq_u32(T2, g_XMMask3);
M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3));
M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3));
M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3));
M.r[3] = vsetq_lane_f32(1.f, T3, 3);
return M;
#elif defined(_XM_SSE_INTRINSICS_)
@ -1012,9 +1012,9 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat3x4(const XMFLOAT3X4* pSource) noexcept
float32x4_t T3 = vcombine_f32(vTemp0.val[3], rh);
XMMATRIX M = {};
M.r[0] = vandq_u32(T0, g_XMMask3);
M.r[1] = vandq_u32(T1, g_XMMask3);
M.r[2] = vandq_u32(T2, g_XMMask3);
M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T0), g_XMMask3));
M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3));
M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3));
M.r[3] = vsetq_lane_f32(1.f, T3, 3);
return M;
#elif defined(_XM_SSE_INTRINSICS_)
@ -1096,9 +1096,9 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat3x4A(const XMFLOAT3X4A* pSource) noexcept
float32x4_t T3 = vcombine_f32(vTemp0.val[3], rh);
XMMATRIX M = {};
M.r[0] = vandq_u32(T0, g_XMMask3);
M.r[1] = vandq_u32(T1, g_XMMask3);
M.r[2] = vandq_u32(T2, g_XMMask3);
M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T0), g_XMMask3));
M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3));
M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3));
M.r[3] = vsetq_lane_f32(1.f, T3, 3);
return M;
#elif defined(_XM_SSE_INTRINSICS_)
@ -1283,7 +1283,7 @@ inline void XM_CALLCONV XMStoreInt2
pDestination[0] = V.vector4_u32[0];
pDestination[1] = V.vector4_u32[1];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
uint32x2_t VL = vget_low_u32(V);
uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V));
vst1_u32(pDestination, VL);
#elif defined(_XM_SSE_INTRINSICS_)
_mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
@ -1304,7 +1304,7 @@ inline void XM_CALLCONV XMStoreInt2A
pDestination[0] = V.vector4_u32[0];
pDestination[1] = V.vector4_u32[1];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
uint32x2_t VL = vget_low_u32(V);
uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V));
#ifdef _MSC_VER
vst1_u32_ex(pDestination, VL, 64);
#else
@ -1373,9 +1373,9 @@ inline void XM_CALLCONV XMStoreSInt2
pDestination->x = static_cast<int32_t>(V.vector4_f32[0]);
pDestination->y = static_cast<int32_t>(V.vector4_f32[1]);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
int32x2_t v = vget_low_s32(V);
v = vcvt_s32_f32(v);
vst1_s32(reinterpret_cast<int32_t*>(pDestination), v);
float32x2_t v = vget_low_f32(V);
int32x2_t iv = vcvt_s32_f32(v);
vst1_s32(reinterpret_cast<int32_t*>(pDestination), iv);
#elif defined(_XM_SSE_INTRINSICS_)
// In case of positive overflow, detect it
XMVECTOR vOverflow = _mm_cmpgt_ps(V, g_XMMaxInt);
@ -1443,7 +1443,7 @@ inline void XM_CALLCONV XMStoreInt3
pDestination[1] = V.vector4_u32[1];
pDestination[2] = V.vector4_u32[2];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
uint32x2_t VL = vget_low_u32(V);
uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V));
vst1_u32(pDestination, VL);
vst1q_lane_u32(pDestination + 2, *reinterpret_cast<const uint32x4_t*>(&V), 2);
#elif defined(_XM_SSE_INTRINSICS_)
@ -1468,7 +1468,7 @@ inline void XM_CALLCONV XMStoreInt3A
pDestination[1] = V.vector4_u32[1];
pDestination[2] = V.vector4_u32[2];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
uint32x2_t VL = vget_low_u32(V);
uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V));
#ifdef _MSC_VER
vst1_u32_ex(pDestination, VL, 64);
#else
@ -1634,7 +1634,7 @@ inline void XM_CALLCONV XMStoreInt4
pDestination[2] = V.vector4_u32[2];
pDestination[3] = V.vector4_u32[3];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
vst1q_u32(pDestination, V);
vst1q_u32(pDestination, vreinterpretq_u32_f32(V));
#elif defined(_XM_SSE_INTRINSICS_)
_mm_storeu_si128(reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V));
#endif
@ -1659,7 +1659,7 @@ inline void XM_CALLCONV XMStoreInt4A
#ifdef _MSC_VER
vst1q_u32_ex(pDestination, V, 128);
#else
vst1q_u32(pDestination, V);
vst1q_u32(pDestination, vreinterpretq_u32_f32(V));
#endif
#elif defined(_XM_SSE_INTRINSICS_)
_mm_store_si128(reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V));

View File

@ -48,23 +48,25 @@ inline bool XM_CALLCONV XMMatrixIsNaN(FXMMATRIX M) noexcept
return (i != 0); // i == 0 if nothing matched
#elif defined(_XM_ARM_NEON_INTRINSICS_)
// Load in registers
XMVECTOR vX = M.r[0];
XMVECTOR vY = M.r[1];
XMVECTOR vZ = M.r[2];
XMVECTOR vW = M.r[3];
float32x4_t vX = M.r[0];
float32x4_t vY = M.r[1];
float32x4_t vZ = M.r[2];
float32x4_t vW = M.r[3];
// Test themselves to check for NaN
vX = vmvnq_u32(vceqq_f32(vX, vX));
vY = vmvnq_u32(vceqq_f32(vY, vY));
vZ = vmvnq_u32(vceqq_f32(vZ, vZ));
vW = vmvnq_u32(vceqq_f32(vW, vW));
uint32x4_t xmask = vmvnq_u32(vceqq_f32(vX, vX));
uint32x4_t ymask = vmvnq_u32(vceqq_f32(vY, vY));
uint32x4_t zmask = vmvnq_u32(vceqq_f32(vZ, vZ));
uint32x4_t wmask = vmvnq_u32(vceqq_f32(vW, vW));
// Or all the results
vX = vorrq_u32(vX, vZ);
vY = vorrq_u32(vY, vW);
vX = vorrq_u32(vX, vY);
xmask = vorrq_u32(xmask, zmask);
ymask = vorrq_u32(ymask, wmask);
xmask = vorrq_u32(xmask, ymask);
// If any tested true, return true
uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vX), vget_high_u8(vX));
uint16x4x2_t vTemp2 = vzip_u16(vTemp.val[0], vTemp.val[1]);
uint32_t r = vget_lane_u32(vTemp2.val[1], 1);
uint8x8x2_t vTemp = vzip_u8(
vget_low_u8(vreinterpretq_u8_u32(xmask)),
vget_high_u8(vreinterpretq_u8_u32(xmask)));
uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
return (r != 0);
#elif defined(_XM_SSE_INTRINSICS_)
// Load in registers
@ -113,24 +115,31 @@ inline bool XM_CALLCONV XMMatrixIsInfinite(FXMMATRIX M) noexcept
} while (--i);
return (i != 0); // i == 0 if nothing matched
#elif defined(_XM_ARM_NEON_INTRINSICS_)
// Load in registers
float32x4_t vX = M.r[0];
float32x4_t vY = M.r[1];
float32x4_t vZ = M.r[2];
float32x4_t vW = M.r[3];
// Mask off the sign bits
XMVECTOR vTemp1 = vandq_u32(M.r[0], g_XMAbsMask);
XMVECTOR vTemp2 = vandq_u32(M.r[1], g_XMAbsMask);
XMVECTOR vTemp3 = vandq_u32(M.r[2], g_XMAbsMask);
XMVECTOR vTemp4 = vandq_u32(M.r[3], g_XMAbsMask);
vX = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vX), g_XMAbsMask));
vY = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vY), g_XMAbsMask));
vZ = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vZ), g_XMAbsMask));
vW = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vW), g_XMAbsMask));
// Compare to infinity
vTemp1 = vceqq_f32(vTemp1, g_XMInfinity);
vTemp2 = vceqq_f32(vTemp2, g_XMInfinity);
vTemp3 = vceqq_f32(vTemp3, g_XMInfinity);
vTemp4 = vceqq_f32(vTemp4, g_XMInfinity);
uint32x4_t xmask = vceqq_f32(vX, g_XMInfinity);
uint32x4_t ymask = vceqq_f32(vY, g_XMInfinity);
uint32x4_t zmask = vceqq_f32(vZ, g_XMInfinity);
uint32x4_t wmask = vceqq_f32(vW, g_XMInfinity);
// Or the answers together
vTemp1 = vorrq_u32(vTemp1, vTemp2);
vTemp3 = vorrq_u32(vTemp3, vTemp4);
vTemp1 = vorrq_u32(vTemp1, vTemp3);
// If any are infinity, the signs are true.
uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1));
uint16x4x2_t vTemp5 = vzip_u16(vTemp.val[0], vTemp.val[1]);
uint32_t r = vget_lane_u32(vTemp5.val[1], 1);
xmask = vorrq_u32(xmask, zmask);
ymask = vorrq_u32(ymask, wmask);
xmask = vorrq_u32(xmask, ymask);
// If any tested true, return true
uint8x8x2_t vTemp = vzip_u8(
vget_low_u8(vreinterpretq_u8_u32(xmask)),
vget_high_u8(vreinterpretq_u8_u32(xmask)));
uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
return (r != 0);
#elif defined(_XM_SSE_INTRINSICS_)
// Mask off the sign bits
@ -187,16 +196,16 @@ inline bool XM_CALLCONV XMMatrixIsIdentity(FXMMATRIX M) noexcept
uOne |= uZero;
return (uOne == 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
XMVECTOR vTemp1 = vceqq_f32(M.r[0], g_XMIdentityR0);
XMVECTOR vTemp2 = vceqq_f32(M.r[1], g_XMIdentityR1);
XMVECTOR vTemp3 = vceqq_f32(M.r[2], g_XMIdentityR2);
XMVECTOR vTemp4 = vceqq_f32(M.r[3], g_XMIdentityR3);
vTemp1 = vandq_u32(vTemp1, vTemp2);
vTemp3 = vandq_u32(vTemp3, vTemp4);
vTemp1 = vandq_u32(vTemp1, vTemp3);
uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1));
uint16x4x2_t vTemp5 = vzip_u16(vTemp.val[0], vTemp.val[1]);
uint32_t r = vget_lane_u32(vTemp5.val[1], 1);
uint32x4_t xmask = vceqq_f32(M.r[0], g_XMIdentityR0);
uint32x4_t ymask = vceqq_f32(M.r[1], g_XMIdentityR1);
uint32x4_t zmask = vceqq_f32(M.r[2], g_XMIdentityR2);
uint32x4_t wmask = vceqq_f32(M.r[3], g_XMIdentityR3);
xmask = vandq_u32(xmask, zmask);
ymask = vandq_u32(ymask, wmask);
xmask = vandq_u32(xmask, ymask);
uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(xmask)), vget_high_u8(vreinterpretq_u8_u32(xmask)));
uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
return (r == 0xFFFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR vTemp1 = _mm_cmpeq_ps(M.r[0], g_XMIdentityR0);
@ -265,10 +274,10 @@ inline XMMATRIX XM_CALLCONV XMMatrixMultiply
float32x2_t VL = vget_low_f32(M1.r[0]);
float32x2_t VH = vget_high_f32(M1.r[0]);
// Perform the operation on the first row
XMVECTOR vX = vmulq_lane_f32(M2.r[0], VL, 0);
XMVECTOR vY = vmulq_lane_f32(M2.r[1], VL, 1);
XMVECTOR vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
XMVECTOR vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
float32x4_t vX = vmulq_lane_f32(M2.r[0], VL, 0);
float32x4_t vY = vmulq_lane_f32(M2.r[1], VL, 1);
float32x4_t vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
float32x4_t vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
mResult.r[0] = vaddq_f32(vZ, vW);
// Repeat for the other 3 rows
VL = vget_low_f32(M1.r[1]);
@ -478,10 +487,10 @@ inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose
float32x2_t VL = vget_low_f32(M1.r[0]);
float32x2_t VH = vget_high_f32(M1.r[0]);
// Perform the operation on the first row
XMVECTOR vX = vmulq_lane_f32(M2.r[0], VL, 0);
XMVECTOR vY = vmulq_lane_f32(M2.r[1], VL, 1);
XMVECTOR vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
XMVECTOR vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
float32x4_t vX = vmulq_lane_f32(M2.r[0], VL, 0);
float32x4_t vY = vmulq_lane_f32(M2.r[1], VL, 1);
float32x4_t vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
float32x4_t vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
float32x4_t r0 = vaddq_f32(vZ, vW);
// Repeat for the other 3 rows
VL = vget_low_f32(M1.r[1]);
@ -1403,9 +1412,9 @@ inline XMMATRIX XM_CALLCONV XMMatrixScalingFromVector(FXMVECTOR Scale) noexcept
#elif defined(_XM_ARM_NEON_INTRINSICS_)
XMMATRIX M;
M.r[0] = vandq_u32(Scale, g_XMMaskX);
M.r[1] = vandq_u32(Scale, g_XMMaskY);
M.r[2] = vandq_u32(Scale, g_XMMaskZ);
M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskX));
M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskY));
M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskZ));
M.r[3] = g_XMIdentityR3.v;
return M;
#elif defined(_XM_SSE_INTRINSICS_)
@ -1455,12 +1464,12 @@ inline XMMATRIX XM_CALLCONV XMMatrixRotationX(float Angle) noexcept
float fCosAngle;
XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
const XMVECTOR Zero = vdupq_n_f32(0);
const float32x4_t Zero = vdupq_n_f32(0);
XMVECTOR T1 = vsetq_lane_f32(fCosAngle, Zero, 1);
float32x4_t T1 = vsetq_lane_f32(fCosAngle, Zero, 1);
T1 = vsetq_lane_f32(fSinAngle, T1, 2);
XMVECTOR T2 = vsetq_lane_f32(-fSinAngle, Zero, 1);
float32x4_t T2 = vsetq_lane_f32(-fSinAngle, Zero, 1);
T2 = vsetq_lane_f32(fCosAngle, T2, 2);
XMMATRIX M;
@ -1528,12 +1537,12 @@ inline XMMATRIX XM_CALLCONV XMMatrixRotationY(float Angle) noexcept
float fCosAngle;
XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
const XMVECTOR Zero = vdupq_n_f32(0);
const float32x4_t Zero = vdupq_n_f32(0);
XMVECTOR T0 = vsetq_lane_f32(fCosAngle, Zero, 0);
float32x4_t T0 = vsetq_lane_f32(fCosAngle, Zero, 0);
T0 = vsetq_lane_f32(-fSinAngle, T0, 2);
XMVECTOR T2 = vsetq_lane_f32(fSinAngle, Zero, 0);
float32x4_t T2 = vsetq_lane_f32(fSinAngle, Zero, 0);
T2 = vsetq_lane_f32(fCosAngle, T2, 2);
XMMATRIX M;
@ -1601,12 +1610,12 @@ inline XMMATRIX XM_CALLCONV XMMatrixRotationZ(float Angle) noexcept
float fCosAngle;
XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
const XMVECTOR Zero = vdupq_n_f32(0);
const float32x4_t Zero = vdupq_n_f32(0);
XMVECTOR T0 = vsetq_lane_f32(fCosAngle, Zero, 0);
float32x4_t T0 = vsetq_lane_f32(fCosAngle, Zero, 0);
T0 = vsetq_lane_f32(fSinAngle, T0, 1);
XMVECTOR T1 = vsetq_lane_f32(-fSinAngle, Zero, 0);
float32x4_t T1 = vsetq_lane_f32(-fSinAngle, Zero, 0);
T1 = vsetq_lane_f32(fCosAngle, T1, 1);
XMMATRIX M;
@ -2166,7 +2175,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveLH
#elif defined(_XM_ARM_NEON_INTRINSICS_)
float TwoNearZ = NearZ + NearZ;
float fRange = FarZ / (FarZ - NearZ);
const XMVECTOR Zero = vdupq_n_f32(0);
const float32x4_t Zero = vdupq_n_f32(0);
XMMATRIX M;
M.r[0] = vsetq_lane_f32(TwoNearZ / ViewWidth, Zero, 0);
M.r[1] = vsetq_lane_f32(TwoNearZ / ViewHeight, Zero, 1);
@ -2253,7 +2262,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveRH
#elif defined(_XM_ARM_NEON_INTRINSICS_)
float TwoNearZ = NearZ + NearZ;
float fRange = FarZ / (NearZ - FarZ);
const XMVECTOR Zero = vdupq_n_f32(0);
const float32x4_t Zero = vdupq_n_f32(0);
XMMATRIX M;
M.r[0] = vsetq_lane_f32(TwoNearZ / ViewWidth, Zero, 0);
@ -2351,7 +2360,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovLH
float fRange = FarZ / (FarZ - NearZ);
float Height = CosFov / SinFov;
float Width = Height / AspectRatio;
const XMVECTOR Zero = vdupq_n_f32(0);
const float32x4_t Zero = vdupq_n_f32(0);
XMMATRIX M;
M.r[0] = vsetq_lane_f32(Width, Zero, 0);
@ -2452,7 +2461,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovRH
float fRange = FarZ / (NearZ - FarZ);
float Height = CosFov / SinFov;
float Width = Height / AspectRatio;
const XMVECTOR Zero = vdupq_n_f32(0);
const float32x4_t Zero = vdupq_n_f32(0);
XMMATRIX M;
M.r[0] = vsetq_lane_f32(Width, Zero, 0);
@ -2549,7 +2558,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterLH
float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
float fRange = FarZ / (FarZ - NearZ);
const XMVECTOR Zero = vdupq_n_f32(0);
const float32x4_t Zero = vdupq_n_f32(0);
XMMATRIX M;
M.r[0] = vsetq_lane_f32(TwoNearZ * ReciprocalWidth, Zero, 0);
@ -2647,7 +2656,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterRH
float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
float fRange = FarZ / (NearZ - FarZ);
const XMVECTOR Zero = vdupq_n_f32(0);
const float32x4_t Zero = vdupq_n_f32(0);
XMMATRIX M;
M.r[0] = vsetq_lane_f32(TwoNearZ * ReciprocalWidth, Zero, 0);
@ -2737,7 +2746,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixOrthographicLH
#elif defined(_XM_ARM_NEON_INTRINSICS_)
float fRange = 1.0f / (FarZ - NearZ);
const XMVECTOR Zero = vdupq_n_f32(0);
const float32x4_t Zero = vdupq_n_f32(0);
XMMATRIX M;
M.r[0] = vsetq_lane_f32(2.0f / ViewWidth, Zero, 0);
M.r[1] = vsetq_lane_f32(2.0f / ViewHeight, Zero, 1);
@ -2821,7 +2830,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixOrthographicRH
#elif defined(_XM_ARM_NEON_INTRINSICS_)
float fRange = 1.0f / (NearZ - FarZ);
const XMVECTOR Zero = vdupq_n_f32(0);
const float32x4_t Zero = vdupq_n_f32(0);
XMMATRIX M;
M.r[0] = vsetq_lane_f32(2.0f / ViewWidth, Zero, 0);
M.r[1] = vsetq_lane_f32(2.0f / ViewHeight, Zero, 1);
@ -2910,7 +2919,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterLH
float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
float fRange = 1.0f / (FarZ - NearZ);
const XMVECTOR Zero = vdupq_n_f32(0);
const float32x4_t Zero = vdupq_n_f32(0);
XMMATRIX M;
M.r[0] = vsetq_lane_f32(ReciprocalWidth + ReciprocalWidth, Zero, 0);
M.r[1] = vsetq_lane_f32(ReciprocalHeight + ReciprocalHeight, Zero, 1);
@ -3010,7 +3019,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterRH
float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
float fRange = 1.0f / (NearZ - FarZ);
const XMVECTOR Zero = vdupq_n_f32(0);
const float32x4_t Zero = vdupq_n_f32(0);
XMMATRIX M;
M.r[0] = vsetq_lane_f32(ReciprocalWidth + ReciprocalWidth, Zero, 0);
M.r[1] = vsetq_lane_f32(ReciprocalHeight + ReciprocalHeight, Zero, 1);
@ -3178,7 +3187,7 @@ inline XMMATRIX& XMMATRIX::operator/= (float S) noexcept
R0 = vmul_f32(S0, R0);
S0 = vrecps_f32(R0, vS);
R0 = vmul_f32(S0, R0);
float32x4_t Reciprocal = vcombine_u32(R0, R0);
float32x4_t Reciprocal = vcombine_f32(R0, R0);
r[0] = vmulq_f32(r[0], Reciprocal);
r[1] = vmulq_f32(r[1], Reciprocal);
r[2] = vmulq_f32(r[2], Reciprocal);
@ -3266,7 +3275,7 @@ inline XMMATRIX XMMATRIX::operator/ (float S) const noexcept
R0 = vmul_f32(S0, R0);
S0 = vrecps_f32(R0, vS);
R0 = vmul_f32(S0, R0);
float32x4_t Reciprocal = vcombine_u32(R0, R0);
float32x4_t Reciprocal = vcombine_f32(R0, R0);
XMMATRIX R;
R.r[0] = vmulq_f32(r[0], Reciprocal);
R.r[1] = vmulq_f32(r[1], Reciprocal);

View File

@ -120,12 +120,12 @@ inline XMVECTOR XM_CALLCONV XMQuaternionMultiply
vResult = vmlaq_f32(vResult, Q2X, ControlWZYX);
// Mul by Q1ZWXY
vTemp = vrev64q_u32(vTemp);
vTemp = vreinterpretq_f32_u32(vrev64q_u32(vreinterpretq_u32_f32(vTemp)));
Q2Y = vmulq_f32(Q2Y, vTemp);
vResult = vmlaq_f32(vResult, Q2Y, ControlZWXY);
// Mul by Q1YXWZ
vTemp = vrev64q_u32(vTemp);
vTemp = vreinterpretq_f32_u32(vrev64q_u32(vreinterpretq_u32_f32(vTemp)));
vTemp = vcombine_f32(vget_high_f32(vTemp), vget_low_f32(vTemp));
Q2Z = vmulq_f32(Q2Z, vTemp);
vResult = vmlaq_f32(vResult, Q2Z, ControlYXWZ);
@ -728,74 +728,74 @@ inline XMVECTOR XM_CALLCONV XMQuaternionRotationMatrix(FXMMATRIX M) noexcept
static const XMVECTORU32 Select0110 = { { { XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0 } } };
static const XMVECTORU32 Select0010 = { { { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 } } };
XMVECTOR r0 = M.r[0];
XMVECTOR r1 = M.r[1];
XMVECTOR r2 = M.r[2];
float32x4_t r0 = M.r[0];
float32x4_t r1 = M.r[1];
float32x4_t r2 = M.r[2];
XMVECTOR r00 = vdupq_lane_f32(vget_low_f32(r0), 0);
XMVECTOR r11 = vdupq_lane_f32(vget_low_f32(r1), 1);
XMVECTOR r22 = vdupq_lane_f32(vget_high_f32(r2), 0);
float32x4_t r00 = vdupq_lane_f32(vget_low_f32(r0), 0);
float32x4_t r11 = vdupq_lane_f32(vget_low_f32(r1), 1);
float32x4_t r22 = vdupq_lane_f32(vget_high_f32(r2), 0);
// x^2 >= y^2 equivalent to r11 - r00 <= 0
XMVECTOR r11mr00 = vsubq_f32(r11, r00);
XMVECTOR x2gey2 = vcleq_f32(r11mr00, g_XMZero);
float32x4_t r11mr00 = vsubq_f32(r11, r00);
uint32x4_t x2gey2 = vcleq_f32(r11mr00, g_XMZero);
// z^2 >= w^2 equivalent to r11 + r00 <= 0
XMVECTOR r11pr00 = vaddq_f32(r11, r00);
XMVECTOR z2gew2 = vcleq_f32(r11pr00, g_XMZero);
float32x4_t r11pr00 = vaddq_f32(r11, r00);
uint32x4_t z2gew2 = vcleq_f32(r11pr00, g_XMZero);
// x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0
XMVECTOR x2py2gez2pw2 = vcleq_f32(r22, g_XMZero);
uint32x4_t x2py2gez2pw2 = vcleq_f32(r22, g_XMZero);
// (4*x^2, 4*y^2, 4*z^2, 4*w^2)
XMVECTOR t0 = vmulq_f32(XMPMMP, r00);
XMVECTOR x2y2z2w2 = vmlaq_f32(t0, XMMPMP, r11);
float32x4_t t0 = vmulq_f32(XMPMMP, r00);
float32x4_t x2y2z2w2 = vmlaq_f32(t0, XMMPMP, r11);
x2y2z2w2 = vmlaq_f32(x2y2z2w2, XMMMPP, r22);
x2y2z2w2 = vaddq_f32(x2y2z2w2, g_XMOne);
// (r01, r02, r12, r11)
t0 = vextq_f32(r0, r0, 1);
XMVECTOR t1 = vextq_f32(r1, r1, 1);
float32x4_t t1 = vextq_f32(r1, r1, 1);
t0 = vcombine_f32(vget_low_f32(t0), vrev64_f32(vget_low_f32(t1)));
// (r10, r20, r21, r10)
t1 = vextq_f32(r2, r2, 3);
XMVECTOR r10 = vdupq_lane_f32(vget_low_f32(r1), 0);
float32x4_t r10 = vdupq_lane_f32(vget_low_f32(r1), 0);
t1 = vbslq_f32(Select0110, t1, r10);
// (4*x*y, 4*x*z, 4*y*z, unused)
XMVECTOR xyxzyz = vaddq_f32(t0, t1);
float32x4_t xyxzyz = vaddq_f32(t0, t1);
// (r21, r20, r10, r10)
t0 = vcombine_f32(vrev64_f32(vget_low_f32(r2)), vget_low_f32(r10));
// (r12, r02, r01, r12)
XMVECTOR t2 = vcombine_f32(vrev64_f32(vget_high_f32(r0)), vrev64_f32(vget_low_f32(r0)));
XMVECTOR t3 = vdupq_lane_f32(vget_high_f32(r1), 0);
float32x4_t t2 = vcombine_f32(vrev64_f32(vget_high_f32(r0)), vrev64_f32(vget_low_f32(r0)));
float32x4_t t3 = vdupq_lane_f32(vget_high_f32(r1), 0);
t1 = vbslq_f32(Select0110, t2, t3);
// (4*x*w, 4*y*w, 4*z*w, unused)
XMVECTOR xwywzw = vsubq_f32(t0, t1);
float32x4_t xwywzw = vsubq_f32(t0, t1);
xwywzw = vmulq_f32(XMMPMP, xwywzw);
// (4*x*x, 4*x*y, 4*x*z, 4*x*w)
t0 = vextq_f32(xyxzyz, xyxzyz, 3);
t1 = vbslq_f32(Select0110, t0, x2y2z2w2);
t2 = vdupq_lane_f32(vget_low_f32(xwywzw), 0);
XMVECTOR tensor0 = vbslq_f32(g_XMSelect1110, t1, t2);
float32x4_t tensor0 = vbslq_f32(g_XMSelect1110, t1, t2);
// (4*y*x, 4*y*y, 4*y*z, 4*y*w)
t0 = vbslq_f32(g_XMSelect1011, xyxzyz, x2y2z2w2);
t1 = vdupq_lane_f32(vget_low_f32(xwywzw), 1);
XMVECTOR tensor1 = vbslq_f32(g_XMSelect1110, t0, t1);
float32x4_t tensor1 = vbslq_f32(g_XMSelect1110, t0, t1);
// (4*z*x, 4*z*y, 4*z*z, 4*z*w)
t0 = vextq_f32(xyxzyz, xyxzyz, 1);
t1 = vcombine_f32(vget_low_f32(t0), vrev64_f32(vget_high_f32(xwywzw)));
XMVECTOR tensor2 = vbslq_f32(Select0010, x2y2z2w2, t1);
float32x4_t tensor2 = vbslq_f32(Select0010, x2y2z2w2, t1);
// (4*w*x, 4*w*y, 4*w*z, 4*w*w)
XMVECTOR tensor3 = vbslq_f32(g_XMSelect1110, xwywzw, x2y2z2w2);
float32x4_t tensor3 = vbslq_f32(g_XMSelect1110, xwywzw, x2y2z2w2);
// Select the row of the tensor-product matrix that has the largest
// magnitude.
@ -1358,8 +1358,8 @@ inline XMVECTOR XM_CALLCONV XMColorNegative(FXMVECTOR vColor) noexcept
} } };
return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
XMVECTOR vTemp = veorq_u32(vColor, g_XMNegate3);
return vaddq_f32(vTemp, g_XMOne3);
uint32x4_t vTemp = veorq_u32(vreinterpretq_u32_f32(vColor), g_XMNegate3);
return vaddq_f32(vreinterpretq_f32_u32(vTemp), g_XMOne3);
#elif defined(_XM_SSE_INTRINSICS_)
// Negate only x,y and z.
XMVECTOR vTemp = _mm_xor_ps(vColor, g_XMNegate3);

File diff suppressed because it is too large Load Diff

View File

@ -23,7 +23,7 @@ inline float XMConvertHalfToFloat(HALF Value) noexcept
__m128i V1 = _mm_cvtsi32_si128(static_cast<int>(Value));
__m128 V2 = _mm_cvtph_ps(V1);
return _mm_cvtss_f32(V2);
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_)
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2))
uint16x4_t vHalf = vdup_n_u16(Value);
float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
return vgetq_lane_f32(vFloat, 0);
@ -255,7 +255,7 @@ inline float* XMConvertHalfToFloatStream
XM_SFENCE();
return pOutputStream;
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_)
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2))
auto pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
auto pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
@ -389,7 +389,7 @@ inline HALF XMConvertFloatToHalf(float Value) noexcept
__m128 V1 = _mm_set_ss(Value);
__m128i V2 = _mm_cvtps_ph(V1, _MM_FROUND_TO_NEAREST_INT);
return static_cast<HALF>(_mm_extract_epi16(V2, 0));
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_)
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2))
float32x4_t vFloat = vdupq_n_f32(Value);
float16x4_t vHalf = vcvt_f16_f32(vFloat);
return vget_lane_u16(vreinterpret_u16_f16(vHalf), 0);
@ -609,7 +609,7 @@ inline HALF* XMConvertFloatToHalfStream
}
return pOutputStream;
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_)
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2))
auto pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
auto pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
@ -1091,9 +1091,9 @@ inline XMVECTOR XM_CALLCONV XMLoadUByte2(const XMUBYTE2* pSource) noexcept
return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
uint16x4_t vInt8 = vld1_dup_u16(reinterpret_cast<const uint16_t*>(pSource));
uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u32(vInt8));
uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u16(vInt8));
uint32x4_t vInt = vmovl_u16(vget_low_u16(vInt16));
vInt = vandq_s32(vInt, g_XMMaskXY);
vInt = vandq_u32(vInt, g_XMMaskXY);
return vcvtq_f32_u32(vInt);
#elif defined(_XM_SSE_INTRINSICS_)
static const XMVECTORF32 Scale = { { { 1.0f, 1.0f / 256.0f, 0, 0 } } };
@ -1317,9 +1317,9 @@ inline XMVECTOR XM_CALLCONV XMLoadShortN4(const XMSHORTN4* pSource) noexcept
#elif defined(_XM_ARM_NEON_INTRINSICS_)
int16x4_t vInt = vld1_s16(reinterpret_cast<const int16_t*>(pSource));
int32x4_t V = vmovl_s16(vInt);
V = vcvtq_f32_s32(V);
V = vmulq_n_f32(V, 1.0f / 32767.0f);
return vmaxq_f32(V, vdupq_n_f32(-1.f));
float32x4_t vResult = vcvtq_f32_s32(V);
vResult = vmulq_n_f32(vResult, 1.0f / 32767.0f);
return vmaxq_f32(vResult, vdupq_n_f32(-1.f));
#elif defined(_XM_SSE_INTRINSICS_)
// Splat the color in all four entries (x,z,y,w)
__m128d vIntd = _mm_load1_pd(reinterpret_cast<const double*>(&pSource->x));
@ -1391,8 +1391,8 @@ inline XMVECTOR XM_CALLCONV XMLoadUShortN4(const XMUSHORTN4* pSource) noexcept
#elif defined(_XM_ARM_NEON_INTRINSICS_)
uint16x4_t vInt = vld1_u16(reinterpret_cast<const uint16_t*>(pSource));
uint32x4_t V = vmovl_u16(vInt);
V = vcvtq_f32_u32(V);
return vmulq_n_f32(V, 1.0f / 65535.0f);
float32x4_t vResult = vcvtq_f32_u32(V);
return vmulq_n_f32(vResult, 1.0f / 65535.0f);
#elif defined(_XM_SSE_INTRINSICS_)
static const XMVECTORF32 FixupY16W16 = { { { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / (65535.0f * 65536.0f), 1.0f / (65535.0f * 65536.0f) } } };
static const XMVECTORF32 FixaddY16W16 = { { { 0, 0, 32768.0f * 65536.0f, 32768.0f * 65536.0f } } };
@ -1626,7 +1626,7 @@ inline XMVECTOR XM_CALLCONV XMLoadUDecN4_XR(const XMUDECN4* pSource) noexcept
uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
vInt = vandq_u32(vInt, g_XMMaskDec4);
int32x4_t vTemp = vsubq_s32(vreinterpretq_s32_u32(vInt), XRBias);
vTemp = veorq_u32(vTemp, g_XMFlipW);
vTemp = veorq_s32(vTemp, g_XMFlipW);
float32x4_t R = vcvtq_f32_s32(vTemp);
R = vaddq_f32(R, g_XMAddUDec4);
return vmulq_f32(R, XRMul);
@ -2686,8 +2686,7 @@ inline void XM_CALLCONV XMStoreShortN4
float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(-1.f));
vResult = vminq_f32(vResult, vdupq_n_f32(1.0f));
vResult = vmulq_n_f32(vResult, 32767.0f);
vResult = vcvtq_s32_f32(vResult);
int16x4_t vInt = vmovn_s32(vResult);
int16x4_t vInt = vmovn_s32(vcvtq_s32_f32(vResult));
vst1_s16(reinterpret_cast<int16_t*>(pDestination), vInt);
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR vResult = _mm_max_ps(V, g_XMNegativeOne);
@ -2724,8 +2723,7 @@ inline void XM_CALLCONV XMStoreShort4
#elif defined(_XM_ARM_NEON_INTRINSICS_)
float32x4_t vResult = vmaxq_f32(V, g_ShortMin);
vResult = vminq_f32(vResult, g_ShortMax);
vResult = vcvtq_s32_f32(vResult);
int16x4_t vInt = vmovn_s32(vResult);
int16x4_t vInt = vmovn_s32(vcvtq_s32_f32(vResult));
vst1_s16(reinterpret_cast<int16_t*>(pDestination), vInt);
#elif defined(_XM_SSE_INTRINSICS_)
// Bounds check
@ -2767,8 +2765,7 @@ inline void XM_CALLCONV XMStoreUShortN4
vResult = vminq_f32(vResult, vdupq_n_f32(1.0f));
vResult = vmulq_n_f32(vResult, 65535.0f);
vResult = vaddq_f32(vResult, g_XMOneHalf);
vResult = vcvtq_u32_f32(vResult);
uint16x4_t vInt = vmovn_u32(vResult);
uint16x4_t vInt = vmovn_u32(vcvtq_u32_f32(vResult));
vst1_u16(reinterpret_cast<uint16_t*>(pDestination), vInt);
#elif defined(_XM_SSE_INTRINSICS_)
// Bounds check
@ -2812,8 +2809,7 @@ inline void XM_CALLCONV XMStoreUShort4
#elif defined(_XM_ARM_NEON_INTRINSICS_)
float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0));
vResult = vminq_f32(vResult, g_UShortMax);
vResult = vcvtq_u32_f32(vResult);
uint16x4_t vInt = vmovn_u32(vResult);
uint16x4_t vInt = vmovn_u32(vcvtq_u32_f32(vResult));
vst1_u16(reinterpret_cast<uint16_t*>(pDestination), vInt);
#elif defined(_XM_SSE_INTRINSICS_)
// Bounds check
@ -2947,7 +2943,7 @@ inline void XM_CALLCONV XMStoreXDec4
vTemp = vorr_u32(vTemp, vTemp2);
// Perform a single bit left shift on y|w
vTemp2 = vdup_lane_u32(vTemp, 1);
vTemp2 = vadd_s32(vTemp2, vTemp2);
vTemp2 = vadd_u32(vTemp2, vTemp2);
vTemp = vorr_u32(vTemp, vTemp2);
vst1_lane_u32(&pDestination->v, vTemp, 0);
#elif defined(_XM_SSE_INTRINSICS_)
@ -3640,7 +3636,7 @@ inline void XM_CALLCONV XMStoreU555
vTemp = vorr_u32(vTemp, vTemp2);
// Perform a single bit left shift on y|w
vTemp2 = vdup_lane_u32(vTemp, 1);
vTemp2 = vadd_s32(vTemp2, vTemp2);
vTemp2 = vadd_u32(vTemp2, vTemp2);
vTemp = vorr_u32(vTemp, vTemp2);
vst1_lane_u16(&pDestination->v, vreinterpret_u16_u32(vTemp), 0);
#elif defined(_XM_SSE_INTRINSICS_)