mirror of
https://github.com/microsoft/DirectXMath
synced 2024-11-25 05:30:04 +00:00
ARM-NEON intrinsics code paths now type-safe (#115)
This commit is contained in:
parent
404c59a7dd
commit
9962628ef4
@ -196,7 +196,7 @@
|
||||
|
||||
#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
|
||||
|
||||
#if defined(__clang__)
|
||||
#if defined(__clang__) || defined(__GNUC__)
|
||||
#define XM_PREFETCH( a ) __builtin_prefetch(a)
|
||||
#elif defined(_MSC_VER)
|
||||
#define XM_PREFETCH( a ) __prefetch(a)
|
||||
@ -380,9 +380,13 @@ namespace DirectX
|
||||
|
||||
inline operator XMVECTOR() const noexcept { return v; }
|
||||
inline operator const float* () const noexcept { return f; }
|
||||
#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
|
||||
#ifdef _XM_NO_INTRINSICS_
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
inline operator __m128i() const noexcept { return _mm_castps_si128(v); }
|
||||
inline operator __m128d() const noexcept { return _mm_castps_pd(v); }
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(__GNUC__)
|
||||
inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); }
|
||||
inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); }
|
||||
#endif
|
||||
};
|
||||
|
||||
@ -395,9 +399,13 @@ namespace DirectX
|
||||
};
|
||||
|
||||
inline operator XMVECTOR() const noexcept { return v; }
|
||||
#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
|
||||
#ifdef _XM_NO_INTRINSICS_
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
inline operator __m128i() const noexcept { return _mm_castps_si128(v); }
|
||||
inline operator __m128d() const noexcept { return _mm_castps_pd(v); }
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(__GNUC__)
|
||||
inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); }
|
||||
inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); }
|
||||
#endif
|
||||
};
|
||||
|
||||
@ -410,9 +418,13 @@ namespace DirectX
|
||||
};
|
||||
|
||||
inline operator XMVECTOR() const noexcept { return v; }
|
||||
#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
|
||||
#ifdef _XM_NO_INTRINSICS_
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
inline operator __m128i() const noexcept { return _mm_castps_si128(v); }
|
||||
inline operator __m128d() const noexcept { return _mm_castps_pd(v); }
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(__GNUC__)
|
||||
inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); }
|
||||
inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); }
|
||||
#endif
|
||||
};
|
||||
|
||||
@ -425,9 +437,13 @@ namespace DirectX
|
||||
};
|
||||
|
||||
inline operator XMVECTOR() const noexcept { return v; }
|
||||
#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
|
||||
#ifdef _XM_NO_INTRINSICS_
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
inline operator __m128i() const noexcept { return _mm_castps_si128(v); }
|
||||
inline operator __m128d() const noexcept { return _mm_castps_pd(v); }
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(__GNUC__)
|
||||
inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); }
|
||||
inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); }
|
||||
#endif
|
||||
};
|
||||
|
||||
@ -2166,7 +2182,7 @@ namespace DirectX
|
||||
// Convert DivExponent into 1.0f/(1<<DivExponent)
|
||||
uint32_t uScale = 0x3F800000U - (DivExponent << 23);
|
||||
// Splat the scalar value (It's really a float)
|
||||
vScale = vdupq_n_u32(uScale);
|
||||
vScale = vreinterpretq_s32_u32(vdupq_n_u32(uScale));
|
||||
// Multiply by the reciprocal (Perform a right shift by DivExponent)
|
||||
vResult = vmulq_f32(vResult, reinterpret_cast<const float32x4_t*>(&vScale)[0]);
|
||||
return vResult;
|
||||
|
@ -39,7 +39,7 @@ inline XMVECTOR XM_CALLCONV XMConvertVectorIntToFloat
|
||||
return Result;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
float fScale = 1.0f / (float)(1U << DivExponent);
|
||||
float32x4_t vResult = vcvtq_f32_s32(VInt);
|
||||
float32x4_t vResult = vcvtq_f32_s32(vreinterpretq_s32_f32(VInt));
|
||||
return vmulq_n_f32(vResult, fScale);
|
||||
#else // _XM_SSE_INTRINSICS_
|
||||
// Convert to floats
|
||||
@ -91,10 +91,10 @@ inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToInt
|
||||
// Float to int conversion
|
||||
int32x4_t vResulti = vcvtq_s32_f32(vResult);
|
||||
// If there was positive overflow, set to 0x7FFFFFFF
|
||||
vResult = vandq_u32(vOverflow, g_XMAbsMask);
|
||||
vOverflow = vbicq_u32(vResulti, vOverflow);
|
||||
vOverflow = vorrq_u32(vOverflow, vResult);
|
||||
return vOverflow;
|
||||
vResult = vreinterpretq_f32_u32(vandq_u32(vOverflow, g_XMAbsMask));
|
||||
vOverflow = vbicq_u32(vreinterpretq_u32_s32(vResulti), vOverflow);
|
||||
vOverflow = vorrq_u32(vOverflow, vreinterpretq_u32_f32(vResult));
|
||||
return vreinterpretq_f32_u32(vOverflow);
|
||||
#else // _XM_SSE_INTRINSICS_
|
||||
XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent));
|
||||
vResult = _mm_mul_ps(vResult, VFloat);
|
||||
@ -129,7 +129,7 @@ inline XMVECTOR XM_CALLCONV XMConvertVectorUIntToFloat
|
||||
return Result;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
float fScale = 1.0f / (float)(1U << DivExponent);
|
||||
float32x4_t vResult = vcvtq_f32_u32(VUInt);
|
||||
float32x4_t vResult = vcvtq_f32_u32(vreinterpretq_u32_f32(VUInt));
|
||||
return vmulq_n_f32(vResult, fScale);
|
||||
#else // _XM_SSE_INTRINSICS_
|
||||
// For the values that are higher than 0x7FFFFFFF, a fixup is needed
|
||||
@ -191,9 +191,9 @@ inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToUInt
|
||||
// Float to int conversion
|
||||
uint32x4_t vResulti = vcvtq_u32_f32(vResult);
|
||||
// If there was overflow, set to 0xFFFFFFFFU
|
||||
vResult = vbicq_u32(vResulti, vOverflow);
|
||||
vOverflow = vorrq_u32(vOverflow, vResult);
|
||||
return vOverflow;
|
||||
vResult = vreinterpretq_f32_u32(vbicq_u32(vResulti, vOverflow));
|
||||
vOverflow = vorrq_u32(vOverflow, vreinterpretq_u32_f32(vResult));
|
||||
return vreinterpretq_f32_u32(vOverflow);
|
||||
#else // _XM_SSE_INTRINSICS_
|
||||
XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent));
|
||||
vResult = _mm_mul_ps(vResult, VFloat);
|
||||
@ -240,7 +240,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt(const uint32_t* pSource) noexcept
|
||||
return V;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
uint32x4_t zero = vdupq_n_u32(0);
|
||||
return vld1q_lane_u32(pSource, zero, 0);
|
||||
return vreinterpretq_f32_u32(vld1q_lane_u32(pSource, zero, 0));
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
return _mm_load_ss(reinterpret_cast<const float*>(pSource));
|
||||
#endif
|
||||
@ -281,7 +281,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt2(const uint32_t* pSource) noexcept
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
uint32x2_t x = vld1_u32(pSource);
|
||||
uint32x2_t zero = vdup_n_u32(0);
|
||||
return vcombine_u32(x, zero);
|
||||
return vreinterpretq_f32_u32(vcombine_u32(x, zero));
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
|
||||
#endif
|
||||
@ -307,7 +307,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt2A(const uint32_t* pSource) noexcept
|
||||
uint32x2_t x = vld1_u32(pSource);
|
||||
#endif
|
||||
uint32x2_t zero = vdup_n_u32(0);
|
||||
return vcombine_u32(x, zero);
|
||||
return vreinterpretq_f32_u32(vcombine_u32(x, zero));
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
|
||||
#endif
|
||||
@ -434,7 +434,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt3(const uint32_t* pSource) noexcept
|
||||
uint32x2_t x = vld1_u32(pSource);
|
||||
uint32x2_t zero = vdup_n_u32(0);
|
||||
uint32x2_t y = vld1_lane_u32(pSource + 2, zero, 0);
|
||||
return vcombine_u32(x, y);
|
||||
return vreinterpretq_f32_u32(vcombine_u32(x, y));
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
__m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
|
||||
__m128 z = _mm_load_ss(reinterpret_cast<const float*>(pSource + 2));
|
||||
@ -466,7 +466,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt3A(const uint32_t* pSource) noexcept
|
||||
#else
|
||||
uint32x4_t V = vld1q_u32(pSource);
|
||||
#endif
|
||||
return vsetq_lane_u32(0, V, 3);
|
||||
return vreinterpretq_f32_u32(vsetq_lane_u32(0, V, 3));
|
||||
#elif defined(_XM_SSE4_INTRINSICS_)
|
||||
__m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
|
||||
__m128 z = _mm_load_ss(reinterpret_cast<const float*>(pSource + 2));
|
||||
@ -614,7 +614,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt4(const uint32_t* pSource) noexcept
|
||||
V.vector4_u32[3] = pSource[3];
|
||||
return V;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
return vld1q_u32(pSource);
|
||||
return vreinterpretq_f32_u32(vld1q_u32(pSource));
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
__m128i V = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pSource));
|
||||
return _mm_castsi128_ps(V);
|
||||
@ -638,7 +638,7 @@ inline XMVECTOR XM_CALLCONV XMLoadInt4A(const uint32_t* pSource) noexcept
|
||||
#ifdef _MSC_VER
|
||||
return vld1q_u32_ex(pSource, 128);
|
||||
#else
|
||||
return vld1q_u32(pSource);
|
||||
return vreinterpretq_f32_u32(vld1q_u32(pSource));
|
||||
#endif
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
__m128i V = _mm_load_si128(reinterpret_cast<const __m128i*>(pSource));
|
||||
@ -780,8 +780,8 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat3x3(const XMFLOAT3X3* pSource) noexcept
|
||||
float32x4_t T = vextq_f32(v0, v1, 3);
|
||||
|
||||
XMMATRIX M;
|
||||
M.r[0] = vandq_u32(v0, g_XMMask3);
|
||||
M.r[1] = vandq_u32(T, g_XMMask3);
|
||||
M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3));
|
||||
M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T), g_XMMask3));
|
||||
M.r[2] = vcombine_f32(vget_high_f32(v1), v2);
|
||||
M.r[3] = g_XMIdentityR3;
|
||||
return M;
|
||||
@ -846,9 +846,9 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat4x3(const XMFLOAT4X3* pSource) noexcept
|
||||
float32x4_t T3 = vextq_f32(v2, v2, 1);
|
||||
|
||||
XMMATRIX M;
|
||||
M.r[0] = vandq_u32(v0, g_XMMask3);
|
||||
M.r[1] = vandq_u32(T1, g_XMMask3);
|
||||
M.r[2] = vandq_u32(T2, g_XMMask3);
|
||||
M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3));
|
||||
M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3));
|
||||
M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3));
|
||||
M.r[3] = vsetq_lane_f32(1.f, T3, 3);
|
||||
return M;
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
@ -930,9 +930,9 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat4x3A(const XMFLOAT4X3A* pSource) noexcept
|
||||
float32x4_t T3 = vextq_f32(v2, v2, 1);
|
||||
|
||||
XMMATRIX M;
|
||||
M.r[0] = vandq_u32(v0, g_XMMask3);
|
||||
M.r[1] = vandq_u32(T1, g_XMMask3);
|
||||
M.r[2] = vandq_u32(T2, g_XMMask3);
|
||||
M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3));
|
||||
M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3));
|
||||
M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3));
|
||||
M.r[3] = vsetq_lane_f32(1.f, T3, 3);
|
||||
return M;
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
@ -1012,9 +1012,9 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat3x4(const XMFLOAT3X4* pSource) noexcept
|
||||
float32x4_t T3 = vcombine_f32(vTemp0.val[3], rh);
|
||||
|
||||
XMMATRIX M = {};
|
||||
M.r[0] = vandq_u32(T0, g_XMMask3);
|
||||
M.r[1] = vandq_u32(T1, g_XMMask3);
|
||||
M.r[2] = vandq_u32(T2, g_XMMask3);
|
||||
M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T0), g_XMMask3));
|
||||
M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3));
|
||||
M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3));
|
||||
M.r[3] = vsetq_lane_f32(1.f, T3, 3);
|
||||
return M;
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
@ -1096,9 +1096,9 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat3x4A(const XMFLOAT3X4A* pSource) noexcept
|
||||
float32x4_t T3 = vcombine_f32(vTemp0.val[3], rh);
|
||||
|
||||
XMMATRIX M = {};
|
||||
M.r[0] = vandq_u32(T0, g_XMMask3);
|
||||
M.r[1] = vandq_u32(T1, g_XMMask3);
|
||||
M.r[2] = vandq_u32(T2, g_XMMask3);
|
||||
M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T0), g_XMMask3));
|
||||
M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3));
|
||||
M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3));
|
||||
M.r[3] = vsetq_lane_f32(1.f, T3, 3);
|
||||
return M;
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
@ -1283,7 +1283,7 @@ inline void XM_CALLCONV XMStoreInt2
|
||||
pDestination[0] = V.vector4_u32[0];
|
||||
pDestination[1] = V.vector4_u32[1];
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
uint32x2_t VL = vget_low_u32(V);
|
||||
uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V));
|
||||
vst1_u32(pDestination, VL);
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
_mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
|
||||
@ -1304,7 +1304,7 @@ inline void XM_CALLCONV XMStoreInt2A
|
||||
pDestination[0] = V.vector4_u32[0];
|
||||
pDestination[1] = V.vector4_u32[1];
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
uint32x2_t VL = vget_low_u32(V);
|
||||
uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V));
|
||||
#ifdef _MSC_VER
|
||||
vst1_u32_ex(pDestination, VL, 64);
|
||||
#else
|
||||
@ -1373,9 +1373,9 @@ inline void XM_CALLCONV XMStoreSInt2
|
||||
pDestination->x = static_cast<int32_t>(V.vector4_f32[0]);
|
||||
pDestination->y = static_cast<int32_t>(V.vector4_f32[1]);
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
int32x2_t v = vget_low_s32(V);
|
||||
v = vcvt_s32_f32(v);
|
||||
vst1_s32(reinterpret_cast<int32_t*>(pDestination), v);
|
||||
float32x2_t v = vget_low_f32(V);
|
||||
int32x2_t iv = vcvt_s32_f32(v);
|
||||
vst1_s32(reinterpret_cast<int32_t*>(pDestination), iv);
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// In case of positive overflow, detect it
|
||||
XMVECTOR vOverflow = _mm_cmpgt_ps(V, g_XMMaxInt);
|
||||
@ -1443,7 +1443,7 @@ inline void XM_CALLCONV XMStoreInt3
|
||||
pDestination[1] = V.vector4_u32[1];
|
||||
pDestination[2] = V.vector4_u32[2];
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
uint32x2_t VL = vget_low_u32(V);
|
||||
uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V));
|
||||
vst1_u32(pDestination, VL);
|
||||
vst1q_lane_u32(pDestination + 2, *reinterpret_cast<const uint32x4_t*>(&V), 2);
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
@ -1468,7 +1468,7 @@ inline void XM_CALLCONV XMStoreInt3A
|
||||
pDestination[1] = V.vector4_u32[1];
|
||||
pDestination[2] = V.vector4_u32[2];
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
uint32x2_t VL = vget_low_u32(V);
|
||||
uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V));
|
||||
#ifdef _MSC_VER
|
||||
vst1_u32_ex(pDestination, VL, 64);
|
||||
#else
|
||||
@ -1634,7 +1634,7 @@ inline void XM_CALLCONV XMStoreInt4
|
||||
pDestination[2] = V.vector4_u32[2];
|
||||
pDestination[3] = V.vector4_u32[3];
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
vst1q_u32(pDestination, V);
|
||||
vst1q_u32(pDestination, vreinterpretq_u32_f32(V));
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V));
|
||||
#endif
|
||||
@ -1659,7 +1659,7 @@ inline void XM_CALLCONV XMStoreInt4A
|
||||
#ifdef _MSC_VER
|
||||
vst1q_u32_ex(pDestination, V, 128);
|
||||
#else
|
||||
vst1q_u32(pDestination, V);
|
||||
vst1q_u32(pDestination, vreinterpretq_u32_f32(V));
|
||||
#endif
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
_mm_store_si128(reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V));
|
||||
|
@ -48,23 +48,25 @@ inline bool XM_CALLCONV XMMatrixIsNaN(FXMMATRIX M) noexcept
|
||||
return (i != 0); // i == 0 if nothing matched
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
// Load in registers
|
||||
XMVECTOR vX = M.r[0];
|
||||
XMVECTOR vY = M.r[1];
|
||||
XMVECTOR vZ = M.r[2];
|
||||
XMVECTOR vW = M.r[3];
|
||||
float32x4_t vX = M.r[0];
|
||||
float32x4_t vY = M.r[1];
|
||||
float32x4_t vZ = M.r[2];
|
||||
float32x4_t vW = M.r[3];
|
||||
// Test themselves to check for NaN
|
||||
vX = vmvnq_u32(vceqq_f32(vX, vX));
|
||||
vY = vmvnq_u32(vceqq_f32(vY, vY));
|
||||
vZ = vmvnq_u32(vceqq_f32(vZ, vZ));
|
||||
vW = vmvnq_u32(vceqq_f32(vW, vW));
|
||||
uint32x4_t xmask = vmvnq_u32(vceqq_f32(vX, vX));
|
||||
uint32x4_t ymask = vmvnq_u32(vceqq_f32(vY, vY));
|
||||
uint32x4_t zmask = vmvnq_u32(vceqq_f32(vZ, vZ));
|
||||
uint32x4_t wmask = vmvnq_u32(vceqq_f32(vW, vW));
|
||||
// Or all the results
|
||||
vX = vorrq_u32(vX, vZ);
|
||||
vY = vorrq_u32(vY, vW);
|
||||
vX = vorrq_u32(vX, vY);
|
||||
xmask = vorrq_u32(xmask, zmask);
|
||||
ymask = vorrq_u32(ymask, wmask);
|
||||
xmask = vorrq_u32(xmask, ymask);
|
||||
// If any tested true, return true
|
||||
uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vX), vget_high_u8(vX));
|
||||
uint16x4x2_t vTemp2 = vzip_u16(vTemp.val[0], vTemp.val[1]);
|
||||
uint32_t r = vget_lane_u32(vTemp2.val[1], 1);
|
||||
uint8x8x2_t vTemp = vzip_u8(
|
||||
vget_low_u8(vreinterpretq_u8_u32(xmask)),
|
||||
vget_high_u8(vreinterpretq_u8_u32(xmask)));
|
||||
uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
|
||||
uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
|
||||
return (r != 0);
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Load in registers
|
||||
@ -113,24 +115,31 @@ inline bool XM_CALLCONV XMMatrixIsInfinite(FXMMATRIX M) noexcept
|
||||
} while (--i);
|
||||
return (i != 0); // i == 0 if nothing matched
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
// Load in registers
|
||||
float32x4_t vX = M.r[0];
|
||||
float32x4_t vY = M.r[1];
|
||||
float32x4_t vZ = M.r[2];
|
||||
float32x4_t vW = M.r[3];
|
||||
// Mask off the sign bits
|
||||
XMVECTOR vTemp1 = vandq_u32(M.r[0], g_XMAbsMask);
|
||||
XMVECTOR vTemp2 = vandq_u32(M.r[1], g_XMAbsMask);
|
||||
XMVECTOR vTemp3 = vandq_u32(M.r[2], g_XMAbsMask);
|
||||
XMVECTOR vTemp4 = vandq_u32(M.r[3], g_XMAbsMask);
|
||||
vX = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vX), g_XMAbsMask));
|
||||
vY = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vY), g_XMAbsMask));
|
||||
vZ = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vZ), g_XMAbsMask));
|
||||
vW = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vW), g_XMAbsMask));
|
||||
// Compare to infinity
|
||||
vTemp1 = vceqq_f32(vTemp1, g_XMInfinity);
|
||||
vTemp2 = vceqq_f32(vTemp2, g_XMInfinity);
|
||||
vTemp3 = vceqq_f32(vTemp3, g_XMInfinity);
|
||||
vTemp4 = vceqq_f32(vTemp4, g_XMInfinity);
|
||||
uint32x4_t xmask = vceqq_f32(vX, g_XMInfinity);
|
||||
uint32x4_t ymask = vceqq_f32(vY, g_XMInfinity);
|
||||
uint32x4_t zmask = vceqq_f32(vZ, g_XMInfinity);
|
||||
uint32x4_t wmask = vceqq_f32(vW, g_XMInfinity);
|
||||
// Or the answers together
|
||||
vTemp1 = vorrq_u32(vTemp1, vTemp2);
|
||||
vTemp3 = vorrq_u32(vTemp3, vTemp4);
|
||||
vTemp1 = vorrq_u32(vTemp1, vTemp3);
|
||||
// If any are infinity, the signs are true.
|
||||
uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1));
|
||||
uint16x4x2_t vTemp5 = vzip_u16(vTemp.val[0], vTemp.val[1]);
|
||||
uint32_t r = vget_lane_u32(vTemp5.val[1], 1);
|
||||
xmask = vorrq_u32(xmask, zmask);
|
||||
ymask = vorrq_u32(ymask, wmask);
|
||||
xmask = vorrq_u32(xmask, ymask);
|
||||
// If any tested true, return true
|
||||
uint8x8x2_t vTemp = vzip_u8(
|
||||
vget_low_u8(vreinterpretq_u8_u32(xmask)),
|
||||
vget_high_u8(vreinterpretq_u8_u32(xmask)));
|
||||
uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
|
||||
uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
|
||||
return (r != 0);
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Mask off the sign bits
|
||||
@ -187,16 +196,16 @@ inline bool XM_CALLCONV XMMatrixIsIdentity(FXMMATRIX M) noexcept
|
||||
uOne |= uZero;
|
||||
return (uOne == 0);
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
XMVECTOR vTemp1 = vceqq_f32(M.r[0], g_XMIdentityR0);
|
||||
XMVECTOR vTemp2 = vceqq_f32(M.r[1], g_XMIdentityR1);
|
||||
XMVECTOR vTemp3 = vceqq_f32(M.r[2], g_XMIdentityR2);
|
||||
XMVECTOR vTemp4 = vceqq_f32(M.r[3], g_XMIdentityR3);
|
||||
vTemp1 = vandq_u32(vTemp1, vTemp2);
|
||||
vTemp3 = vandq_u32(vTemp3, vTemp4);
|
||||
vTemp1 = vandq_u32(vTemp1, vTemp3);
|
||||
uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1));
|
||||
uint16x4x2_t vTemp5 = vzip_u16(vTemp.val[0], vTemp.val[1]);
|
||||
uint32_t r = vget_lane_u32(vTemp5.val[1], 1);
|
||||
uint32x4_t xmask = vceqq_f32(M.r[0], g_XMIdentityR0);
|
||||
uint32x4_t ymask = vceqq_f32(M.r[1], g_XMIdentityR1);
|
||||
uint32x4_t zmask = vceqq_f32(M.r[2], g_XMIdentityR2);
|
||||
uint32x4_t wmask = vceqq_f32(M.r[3], g_XMIdentityR3);
|
||||
xmask = vandq_u32(xmask, zmask);
|
||||
ymask = vandq_u32(ymask, wmask);
|
||||
xmask = vandq_u32(xmask, ymask);
|
||||
uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(xmask)), vget_high_u8(vreinterpretq_u8_u32(xmask)));
|
||||
uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
|
||||
uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
|
||||
return (r == 0xFFFFFFFFU);
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
XMVECTOR vTemp1 = _mm_cmpeq_ps(M.r[0], g_XMIdentityR0);
|
||||
@ -265,10 +274,10 @@ inline XMMATRIX XM_CALLCONV XMMatrixMultiply
|
||||
float32x2_t VL = vget_low_f32(M1.r[0]);
|
||||
float32x2_t VH = vget_high_f32(M1.r[0]);
|
||||
// Perform the operation on the first row
|
||||
XMVECTOR vX = vmulq_lane_f32(M2.r[0], VL, 0);
|
||||
XMVECTOR vY = vmulq_lane_f32(M2.r[1], VL, 1);
|
||||
XMVECTOR vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
|
||||
XMVECTOR vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
|
||||
float32x4_t vX = vmulq_lane_f32(M2.r[0], VL, 0);
|
||||
float32x4_t vY = vmulq_lane_f32(M2.r[1], VL, 1);
|
||||
float32x4_t vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
|
||||
float32x4_t vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
|
||||
mResult.r[0] = vaddq_f32(vZ, vW);
|
||||
// Repeat for the other 3 rows
|
||||
VL = vget_low_f32(M1.r[1]);
|
||||
@ -478,10 +487,10 @@ inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose
|
||||
float32x2_t VL = vget_low_f32(M1.r[0]);
|
||||
float32x2_t VH = vget_high_f32(M1.r[0]);
|
||||
// Perform the operation on the first row
|
||||
XMVECTOR vX = vmulq_lane_f32(M2.r[0], VL, 0);
|
||||
XMVECTOR vY = vmulq_lane_f32(M2.r[1], VL, 1);
|
||||
XMVECTOR vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
|
||||
XMVECTOR vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
|
||||
float32x4_t vX = vmulq_lane_f32(M2.r[0], VL, 0);
|
||||
float32x4_t vY = vmulq_lane_f32(M2.r[1], VL, 1);
|
||||
float32x4_t vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
|
||||
float32x4_t vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
|
||||
float32x4_t r0 = vaddq_f32(vZ, vW);
|
||||
// Repeat for the other 3 rows
|
||||
VL = vget_low_f32(M1.r[1]);
|
||||
@ -1403,9 +1412,9 @@ inline XMMATRIX XM_CALLCONV XMMatrixScalingFromVector(FXMVECTOR Scale) noexcept
|
||||
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
XMMATRIX M;
|
||||
M.r[0] = vandq_u32(Scale, g_XMMaskX);
|
||||
M.r[1] = vandq_u32(Scale, g_XMMaskY);
|
||||
M.r[2] = vandq_u32(Scale, g_XMMaskZ);
|
||||
M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskX));
|
||||
M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskY));
|
||||
M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskZ));
|
||||
M.r[3] = g_XMIdentityR3.v;
|
||||
return M;
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
@ -1455,12 +1464,12 @@ inline XMMATRIX XM_CALLCONV XMMatrixRotationX(float Angle) noexcept
|
||||
float fCosAngle;
|
||||
XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
|
||||
|
||||
const XMVECTOR Zero = vdupq_n_f32(0);
|
||||
const float32x4_t Zero = vdupq_n_f32(0);
|
||||
|
||||
XMVECTOR T1 = vsetq_lane_f32(fCosAngle, Zero, 1);
|
||||
float32x4_t T1 = vsetq_lane_f32(fCosAngle, Zero, 1);
|
||||
T1 = vsetq_lane_f32(fSinAngle, T1, 2);
|
||||
|
||||
XMVECTOR T2 = vsetq_lane_f32(-fSinAngle, Zero, 1);
|
||||
float32x4_t T2 = vsetq_lane_f32(-fSinAngle, Zero, 1);
|
||||
T2 = vsetq_lane_f32(fCosAngle, T2, 2);
|
||||
|
||||
XMMATRIX M;
|
||||
@ -1528,12 +1537,12 @@ inline XMMATRIX XM_CALLCONV XMMatrixRotationY(float Angle) noexcept
|
||||
float fCosAngle;
|
||||
XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
|
||||
|
||||
const XMVECTOR Zero = vdupq_n_f32(0);
|
||||
const float32x4_t Zero = vdupq_n_f32(0);
|
||||
|
||||
XMVECTOR T0 = vsetq_lane_f32(fCosAngle, Zero, 0);
|
||||
float32x4_t T0 = vsetq_lane_f32(fCosAngle, Zero, 0);
|
||||
T0 = vsetq_lane_f32(-fSinAngle, T0, 2);
|
||||
|
||||
XMVECTOR T2 = vsetq_lane_f32(fSinAngle, Zero, 0);
|
||||
float32x4_t T2 = vsetq_lane_f32(fSinAngle, Zero, 0);
|
||||
T2 = vsetq_lane_f32(fCosAngle, T2, 2);
|
||||
|
||||
XMMATRIX M;
|
||||
@ -1601,12 +1610,12 @@ inline XMMATRIX XM_CALLCONV XMMatrixRotationZ(float Angle) noexcept
|
||||
float fCosAngle;
|
||||
XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
|
||||
|
||||
const XMVECTOR Zero = vdupq_n_f32(0);
|
||||
const float32x4_t Zero = vdupq_n_f32(0);
|
||||
|
||||
XMVECTOR T0 = vsetq_lane_f32(fCosAngle, Zero, 0);
|
||||
float32x4_t T0 = vsetq_lane_f32(fCosAngle, Zero, 0);
|
||||
T0 = vsetq_lane_f32(fSinAngle, T0, 1);
|
||||
|
||||
XMVECTOR T1 = vsetq_lane_f32(-fSinAngle, Zero, 0);
|
||||
float32x4_t T1 = vsetq_lane_f32(-fSinAngle, Zero, 0);
|
||||
T1 = vsetq_lane_f32(fCosAngle, T1, 1);
|
||||
|
||||
XMMATRIX M;
|
||||
@ -2166,7 +2175,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveLH
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
float TwoNearZ = NearZ + NearZ;
|
||||
float fRange = FarZ / (FarZ - NearZ);
|
||||
const XMVECTOR Zero = vdupq_n_f32(0);
|
||||
const float32x4_t Zero = vdupq_n_f32(0);
|
||||
XMMATRIX M;
|
||||
M.r[0] = vsetq_lane_f32(TwoNearZ / ViewWidth, Zero, 0);
|
||||
M.r[1] = vsetq_lane_f32(TwoNearZ / ViewHeight, Zero, 1);
|
||||
@ -2253,7 +2262,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveRH
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
float TwoNearZ = NearZ + NearZ;
|
||||
float fRange = FarZ / (NearZ - FarZ);
|
||||
const XMVECTOR Zero = vdupq_n_f32(0);
|
||||
const float32x4_t Zero = vdupq_n_f32(0);
|
||||
|
||||
XMMATRIX M;
|
||||
M.r[0] = vsetq_lane_f32(TwoNearZ / ViewWidth, Zero, 0);
|
||||
@ -2351,7 +2360,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovLH
|
||||
float fRange = FarZ / (FarZ - NearZ);
|
||||
float Height = CosFov / SinFov;
|
||||
float Width = Height / AspectRatio;
|
||||
const XMVECTOR Zero = vdupq_n_f32(0);
|
||||
const float32x4_t Zero = vdupq_n_f32(0);
|
||||
|
||||
XMMATRIX M;
|
||||
M.r[0] = vsetq_lane_f32(Width, Zero, 0);
|
||||
@ -2452,7 +2461,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovRH
|
||||
float fRange = FarZ / (NearZ - FarZ);
|
||||
float Height = CosFov / SinFov;
|
||||
float Width = Height / AspectRatio;
|
||||
const XMVECTOR Zero = vdupq_n_f32(0);
|
||||
const float32x4_t Zero = vdupq_n_f32(0);
|
||||
|
||||
XMMATRIX M;
|
||||
M.r[0] = vsetq_lane_f32(Width, Zero, 0);
|
||||
@ -2549,7 +2558,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterLH
|
||||
float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
|
||||
float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
|
||||
float fRange = FarZ / (FarZ - NearZ);
|
||||
const XMVECTOR Zero = vdupq_n_f32(0);
|
||||
const float32x4_t Zero = vdupq_n_f32(0);
|
||||
|
||||
XMMATRIX M;
|
||||
M.r[0] = vsetq_lane_f32(TwoNearZ * ReciprocalWidth, Zero, 0);
|
||||
@ -2647,7 +2656,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterRH
|
||||
float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
|
||||
float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
|
||||
float fRange = FarZ / (NearZ - FarZ);
|
||||
const XMVECTOR Zero = vdupq_n_f32(0);
|
||||
const float32x4_t Zero = vdupq_n_f32(0);
|
||||
|
||||
XMMATRIX M;
|
||||
M.r[0] = vsetq_lane_f32(TwoNearZ * ReciprocalWidth, Zero, 0);
|
||||
@ -2737,7 +2746,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixOrthographicLH
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
float fRange = 1.0f / (FarZ - NearZ);
|
||||
|
||||
const XMVECTOR Zero = vdupq_n_f32(0);
|
||||
const float32x4_t Zero = vdupq_n_f32(0);
|
||||
XMMATRIX M;
|
||||
M.r[0] = vsetq_lane_f32(2.0f / ViewWidth, Zero, 0);
|
||||
M.r[1] = vsetq_lane_f32(2.0f / ViewHeight, Zero, 1);
|
||||
@ -2821,7 +2830,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixOrthographicRH
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
float fRange = 1.0f / (NearZ - FarZ);
|
||||
|
||||
const XMVECTOR Zero = vdupq_n_f32(0);
|
||||
const float32x4_t Zero = vdupq_n_f32(0);
|
||||
XMMATRIX M;
|
||||
M.r[0] = vsetq_lane_f32(2.0f / ViewWidth, Zero, 0);
|
||||
M.r[1] = vsetq_lane_f32(2.0f / ViewHeight, Zero, 1);
|
||||
@ -2910,7 +2919,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterLH
|
||||
float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
|
||||
float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
|
||||
float fRange = 1.0f / (FarZ - NearZ);
|
||||
const XMVECTOR Zero = vdupq_n_f32(0);
|
||||
const float32x4_t Zero = vdupq_n_f32(0);
|
||||
XMMATRIX M;
|
||||
M.r[0] = vsetq_lane_f32(ReciprocalWidth + ReciprocalWidth, Zero, 0);
|
||||
M.r[1] = vsetq_lane_f32(ReciprocalHeight + ReciprocalHeight, Zero, 1);
|
||||
@ -3010,7 +3019,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterRH
|
||||
float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
|
||||
float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
|
||||
float fRange = 1.0f / (NearZ - FarZ);
|
||||
const XMVECTOR Zero = vdupq_n_f32(0);
|
||||
const float32x4_t Zero = vdupq_n_f32(0);
|
||||
XMMATRIX M;
|
||||
M.r[0] = vsetq_lane_f32(ReciprocalWidth + ReciprocalWidth, Zero, 0);
|
||||
M.r[1] = vsetq_lane_f32(ReciprocalHeight + ReciprocalHeight, Zero, 1);
|
||||
@ -3178,7 +3187,7 @@ inline XMMATRIX& XMMATRIX::operator/= (float S) noexcept
|
||||
R0 = vmul_f32(S0, R0);
|
||||
S0 = vrecps_f32(R0, vS);
|
||||
R0 = vmul_f32(S0, R0);
|
||||
float32x4_t Reciprocal = vcombine_u32(R0, R0);
|
||||
float32x4_t Reciprocal = vcombine_f32(R0, R0);
|
||||
r[0] = vmulq_f32(r[0], Reciprocal);
|
||||
r[1] = vmulq_f32(r[1], Reciprocal);
|
||||
r[2] = vmulq_f32(r[2], Reciprocal);
|
||||
@ -3266,7 +3275,7 @@ inline XMMATRIX XMMATRIX::operator/ (float S) const noexcept
|
||||
R0 = vmul_f32(S0, R0);
|
||||
S0 = vrecps_f32(R0, vS);
|
||||
R0 = vmul_f32(S0, R0);
|
||||
float32x4_t Reciprocal = vcombine_u32(R0, R0);
|
||||
float32x4_t Reciprocal = vcombine_f32(R0, R0);
|
||||
XMMATRIX R;
|
||||
R.r[0] = vmulq_f32(r[0], Reciprocal);
|
||||
R.r[1] = vmulq_f32(r[1], Reciprocal);
|
||||
|
@ -120,12 +120,12 @@ inline XMVECTOR XM_CALLCONV XMQuaternionMultiply
|
||||
vResult = vmlaq_f32(vResult, Q2X, ControlWZYX);
|
||||
|
||||
// Mul by Q1ZWXY
|
||||
vTemp = vrev64q_u32(vTemp);
|
||||
vTemp = vreinterpretq_f32_u32(vrev64q_u32(vreinterpretq_u32_f32(vTemp)));
|
||||
Q2Y = vmulq_f32(Q2Y, vTemp);
|
||||
vResult = vmlaq_f32(vResult, Q2Y, ControlZWXY);
|
||||
|
||||
// Mul by Q1YXWZ
|
||||
vTemp = vrev64q_u32(vTemp);
|
||||
vTemp = vreinterpretq_f32_u32(vrev64q_u32(vreinterpretq_u32_f32(vTemp)));
|
||||
vTemp = vcombine_f32(vget_high_f32(vTemp), vget_low_f32(vTemp));
|
||||
Q2Z = vmulq_f32(Q2Z, vTemp);
|
||||
vResult = vmlaq_f32(vResult, Q2Z, ControlYXWZ);
|
||||
@ -728,74 +728,74 @@ inline XMVECTOR XM_CALLCONV XMQuaternionRotationMatrix(FXMMATRIX M) noexcept
|
||||
static const XMVECTORU32 Select0110 = { { { XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0 } } };
|
||||
static const XMVECTORU32 Select0010 = { { { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 } } };
|
||||
|
||||
XMVECTOR r0 = M.r[0];
|
||||
XMVECTOR r1 = M.r[1];
|
||||
XMVECTOR r2 = M.r[2];
|
||||
float32x4_t r0 = M.r[0];
|
||||
float32x4_t r1 = M.r[1];
|
||||
float32x4_t r2 = M.r[2];
|
||||
|
||||
XMVECTOR r00 = vdupq_lane_f32(vget_low_f32(r0), 0);
|
||||
XMVECTOR r11 = vdupq_lane_f32(vget_low_f32(r1), 1);
|
||||
XMVECTOR r22 = vdupq_lane_f32(vget_high_f32(r2), 0);
|
||||
float32x4_t r00 = vdupq_lane_f32(vget_low_f32(r0), 0);
|
||||
float32x4_t r11 = vdupq_lane_f32(vget_low_f32(r1), 1);
|
||||
float32x4_t r22 = vdupq_lane_f32(vget_high_f32(r2), 0);
|
||||
|
||||
// x^2 >= y^2 equivalent to r11 - r00 <= 0
|
||||
XMVECTOR r11mr00 = vsubq_f32(r11, r00);
|
||||
XMVECTOR x2gey2 = vcleq_f32(r11mr00, g_XMZero);
|
||||
float32x4_t r11mr00 = vsubq_f32(r11, r00);
|
||||
uint32x4_t x2gey2 = vcleq_f32(r11mr00, g_XMZero);
|
||||
|
||||
// z^2 >= w^2 equivalent to r11 + r00 <= 0
|
||||
XMVECTOR r11pr00 = vaddq_f32(r11, r00);
|
||||
XMVECTOR z2gew2 = vcleq_f32(r11pr00, g_XMZero);
|
||||
float32x4_t r11pr00 = vaddq_f32(r11, r00);
|
||||
uint32x4_t z2gew2 = vcleq_f32(r11pr00, g_XMZero);
|
||||
|
||||
// x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0
|
||||
XMVECTOR x2py2gez2pw2 = vcleq_f32(r22, g_XMZero);
|
||||
uint32x4_t x2py2gez2pw2 = vcleq_f32(r22, g_XMZero);
|
||||
|
||||
// (4*x^2, 4*y^2, 4*z^2, 4*w^2)
|
||||
XMVECTOR t0 = vmulq_f32(XMPMMP, r00);
|
||||
XMVECTOR x2y2z2w2 = vmlaq_f32(t0, XMMPMP, r11);
|
||||
float32x4_t t0 = vmulq_f32(XMPMMP, r00);
|
||||
float32x4_t x2y2z2w2 = vmlaq_f32(t0, XMMPMP, r11);
|
||||
x2y2z2w2 = vmlaq_f32(x2y2z2w2, XMMMPP, r22);
|
||||
x2y2z2w2 = vaddq_f32(x2y2z2w2, g_XMOne);
|
||||
|
||||
// (r01, r02, r12, r11)
|
||||
t0 = vextq_f32(r0, r0, 1);
|
||||
XMVECTOR t1 = vextq_f32(r1, r1, 1);
|
||||
float32x4_t t1 = vextq_f32(r1, r1, 1);
|
||||
t0 = vcombine_f32(vget_low_f32(t0), vrev64_f32(vget_low_f32(t1)));
|
||||
|
||||
// (r10, r20, r21, r10)
|
||||
t1 = vextq_f32(r2, r2, 3);
|
||||
XMVECTOR r10 = vdupq_lane_f32(vget_low_f32(r1), 0);
|
||||
float32x4_t r10 = vdupq_lane_f32(vget_low_f32(r1), 0);
|
||||
t1 = vbslq_f32(Select0110, t1, r10);
|
||||
|
||||
// (4*x*y, 4*x*z, 4*y*z, unused)
|
||||
XMVECTOR xyxzyz = vaddq_f32(t0, t1);
|
||||
float32x4_t xyxzyz = vaddq_f32(t0, t1);
|
||||
|
||||
// (r21, r20, r10, r10)
|
||||
t0 = vcombine_f32(vrev64_f32(vget_low_f32(r2)), vget_low_f32(r10));
|
||||
|
||||
// (r12, r02, r01, r12)
|
||||
XMVECTOR t2 = vcombine_f32(vrev64_f32(vget_high_f32(r0)), vrev64_f32(vget_low_f32(r0)));
|
||||
XMVECTOR t3 = vdupq_lane_f32(vget_high_f32(r1), 0);
|
||||
float32x4_t t2 = vcombine_f32(vrev64_f32(vget_high_f32(r0)), vrev64_f32(vget_low_f32(r0)));
|
||||
float32x4_t t3 = vdupq_lane_f32(vget_high_f32(r1), 0);
|
||||
t1 = vbslq_f32(Select0110, t2, t3);
|
||||
|
||||
// (4*x*w, 4*y*w, 4*z*w, unused)
|
||||
XMVECTOR xwywzw = vsubq_f32(t0, t1);
|
||||
float32x4_t xwywzw = vsubq_f32(t0, t1);
|
||||
xwywzw = vmulq_f32(XMMPMP, xwywzw);
|
||||
|
||||
// (4*x*x, 4*x*y, 4*x*z, 4*x*w)
|
||||
t0 = vextq_f32(xyxzyz, xyxzyz, 3);
|
||||
t1 = vbslq_f32(Select0110, t0, x2y2z2w2);
|
||||
t2 = vdupq_lane_f32(vget_low_f32(xwywzw), 0);
|
||||
XMVECTOR tensor0 = vbslq_f32(g_XMSelect1110, t1, t2);
|
||||
float32x4_t tensor0 = vbslq_f32(g_XMSelect1110, t1, t2);
|
||||
|
||||
// (4*y*x, 4*y*y, 4*y*z, 4*y*w)
|
||||
t0 = vbslq_f32(g_XMSelect1011, xyxzyz, x2y2z2w2);
|
||||
t1 = vdupq_lane_f32(vget_low_f32(xwywzw), 1);
|
||||
XMVECTOR tensor1 = vbslq_f32(g_XMSelect1110, t0, t1);
|
||||
float32x4_t tensor1 = vbslq_f32(g_XMSelect1110, t0, t1);
|
||||
|
||||
// (4*z*x, 4*z*y, 4*z*z, 4*z*w)
|
||||
t0 = vextq_f32(xyxzyz, xyxzyz, 1);
|
||||
t1 = vcombine_f32(vget_low_f32(t0), vrev64_f32(vget_high_f32(xwywzw)));
|
||||
XMVECTOR tensor2 = vbslq_f32(Select0010, x2y2z2w2, t1);
|
||||
float32x4_t tensor2 = vbslq_f32(Select0010, x2y2z2w2, t1);
|
||||
|
||||
// (4*w*x, 4*w*y, 4*w*z, 4*w*w)
|
||||
XMVECTOR tensor3 = vbslq_f32(g_XMSelect1110, xwywzw, x2y2z2w2);
|
||||
float32x4_t tensor3 = vbslq_f32(g_XMSelect1110, xwywzw, x2y2z2w2);
|
||||
|
||||
// Select the row of the tensor-product matrix that has the largest
|
||||
// magnitude.
|
||||
@ -1358,8 +1358,8 @@ inline XMVECTOR XM_CALLCONV XMColorNegative(FXMVECTOR vColor) noexcept
|
||||
} } };
|
||||
return vResult.v;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
XMVECTOR vTemp = veorq_u32(vColor, g_XMNegate3);
|
||||
return vaddq_f32(vTemp, g_XMOne3);
|
||||
uint32x4_t vTemp = veorq_u32(vreinterpretq_u32_f32(vColor), g_XMNegate3);
|
||||
return vaddq_f32(vreinterpretq_f32_u32(vTemp), g_XMOne3);
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Negate only x,y and z.
|
||||
XMVECTOR vTemp = _mm_xor_ps(vColor, g_XMNegate3);
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -23,7 +23,7 @@ inline float XMConvertHalfToFloat(HALF Value) noexcept
|
||||
__m128i V1 = _mm_cvtsi32_si128(static_cast<int>(Value));
|
||||
__m128 V2 = _mm_cvtph_ps(V1);
|
||||
return _mm_cvtss_f32(V2);
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_)
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2))
|
||||
uint16x4_t vHalf = vdup_n_u16(Value);
|
||||
float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
|
||||
return vgetq_lane_f32(vFloat, 0);
|
||||
@ -255,7 +255,7 @@ inline float* XMConvertHalfToFloatStream
|
||||
XM_SFENCE();
|
||||
|
||||
return pOutputStream;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_)
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2))
|
||||
auto pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
|
||||
auto pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
|
||||
|
||||
@ -389,7 +389,7 @@ inline HALF XMConvertFloatToHalf(float Value) noexcept
|
||||
__m128 V1 = _mm_set_ss(Value);
|
||||
__m128i V2 = _mm_cvtps_ph(V1, _MM_FROUND_TO_NEAREST_INT);
|
||||
return static_cast<HALF>(_mm_extract_epi16(V2, 0));
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_)
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2))
|
||||
float32x4_t vFloat = vdupq_n_f32(Value);
|
||||
float16x4_t vHalf = vcvt_f16_f32(vFloat);
|
||||
return vget_lane_u16(vreinterpret_u16_f16(vHalf), 0);
|
||||
@ -609,7 +609,7 @@ inline HALF* XMConvertFloatToHalfStream
|
||||
}
|
||||
|
||||
return pOutputStream;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_)
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2))
|
||||
auto pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
|
||||
auto pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
|
||||
|
||||
@ -1091,9 +1091,9 @@ inline XMVECTOR XM_CALLCONV XMLoadUByte2(const XMUBYTE2* pSource) noexcept
|
||||
return vResult.v;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
uint16x4_t vInt8 = vld1_dup_u16(reinterpret_cast<const uint16_t*>(pSource));
|
||||
uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u32(vInt8));
|
||||
uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u16(vInt8));
|
||||
uint32x4_t vInt = vmovl_u16(vget_low_u16(vInt16));
|
||||
vInt = vandq_s32(vInt, g_XMMaskXY);
|
||||
vInt = vandq_u32(vInt, g_XMMaskXY);
|
||||
return vcvtq_f32_u32(vInt);
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
static const XMVECTORF32 Scale = { { { 1.0f, 1.0f / 256.0f, 0, 0 } } };
|
||||
@ -1317,9 +1317,9 @@ inline XMVECTOR XM_CALLCONV XMLoadShortN4(const XMSHORTN4* pSource) noexcept
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
int16x4_t vInt = vld1_s16(reinterpret_cast<const int16_t*>(pSource));
|
||||
int32x4_t V = vmovl_s16(vInt);
|
||||
V = vcvtq_f32_s32(V);
|
||||
V = vmulq_n_f32(V, 1.0f / 32767.0f);
|
||||
return vmaxq_f32(V, vdupq_n_f32(-1.f));
|
||||
float32x4_t vResult = vcvtq_f32_s32(V);
|
||||
vResult = vmulq_n_f32(vResult, 1.0f / 32767.0f);
|
||||
return vmaxq_f32(vResult, vdupq_n_f32(-1.f));
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Splat the color in all four entries (x,z,y,w)
|
||||
__m128d vIntd = _mm_load1_pd(reinterpret_cast<const double*>(&pSource->x));
|
||||
@ -1391,8 +1391,8 @@ inline XMVECTOR XM_CALLCONV XMLoadUShortN4(const XMUSHORTN4* pSource) noexcept
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
uint16x4_t vInt = vld1_u16(reinterpret_cast<const uint16_t*>(pSource));
|
||||
uint32x4_t V = vmovl_u16(vInt);
|
||||
V = vcvtq_f32_u32(V);
|
||||
return vmulq_n_f32(V, 1.0f / 65535.0f);
|
||||
float32x4_t vResult = vcvtq_f32_u32(V);
|
||||
return vmulq_n_f32(vResult, 1.0f / 65535.0f);
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
static const XMVECTORF32 FixupY16W16 = { { { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / (65535.0f * 65536.0f), 1.0f / (65535.0f * 65536.0f) } } };
|
||||
static const XMVECTORF32 FixaddY16W16 = { { { 0, 0, 32768.0f * 65536.0f, 32768.0f * 65536.0f } } };
|
||||
@ -1626,7 +1626,7 @@ inline XMVECTOR XM_CALLCONV XMLoadUDecN4_XR(const XMUDECN4* pSource) noexcept
|
||||
uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
|
||||
vInt = vandq_u32(vInt, g_XMMaskDec4);
|
||||
int32x4_t vTemp = vsubq_s32(vreinterpretq_s32_u32(vInt), XRBias);
|
||||
vTemp = veorq_u32(vTemp, g_XMFlipW);
|
||||
vTemp = veorq_s32(vTemp, g_XMFlipW);
|
||||
float32x4_t R = vcvtq_f32_s32(vTemp);
|
||||
R = vaddq_f32(R, g_XMAddUDec4);
|
||||
return vmulq_f32(R, XRMul);
|
||||
@ -2686,8 +2686,7 @@ inline void XM_CALLCONV XMStoreShortN4
|
||||
float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(-1.f));
|
||||
vResult = vminq_f32(vResult, vdupq_n_f32(1.0f));
|
||||
vResult = vmulq_n_f32(vResult, 32767.0f);
|
||||
vResult = vcvtq_s32_f32(vResult);
|
||||
int16x4_t vInt = vmovn_s32(vResult);
|
||||
int16x4_t vInt = vmovn_s32(vcvtq_s32_f32(vResult));
|
||||
vst1_s16(reinterpret_cast<int16_t*>(pDestination), vInt);
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
XMVECTOR vResult = _mm_max_ps(V, g_XMNegativeOne);
|
||||
@ -2724,8 +2723,7 @@ inline void XM_CALLCONV XMStoreShort4
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
float32x4_t vResult = vmaxq_f32(V, g_ShortMin);
|
||||
vResult = vminq_f32(vResult, g_ShortMax);
|
||||
vResult = vcvtq_s32_f32(vResult);
|
||||
int16x4_t vInt = vmovn_s32(vResult);
|
||||
int16x4_t vInt = vmovn_s32(vcvtq_s32_f32(vResult));
|
||||
vst1_s16(reinterpret_cast<int16_t*>(pDestination), vInt);
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Bounds check
|
||||
@ -2767,8 +2765,7 @@ inline void XM_CALLCONV XMStoreUShortN4
|
||||
vResult = vminq_f32(vResult, vdupq_n_f32(1.0f));
|
||||
vResult = vmulq_n_f32(vResult, 65535.0f);
|
||||
vResult = vaddq_f32(vResult, g_XMOneHalf);
|
||||
vResult = vcvtq_u32_f32(vResult);
|
||||
uint16x4_t vInt = vmovn_u32(vResult);
|
||||
uint16x4_t vInt = vmovn_u32(vcvtq_u32_f32(vResult));
|
||||
vst1_u16(reinterpret_cast<uint16_t*>(pDestination), vInt);
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Bounds check
|
||||
@ -2812,8 +2809,7 @@ inline void XM_CALLCONV XMStoreUShort4
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0));
|
||||
vResult = vminq_f32(vResult, g_UShortMax);
|
||||
vResult = vcvtq_u32_f32(vResult);
|
||||
uint16x4_t vInt = vmovn_u32(vResult);
|
||||
uint16x4_t vInt = vmovn_u32(vcvtq_u32_f32(vResult));
|
||||
vst1_u16(reinterpret_cast<uint16_t*>(pDestination), vInt);
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
// Bounds check
|
||||
@ -2947,7 +2943,7 @@ inline void XM_CALLCONV XMStoreXDec4
|
||||
vTemp = vorr_u32(vTemp, vTemp2);
|
||||
// Perform a single bit left shift on y|w
|
||||
vTemp2 = vdup_lane_u32(vTemp, 1);
|
||||
vTemp2 = vadd_s32(vTemp2, vTemp2);
|
||||
vTemp2 = vadd_u32(vTemp2, vTemp2);
|
||||
vTemp = vorr_u32(vTemp, vTemp2);
|
||||
vst1_lane_u32(&pDestination->v, vTemp, 0);
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
@ -3640,7 +3636,7 @@ inline void XM_CALLCONV XMStoreU555
|
||||
vTemp = vorr_u32(vTemp, vTemp2);
|
||||
// Perform a single bit left shift on y|w
|
||||
vTemp2 = vdup_lane_u32(vTemp, 1);
|
||||
vTemp2 = vadd_s32(vTemp2, vTemp2);
|
||||
vTemp2 = vadd_u32(vTemp2, vTemp2);
|
||||
vTemp = vorr_u32(vTemp, vTemp2);
|
||||
vst1_lane_u16(&pDestination->v, vreinterpret_u16_u32(vTemp), 0);
|
||||
#elif defined(_XM_SSE_INTRINSICS_)
|
||||
|
Loading…
Reference in New Issue
Block a user