From 9a1e0b53187e99096c473b9b8b11e9f07313e357 Mon Sep 17 00:00:00 2001 From: Chuck Walbourn Date: Mon, 23 May 2016 12:45:32 -0700 Subject: [PATCH] XNAMath 2.02 --- Inc/xnamath.h | 49 ++++++----- Inc/xnamathconvert.inl | 185 ++++++++++++++++++++++++----------------- Inc/xnamathmatrix.inl | 24 +++--- Inc/xnamathmisc.inl | 143 +++++-------------------------- Inc/xnamathvector.inl | 83 +++++++++--------- 5 files changed, 208 insertions(+), 276 deletions(-) diff --git a/Inc/xnamath.h b/Inc/xnamath.h index eabe19f..df0a571 100644 --- a/Inc/xnamath.h +++ b/Inc/xnamath.h @@ -22,7 +22,7 @@ Abstract: #error XNAMATH and XBOXMATH are incompatible in the same compilation module. Use one or the other. #endif -#define XNAMATH_VERSION 201 +#define XNAMATH_VERSION 202 #if !defined(_XM_X64_) && !defined(_XM_X86_) #if defined(_M_AMD64) || defined(_AMD64_) @@ -32,6 +32,16 @@ Abstract: #endif #endif +#if !defined(_XM_BIGENDIAN_) && !defined(_XM_LITTLEENDIAN_) +#if defined(_XM_X64_) || defined(_XM_X86_) +#define _XM_LITTLEENDIAN_ +#elif defined(_XBOX_VER) +#define _XM_BIGENDIAN_ +#else +#error xnamath.h only supports x86, x64, or XBox 360 targets +#endif +#endif + #if defined(_XM_X86_) || defined(_XM_X64_) #define _XM_SSE_INTRINSICS_ #if !defined(__cplusplus) && !defined(_XM_NO_INTRINSICS_) @@ -174,7 +184,7 @@ XMFINLINE FLOAT XMConvertToDegrees(FLOAT fRadians) { return fRadians * (180.0f / ****************************************************************************/ #pragma warning(push) -#pragma warning(disable:4201) +#pragma warning(disable:4201 4365) #if !defined (_XM_X86_) && !defined(_XM_X64_) #pragma bitfield_order(push) @@ -278,9 +288,9 @@ typedef _DECLSPEC_ALIGN_16_ struct XMVECTORU32 { } XMVECTORU32; // Fix-up for (1st-3rd) XMVECTOR parameters that are pass-in-register for x86 and Xbox 360, but not for other targets -#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINISCS_) +#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) typedef const XMVECTOR FXMVECTOR; -#elif defined(_XM_X86_) && !defined(_XM_NO_INTRINISCS_) +#elif defined(_XM_X86_) && !defined(_XM_NO_INTRINSICS_) typedef const XMVECTOR FXMVECTOR; #elif defined(__cplusplus) typedef const XMVECTOR& FXMVECTOR; @@ -289,7 +299,7 @@ typedef const XMVECTOR FXMVECTOR; #endif // Fix-up for (4th+) XMVECTOR parameters to pass in-register for Xbox 360 and by reference otherwise -#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINISCS_) +#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) typedef const XMVECTOR CXMVECTOR; #elif defined(__cplusplus) typedef const XMVECTOR& CXMVECTOR; @@ -354,11 +364,11 @@ typedef _DECLSPEC_ALIGN_16_ struct _XMMATRIX FLOAT operator() (UINT Row, UINT Column) CONST { return m[Row][Column]; } FLOAT& operator() (UINT Row, UINT Column) { return m[Row][Column]; } - _XMMATRIX& operator= (CONST _XMMATRIX&); + _XMMATRIX& operator= (CONST _XMMATRIX& M); #ifndef XM_NO_OPERATOR_OVERLOADS - _XMMATRIX& operator*= (CONST _XMMATRIX&); - _XMMATRIX operator* (CONST _XMMATRIX&) CONST; + _XMMATRIX& operator*= (CONST _XMMATRIX& M); + _XMMATRIX operator* (CONST _XMMATRIX& M) CONST; #endif // !XM_NO_OPERATOR_OVERLOADS #endif // __cplusplus @@ -1500,7 +1510,7 @@ typedef struct _XMCOLOR _XMCOLOR() {}; _XMCOLOR(UINT Color) : c(Color) {}; - _XMCOLOR(FLOAT _x, FLOAT _y, FLOAT _z, FLOAT _w); + _XMCOLOR(FLOAT _r, FLOAT _g, FLOAT _b, FLOAT _a); _XMCOLOR(CONST FLOAT *pArray); operator UINT () { return c; } @@ -2582,9 +2592,6 @@ XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR0 = {-1.0f,0.0f, 0.0f, 0.0f}; XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR1 = {0.0f,-1.0f, 0.0f, 0.0f}; XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR2 = {0.0f, 0.0f,-1.0f, 0.0f}; XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR3 = {0.0f, 0.0f, 0.0f,-1.0f}; - -#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) - XMGLOBALCONST XMVECTORI32 g_XMNegativeZero = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; XMGLOBALCONST XMVECTORI32 g_XMNegate3 = {0x80000000, 0x80000000, 0x80000000, 0x00000000}; XMGLOBALCONST XMVECTORI32 g_XMMask3 = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000}; @@ -2638,8 +2645,6 @@ XMGLOBALCONST XMVECTORF32 g_XMNegateW = { 1.0f, 1.0f, 1.0f,-1.0f}; XMGLOBALCONST XMVECTORI32 g_XMSelect0101 = {XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1}; XMGLOBALCONST XMVECTORI32 g_XMSelect1010 = {XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0}; XMGLOBALCONST XMVECTORI32 g_XMOneHalfMinusEpsilon = { 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD}; - -#ifdef _XM_NO_INTRINSICS_ XMGLOBALCONST XMVECTORI32 g_XMSelect1000 = {XM_SELECT_1, XM_SELECT_0, XM_SELECT_0, XM_SELECT_0}; XMGLOBALCONST XMVECTORI32 g_XMSelect1100 = {XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0}; XMGLOBALCONST XMVECTORI32 g_XMSelect1110 = {XM_SELECT_1, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0}; @@ -2650,9 +2655,6 @@ XMGLOBALCONST XMVECTORI32 g_XMSwizzleYZXW = {XM_PERMUTE_0Y, XM_PERMUTE_0Z, XMGLOBALCONST XMVECTORI32 g_XMSwizzleZXYW = {XM_PERMUTE_0Z, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0W}; XMGLOBALCONST XMVECTORI32 g_XMPermute0X0Y1X1Y = {XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_1Y}; XMGLOBALCONST XMVECTORI32 g_XMPermute0Z0W1Z1W = {XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_1W}; -#endif // !_XM_NO_INTRINSICS_ - -#ifdef _XM_SSE_INTRINSICS_ XMGLOBALCONST XMVECTORF32 g_XMFixupY16 = {1.0f,1.0f/65536.0f,0.0f,0.0f}; XMGLOBALCONST XMVECTORF32 g_XMFixupY16W16 = {1.0f,1.0f,1.0f/65536.0f,1.0f/65536.0f}; XMGLOBALCONST XMVECTORI32 g_XMFlipY = {0,0x80000000,0,0}; @@ -2685,9 +2687,6 @@ XMGLOBALCONST XMVECTORF32 g_XMMulDec4 = {1.0f,1.0f/1024.0f,1.0f/(1024. XMGLOBALCONST XMVECTORI32 g_XMMaskByte4 = {0xFF,0xFF00,0xFF0000,0xFF000000}; XMGLOBALCONST XMVECTORI32 g_XMXorByte4 = {0x80,0x8000,0x800000,0x00000000}; XMGLOBALCONST XMVECTORF32 g_XMAddByte4 = {-128.0f,-128.0f*256.0f,-128.0f*65536.0f,0}; -#endif - -#endif // _XM_NO_INTRINSICS_ /**************************************************************************** * @@ -2696,7 +2695,7 @@ XMGLOBALCONST XMVECTORF32 g_XMAddByte4 = {-128.0f,-128.0f*256.0f,-128.0 ****************************************************************************/ #pragma warning(push) -#pragma warning(disable:4214 4204 4616 6001) +#pragma warning(disable:4214 4204 4365 4616 6001) #if !defined(__cplusplus) && !defined(_XBOX) && defined(_XM_ISVS2005_) @@ -2861,10 +2860,10 @@ XMFINLINE XMVECTOR XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, UINT VSLeftRotateE //------------------------------------------------------------------------------ -#include -#include -#include -#include +#include "xnamathconvert.inl" +#include "xnamathvector.inl" +#include "xnamathmatrix.inl" +#include "xnamathmisc.inl" #pragma warning(pop) diff --git a/Inc/xnamathconvert.inl b/Inc/xnamathconvert.inl index 7180e44..d76d78d 100644 --- a/Inc/xnamathconvert.inl +++ b/Inc/xnamathconvert.inl @@ -431,8 +431,8 @@ XMFINLINE XMVECTOR XMLoadInt(CONST UINT* pSource) #elif defined(_XM_SSE_INTRINSICS_) XMASSERT(pSource); XMASSERT(((UINT_PTR)pSource & 3) == 0); - __m128i V = _mm_set_epi32( 0, 0, 0, *pSource ); - return reinterpret_cast<__m128 *>(&V)[0]; + + return _mm_load_ss( (const float*)pSource ); #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) #endif // _XM_VMX128_INTRINSICS_ } @@ -480,9 +480,10 @@ XMFINLINE XMVECTOR XMLoadInt2 #elif defined(_XM_SSE_INTRINSICS_) XMASSERT(pSource); - __m128i V = _mm_set_epi32( 0, 0, *(pSource+1), *pSource ); - return reinterpret_cast<__m128 *>(&V)[0]; + __m128 x = _mm_load_ss( (const float*)pSource ); + __m128 y = _mm_load_ss( (const float*)(pSource+1) ); + return _mm_unpacklo_ps( x, y ); #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) #endif // _XM_VMX128_INTRINSICS_ } @@ -509,6 +510,8 @@ XMFINLINE XMVECTOR XMLoadInt2A #elif defined(_XM_SSE_INTRINSICS_) XMASSERT(pSource); + XMASSERT(((UINT_PTR)pSource & 0xF) == 0); + __m128i V = _mm_loadl_epi64( (const __m128i*)pSource ); return reinterpret_cast<__m128 *>(&V)[0]; @@ -526,20 +529,16 @@ XMFINLINE XMVECTOR XMLoadFloat2 #if defined(_XM_NO_INTRINSICS_) XMVECTOR V; XMASSERT(pSource); + ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0]; ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0]; - V.vector4_f32[2] = V.vector4_f32[3] = 0.0f; return V; #elif defined(_XM_SSE_INTRINSICS_) XMASSERT(pSource); -#ifdef _XM_X86_ + __m128 x = _mm_load_ss( &pSource->x ); __m128 y = _mm_load_ss( &pSource->y ); return _mm_unpacklo_ps( x, y ); -#else // _XM_X64_ - // This reads 2 floats past the memory that should be ignored. - return _mm_loadu_ps( &pSource->x ); -#endif #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) #endif // _XM_VMX128_INTRINSICS_ } @@ -565,14 +564,10 @@ XMFINLINE XMVECTOR XMLoadFloat2A #elif defined(_XM_SSE_INTRINSICS_) XMASSERT(pSource); -#ifdef _XM_X86_ - __m128 x = _mm_load_ss( &pSource->x ); - __m128 y = _mm_load_ss( &pSource->y ); - return _mm_unpacklo_ps( x, y ); -#else // _XM_X64_ - // This reads 2 floats past the memory that should be ignored. - return _mm_load_ps( &pSource->x ); -#endif + XMASSERT(((UINT_PTR)pSource & 0xF) == 0); + + __m128i V = _mm_loadl_epi64( (const __m128i*)pSource ); + return reinterpret_cast<__m128 *>(&V)[0]; #else // _XM_VMX128_INTRINSICS_ #endif // _XM_VMX128_INTRINSICS_ } @@ -792,8 +787,17 @@ XMFINLINE XMVECTOR XMLoadInt3 #elif defined(_XM_SSE_INTRINSICS_) XMASSERT(pSource); + +#ifdef _XM_ISVS2005_ __m128i V = _mm_set_epi32( 0, *(pSource+2), *(pSource+1), *pSource ); return reinterpret_cast<__m128 *>(&V)[0]; +#else + __m128 x = _mm_load_ss( (const float*)pSource ); + __m128 y = _mm_load_ss( (const float*)(pSource+1) ); + __m128 z = _mm_load_ss( (const float*)(pSource+2) ); + __m128 xy = _mm_unpacklo_ps( x, y ); + return _mm_movelh_ps( xy, z ); +#endif // !_XM_ISVS2005_ #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) #endif // _XM_VMX128_INTRINSICS_ } @@ -839,15 +843,26 @@ XMFINLINE XMVECTOR XMLoadFloat3 #if defined(_XM_NO_INTRINSICS_) XMVECTOR V; XMASSERT(pSource); + ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0]; ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0]; ((UINT *)(&V.vector4_f32[2]))[0] = ((const UINT *)(&pSource->z))[0]; - V.vector4_f32[3] = 0.0f; return V; #elif defined(_XM_SSE_INTRINSICS_) XMASSERT(pSource); + +#ifdef _XM_ISVS2005_ // This reads 1 floats past the memory that should be ignored. + // Need to continue to do this for VS 2005 due to compiler issue but prefer new method + // to avoid triggering issues with memory debug tools (like AV) return _mm_loadu_ps( &pSource->x ); +#else + __m128 x = _mm_load_ss( &pSource->x ); + __m128 y = _mm_load_ss( &pSource->y ); + __m128 z = _mm_load_ss( &pSource->z ); + __m128 xy = _mm_unpacklo_ps( x, y ); + return _mm_movelh_ps( xy, z ); +#endif // !_XM_ISVS2005_ #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) #endif // _XM_VMX128_INTRINSICS_ } @@ -874,10 +889,10 @@ XMFINLINE XMVECTOR XMLoadFloat3A #elif defined(_XM_SSE_INTRINSICS_) XMASSERT(pSource); + XMASSERT(((UINT_PTR)pSource & 0xF) == 0); - // This reads 1 floats past the memory that should be ignored. - - return _mm_load_ps( &pSource->x ); + // This reads 1 floats past the memory that should be ignored. + return _mm_load_ps( &pSource->x ); #else // _XM_VMX128_INTRINSICS_ #endif // _XM_VMX128_INTRINSICS_ } @@ -1298,9 +1313,9 @@ XMFINLINE XMVECTOR XMLoadFloat3PK CONST XMFLOAT3PK* pSource ) { + _DECLSPEC_ALIGN_16_ UINT Result[4]; UINT Mantissa; UINT Exponent; - UINT Result[3]; XMASSERT(pSource); @@ -1406,7 +1421,7 @@ XMFINLINE XMVECTOR XMLoadFloat3PK Result[2] = ((Exponent + 112) << 23) | (Mantissa << 18); } - return XMLoadFloat3( (XMFLOAT3*)&Result ); + return XMLoadFloat3A( (XMFLOAT3A*)&Result ); } //------------------------------------------------------------------------------ @@ -1416,9 +1431,9 @@ XMFINLINE XMVECTOR XMLoadFloat3SE CONST XMFLOAT3SE* pSource ) { + _DECLSPEC_ALIGN_16_ UINT Result[4]; UINT Mantissa; UINT Exponent, ExpBits; - UINT Result[3]; XMASSERT(pSource); @@ -1515,7 +1530,7 @@ XMFINLINE XMVECTOR XMLoadFloat3SE Result[2] = ((Exponent + 112) << 23) | (Mantissa << 14); } - return XMLoadFloat3( (XMFLOAT3*)&Result ); + return XMLoadFloat3A( (XMFLOAT3A*)&Result ); } //------------------------------------------------------------------------------ @@ -1541,6 +1556,7 @@ XMFINLINE XMVECTOR XMLoadInt4 #elif defined(_XM_SSE_INTRINSICS_) XMASSERT(pSource); + __m128i V = _mm_loadu_si128( (const __m128i*)pSource ); return reinterpret_cast<__m128 *>(&V)[0]; @@ -1577,7 +1593,6 @@ XMFINLINE XMVECTOR XMLoadInt4A __m128i V = _mm_load_si128( (const __m128i*)pSource ); return reinterpret_cast<__m128 *>(&V)[0]; - #else // _XM_VMX128_INTRINSICS_ #endif // _XM_VMX128_INTRINSICS_ } @@ -1592,6 +1607,7 @@ XMFINLINE XMVECTOR XMLoadFloat4 #if defined(_XM_NO_INTRINSICS_) XMVECTOR V; XMASSERT(pSource); + ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0]; ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0]; ((UINT *)(&V.vector4_f32[2]))[0] = ((const UINT *)(&pSource->z))[0]; @@ -1599,6 +1615,7 @@ XMFINLINE XMVECTOR XMLoadFloat4 return V; #elif defined(_XM_SSE_INTRINSICS_) XMASSERT(pSource); + return _mm_loadu_ps( &pSource->x ); #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) #endif // _XM_VMX128_INTRINSICS_ @@ -3055,13 +3072,18 @@ XMFINLINE VOID XMStoreInt FXMVECTOR V ) { -#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) +#if defined(_XM_NO_INTRINSICS_) XMASSERT(pDestination); XMASSERT(((UINT_PTR)pDestination & 3) == 0); *pDestination = XMVectorGetIntX( V ); +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + _mm_store_ss( (float*)pDestination, V ); #else // _XM_VMX128_INTRINSICS_ #endif // _XM_VMX128_INTRINSICS_ } @@ -3074,13 +3096,18 @@ XMFINLINE VOID XMStoreFloat FXMVECTOR V ) { -#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) +#if defined(_XM_NO_INTRINSICS_) XMASSERT(pDestination); XMASSERT(((UINT_PTR)pDestination & 3) == 0); *pDestination = XMVectorGetX( V ); +#elif defined(_XM_SSE_INTRINSICS_) + XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + + _mm_store_ss( pDestination, V ); #else // _XM_VMX128_INTRINSICS_ #endif // _XM_VMX128_INTRINSICS_ } @@ -3102,12 +3129,12 @@ XMFINLINE VOID XMStoreInt2 pDestination[1] = V.vector4_u32[1]; #elif defined(_XM_SSE_INTRINSICS_) - XMASSERT(pDestination); XMASSERT(((UINT_PTR)pDestination & 3) == 0); - pDestination[0] = XMVectorGetIntX( V ); - pDestination[1] = XMVectorGetIntY( V ); + XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) ); + _mm_store_ss( (float*)&pDestination[0], V ); + _mm_store_ss( (float*)&pDestination[1], T ); #else // _XM_VMX128_INTRINSICS_ #endif // _XM_VMX128_INTRINSICS_ } @@ -3131,7 +3158,7 @@ XMFINLINE VOID XMStoreInt2A #elif defined(_XM_SSE_INTRINSICS_) XMASSERT(pDestination); - XMASSERT(((UINT_PTR)pDestination & 3) == 0); + XMASSERT(((UINT_PTR)pDestination & 0xF) == 0); _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast(&V)[0] ); @@ -3156,14 +3183,12 @@ XMFINLINE VOID XMStoreFloat2 pDestination->y = V.vector4_f32[1]; #elif defined(_XM_SSE_INTRINSICS_) - XMASSERT(pDestination); XMASSERT(((UINT_PTR)pDestination & 3) == 0); - XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) ); - _mm_store_ss( &pDestination->x, V ); - _mm_store_ss( &pDestination->y, T ); - + XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) ); + _mm_store_ss( &pDestination->x, V ); + _mm_store_ss( &pDestination->y, T ); #else // _XM_VMX128_INTRINSICS_ #endif // _XM_VMX128_INTRINSICS_ } @@ -3187,11 +3212,9 @@ XMFINLINE VOID XMStoreFloat2A #elif defined(_XM_SSE_INTRINSICS_) XMASSERT(pDestination); - XMASSERT(((UINT_PTR)pDestination & 3) == 0); + XMASSERT(((UINT_PTR)pDestination & 0xF) == 0); - XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) ); - _mm_store_ss( &pDestination->x, V ); - _mm_store_ss( &pDestination->y, T ); + _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast(&V)[0] ); #else // _XM_VMX128_INTRINSICS_ #endif // _XM_VMX128_INTRINSICS_ @@ -3391,9 +3414,12 @@ XMFINLINE VOID XMStoreInt3 XMASSERT(pDestination); XMASSERT(((UINT_PTR)pDestination & 3) == 0); - pDestination[0] = XMVectorGetIntX( V ); - pDestination[1] = XMVectorGetIntY( V ); - pDestination[2] = XMVectorGetIntZ( V ); + + XMVECTOR T1 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR T2 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss( (float*)pDestination, V ); + _mm_store_ss( (float*)&pDestination[1], T1 ); + _mm_store_ss( (float*)&pDestination[2], T2 ); #else // _XM_VMX128_INTRINSICS_ #endif // _XM_VMX128_INTRINSICS_ @@ -3419,10 +3445,11 @@ XMFINLINE VOID XMStoreInt3A #elif defined(_XM_SSE_INTRINSICS_) XMASSERT(pDestination); - XMASSERT(((UINT_PTR)pDestination & 3) == 0); - pDestination[0] = XMVectorGetIntX( V ); - pDestination[1] = XMVectorGetIntY( V ); - pDestination[2] = XMVectorGetIntZ( V ); + XMASSERT(((UINT_PTR)pDestination & 0xF) == 0); + + XMVECTOR T = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2)); + _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast(&V)[0] ); + _mm_store_ss( (float*)&pDestination[2], T ); #else // _XM_VMX128_INTRINSICS_ #endif // _XM_VMX128_INTRINSICS_ @@ -3450,11 +3477,11 @@ XMFINLINE VOID XMStoreFloat3 XMASSERT(pDestination); XMASSERT(((UINT_PTR)pDestination & 3) == 0); - XMVECTOR T1 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1)); - XMVECTOR T2 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2)); - _mm_store_ss( &pDestination->x, V ); - _mm_store_ss( &pDestination->y, T1 ); - _mm_store_ss( &pDestination->z, T2 ); + XMVECTOR T1 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR T2 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss( &pDestination->x, V ); + _mm_store_ss( &pDestination->y, T1 ); + _mm_store_ss( &pDestination->z, T2 ); #else // _XM_VMX128_INTRINSICS_ #endif // _XM_VMX128_INTRINSICS_ @@ -3482,11 +3509,9 @@ XMFINLINE VOID XMStoreFloat3A XMASSERT(pDestination); XMASSERT(((UINT_PTR)pDestination & 0xF) == 0); - XMVECTOR T1 = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) ); - XMVECTOR T2 = _mm_unpackhi_ps( V, V ); - _mm_store_ss( &pDestination->x, V ); - _mm_store_ss( &pDestination->y, T1 ); - _mm_store_ss( &pDestination->z, T2 ); + XMVECTOR T = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2)); + _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast(&V)[0] ); + _mm_store_ss( &pDestination->z, T ); #else // _XM_VMX128_INTRINSICS_ #endif // _XM_VMX128_INTRINSICS_ @@ -3918,13 +3943,13 @@ XMFINLINE VOID XMStoreFloat3PK FXMVECTOR V ) { + _DECLSPEC_ALIGN_16_ UINT IValue[4]; UINT I, Sign, j; - UINT IValue[3]; UINT Result[3]; XMASSERT(pDestination); - XMStoreFloat3( (XMFLOAT3*)&IValue, V ); + XMStoreFloat3A( (XMFLOAT3A*)&IValue, V ); // X & Y Channels (5-bit exponent, 6-bit mantissa) for(j=0; j < 2; ++j) @@ -4036,14 +4061,15 @@ XMFINLINE VOID XMStoreFloat3SE FXMVECTOR V ) { + _DECLSPEC_ALIGN_16_ UINT IValue[4]; UINT I, Sign, j, T; - UINT IValue[3]; UINT Frac[3]; UINT Exp[3]; + XMASSERT(pDestination); - XMStoreFloat3( (XMFLOAT3*)&IValue, V ); + XMStoreFloat3A( (XMFLOAT3A*)&IValue, V ); // X, Y, Z Channels (5-bit exponent, 9-bit mantissa) for(j=0; j < 3; ++j) @@ -4131,7 +4157,7 @@ XMFINLINE VOID XMStoreInt4 #elif defined(_XM_SSE_INTRINSICS_) XMASSERT(pDestination); - + _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast(&V)[0] ); #else // _XM_VMX128_INTRINSICS_ @@ -4158,6 +4184,7 @@ XMFINLINE VOID XMStoreInt4A #elif defined(_XM_SSE_INTRINSICS_) XMASSERT(pDestination); + XMASSERT(((UINT_PTR)pDestination & 0xF) == 0); _mm_store_si128( (__m128i*)pDestination, reinterpret_cast(&V)[0] ); @@ -4176,7 +4203,8 @@ XMFINLINE VOID XMStoreInt4NC #if defined(_XM_NO_INTRINSICS_) XMASSERT(pDestination); - + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + pDestination[0] = V.vector4_u32[0]; pDestination[1] = V.vector4_u32[1]; pDestination[2] = V.vector4_u32[2]; @@ -4184,7 +4212,8 @@ XMFINLINE VOID XMStoreInt4NC #elif defined(_XM_SSE_INTRINSICS_) XMASSERT(pDestination); - + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast(&V)[0] ); #else // _XM_VMX128_INTRINSICS_ @@ -4202,7 +4231,7 @@ XMFINLINE VOID XMStoreFloat4 #if defined(_XM_NO_INTRINSICS_) XMASSERT(pDestination); - + pDestination->x = V.vector4_f32[0]; pDestination->y = V.vector4_f32[1]; pDestination->z = V.vector4_f32[2]; @@ -4210,7 +4239,7 @@ XMFINLINE VOID XMStoreFloat4 #elif defined(_XM_SSE_INTRINSICS_) XMASSERT(pDestination); - + _mm_storeu_ps( &pDestination->x, V ); #else // _XM_VMX128_INTRINSICS_ @@ -4255,7 +4284,8 @@ XMFINLINE VOID XMStoreFloat4NC #if defined(_XM_NO_INTRINSICS_) XMASSERT(pDestination); - + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + pDestination->x = V.vector4_f32[0]; pDestination->y = V.vector4_f32[1]; pDestination->z = V.vector4_f32[2]; @@ -4263,7 +4293,8 @@ XMFINLINE VOID XMStoreFloat4NC #elif defined(_XM_SSE_INTRINSICS_) XMASSERT(pDestination); - + XMASSERT(((UINT_PTR)pDestination & 3) == 0); + _mm_storeu_ps( &pDestination->x, V ); #else // _XM_VMX128_INTRINSICS_ @@ -5452,13 +5483,13 @@ XMFINLINE VOID XMStoreColor // Convert to 0-255 vResult = _mm_mul_ps(vResult,Scale); // Shuffle RGBA to ARGB - vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(2,1,0,3)); + vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,0,1,2)); // Convert to int __m128i vInt = _mm_cvtps_epi32(vResult); // Mash to shorts vInt = _mm_packs_epi32(vInt,vInt); // Mash to bytes - vInt = _mm_packs_epi16(vInt,vInt); + vInt = _mm_packus_epi16(vInt,vInt); // Store the color _mm_store_ss(reinterpret_cast(&pDestination->c),reinterpret_cast<__m128 *>(&vInt)[0]); #else // _XM_VMX128_INTRINSICS_ @@ -5698,12 +5729,12 @@ XMFINLINE VOID XMStoreFloat4x4A pDestination->m[3][3] = M.r[3].vector4_f32[3]; #elif defined(_XM_SSE_INTRINSICS_) - XMASSERT(pDestination); + XMASSERT(pDestination); - _mm_store_ps( &pDestination->_11, M.r[0] ); - _mm_store_ps( &pDestination->_21, M.r[1] ); - _mm_store_ps( &pDestination->_31, M.r[2] ); - _mm_store_ps( &pDestination->_41, M.r[3] ); + _mm_store_ps( &pDestination->_11, M.r[0] ); + _mm_store_ps( &pDestination->_21, M.r[1] ); + _mm_store_ps( &pDestination->_31, M.r[2] ); + _mm_store_ps( &pDestination->_41, M.r[3] ); #else // _XM_VMX128_INTRINSICS_ #endif // _XM_VMX128_INTRINSICS_ } diff --git a/Inc/xnamathmatrix.inl b/Inc/xnamathmatrix.inl index f35a09f..293501a 100644 --- a/Inc/xnamathmatrix.inl +++ b/Inc/xnamathmatrix.inl @@ -2310,7 +2310,7 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveRH XMFINLINE XMMATRIX XMMatrixPerspectiveFovLH ( FLOAT FovAngleY, - FLOAT AspectHByW, + FLOAT AspectRatio, FLOAT NearZ, FLOAT FarZ ) @@ -2324,13 +2324,13 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovLH XMMATRIX M; XMASSERT(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f)); - XMASSERT(!XMScalarNearEqual(AspectHByW, 0.0f, 0.00001f)); + XMASSERT(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f)); XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); Height = CosFov / SinFov; - Width = Height / AspectHByW; + Width = Height / AspectRatio; M.r[0] = XMVectorSet(Width, 0.0f, 0.0f, 0.0f); M.r[1] = XMVectorSet(0.0f, Height, 0.0f, 0.0f); @@ -2341,7 +2341,7 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovLH #elif defined(_XM_SSE_INTRINSICS_) XMASSERT(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f)); - XMASSERT(!XMScalarNearEqual(AspectHByW, 0.0f, 0.00001f)); + XMASSERT(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f)); XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); XMMATRIX M; FLOAT SinFov; @@ -2351,7 +2351,7 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovLH // Note: This is recorded on the stack FLOAT Height = CosFov / SinFov; XMVECTOR rMem = { - Height / AspectHByW, + Height / AspectRatio, Height, fRange, -fRange * NearZ @@ -2363,7 +2363,7 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovLH vTemp = _mm_move_ss(vTemp,vValues); // CosFov / SinFov,0,0,0 M.r[0] = vTemp; - // 0,Height / AspectHByW,0,0 + // 0,Height / AspectRatio,0,0 vTemp = vValues; vTemp = _mm_and_ps(vTemp,g_XMMaskY); M.r[1] = vTemp; @@ -2386,7 +2386,7 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovLH XMFINLINE XMMATRIX XMMatrixPerspectiveFovRH ( FLOAT FovAngleY, - FLOAT AspectHByW, + FLOAT AspectRatio, FLOAT NearZ, FLOAT FarZ ) @@ -2400,13 +2400,13 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovRH XMMATRIX M; XMASSERT(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f)); - XMASSERT(!XMScalarNearEqual(AspectHByW, 0.0f, 0.00001f)); + XMASSERT(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f)); XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); Height = CosFov / SinFov; - Width = Height / AspectHByW; + Width = Height / AspectRatio; M.r[0] = XMVectorSet(Width, 0.0f, 0.0f, 0.0f); M.r[1] = XMVectorSet(0.0f, Height, 0.0f, 0.0f); @@ -2417,7 +2417,7 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovRH #elif defined(_XM_SSE_INTRINSICS_) XMASSERT(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f)); - XMASSERT(!XMScalarNearEqual(AspectHByW, 0.0f, 0.00001f)); + XMASSERT(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f)); XMASSERT(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); XMMATRIX M; FLOAT SinFov; @@ -2427,7 +2427,7 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovRH // Note: This is recorded on the stack FLOAT Height = CosFov / SinFov; XMVECTOR rMem = { - Height / AspectHByW, + Height / AspectRatio, Height, fRange, fRange * NearZ @@ -2439,7 +2439,7 @@ XMFINLINE XMMATRIX XMMatrixPerspectiveFovRH vTemp = _mm_move_ss(vTemp,vValues); // CosFov / SinFov,0,0,0 M.r[0] = vTemp; - // 0,Height / AspectHByW,0,0 + // 0,Height / AspectRatio,0,0 vTemp = vValues; vTemp = _mm_and_ps(vTemp,g_XMMaskY); M.r[1] = vTemp; diff --git a/Inc/xnamathmisc.inl b/Inc/xnamathmisc.inl index 593c45b..c606d1d 100644 --- a/Inc/xnamathmisc.inl +++ b/Inc/xnamathmisc.inl @@ -693,7 +693,7 @@ XMFINLINE XMVECTOR XMQuaternionBaryCentric s = f + g; - if (s < 0.00001f && s > -0.00001f) + if ((s < 0.00001f) && (s > -0.00001f)) { Result = Q0; } @@ -932,28 +932,26 @@ XMINLINE XMVECTOR XMQuaternionRotationMatrix CXMMATRIX M ) { -#if defined(_XM_NO_INTRINSICS_) +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) XMVECTOR Q0, Q1, Q2; XMVECTOR M00, M11, M22; XMVECTOR CQ0, CQ1, C; XMVECTOR CX, CY, CZ, CW; XMVECTOR SQ1, Scale; - XMVECTOR Rsq, Sqrt, VEqualsInfinity, VEqualsZero, Select; + XMVECTOR Rsq, Sqrt, VEqualsNaN; XMVECTOR A, B, P; XMVECTOR PermuteSplat, PermuteSplatT; XMVECTOR SignB, SignBT; XMVECTOR PermuteControl, PermuteControlT; - XMVECTOR Zero; XMVECTOR Result; - static CONST XMVECTOR OneQuarter = {0.25f, 0.25f, 0.25f, 0.25f}; - static CONST XMVECTOR SignPNNP = {1.0f, -1.0f, -1.0f, 1.0f}; - static CONST XMVECTOR SignNPNP = {-1.0f, 1.0f, -1.0f, 1.0f}; - static CONST XMVECTOR SignNNPP = {-1.0f, -1.0f, 1.0f, 1.0f}; - static CONST XMVECTOR SignPNPP = {1.0f, -1.0f, 1.0f, 1.0f}; - static CONST XMVECTOR SignPPNP = {1.0f, 1.0f, -1.0f, 1.0f}; - static CONST XMVECTOR SignNPPP = {-1.0f, 1.0f, 1.0f, 1.0f}; - static CONST XMVECTOR SignNNNX = {-1.0f, -1.0f, -1.0f, 2.0e-126f}; + static CONST XMVECTORF32 OneQuarter = {0.25f, 0.25f, 0.25f, 0.25f}; + static CONST XMVECTORF32 SignPNNP = {1.0f, -1.0f, -1.0f, 1.0f}; + static CONST XMVECTORF32 SignNPNP = {-1.0f, 1.0f, -1.0f, 1.0f}; + static CONST XMVECTORF32 SignNNPP = {-1.0f, -1.0f, 1.0f, 1.0f}; + static CONST XMVECTORF32 SignPNPP = {1.0f, -1.0f, 1.0f, 1.0f}; + static CONST XMVECTORF32 SignPPNP = {1.0f, 1.0f, -1.0f, 1.0f}; + static CONST XMVECTORF32 SignNPPP = {-1.0f, 1.0f, 1.0f, 1.0f}; static CONST XMVECTORU32 Permute0X0X0Y0W = {XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0W}; static CONST XMVECTORU32 Permute0Y0Z0Z1W = {XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_1W}; static CONST XMVECTORU32 SplatX = {XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X}; @@ -972,26 +970,23 @@ XMINLINE XMVECTOR XMQuaternionRotationMatrix M11 = XMVectorSplatY(M.r[1]); M22 = XMVectorSplatZ(M.r[2]); - Q0 = XMVectorMultiply(SignPNNP, M00); - Q0 = XMVectorMultiplyAdd(SignNPNP, M11, Q0); - Q0 = XMVectorMultiplyAdd(SignNNPP, M22, Q0); + Q0 = XMVectorMultiply(SignPNNP.v, M00); + Q0 = XMVectorMultiplyAdd(SignNPNP.v, M11, Q0); + Q0 = XMVectorMultiplyAdd(SignNNPP.v, M22, Q0); Q1 = XMVectorAdd(Q0, g_XMOne.v); Rsq = XMVectorReciprocalSqrt(Q1); - Zero = XMVectorZero(); - VEqualsInfinity = XMVectorEqualInt(Q1, g_XMInfinity.v); - VEqualsZero = XMVectorEqual(Q1, Zero); + VEqualsNaN = XMVectorIsNaN(Rsq); Sqrt = XMVectorMultiply(Q1, Rsq); - Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero); - Q1 = XMVectorSelect(Q1, Sqrt, Select); + Q1 = XMVectorSelect(Sqrt, Q1, VEqualsNaN); Q1 = XMVectorMultiply(Q1, g_XMOneHalf.v); SQ1 = XMVectorMultiply(Rsq, g_XMOneHalf.v); CQ0 = XMVectorPermute(Q0, Q0, Permute0X0X0Y0W.v); - CQ1 = XMVectorPermute(Q0, SignNNNX, Permute0Y0Z0Z1W.v); + CQ1 = XMVectorPermute(Q0, g_XMEpsilon.v, Permute0Y0Z0Z1W.v); C = XMVectorGreaterOrEqual(CQ0, CQ1); CX = XMVectorSplatX(C); @@ -1000,15 +995,15 @@ XMINLINE XMVECTOR XMQuaternionRotationMatrix CW = XMVectorSplatW(C); PermuteSplat = XMVectorSelect(SplatZ.v, SplatY.v, CZ); - SignB = XMVectorSelect(SignNPPP, SignPPNP, CZ); + SignB = XMVectorSelect(SignNPPP.v, SignPPNP.v, CZ); PermuteControl = XMVectorSelect(Permute2.v, Permute1.v, CZ); PermuteSplat = XMVectorSelect(PermuteSplat, SplatZ.v, CX); - SignB = XMVectorSelect(SignB, SignNPPP, CX); + SignB = XMVectorSelect(SignB, SignNPPP.v, CX); PermuteControl = XMVectorSelect(PermuteControl, Permute2.v, CX); PermuteSplatT = XMVectorSelect(PermuteSplat,SplatX.v, CY); - SignBT = XMVectorSelect(SignB, SignPNPP, CY); + SignBT = XMVectorSelect(SignB, SignPNPP.v, CY); PermuteControlT = XMVectorSelect(PermuteControl,Permute0.v, CY); PermuteSplat = XMVectorSelect(PermuteSplat, PermuteSplatT, CX); @@ -1016,7 +1011,7 @@ XMINLINE XMVECTOR XMQuaternionRotationMatrix PermuteControl = XMVectorSelect(PermuteControl, PermuteControlT, CX); PermuteSplat = XMVectorSelect(PermuteSplat,SplatW.v, CW); - SignB = XMVectorSelect(SignB, SignNNNX, CW); + SignB = XMVectorSelect(SignB, g_XMNegativeOne.v, CW); PermuteControl = XMVectorSelect(PermuteControl,Permute3.v, CW); Scale = XMVectorPermute(SQ1, SQ1, PermuteSplat); @@ -1032,104 +1027,6 @@ XMINLINE XMVECTOR XMQuaternionRotationMatrix return Result; -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR Q0, Q1, Q2; - XMVECTOR M00, M11, M22; - XMVECTOR CQ0, CQ1, C; - XMVECTOR CX, CY, CZ, CW; - XMVECTOR SQ1, Scale; - XMVECTOR Rsq, Sqrt, VEqualsInfinity, VEqualsZero, Select; - XMVECTOR A, B, P; - XMVECTOR PermuteSplat, PermuteSplatT; - XMVECTOR SignB, SignBT; - XMVECTOR PermuteControl, PermuteControlT; - XMVECTOR Zero; - XMVECTOR Result; - static CONST XMVECTORF32 OneQuarter = {0.25f, 0.25f, 0.25f, 0.25f}; - static CONST XMVECTORF32 SignPNNP = {1.0f, -1.0f, -1.0f, 1.0f}; - static CONST XMVECTORF32 SignNPNP = {-1.0f, 1.0f, -1.0f, 1.0f}; - static CONST XMVECTORF32 SignNNPP = {-1.0f, -1.0f, 1.0f, 1.0f}; - static CONST XMVECTORF32 SignPNPP = {1.0f, -1.0f, 1.0f, 1.0f}; - static CONST XMVECTORF32 SignPPNP = {1.0f, 1.0f, -1.0f, 1.0f}; - static CONST XMVECTORF32 SignNPPP = {-1.0f, 1.0f, 1.0f, 1.0f}; - static CONST XMVECTORF32 SignNNNX = {-1.0f, -1.0f, -1.0f, 2.0e-126f}; - static CONST XMVECTORI32 Permute0X0X0Y0W = {XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_0W}; - static CONST XMVECTORI32 Permute0Y0Z0Z1W = {XM_PERMUTE_0Y, XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_1W}; - static CONST XMVECTORI32 SplatX = {XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X}; - static CONST XMVECTORI32 SplatY = {XM_PERMUTE_0Y, XM_PERMUTE_0Y, XM_PERMUTE_0Y, XM_PERMUTE_0Y}; - static CONST XMVECTORI32 SplatZ = {XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Z}; - static CONST XMVECTORI32 SplatW = {XM_PERMUTE_0W, XM_PERMUTE_0W, XM_PERMUTE_0W, XM_PERMUTE_0W}; - static CONST XMVECTORI32 PermuteC = {XM_PERMUTE_0X, XM_PERMUTE_0Z, XM_PERMUTE_1X, XM_PERMUTE_1Y}; - static CONST XMVECTORI32 PermuteA = {XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Z, XM_PERMUTE_0W}; - static CONST XMVECTORI32 PermuteB = {XM_PERMUTE_1X, XM_PERMUTE_1W, XM_PERMUTE_0Z, XM_PERMUTE_0W}; - static CONST XMVECTORI32 Permute0 = {XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1Z, XM_PERMUTE_1Y}; - static CONST XMVECTORI32 Permute1 = {XM_PERMUTE_1X, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Z}; - static CONST XMVECTORI32 Permute2 = {XM_PERMUTE_1Z, XM_PERMUTE_1Y, XM_PERMUTE_0Z, XM_PERMUTE_1X}; - static CONST XMVECTORI32 Permute3 = {XM_PERMUTE_1Y, XM_PERMUTE_1Z, XM_PERMUTE_1X, XM_PERMUTE_0W}; - - M00 = XMVectorSplatX(M.r[0]); - M11 = XMVectorSplatY(M.r[1]); - M22 = XMVectorSplatZ(M.r[2]); - - Q0 = XMVectorMultiply(SignPNNP, M00); - Q0 = XMVectorMultiplyAdd(SignNPNP, M11, Q0); - Q0 = XMVectorMultiplyAdd(SignNNPP, M22, Q0); - - Q1 = XMVectorAdd(Q0, g_XMOne); - - Rsq = XMVectorReciprocalSqrt(Q1); - Zero = XMVectorZero(); - VEqualsInfinity = XMVectorEqualInt(Q1, g_XMInfinity); - VEqualsZero = XMVectorEqual(Q1, Zero); - Sqrt = XMVectorMultiply(Q1, Rsq); - Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero); - Q1 = XMVectorSelect(Q1, Sqrt, Select); - - Q1 = XMVectorMultiply(Q1, g_XMOneHalf); - - SQ1 = XMVectorMultiply(Rsq, g_XMOneHalf); - - CQ0 = XMVectorPermute(Q0, Q0, Permute0X0X0Y0W); - CQ1 = XMVectorPermute(Q0, SignNNNX, Permute0Y0Z0Z1W); - C = XMVectorGreaterOrEqual(CQ0, CQ1); - - CX = XMVectorSplatX(C); - CY = XMVectorSplatY(C); - CZ = XMVectorSplatZ(C); - CW = XMVectorSplatW(C); - - PermuteSplat = XMVectorSelect(SplatZ, SplatY, CZ); - SignB = XMVectorSelect(SignNPPP, SignPPNP, CZ); - PermuteControl = XMVectorSelect(Permute2, Permute1, CZ); - - PermuteSplat = XMVectorSelect(PermuteSplat, SplatZ, CX); - SignB = XMVectorSelect(SignB, SignNPPP, CX); - PermuteControl = XMVectorSelect(PermuteControl, Permute2, CX); - - PermuteSplatT = XMVectorSelect(PermuteSplat,SplatX, CY); - SignBT = XMVectorSelect(SignB, SignPNPP, CY); - PermuteControlT = XMVectorSelect(PermuteControl,Permute0, CY); - - PermuteSplat = XMVectorSelect(PermuteSplat, PermuteSplatT, CX); - SignB = XMVectorSelect(SignB, SignBT, CX); - PermuteControl = XMVectorSelect(PermuteControl, PermuteControlT, CX); - - PermuteSplat = XMVectorSelect(PermuteSplat,SplatW, CW); - SignB = XMVectorSelect(SignB, SignNNNX, CW); - PermuteControl = XMVectorSelect(PermuteControl,Permute3, CW); - - Scale = XMVectorPermute(SQ1, SQ1, PermuteSplat); - - P = XMVectorPermute(M.r[1], M.r[2],PermuteC); // {M10, M12, M20, M21} - A = XMVectorPermute(M.r[0], P, PermuteA); // {M01, M12, M20, M03} - B = XMVectorPermute(M.r[0], P, PermuteB); // {M10, M21, M02, M03} - - Q2 = XMVectorMultiplyAdd(SignB, B, A); - Q2 = XMVectorMultiply(Q2, Scale); - - Result = XMVectorPermute(Q1, Q2, PermuteControl); - - return Result; #else // _XM_VMX128_INTRINSICS_ #endif // _XM_VMX128_INTRINSICS_ } diff --git a/Inc/xnamathvector.inl b/Inc/xnamathvector.inl index cb07840..6aa536c 100644 --- a/Inc/xnamathvector.inl +++ b/Inc/xnamathvector.inl @@ -1256,7 +1256,7 @@ XMFINLINE XMVECTOR XMVectorPermute ++pControl; VectorIndex = (uIndex>>4)&1; uIndex &= 0x0F; -#if defined(_XM_X86_) || defined(_XM_X64_) +#if defined(_XM_LITTLEENDIAN_) uIndex ^= 3; // Swap byte ordering on little endian machines #endif pWork[0] = aByte[VectorIndex][uIndex]; @@ -4415,22 +4415,22 @@ XMINLINE XMVECTOR XMVectorATan2 // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions: // Y == 0 and X is Negative -> Pi with the sign of Y - // Y == 0 and X is Positive -> 0 with the sign of Y + // y == 0 and x is positive -> 0 with the sign of y // Y != 0 and X == 0 -> Pi / 2 with the sign of Y - // X == -Infinity and Finite Y > 0 -> Pi with the sign of Y - // X == +Infinity and Finite Y > 0 -> 0 with the sign of Y + // Y != 0 and X is Negative -> atan(y/x) + (PI with the sign of Y) + // X == -Infinity and Finite Y -> Pi with the sign of Y + // X == +Infinity and Finite Y -> 0 with the sign of Y // Y == Infinity and X is Finite -> Pi / 2 with the sign of Y // Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y // Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y - // TODO: Return Y / X if the result underflows XMVECTOR Reciprocal; XMVECTOR V; XMVECTOR YSign; XMVECTOR Pi, PiOverTwo, PiOverFour, ThreePiOverFour; - XMVECTOR YEqualsZero, XEqualsZero, XIsPositive, YEqualsInfinity, XEqualsInfinity, FiniteYGreaterZero; + XMVECTOR YEqualsZero, XEqualsZero, XIsPositive, YEqualsInfinity, XEqualsInfinity; XMVECTOR ATanResultValid; - XMVECTOR R0, R1, R2, R3, R4, R5, R6, R7; + XMVECTOR R0, R1, R2, R3, R4, R5; XMVECTOR Zero; XMVECTOR Result; static CONST XMVECTOR ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f}; @@ -4449,8 +4449,6 @@ XMINLINE XMVECTOR XMVectorATan2 XIsPositive = XMVectorEqualInt(XIsPositive, Zero); YEqualsInfinity = XMVectorIsInfinite(Y); XEqualsInfinity = XMVectorIsInfinite(X); - FiniteYGreaterZero = XMVectorGreater(Y, Zero); - FiniteYGreaterZero = XMVectorSelect(FiniteYGreaterZero, Zero, YEqualsInfinity); YSign = XMVectorAndInt(Y, g_XMNegativeZero.v); Pi = XMVectorOrInt(Pi, YSign); @@ -4463,25 +4461,25 @@ XMINLINE XMVECTOR XMVectorATan2 R3 = XMVectorSelect(R2, R1, YEqualsZero); R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive); R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity); - R6 = XMVectorSelect(R3, R5, YEqualsInfinity); - R7 = XMVectorSelect(R6, R1, FiniteYGreaterZero); - Result = XMVectorSelect(R6, R7, XEqualsInfinity); + Result = XMVectorSelect(R3, R5, YEqualsInfinity); ATanResultValid = XMVectorEqualInt(Result, ATanResultValid); Reciprocal = XMVectorReciprocal(X); V = XMVectorMultiply(Y, Reciprocal); R0 = XMVectorATan(V); - Result = XMVectorSelect(Result, R0, ATanResultValid); + R1 = XMVectorSelect( Pi, Zero, XIsPositive ); + R2 = XMVectorAdd(R0, R1); + + Result = XMVectorSelect(Result, R2, ATanResultValid); return Result; #elif defined(_XM_SSE_INTRINSICS_) static CONST XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f}; + // Mask if Y>0 && Y!=INF - XMVECTOR FiniteYGreaterZero = _mm_cmpgt_ps(Y,g_XMZero); XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y); - FiniteYGreaterZero = _mm_andnot_ps(YEqualsInfinity,FiniteYGreaterZero); // Get the sign of (Y&0x80000000) XMVECTOR YSign = _mm_and_ps(Y, g_XMNegativeZero); // Get the sign bits of X @@ -4489,10 +4487,10 @@ XMINLINE XMVECTOR XMVectorATan2 // Change them to masks XIsPositive = XMVectorEqualInt(XIsPositive,g_XMZero); // Get Pi - XMVECTOR R1 = _mm_load_ps1(&ATan2Constants.f[0]); + XMVECTOR Pi = _mm_load_ps1(&ATan2Constants.f[0]); // Copy the sign of Y - R1 = _mm_or_ps(R1,YSign); - R1 = XMVectorSelect(R1,YSign,XIsPositive); + Pi = _mm_or_ps(Pi,YSign); + XMVECTOR R1 = XMVectorSelect(Pi,YSign,XIsPositive); // Mask for X==0 XMVECTOR vConstants = _mm_cmpeq_ps(X,g_XMZero); // Get Pi/2 with with sign of Y @@ -4513,7 +4511,7 @@ XMINLINE XMVECTOR XMVectorATan2 vConstants = XMVectorSelect(PiOverTwo,vConstants,XEqualsInfinity); XMVECTOR vResult = XMVectorSelect(R2,vConstants,YEqualsInfinity); - vConstants = XMVectorSelect(vResult,R1,FiniteYGreaterZero); + vConstants = XMVectorSelect(R1,vResult,YEqualsInfinity); // At this point, any entry that's zero will get the result // from XMVectorATan(), otherwise, return the failsafe value vResult = XMVectorSelect(vResult,vConstants,XEqualsInfinity); @@ -4523,6 +4521,10 @@ XMINLINE XMVECTOR XMVectorATan2 vConstants = _mm_div_ps(Y,X); vConstants = XMVectorATan(vConstants); // Discard entries that have been declared void + + XMVECTOR R3 = XMVectorSelect( Pi, g_XMZero, XIsPositive ); + vConstants = _mm_add_ps( vConstants, R3 ); + vResult = XMVectorSelect(vResult,vConstants,ATanResultValid); return vResult; #else // _XM_VMX128_INTRINSICS_ @@ -5139,9 +5141,9 @@ XMFINLINE XMVECTOR XMVectorATan2Est XMVECTOR V; XMVECTOR YSign; XMVECTOR Pi, PiOverTwo, PiOverFour, ThreePiOverFour; - XMVECTOR YEqualsZero, XEqualsZero, XIsPositive, YEqualsInfinity, XEqualsInfinity, FiniteYGreaterZero; + XMVECTOR YEqualsZero, XEqualsZero, XIsPositive, YEqualsInfinity, XEqualsInfinity; XMVECTOR ATanResultValid; - XMVECTOR R0, R1, R2, R3, R4, R5, R6, R7; + XMVECTOR R0, R1, R2, R3, R4, R5; XMVECTOR Zero; XMVECTOR Result; static CONST XMVECTOR ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f}; @@ -5160,8 +5162,6 @@ XMFINLINE XMVECTOR XMVectorATan2Est XIsPositive = XMVectorEqualInt(XIsPositive, Zero); YEqualsInfinity = XMVectorIsInfinite(Y); XEqualsInfinity = XMVectorIsInfinite(X); - FiniteYGreaterZero = XMVectorGreater(Y, Zero); - FiniteYGreaterZero = XMVectorSelect(FiniteYGreaterZero, Zero, YEqualsInfinity); YSign = XMVectorAndInt(Y, g_XMNegativeZero.v); Pi = XMVectorOrInt(Pi, YSign); @@ -5174,25 +5174,25 @@ XMFINLINE XMVECTOR XMVectorATan2Est R3 = XMVectorSelect(R2, R1, YEqualsZero); R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive); R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity); - R6 = XMVectorSelect(R3, R5, YEqualsInfinity); - R7 = XMVectorSelect(R6, R1, FiniteYGreaterZero); - Result = XMVectorSelect(R6, R7, XEqualsInfinity); + Result = XMVectorSelect(R3, R5, YEqualsInfinity); ATanResultValid = XMVectorEqualInt(Result, ATanResultValid); Reciprocal = XMVectorReciprocalEst(X); V = XMVectorMultiply(Y, Reciprocal); R0 = XMVectorATanEst(V); - Result = XMVectorSelect(Result, R0, ATanResultValid); + R1 = XMVectorSelect( Pi, Zero, XIsPositive ); + R2 = XMVectorAdd(R0, R1); + + Result = XMVectorSelect(Result, R2, ATanResultValid); return Result; #elif defined(_XM_SSE_INTRINSICS_) static CONST XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f}; + // Mask if Y>0 && Y!=INF - XMVECTOR FiniteYGreaterZero = _mm_cmpgt_ps(Y,g_XMZero); XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y); - FiniteYGreaterZero = _mm_andnot_ps(YEqualsInfinity,FiniteYGreaterZero); // Get the sign of (Y&0x80000000) XMVECTOR YSign = _mm_and_ps(Y, g_XMNegativeZero); // Get the sign bits of X @@ -5200,10 +5200,10 @@ XMFINLINE XMVECTOR XMVectorATan2Est // Change them to masks XIsPositive = XMVectorEqualInt(XIsPositive,g_XMZero); // Get Pi - XMVECTOR R1 = _mm_load_ps1(&ATan2Constants.f[0]); + XMVECTOR Pi = _mm_load_ps1(&ATan2Constants.f[0]); // Copy the sign of Y - R1 = _mm_or_ps(R1,YSign); - R1 = XMVectorSelect(R1,YSign,XIsPositive); + Pi = _mm_or_ps(Pi,YSign); + XMVECTOR R1 = XMVectorSelect(Pi,YSign,XIsPositive); // Mask for X==0 XMVECTOR vConstants = _mm_cmpeq_ps(X,g_XMZero); // Get Pi/2 with with sign of Y @@ -5224,16 +5224,21 @@ XMFINLINE XMVECTOR XMVectorATan2Est vConstants = XMVectorSelect(PiOverTwo,vConstants,XEqualsInfinity); XMVECTOR vResult = XMVectorSelect(R2,vConstants,YEqualsInfinity); - vConstants = XMVectorSelect(vResult,R1,FiniteYGreaterZero); + vConstants = XMVectorSelect(R1,vResult,YEqualsInfinity); // At this point, any entry that's zero will get the result // from XMVectorATan(), otherwise, return the failsafe value vResult = XMVectorSelect(vResult,vConstants,XEqualsInfinity); // Any entries not 0xFFFFFFFF, are considered precalculated XMVECTOR ATanResultValid = XMVectorEqualInt(vResult,g_XMNegOneMask); // Let's do the ATan2 function - vConstants = _mm_div_ps(Y,X); + XMVECTOR Reciprocal = _mm_rcp_ps(X); + vConstants = _mm_mul_ps(Y, Reciprocal); vConstants = XMVectorATanEst(vConstants); // Discard entries that have been declared void + + XMVECTOR R3 = XMVectorSelect( Pi, g_XMZero, XIsPositive ); + vConstants = _mm_add_ps( vConstants, R3 ); + vResult = XMVectorSelect(vResult,vConstants,ATanResultValid); return vResult; #else // _XM_VMX128_INTRINSICS_ @@ -12777,13 +12782,13 @@ XMFINLINE _XMUICO4& _XMUICO4::operator= XMFINLINE _XMCOLOR::_XMCOLOR ( - FLOAT _x, - FLOAT _y, - FLOAT _z, - FLOAT _w + FLOAT _r, + FLOAT _g, + FLOAT _b, + FLOAT _a ) { - XMStoreColor(this, XMVectorSet(_x, _y, _z, _w)); + XMStoreColor(this, XMVectorSet(_r, _g, _b, _a)); } //------------------------------------------------------------------------------