1
0
mirror of https://github.com/microsoft/DirectXMath synced 2024-11-08 21:50:09 +00:00

DirectXMath 3.03

This commit is contained in:
Chuck Walbourn 2016-05-23 14:15:41 -07:00
parent 409c3a3646
commit fd7f30458d
6 changed files with 500 additions and 534 deletions

View File

@ -17,7 +17,7 @@
#error DirectX Math requires C++
#endif
#define DIRECTX_MATH_VERSION 302
#define DIRECTX_MATH_VERSION 303
#if !defined(_XM_BIGENDIAN_) && !defined(_XM_LITTLEENDIAN_)
#if defined(_M_AMD64) || defined(_M_IX86) || defined(_M_ARM)
@ -29,6 +29,8 @@
#endif
#endif // !_XM_BIGENDIAN_ && !_XM_LITTLEENDIAN_
#if !defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) && !defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
#if defined(_M_IX86) || defined(_M_AMD64)
#define _XM_SSE_INTRINSICS_
@ -62,15 +64,7 @@
#endif
#endif
#ifdef _WIN32_WCE
inline float powf(float _X, float _Y) { return ((float)pow((double)_X, (double)_Y)); }
inline float logf(float _X) { return ((float)log((double)_X)); }
inline float tanf(float _X) { return ((float)tan((double)_X)); }
inline float atanf(float _X) { return ((float)atan((double)_X)); }
inline float sinhf(float _X) { return ((float)sinh((double)_X)); }
inline float coshf(float _X) { return ((float)cosh((double)_X)); }
inline float tanhf(float _X) { return ((float)tanh((double)_X)); }
#endif
#include <sal.h>
#include <assert.h>
@ -261,8 +255,8 @@ __declspec(align(16)) struct XMVECTORF32
inline operator XMVECTOR() const { return v; }
inline operator const float*() const { return f; }
#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
inline operator __m128i() const { return reinterpret_cast<const __m128i *>(&v)[0]; }
inline operator __m128d() const { return reinterpret_cast<const __m128d *>(&v)[0]; }
inline operator __m128i() const { return _mm_castps_si128(v); }
inline operator __m128d() const { return _mm_castps_pd(v); }
#endif
};
@ -276,8 +270,8 @@ __declspec(align(16)) struct XMVECTORI32
inline operator XMVECTOR() const { return v; }
#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
inline operator __m128i() const { return reinterpret_cast<const __m128i *>(&v)[0]; }
inline operator __m128d() const { return reinterpret_cast<const __m128d *>(&v)[0]; }
inline operator __m128i() const { return _mm_castps_si128(v); }
inline operator __m128d() const { return _mm_castps_pd(v); }
#endif
};
@ -291,8 +285,8 @@ __declspec(align(16)) struct XMVECTORU8
inline operator XMVECTOR() const { return v; }
#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
inline operator __m128i() const { return reinterpret_cast<const __m128i *>(&v)[0]; }
inline operator __m128d() const { return reinterpret_cast<const __m128d *>(&v)[0]; }
inline operator __m128i() const { return _mm_castps_si128(v); }
inline operator __m128d() const { return _mm_castps_pd(v); }
#endif
};
@ -306,8 +300,8 @@ __declspec(align(16)) struct XMVECTORU32
inline operator XMVECTOR() const { return v; }
#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
inline operator __m128i() const { return reinterpret_cast<const __m128i *>(&v)[0]; }
inline operator __m128d() const { return reinterpret_cast<const __m128d *>(&v)[0]; }
inline operator __m128i() const { return _mm_castps_si128(v); }
inline operator __m128d() const { return _mm_castps_pd(v); }
#endif
};
@ -350,6 +344,7 @@ struct XMMATRIX
__declspec(align(16)) struct XMMATRIX
#endif
{
#ifdef _XM_NO_INTRINSICS_
union
{
XMVECTOR r[4];
@ -362,6 +357,9 @@ __declspec(align(16)) struct XMMATRIX
};
float m[4][4];
};
#else
XMVECTOR r[4];
#endif
XMMATRIX() {}
XMMATRIX(FXMVECTOR R0, FXMVECTOR R1, FXMVECTOR R2, GXMVECTOR R3) { r[0] = R0; r[1] = R1; r[2] = R2; r[3] = R3; }
@ -371,8 +369,10 @@ __declspec(align(16)) struct XMMATRIX
float m30, float m31, float m32, float m33);
explicit XMMATRIX(_In_reads_(16) const float *pArray);
#ifdef _XM_NO_INTRINSICS_
float operator() (size_t Row, size_t Column) const { return m[Row][Column]; }
float& operator() (size_t Row, size_t Column) { return m[Row][Column]; }
#endif
XMMATRIX& operator= (const XMMATRIX& M) { r[0] = M.r[0]; r[1] = M.r[1]; r[2] = M.r[2]; r[3] = M.r[3]; return *this; }
@ -403,7 +403,7 @@ struct XMFLOAT2
XMFLOAT2() {}
XMFLOAT2(float _x, float _y) : x(_x), y(_y) {}
XMFLOAT2(_In_reads_(2) const float *pArray) : x(pArray[0]), y(pArray[1]) {}
explicit XMFLOAT2(_In_reads_(2) const float *pArray) : x(pArray[0]), y(pArray[1]) {}
XMFLOAT2& operator= (const XMFLOAT2& Float2) { x = Float2.x; y = Float2.y; return *this; }
};
@ -413,7 +413,7 @@ __declspec(align(16)) struct XMFLOAT2A : public XMFLOAT2
{
XMFLOAT2A() : XMFLOAT2() {}
XMFLOAT2A(float _x, float _y) : XMFLOAT2(_x, _y) {}
XMFLOAT2A(_In_reads_(2) const float *pArray) : XMFLOAT2(pArray) {}
explicit XMFLOAT2A(_In_reads_(2) const float *pArray) : XMFLOAT2(pArray) {}
XMFLOAT2A& operator= (const XMFLOAT2A& Float2) { x = Float2.x; y = Float2.y; return *this; }
};
@ -455,7 +455,7 @@ struct XMFLOAT3
XMFLOAT3() {}
XMFLOAT3(float _x, float _y, float _z) : x(_x), y(_y), z(_z) {}
XMFLOAT3(_In_reads_(3) const float *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}
explicit XMFLOAT3(_In_reads_(3) const float *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}
XMFLOAT3& operator= (const XMFLOAT3& Float3) { x = Float3.x; y = Float3.y; z = Float3.z; return *this; }
};
@ -465,7 +465,7 @@ __declspec(align(16)) struct XMFLOAT3A : public XMFLOAT3
{
XMFLOAT3A() : XMFLOAT3() {}
XMFLOAT3A(float _x, float _y, float _z) : XMFLOAT3(_x, _y, _z) {}
XMFLOAT3A(_In_reads_(3) const float *pArray) : XMFLOAT3(pArray) {}
explicit XMFLOAT3A(_In_reads_(3) const float *pArray) : XMFLOAT3(pArray) {}
XMFLOAT3A& operator= (const XMFLOAT3A& Float3) { x = Float3.x; y = Float3.y; z = Float3.z; return *this; }
};
@ -482,7 +482,7 @@ struct XMINT3
XMINT3(int32_t _x, int32_t _y, int32_t _z) : x(_x), y(_y), z(_z) {}
explicit XMINT3(_In_reads_(3) const int32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}
XMINT3& operator= (const XMINT3& Int3) { x = Int3.x; y = Int3.y; z = Int3.z; return *this; }
XMINT3& operator= (const XMINT3& i3) { x = i3.x; y = i3.y; z = i3.z; return *this; }
};
// 3D Vector; 32 bit unsigned integer components
@ -496,7 +496,7 @@ struct XMUINT3
XMUINT3(uint32_t _x, uint32_t _y, uint32_t _z) : x(_x), y(_y), z(_z) {}
explicit XMUINT3(_In_reads_(3) const uint32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}
XMUINT3& operator= (const XMUINT3& UInt3) { x = UInt3.x; y = UInt3.y; z = UInt3.z; return *this; }
XMUINT3& operator= (const XMUINT3& u3) { x = u3.x; y = u3.y; z = u3.z; return *this; }
};
//------------------------------------------------------------------------------
@ -510,7 +510,7 @@ struct XMFLOAT4
XMFLOAT4() {}
XMFLOAT4(float _x, float _y, float _z, float _w) : x(_x), y(_y), z(_z), w(_w) {}
XMFLOAT4(_In_reads_(4) const float *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
explicit XMFLOAT4(_In_reads_(4) const float *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
XMFLOAT4& operator= (const XMFLOAT4& Float4) { x = Float4.x; y = Float4.y; z = Float4.z; w = Float4.w; return *this; }
};
@ -520,7 +520,7 @@ __declspec(align(16)) struct XMFLOAT4A : public XMFLOAT4
{
XMFLOAT4A() : XMFLOAT4() {}
XMFLOAT4A(float _x, float _y, float _z, float _w) : XMFLOAT4(_x, _y, _z, _w) {}
XMFLOAT4A(_In_reads_(4) const float *pArray) : XMFLOAT4(pArray) {}
explicit XMFLOAT4A(_In_reads_(4) const float *pArray) : XMFLOAT4(pArray) {}
XMFLOAT4A& operator= (const XMFLOAT4A& Float4) { x = Float4.x; y = Float4.y; z = Float4.z; w = Float4.w; return *this; }
};
@ -1368,6 +1368,8 @@ template<class T> inline T XMMax(T a, T b) { return (a > b) ? a : b; }
#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
#define XM_PERMUTE_PS( v, c ) _mm_shuffle_ps( v, v, c )
// PermuteHelper internal template (SSE only)
namespace Internal
{
@ -1384,8 +1386,8 @@ namespace Internal
WhichW ? 0xFFFFFFFF : 0,
};
XMVECTOR shuffled1 = _mm_shuffle_ps(v1, v1, Shuffle);
XMVECTOR shuffled2 = _mm_shuffle_ps(v2, v2, Shuffle);
XMVECTOR shuffled1 = XM_PERMUTE_PS(v1, Shuffle);
XMVECTOR shuffled2 = XM_PERMUTE_PS(v2, Shuffle);
XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1);
XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2);
@ -1397,13 +1399,13 @@ namespace Internal
// Fast path for permutes that only read from the first vector.
template<uint32_t Shuffle> struct PermuteHelper<Shuffle, false, false, false, false>
{
static XMVECTOR Permute(FXMVECTOR v1, FXMVECTOR v2) { (v2); return _mm_shuffle_ps(v1, v1, Shuffle); }
static XMVECTOR Permute(FXMVECTOR v1, FXMVECTOR v2) { (v2); return XM_PERMUTE_PS(v1, Shuffle); }
};
// Fast path for permutes that only read from the second vector.
template<uint32_t Shuffle> struct PermuteHelper<Shuffle, true, true, true, true>
{
static XMVECTOR Permute(FXMVECTOR v1, FXMVECTOR v2){ (v1); return _mm_shuffle_ps(v2, v2, Shuffle); }
static XMVECTOR Permute(FXMVECTOR v1, FXMVECTOR v2){ (v1); return XM_PERMUTE_PS(v2, Shuffle); }
};
// Fast path for permutes that read XY from the first vector, ZW from the second.
@ -1488,7 +1490,7 @@ template<> inline XMVECTOR XMVectorPermute<1,2,3,4>(FXMVECTOR V1, FXMVECTOR V2)
template<> inline XMVECTOR XMVectorPermute<2,3,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vextq_f32(V1, V2, 2); }
template<> inline XMVECTOR XMVectorPermute<3,4,5,6>(FXMVECTOR V1, FXMVECTOR V2) { return vextq_f32(V1, V2, 3); }
#endif _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_
#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_
//------------------------------------------------------------------------------
@ -1502,7 +1504,7 @@ template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t Swizz
static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
return _mm_shuffle_ps( V, V, _MM_SHUFFLE( SwizzleW, SwizzleZ, SwizzleY, SwizzleX ) );
return XM_PERMUTE_PS( V, _MM_SHUFFLE( SwizzleW, SwizzleZ, SwizzleY, SwizzleX ) );
#elif defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
return __vpermwi(V, ((SwizzleX & 3) << 6) | ((SwizzleY & 3) << 4) | ((SwizzleZ & 3) << 2) | (SwizzleW & 3) );
#else
@ -1515,6 +1517,7 @@ template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t Swizz
// Specialized swizzles
template<> inline XMVECTOR XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; }
#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
template<> inline XMVECTOR XMVectorSwizzle<0,0,0,0>(FXMVECTOR V) { return vdupq_lane_f32( vget_low_f32(V), 0); }
@ -1548,7 +1551,7 @@ template<> inline XMVECTOR XMVectorSwizzle<1,2,3,0>(FXMVECTOR V) { return vextq_
template<> inline XMVECTOR XMVectorSwizzle<2,3,0,1>(FXMVECTOR V) { return vextq_f32(V, V, 2); }
template<> inline XMVECTOR XMVectorSwizzle<3,0,1,2>(FXMVECTOR V) { return vextq_f32(V, V, 3); }
#endif _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_
#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_
//------------------------------------------------------------------------------
@ -1760,7 +1763,7 @@ inline XMVECTOR XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2,
vTemp = _mm_cmpeq_epi32(vTemp,g_vMask1);
// 0xFFFFFFFF -> 1.0f, 0x00000000 -> 0.0f
vTemp = _mm_and_si128(vTemp,g_XMOne);
return reinterpret_cast<const __m128 *>(&vTemp)[0];
return _mm_castsi128_ps(vTemp);
#endif
}
@ -1799,7 +1802,7 @@ inline XMVECTOR XMVectorSplatConstant(int32_t IntConstant, uint32_t DivExponent)
// Splat the scalar value (It's really a float)
vScale = _mm_set1_epi32(uScale);
// Multiply by the reciprocal (Perform a right shift by DivExponent)
vResult = _mm_mul_ps(vResult,reinterpret_cast<const __m128 *>(&vScale)[0]);
vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(vScale));
return vResult;
#endif
}
@ -1824,13 +1827,14 @@ inline XMVECTOR XMVectorSplatConstantInt(int32_t IntConstant)
}
// Implemented for VMX128 intrinsics as #defines aboves
#endif _XM_NO_INTRINSICS_ || _XM_SSE_INTRINSICS_ || _XM_ARM_NEON_INTRINSICS_
#endif // _XM_NO_INTRINSICS_ || _XM_SSE_INTRINSICS_ || _XM_ARM_NEON_INTRINSICS_
#include "DirectXMathConvert.inl"
#include "DirectXMathVector.inl"
#include "DirectXMathMatrix.inl"
#include "DirectXMathMisc.inl"
#pragma prefast(pop)
#pragma warning(pop)

View File

@ -50,12 +50,12 @@ inline XMVECTOR XMConvertVectorIntToFloat
return vmulq_f32( vResult, vScale );
#else // _XM_SSE_INTRINSICS_
// Convert to floats
XMVECTOR vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&VInt)[0]);
XMVECTOR vResult = _mm_cvtepi32_ps(_mm_castps_si128(VInt));
// Convert DivExponent into 1.0f/(1<<DivExponent)
uint32_t uScale = 0x3F800000U - (DivExponent << 23);
// Splat the scalar value
__m128i vScale = _mm_set1_epi32(uScale);
vResult = _mm_mul_ps(vResult,reinterpret_cast<const __m128 *>(&vScale)[0]);
vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(vScale));
return vResult;
#endif
}
@ -108,7 +108,7 @@ inline XMVECTOR XMConvertVectorFloatToInt
__m128i vResulti = _mm_cvttps_epi32(vResult);
// If there was positive overflow, set to 0x7FFFFFFF
vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast<const __m128 *>(&vResulti)[0]);
vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
vOverflow = _mm_or_ps(vOverflow,vResult);
return vOverflow;
#endif
@ -143,17 +143,17 @@ inline XMVECTOR XMConvertVectorUIntToFloat
// Force all values positive
XMVECTOR vResult = _mm_xor_ps(VUInt,vMask);
// Convert to floats
vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
// Convert 0x80000000 -> 0xFFFFFFFF
__m128i iMask = _mm_srai_epi32(reinterpret_cast<const __m128i *>(&vMask)[0],31);
__m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
// For only the ones that are too big, add the fixup
vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&iMask)[0],g_XMFixUnsigned);
vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
vResult = _mm_add_ps(vResult,vMask);
// Convert DivExponent into 1.0f/(1<<DivExponent)
uint32_t uScale = 0x3F800000U - (DivExponent << 23);
// Splat
iMask = _mm_set1_epi32(uScale);
vResult = _mm_mul_ps(vResult,reinterpret_cast<const __m128 *>(&iMask)[0]);
vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(iMask));
return vResult;
#endif
}
@ -213,7 +213,7 @@ inline XMVECTOR XMConvertVectorFloatToUInt
__m128i vResulti = _mm_cvttps_epi32(vResult);
// Convert from signed to unsigned pnly if greater than 0x80000000
vMask = _mm_and_ps(vMask,g_XMNegativeZero);
vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],vMask);
vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
// On those that are too large, set to 0xFFFFFFFF
vResult = _mm_or_ps(vResult,vOverflow);
return vResult;
@ -404,7 +404,7 @@ inline XMVECTOR XMLoadSInt2
__m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
__m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
__m128 V = _mm_unpacklo_ps( x, y );
return _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&V)[0]);
return _mm_cvtepi32_ps(_mm_castps_si128(V));
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
#endif // _XM_VMX128_INTRINSICS_
}
@ -439,11 +439,11 @@ inline XMVECTOR XMLoadUInt2
// Force all values positive
XMVECTOR vResult = _mm_xor_ps(V,vMask);
// Convert to floats
vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
// Convert 0x80000000 -> 0xFFFFFFFF
__m128i iMask = _mm_srai_epi32(reinterpret_cast<const __m128i *>(&vMask)[0],31);
__m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
// For only the ones that are too big, add the fixup
vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&iMask)[0],g_XMFixUnsigned);
vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
vResult = _mm_add_ps(vResult,vMask);
return vResult;
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
@ -596,7 +596,7 @@ inline XMVECTOR XMLoadSInt3
__m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) );
__m128 xy = _mm_unpacklo_ps( x, y );
__m128 V = _mm_movelh_ps( xy, z );
return _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&V)[0]);
return _mm_cvtepi32_ps(_mm_castps_si128(V));
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
#endif // _XM_VMX128_INTRINSICS_
}
@ -634,11 +634,11 @@ inline XMVECTOR XMLoadUInt3
// Force all values positive
XMVECTOR vResult = _mm_xor_ps(V,vMask);
// Convert to floats
vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
// Convert 0x80000000 -> 0xFFFFFFFF
__m128i iMask = _mm_srai_epi32(reinterpret_cast<const __m128i *>(&vMask)[0],31);
__m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
// For only the ones that are too big, add the fixup
vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&iMask)[0],g_XMFixUnsigned);
vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
vResult = _mm_add_ps(vResult,vMask);
return vResult;
@ -792,15 +792,15 @@ inline XMVECTOR XMLoadUInt4
__m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) );
// For the values that are higher than 0x7FFFFFFF, a fixup is needed
// Determine which ones need the fix.
XMVECTOR vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&V)[0],g_XMNegativeZero);
XMVECTOR vMask = _mm_and_ps(_mm_castsi128_ps(V),g_XMNegativeZero);
// Force all values positive
XMVECTOR vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&V)[0],vMask);
XMVECTOR vResult = _mm_xor_ps(_mm_castsi128_ps(V),vMask);
// Convert to floats
vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
// Convert 0x80000000 -> 0xFFFFFFFF
__m128i iMask = _mm_srai_epi32(reinterpret_cast<const __m128i *>(&vMask)[0],31);
__m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
// For only the ones that are too big, add the fixup
vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&iMask)[0],g_XMFixUnsigned);
vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
vResult = _mm_add_ps(vResult,vMask);
return vResult;
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
@ -934,7 +934,7 @@ inline XMMATRIX XMLoadFloat4x3
// vTemp2 = y2,z2,x2,x2
vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
// vTemp2 = x2,y2,z2,z2
vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(1,1,0,2));
vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2));
// vTemp1 = x1,y1,z1,0
vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
// vTemp2 = x2,y2,z2,0
@ -942,13 +942,13 @@ inline XMMATRIX XMLoadFloat4x3
// vTemp3 = x3,y3,z3,0
vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
// vTemp4i = x4,y4,z4,0
__m128i vTemp4i = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vTemp4)[0],32/8);
__m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8);
// vTemp4i = x4,y4,z4,1.0f
vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
XMMATRIX M(vTemp1,
vTemp2,
vTemp3,
reinterpret_cast<const __m128 *>(&vTemp4i)[0]);
_mm_castsi128_ps(vTemp4i));
return M;
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
#endif // _XM_VMX128_INTRINSICS_
@ -1016,7 +1016,7 @@ inline XMMATRIX XMLoadFloat4x3A
// vTemp2 = y2,z2,x2,x2
vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
// vTemp2 = x2,y2,z2,z2
vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(1,1,0,2));
vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2));
// vTemp1 = x1,y1,z1,0
vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
// vTemp2 = x2,y2,z2,0
@ -1024,13 +1024,13 @@ inline XMMATRIX XMLoadFloat4x3A
// vTemp3 = x3,y3,z3,0
vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
// vTemp4i = x4,y4,z4,0
__m128i vTemp4i = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vTemp4)[0],32/8);
__m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8);
// vTemp4i = x4,y4,z4,1.0f
vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
XMMATRIX M(vTemp1,
vTemp2,
vTemp3,
reinterpret_cast<const __m128 *>(&vTemp4i)[0]);
_mm_castsi128_ps(vTemp4i));
return M;
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
@ -1195,7 +1195,7 @@ inline void XMStoreInt2
__n64 VL = vget_low_u32(V);
vst1_u32( pDestination, VL );
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
_mm_store_ss( reinterpret_cast<float*>(&pDestination[0]), V );
_mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T );
#else // _XM_VMX128_INTRINSICS_
@ -1219,7 +1219,7 @@ inline void XMStoreInt2A
__n64 VL = vget_low_u32(V);
vst1_u32_ex( pDestination, VL, 64 );
#elif defined(_XM_SSE_INTRINSICS_)
_mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), reinterpret_cast<const __m128i *>(&V)[0] );
_mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}
@ -1240,7 +1240,7 @@ inline void XMStoreFloat2
__n64 VL = vget_low_f32(V);
vst1_f32( reinterpret_cast<float*>(pDestination), VL );
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
_mm_store_ss( &pDestination->x, V );
_mm_store_ss( &pDestination->y, T );
#else // _XM_VMX128_INTRINSICS_
@ -1264,7 +1264,7 @@ inline void XMStoreFloat2A
__n64 VL = vget_low_f32(V);
vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 );
#elif defined(_XM_SSE_INTRINSICS_)
_mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), reinterpret_cast<const __m128i *>(&V)[0] );
_mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}
@ -1292,10 +1292,10 @@ inline void XMStoreSInt2
__m128i vResulti = _mm_cvttps_epi32(V);
// If there was positive overflow, set to 0x7FFFFFFF
XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast<const __m128 *>(&vResulti)[0]);
vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
vOverflow = _mm_or_ps(vOverflow,vResult);
// Write two ints
XMVECTOR T = _mm_shuffle_ps( vOverflow, vOverflow, _MM_SHUFFLE( 1, 1, 1, 1 ) );
XMVECTOR T = XM_PERMUTE_PS( vOverflow, _MM_SHUFFLE( 1, 1, 1, 1 ) );
_mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow );
_mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T );
#else // _XM_VMX128_INTRINSICS_
@ -1333,11 +1333,11 @@ inline void XMStoreUInt2
__m128i vResulti = _mm_cvttps_epi32(vResult);
// Convert from signed to unsigned pnly if greater than 0x80000000
vMask = _mm_and_ps(vMask,g_XMNegativeZero);
vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],vMask);
vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
// On those that are too large, set to 0xFFFFFFFF
vResult = _mm_or_ps(vResult,vOverflow);
// Write two uints
XMVECTOR T = _mm_shuffle_ps( vResult, vResult, _MM_SHUFFLE( 1, 1, 1, 1 ) );
XMVECTOR T = XM_PERMUTE_PS( vResult, _MM_SHUFFLE( 1, 1, 1, 1 ) );
_mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult );
_mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T );
#else // _XM_VMX128_INTRINSICS_
@ -1362,8 +1362,8 @@ inline void XMStoreInt3
vst1_u32( pDestination, VL );
vst1q_lane_u32( pDestination+2, V, 2 );
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR T1 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
XMVECTOR T2 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
_mm_store_ss( reinterpret_cast<float*>(pDestination), V );
_mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T1 );
_mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T2 );
@ -1390,8 +1390,8 @@ inline void XMStoreInt3A
vst1_u32_ex( pDestination, VL, 64 );
vst1q_lane_u32( pDestination+2, V, 2 );
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR T = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
_mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), reinterpret_cast<const __m128i *>(&V)[0] );
XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
_mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
_mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T );
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
@ -1415,8 +1415,8 @@ inline void XMStoreFloat3
vst1_f32( reinterpret_cast<float*>(pDestination), VL );
vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 );
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR T1 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
XMVECTOR T2 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
_mm_store_ss( &pDestination->x, V );
_mm_store_ss( &pDestination->y, T1 );
_mm_store_ss( &pDestination->z, T2 );
@ -1443,8 +1443,8 @@ inline void XMStoreFloat3A
vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 );
vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 );
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR T = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
_mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), reinterpret_cast<const __m128i *>(&V)[0] );
XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
_mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
_mm_store_ss( &pDestination->z, T );
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
@ -1475,11 +1475,11 @@ inline void XMStoreSInt3
__m128i vResulti = _mm_cvttps_epi32(V);
// If there was positive overflow, set to 0x7FFFFFFF
XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast<const __m128 *>(&vResulti)[0]);
vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
vOverflow = _mm_or_ps(vOverflow,vResult);
// Write 3 uints
XMVECTOR T1 = _mm_shuffle_ps(vOverflow,vOverflow,_MM_SHUFFLE(1,1,1,1));
XMVECTOR T2 = _mm_shuffle_ps(vOverflow,vOverflow,_MM_SHUFFLE(2,2,2,2));
XMVECTOR T1 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(1,1,1,1));
XMVECTOR T2 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(2,2,2,2));
_mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow );
_mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 );
_mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 );
@ -1520,12 +1520,12 @@ inline void XMStoreUInt3
__m128i vResulti = _mm_cvttps_epi32(vResult);
// Convert from signed to unsigned pnly if greater than 0x80000000
vMask = _mm_and_ps(vMask,g_XMNegativeZero);
vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],vMask);
vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
// On those that are too large, set to 0xFFFFFFFF
vResult = _mm_or_ps(vResult,vOverflow);
// Write 3 uints
XMVECTOR T1 = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(1,1,1,1));
XMVECTOR T2 = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(2,2,2,2));
XMVECTOR T1 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1));
XMVECTOR T2 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(2,2,2,2));
_mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult );
_mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 );
_mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 );
@ -1550,7 +1550,7 @@ inline void XMStoreInt4
#elif defined(_XM_ARM_NEON_INTRINSICS_)
vst1q_u32( pDestination, V );
#elif defined(_XM_SSE_INTRINSICS_)
_mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), reinterpret_cast<const __m128i *>(&V)[0] );
_mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}
@ -1573,7 +1573,7 @@ inline void XMStoreInt4A
#elif defined(_XM_ARM_NEON_INTRINSICS_)
vst1q_u32_ex( pDestination, V, 128 );
#elif defined(_XM_SSE_INTRINSICS_)
_mm_store_si128( reinterpret_cast<__m128i*>(pDestination), reinterpret_cast<const __m128i *>(&V)[0] );
_mm_store_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}
@ -1649,9 +1649,9 @@ inline void XMStoreSInt4
__m128i vResulti = _mm_cvttps_epi32(V);
// If there was positive overflow, set to 0x7FFFFFFF
XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast<const __m128 *>(&vResulti)[0]);
vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
vOverflow = _mm_or_ps(vOverflow,vResult);
_mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), reinterpret_cast<const __m128i *>(&vOverflow)[0] );
_mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vOverflow) );
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}
@ -1688,10 +1688,10 @@ inline void XMStoreUInt4
__m128i vResulti = _mm_cvttps_epi32(vResult);
// Convert from signed to unsigned pnly if greater than 0x80000000
vMask = _mm_and_ps(vMask,g_XMNegativeZero);
vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],vMask);
vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
// On those that are too large, set to 0xFFFFFFFF
vResult = _mm_or_ps(vResult,vOverflow);
_mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), reinterpret_cast<const __m128i *>(&vResult)[0] );
_mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vResult) );
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}
@ -1738,7 +1738,7 @@ inline void XMStoreFloat3x3
_mm_storeu_ps(&pDestination->m[0][0],vTemp1);
vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
_mm_storeu_ps(&pDestination->m[1][1],vTemp2);
vTemp3 = _mm_shuffle_ps(vTemp3,vTemp3,_MM_SHUFFLE(2,2,2,2));
vTemp3 = XM_PERMUTE_PS(vTemp3,_MM_SHUFFLE(2,2,2,2));
_mm_store_ss(&pDestination->m[2][2],vTemp3);
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_

View File

@ -319,10 +319,10 @@ inline XMMATRIX XMMatrixMultiply
// Use vW to hold the original row
XMVECTOR vW = M1.r[0];
// Splat the component X,Y,Z then W
XMVECTOR vX = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(0,0,0,0));
XMVECTOR vY = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(1,1,1,1));
XMVECTOR vZ = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(2,2,2,2));
vW = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(3,3,3,3));
XMVECTOR vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
XMVECTOR vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
XMVECTOR vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
// Perform the operation on the first row
vX = _mm_mul_ps(vX,M2.r[0]);
vY = _mm_mul_ps(vY,M2.r[1]);
@ -335,10 +335,10 @@ inline XMMATRIX XMMatrixMultiply
mResult.r[0] = vX;
// Repeat for the other 3 rows
vW = M1.r[1];
vX = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(0,0,0,0));
vY = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(1,1,1,1));
vZ = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(2,2,2,2));
vW = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(3,3,3,3));
vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
vX = _mm_mul_ps(vX,M2.r[0]);
vY = _mm_mul_ps(vY,M2.r[1]);
vZ = _mm_mul_ps(vZ,M2.r[2]);
@ -348,10 +348,10 @@ inline XMMATRIX XMMatrixMultiply
vX = _mm_add_ps(vX,vY);
mResult.r[1] = vX;
vW = M1.r[2];
vX = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(0,0,0,0));
vY = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(1,1,1,1));
vZ = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(2,2,2,2));
vW = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(3,3,3,3));
vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
vX = _mm_mul_ps(vX,M2.r[0]);
vY = _mm_mul_ps(vY,M2.r[1]);
vZ = _mm_mul_ps(vZ,M2.r[2]);
@ -361,10 +361,10 @@ inline XMMATRIX XMMatrixMultiply
vX = _mm_add_ps(vX,vY);
mResult.r[2] = vX;
vW = M1.r[3];
vX = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(0,0,0,0));
vY = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(1,1,1,1));
vZ = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(2,2,2,2));
vW = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(3,3,3,3));
vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
vX = _mm_mul_ps(vX,M2.r[0]);
vY = _mm_mul_ps(vY,M2.r[1]);
vZ = _mm_mul_ps(vZ,M2.r[2]);
@ -490,10 +490,10 @@ inline XMMATRIX XMMatrixMultiplyTranspose
// Use vW to hold the original row
XMVECTOR vW = M1.r[0];
// Splat the component X,Y,Z then W
XMVECTOR vX = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(0,0,0,0));
XMVECTOR vY = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(1,1,1,1));
XMVECTOR vZ = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(2,2,2,2));
vW = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(3,3,3,3));
XMVECTOR vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
XMVECTOR vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
XMVECTOR vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
// Perform the operation on the first row
vX = _mm_mul_ps(vX,M2.r[0]);
vY = _mm_mul_ps(vY,M2.r[1]);
@ -506,10 +506,10 @@ inline XMMATRIX XMMatrixMultiplyTranspose
__m128 r0 = vX;
// Repeat for the other 3 rows
vW = M1.r[1];
vX = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(0,0,0,0));
vY = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(1,1,1,1));
vZ = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(2,2,2,2));
vW = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(3,3,3,3));
vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
vX = _mm_mul_ps(vX,M2.r[0]);
vY = _mm_mul_ps(vY,M2.r[1]);
vZ = _mm_mul_ps(vZ,M2.r[2]);
@ -519,10 +519,10 @@ inline XMMATRIX XMMatrixMultiplyTranspose
vX = _mm_add_ps(vX,vY);
__m128 r1 = vX;
vW = M1.r[2];
vX = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(0,0,0,0));
vY = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(1,1,1,1));
vZ = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(2,2,2,2));
vW = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(3,3,3,3));
vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
vX = _mm_mul_ps(vX,M2.r[0]);
vY = _mm_mul_ps(vY,M2.r[1]);
vZ = _mm_mul_ps(vZ,M2.r[2]);
@ -532,10 +532,10 @@ inline XMMATRIX XMMatrixMultiplyTranspose
vX = _mm_add_ps(vX,vY);
__m128 r2 = vX;
vW = M1.r[3];
vX = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(0,0,0,0));
vY = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(1,1,1,1));
vZ = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(2,2,2,2));
vW = _mm_shuffle_ps(vW,vW,_MM_SHUFFLE(3,3,3,3));
vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0));
vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1));
vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2));
vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3));
vX = _mm_mul_ps(vX,M2.r[0]);
vY = _mm_mul_ps(vY,M2.r[1]);
vZ = _mm_mul_ps(vZ,M2.r[2]);
@ -738,10 +738,10 @@ inline XMMATRIX XMMatrixInverse
#elif defined(_XM_SSE_INTRINSICS_)
XMMATRIX MT = XMMatrixTranspose(M);
XMVECTOR V00 = _mm_shuffle_ps(MT.r[2], MT.r[2],_MM_SHUFFLE(1,1,0,0));
XMVECTOR V10 = _mm_shuffle_ps(MT.r[3], MT.r[3],_MM_SHUFFLE(3,2,3,2));
XMVECTOR V01 = _mm_shuffle_ps(MT.r[0], MT.r[0],_MM_SHUFFLE(1,1,0,0));
XMVECTOR V11 = _mm_shuffle_ps(MT.r[1], MT.r[1],_MM_SHUFFLE(3,2,3,2));
XMVECTOR V00 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(1,1,0,0));
XMVECTOR V10 = XM_PERMUTE_PS(MT.r[3],_MM_SHUFFLE(3,2,3,2));
XMVECTOR V01 = XM_PERMUTE_PS(MT.r[0],_MM_SHUFFLE(1,1,0,0));
XMVECTOR V11 = XM_PERMUTE_PS(MT.r[1],_MM_SHUFFLE(3,2,3,2));
XMVECTOR V02 = _mm_shuffle_ps(MT.r[2], MT.r[0],_MM_SHUFFLE(2,0,2,0));
XMVECTOR V12 = _mm_shuffle_ps(MT.r[3], MT.r[1],_MM_SHUFFLE(3,1,3,1));
@ -749,10 +749,10 @@ inline XMMATRIX XMMatrixInverse
XMVECTOR D1 = _mm_mul_ps(V01,V11);
XMVECTOR D2 = _mm_mul_ps(V02,V12);
V00 = _mm_shuffle_ps(MT.r[2],MT.r[2],_MM_SHUFFLE(3,2,3,2));
V10 = _mm_shuffle_ps(MT.r[3],MT.r[3],_MM_SHUFFLE(1,1,0,0));
V01 = _mm_shuffle_ps(MT.r[0],MT.r[0],_MM_SHUFFLE(3,2,3,2));
V11 = _mm_shuffle_ps(MT.r[1],MT.r[1],_MM_SHUFFLE(1,1,0,0));
V00 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(3,2,3,2));
V10 = XM_PERMUTE_PS(MT.r[3],_MM_SHUFFLE(1,1,0,0));
V01 = XM_PERMUTE_PS(MT.r[0],_MM_SHUFFLE(3,2,3,2));
V11 = XM_PERMUTE_PS(MT.r[1],_MM_SHUFFLE(1,1,0,0));
V02 = _mm_shuffle_ps(MT.r[2],MT.r[0],_MM_SHUFFLE(3,1,3,1));
V12 = _mm_shuffle_ps(MT.r[3],MT.r[1],_MM_SHUFFLE(2,0,2,0));
@ -764,15 +764,15 @@ inline XMMATRIX XMMatrixInverse
D2 = _mm_sub_ps(D2,V02);
// V11 = D0Y,D0W,D2Y,D2Y
V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,1,3,1));
V00 = _mm_shuffle_ps(MT.r[1], MT.r[1],_MM_SHUFFLE(1,0,2,1));
V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(1,0,2,1));
V10 = _mm_shuffle_ps(V11,D0,_MM_SHUFFLE(0,3,0,2));
V01 = _mm_shuffle_ps(MT.r[0], MT.r[0],_MM_SHUFFLE(0,1,0,2));
V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(0,1,0,2));
V11 = _mm_shuffle_ps(V11,D0,_MM_SHUFFLE(2,1,2,1));
// V13 = D1Y,D1W,D2W,D2W
XMVECTOR V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,3,3,1));
V02 = _mm_shuffle_ps(MT.r[3], MT.r[3],_MM_SHUFFLE(1,0,2,1));
V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(1,0,2,1));
V12 = _mm_shuffle_ps(V13,D1,_MM_SHUFFLE(0,3,0,2));
XMVECTOR V03 = _mm_shuffle_ps(MT.r[2], MT.r[2],_MM_SHUFFLE(0,1,0,2));
XMVECTOR V03 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(0,1,0,2));
V13 = _mm_shuffle_ps(V13,D1,_MM_SHUFFLE(2,1,2,1));
XMVECTOR C0 = _mm_mul_ps(V00,V10);
@ -782,15 +782,15 @@ inline XMMATRIX XMMatrixInverse
// V11 = D0X,D0Y,D2X,D2X
V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(0,0,1,0));
V00 = _mm_shuffle_ps(MT.r[1], MT.r[1],_MM_SHUFFLE(2,1,3,2));
V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(2,1,3,2));
V10 = _mm_shuffle_ps(D0,V11,_MM_SHUFFLE(2,1,0,3));
V01 = _mm_shuffle_ps(MT.r[0], MT.r[0],_MM_SHUFFLE(1,3,2,3));
V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(1,3,2,3));
V11 = _mm_shuffle_ps(D0,V11,_MM_SHUFFLE(0,2,1,2));
// V13 = D1X,D1Y,D2Z,D2Z
V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(2,2,1,0));
V02 = _mm_shuffle_ps(MT.r[3], MT.r[3],_MM_SHUFFLE(2,1,3,2));
V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(2,1,3,2));
V12 = _mm_shuffle_ps(D1,V13,_MM_SHUFFLE(2,1,0,3));
V03 = _mm_shuffle_ps(MT.r[2], MT.r[2],_MM_SHUFFLE(1,3,2,3));
V03 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(1,3,2,3));
V13 = _mm_shuffle_ps(D1,V13,_MM_SHUFFLE(0,2,1,2));
V00 = _mm_mul_ps(V00,V10);
@ -802,22 +802,22 @@ inline XMMATRIX XMMatrixInverse
C4 = _mm_sub_ps(C4,V02);
C6 = _mm_sub_ps(C6,V03);
V00 = _mm_shuffle_ps(MT.r[1],MT.r[1],_MM_SHUFFLE(0,3,0,3));
V00 = XM_PERMUTE_PS(MT.r[1],_MM_SHUFFLE(0,3,0,3));
// V10 = D0Z,D0Z,D2X,D2Y
V10 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,0,2,2));
V10 = _mm_shuffle_ps(V10,V10,_MM_SHUFFLE(0,2,3,0));
V01 = _mm_shuffle_ps(MT.r[0],MT.r[0],_MM_SHUFFLE(2,0,3,1));
V10 = XM_PERMUTE_PS(V10,_MM_SHUFFLE(0,2,3,0));
V01 = XM_PERMUTE_PS(MT.r[0],_MM_SHUFFLE(2,0,3,1));
// V11 = D0X,D0W,D2X,D2Y
V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,0,3,0));
V11 = _mm_shuffle_ps(V11,V11,_MM_SHUFFLE(2,1,0,3));
V02 = _mm_shuffle_ps(MT.r[3],MT.r[3],_MM_SHUFFLE(0,3,0,3));
V11 = XM_PERMUTE_PS(V11,_MM_SHUFFLE(2,1,0,3));
V02 = XM_PERMUTE_PS(MT.r[3],_MM_SHUFFLE(0,3,0,3));
// V12 = D1Z,D1Z,D2Z,D2W
V12 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,2,2,2));
V12 = _mm_shuffle_ps(V12,V12,_MM_SHUFFLE(0,2,3,0));
V03 = _mm_shuffle_ps(MT.r[2],MT.r[2],_MM_SHUFFLE(2,0,3,1));
V12 = XM_PERMUTE_PS(V12,_MM_SHUFFLE(0,2,3,0));
V03 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(2,0,3,1));
// V13 = D1X,D1W,D2Z,D2W
V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,2,3,0));
V13 = _mm_shuffle_ps(V13,V13,_MM_SHUFFLE(2,1,0,3));
V13 = XM_PERMUTE_PS(V13,_MM_SHUFFLE(2,1,0,3));
V00 = _mm_mul_ps(V00,V10);
V01 = _mm_mul_ps(V01,V11);
@ -836,10 +836,10 @@ inline XMMATRIX XMMatrixInverse
C2 = _mm_shuffle_ps(C2,C3,_MM_SHUFFLE(3,1,2,0));
C4 = _mm_shuffle_ps(C4,C5,_MM_SHUFFLE(3,1,2,0));
C6 = _mm_shuffle_ps(C6,C7,_MM_SHUFFLE(3,1,2,0));
C0 = _mm_shuffle_ps(C0,C0,_MM_SHUFFLE(3,1,2,0));
C2 = _mm_shuffle_ps(C2,C2,_MM_SHUFFLE(3,1,2,0));
C4 = _mm_shuffle_ps(C4,C4,_MM_SHUFFLE(3,1,2,0));
C6 = _mm_shuffle_ps(C6,C6,_MM_SHUFFLE(3,1,2,0));
C0 = XM_PERMUTE_PS(C0,_MM_SHUFFLE(3,1,2,0));
C2 = XM_PERMUTE_PS(C2,_MM_SHUFFLE(3,1,2,0));
C4 = XM_PERMUTE_PS(C4,_MM_SHUFFLE(3,1,2,0));
C6 = XM_PERMUTE_PS(C6,_MM_SHUFFLE(3,1,2,0));
// Get the determinate
XMVECTOR vTemp = XMVector4Dot(C0,MT.r[0]);
if (pDeterminant != nullptr)
@ -1357,7 +1357,7 @@ inline XMMATRIX XMMatrixRotationX
M.r[0] = g_XMIdentityR0;
M.r[1] = vCos;
// x = 0,y = sin,z = cos, w = 0
vCos = _mm_shuffle_ps(vCos,vCos,_MM_SHUFFLE(3,1,2,0));
vCos = XM_PERMUTE_PS(vCos,_MM_SHUFFLE(3,1,2,0));
// x = 0,y = -sin,z = cos, w = 0
vCos = _mm_mul_ps(vCos,g_XMNegateY);
M.r[2] = vCos;
@ -1434,7 +1434,7 @@ inline XMMATRIX XMMatrixRotationY
M.r[2] = vSin;
M.r[1] = g_XMIdentityR1;
// x = cos,y = 0,z = sin, w = 0
vSin = _mm_shuffle_ps(vSin,vSin,_MM_SHUFFLE(3,0,1,2));
vSin = XM_PERMUTE_PS(vSin,_MM_SHUFFLE(3,0,1,2));
// x = cos,y = 0,z = -sin, w = 0
vSin = _mm_mul_ps(vSin,g_XMNegateZ);
M.r[0] = vSin;
@ -1510,7 +1510,7 @@ inline XMMATRIX XMMatrixRotationZ
XMMATRIX M;
M.r[0] = vCos;
// x = sin,y = cos,z = 0, w = 0
vCos = _mm_shuffle_ps(vCos,vCos,_MM_SHUFFLE(3,2,0,1));
vCos = XM_PERMUTE_PS(vCos,_MM_SHUFFLE(3,2,0,1));
// x = cos,y = -sin,z = 0, w = 0
vCos = _mm_mul_ps(vCos,g_XMNegateX);
M.r[1] = vCos;
@ -1597,8 +1597,8 @@ inline XMMATRIX XMMatrixRotationNormal
XMVECTOR C1 = _mm_set_ps1(fCosAngle);
XMVECTOR C0 = _mm_set_ps1(fSinAngle);
XMVECTOR N0 = _mm_shuffle_ps(NormalAxis,NormalAxis,_MM_SHUFFLE(3,0,2,1));
XMVECTOR N1 = _mm_shuffle_ps(NormalAxis,NormalAxis,_MM_SHUFFLE(3,1,0,2));
XMVECTOR N0 = XM_PERMUTE_PS(NormalAxis,_MM_SHUFFLE(3,0,2,1));
XMVECTOR N1 = XM_PERMUTE_PS(NormalAxis,_MM_SHUFFLE(3,1,0,2));
XMVECTOR V0 = _mm_mul_ps(C2, N0);
V0 = _mm_mul_ps(V0, N1);
@ -1614,18 +1614,18 @@ inline XMMATRIX XMMatrixRotationNormal
V0 = _mm_and_ps(R0,g_XMMask3);
XMVECTOR V1 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(2,1,2,0));
V1 = _mm_shuffle_ps(V1,V1,_MM_SHUFFLE(0,3,2,1));
V1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(0,3,2,1));
XMVECTOR V2 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(0,0,1,1));
V2 = _mm_shuffle_ps(V2,V2,_MM_SHUFFLE(2,0,2,0));
V2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,0,2,0));
R2 = _mm_shuffle_ps(V0,V1,_MM_SHUFFLE(1,0,3,0));
R2 = _mm_shuffle_ps(R2,R2,_MM_SHUFFLE(1,3,2,0));
R2 = XM_PERMUTE_PS(R2,_MM_SHUFFLE(1,3,2,0));
XMMATRIX M;
M.r[0] = R2;
R2 = _mm_shuffle_ps(V0,V1,_MM_SHUFFLE(3,2,3,1));
R2 = _mm_shuffle_ps(R2,R2,_MM_SHUFFLE(1,3,0,2));
R2 = XM_PERMUTE_PS(R2,_MM_SHUFFLE(1,3,0,2));
M.r[1] = R2;
V2 = _mm_shuffle_ps(V2,V0,_MM_SHUFFLE(3,2,1,0));
@ -1702,37 +1702,37 @@ inline XMMATRIX XMMatrixRotationQuaternion
XMVECTOR Q0 = _mm_add_ps(Quaternion,Quaternion);
XMVECTOR Q1 = _mm_mul_ps(Quaternion,Q0);
XMVECTOR V0 = _mm_shuffle_ps(Q1,Q1,_MM_SHUFFLE(3,0,0,1));
XMVECTOR V0 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(3,0,0,1));
V0 = _mm_and_ps(V0,g_XMMask3);
XMVECTOR V1 = _mm_shuffle_ps(Q1,Q1,_MM_SHUFFLE(3,1,2,2));
XMVECTOR V1 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(3,1,2,2));
V1 = _mm_and_ps(V1,g_XMMask3);
XMVECTOR R0 = _mm_sub_ps(Constant1110,V0);
R0 = _mm_sub_ps(R0, V1);
V0 = _mm_shuffle_ps(Quaternion,Quaternion,_MM_SHUFFLE(3,1,0,0));
V1 = _mm_shuffle_ps(Q0,Q0,_MM_SHUFFLE(3,2,1,2));
V0 = XM_PERMUTE_PS(Quaternion,_MM_SHUFFLE(3,1,0,0));
V1 = XM_PERMUTE_PS(Q0,_MM_SHUFFLE(3,2,1,2));
V0 = _mm_mul_ps(V0, V1);
V1 = _mm_shuffle_ps(Quaternion,Quaternion,_MM_SHUFFLE(3,3,3,3));
XMVECTOR V2 = _mm_shuffle_ps(Q0,Q0,_MM_SHUFFLE(3,0,2,1));
V1 = XM_PERMUTE_PS(Quaternion,_MM_SHUFFLE(3,3,3,3));
XMVECTOR V2 = XM_PERMUTE_PS(Q0,_MM_SHUFFLE(3,0,2,1));
V1 = _mm_mul_ps(V1, V2);
XMVECTOR R1 = _mm_add_ps(V0, V1);
XMVECTOR R2 = _mm_sub_ps(V0, V1);
V0 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(1,0,2,1));
V0 = _mm_shuffle_ps(V0,V0,_MM_SHUFFLE(1,3,2,0));
V0 = XM_PERMUTE_PS(V0,_MM_SHUFFLE(1,3,2,0));
V1 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(2,2,0,0));
V1 = _mm_shuffle_ps(V1,V1,_MM_SHUFFLE(2,0,2,0));
V1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(2,0,2,0));
Q1 = _mm_shuffle_ps(R0,V0,_MM_SHUFFLE(1,0,3,0));
Q1 = _mm_shuffle_ps(Q1,Q1,_MM_SHUFFLE(1,3,2,0));
Q1 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(1,3,2,0));
XMMATRIX M;
M.r[0] = Q1;
Q1 = _mm_shuffle_ps(R0,V0,_MM_SHUFFLE(3,2,3,1));
Q1 = _mm_shuffle_ps(Q1,Q1,_MM_SHUFFLE(1,3,0,2));
Q1 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(1,3,0,2));
M.r[1] = Q1;
Q1 = _mm_shuffle_ps(V1,R0,_MM_SHUFFLE(3,2,1,0));
@ -2487,10 +2487,10 @@ inline XMMATRIX XMMatrixPerspectiveOffCenterLH
vTemp = _mm_and_ps(vTemp,g_XMMaskY);
M.r[1] = vTemp;
// 0,0,fRange,1.0f
M.m[2][0] = -(ViewLeft + ViewRight) * ReciprocalWidth;
M.m[2][1] = -(ViewTop + ViewBottom) * ReciprocalHeight;
M.m[2][2] = fRange;
M.m[2][3] = 1.0f;
M.r[2] = XMVectorSet( -(ViewLeft + ViewRight) * ReciprocalWidth,
-(ViewTop + ViewBottom) * ReciprocalHeight,
fRange,
1.0f );
// 0,0,-fRange * NearZ,0.0f
vValues = _mm_and_ps(vValues,g_XMMaskZ);
M.r[3] = vValues;
@ -2585,10 +2585,10 @@ inline XMMATRIX XMMatrixPerspectiveOffCenterRH
vTemp = _mm_and_ps(vTemp,g_XMMaskY);
M.r[1] = vTemp;
// 0,0,fRange,1.0f
M.m[2][0] = (ViewLeft + ViewRight) * ReciprocalWidth;
M.m[2][1] = (ViewTop + ViewBottom) * ReciprocalHeight;
M.m[2][2] = fRange;
M.m[2][3] = -1.0f;
M.r[2] = XMVectorSet( (ViewLeft + ViewRight) * ReciprocalWidth,
(ViewTop + ViewBottom) * ReciprocalHeight,
fRange,
-1.0f );
// 0,0,-fRange * NearZ,0.0f
vValues = _mm_and_ps(vValues,g_XMMaskZ);
M.r[3] = vValues;

View File

@ -156,23 +156,23 @@ inline XMVECTOR XMQuaternionMultiply
XMVECTOR Q2Z = Q2;
XMVECTOR vResult = Q2;
// Splat with one instruction
vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,3,3,3));
Q2X = _mm_shuffle_ps(Q2X,Q2X,_MM_SHUFFLE(0,0,0,0));
Q2Y = _mm_shuffle_ps(Q2Y,Q2Y,_MM_SHUFFLE(1,1,1,1));
Q2Z = _mm_shuffle_ps(Q2Z,Q2Z,_MM_SHUFFLE(2,2,2,2));
vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,3,3,3));
Q2X = XM_PERMUTE_PS(Q2X,_MM_SHUFFLE(0,0,0,0));
Q2Y = XM_PERMUTE_PS(Q2Y,_MM_SHUFFLE(1,1,1,1));
Q2Z = XM_PERMUTE_PS(Q2Z,_MM_SHUFFLE(2,2,2,2));
// Retire Q1 and perform Q1*Q2W
vResult = _mm_mul_ps(vResult,Q1);
XMVECTOR Q1Shuffle = Q1;
// Shuffle the copies of Q1
Q1Shuffle = _mm_shuffle_ps(Q1Shuffle,Q1Shuffle,_MM_SHUFFLE(0,1,2,3));
Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(0,1,2,3));
// Mul by Q1WZYX
Q2X = _mm_mul_ps(Q2X,Q1Shuffle);
Q1Shuffle = _mm_shuffle_ps(Q1Shuffle,Q1Shuffle,_MM_SHUFFLE(2,3,0,1));
Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(2,3,0,1));
// Flip the signs on y and z
Q2X = _mm_mul_ps(Q2X,ControlWZYX);
// Mul by Q1ZWXY
Q2Y = _mm_mul_ps(Q2Y,Q1Shuffle);
Q1Shuffle = _mm_shuffle_ps(Q1Shuffle,Q1Shuffle,_MM_SHUFFLE(0,1,2,3));
Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(0,1,2,3));
// Flip the signs on z and w
Q2Y = _mm_mul_ps(Q2Y,ControlZWXY);
// Mul by Q1YXWZ
@ -438,7 +438,7 @@ inline XMVECTOR XMQuaternionSlerpV
XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega);
XMVECTOR V01 = _mm_shuffle_ps(T,T,_MM_SHUFFLE(2,3,0,1));
XMVECTOR V01 = XM_PERMUTE_PS(T,_MM_SHUFFLE(2,3,0,1));
V01 = _mm_and_ps(V01,MaskXY);
V01 = _mm_xor_ps(V01,SignMask2);
V01 = _mm_add_ps(g_XMIdentityR0, V01);
@ -897,11 +897,11 @@ inline XMVECTOR XMQuaternionRotationMatrix
XMVECTOR r2 = M.r[2]; // (r20, r21, r22, 0)
// (r00, r00, r00, r00)
XMVECTOR r00 = _mm_shuffle_ps(r0, r0, _MM_SHUFFLE(0,0,0,0));
XMVECTOR r00 = XM_PERMUTE_PS(r0, _MM_SHUFFLE(0,0,0,0));
// (r11, r11, r11, r11)
XMVECTOR r11 = _mm_shuffle_ps(r1, r1, _MM_SHUFFLE(1,1,1,1));
XMVECTOR r11 = XM_PERMUTE_PS(r1, _MM_SHUFFLE(1,1,1,1));
// (r22, r22, r22, r22)
XMVECTOR r22 = _mm_shuffle_ps(r2, r2, _MM_SHUFFLE(2,2,2,2));
XMVECTOR r22 = XM_PERMUTE_PS(r2, _MM_SHUFFLE(2,2,2,2));
// x^2 >= y^2 equivalent to r11 - r00 <= 0
// (r11 - r00, r11 - r00, r11 - r00, r11 - r00)
@ -935,7 +935,7 @@ inline XMVECTOR XMQuaternionRotationMatrix
// (r10, r10, r20, r21)
t1 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(1,0,0,0));
// (r10, r20, r21, r10)
t1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(1,3,2,0));
t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1,3,2,0));
// (4*x*y, 4*x*z, 4*y*z, unused)
XMVECTOR xyxzyz = _mm_add_ps(t0, t1);
@ -944,7 +944,7 @@ inline XMVECTOR XMQuaternionRotationMatrix
// (r12, r12, r02, r01)
t1 = _mm_shuffle_ps(r1, r0, _MM_SHUFFLE(1,2,2,2));
// (r12, r02, r01, r12)
t1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(1,3,2,0));
t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1,3,2,0));
// (4*x*w, 4*y*w, 4*z*w, unused)
XMVECTOR xwywzw = _mm_sub_ps(t0, t1);
xwywzw = _mm_mul_ps(XMMPMP, xwywzw);
@ -1137,15 +1137,15 @@ inline XMVECTOR XMPlaneNormalizeEst
// Perform the dot product
XMVECTOR vDot = _mm_mul_ps(P,P);
// x=Dot.y, y=Dot.z
XMVECTOR vTemp = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(2,1,2,1));
XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1));
// Result.x = x+y
vDot = _mm_add_ss(vDot,vTemp);
// x=Dot.z
vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1));
vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
// Result.x = (x+y)+z
vDot = _mm_add_ss(vDot,vTemp);
// Splat x
vDot = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(0,0,0,0));
vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0));
// Get the reciprocal
vDot = _mm_rsqrt_ps(vDot);
// Get the reciprocal
@ -1183,11 +1183,11 @@ inline XMVECTOR XMPlaneNormalize
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product on x,y and z only
XMVECTOR vLengthSq = _mm_mul_ps(P,P);
XMVECTOR vTemp = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(2,1,2,1));
XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,1,2,1));
vLengthSq = _mm_add_ss(vLengthSq,vTemp);
vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1));
vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
vLengthSq = _mm_add_ss(vLengthSq,vTemp);
vLengthSq = _mm_shuffle_ps(vLengthSq,vLengthSq,_MM_SHUFFLE(0,0,0,0));
vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
// Prepare for the division
XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
// Failsafe on zero (Or epsilon) length planes
@ -1531,24 +1531,11 @@ inline XMVECTOR XMColorAdjustSaturation
return vbslq_f32( g_XMSelect1110, vResult, vColor );
#elif defined(_XM_SSE_INTRINSICS_)
static const XMVECTORF32 gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f};
// Mul RGB by intensity constants
XMVECTOR vLuminance = _mm_mul_ps(vColor,gvLuminance);
// vResult.x = vLuminance.y, vResult.y = vLuminance.y,
// vResult.z = vLuminance.z, vResult.w = vLuminance.z
XMVECTOR vResult = vLuminance;
vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(2,2,1,1));
// vLuminance.x += vLuminance.y
vLuminance = _mm_add_ss(vLuminance,vResult);
// Splat vLuminance.z
vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(2,2,2,2));
// vLuminance.x += vLuminance.z (Dot product)
vLuminance = _mm_add_ss(vLuminance,vResult);
// Splat vLuminance
vLuminance = _mm_shuffle_ps(vLuminance,vLuminance,_MM_SHUFFLE(0,0,0,0));
XMVECTOR vLuminance = XMVector3Dot( vColor, gvLuminance );
// Splat fSaturation
XMVECTOR vSaturation = _mm_set_ps1(fSaturation);
// vResult = ((vColor-vLuminance)*vSaturation)+vLuminance;
vResult = _mm_sub_ps(vColor,vLuminance);
XMVECTOR vResult = _mm_sub_ps(vColor,vLuminance);
vResult = _mm_mul_ps(vResult,vSaturation);
vResult = _mm_add_ps(vResult,vLuminance);
// Retain w from the source color

File diff suppressed because it is too large Load Diff

View File

@ -250,7 +250,7 @@ inline XMVECTOR PackedVector::XMLoadShortN2
// x needs to be sign extended
vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16);
// Convert to floating point numbers
vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
// x - 0x8000 to undo the signed order.
vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16);
// Convert -1.0f - 1.0f
@ -286,7 +286,7 @@ inline XMVECTOR PackedVector::XMLoadShort2
// x needs to be sign extended
vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16);
// Convert to floating point numbers
vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
// x - 0x8000 to undo the signed order.
vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16);
// Y is 65536 too large
@ -322,7 +322,7 @@ inline XMVECTOR PackedVector::XMLoadUShortN2
// y needs to be sign flipped
vTemp = _mm_xor_ps(vTemp,g_XMFlipY);
// Convert to floating point numbers
vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
// y + 0x8000 to undo the signed order.
vTemp = _mm_add_ps(vTemp,FixaddY16);
// Y is 65536 times too large
@ -358,7 +358,7 @@ inline XMVECTOR PackedVector::XMLoadUShort2
// y needs to be sign flipped
vTemp = _mm_xor_ps(vTemp,g_XMFlipY);
// Convert to floating point numbers
vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
// Y is 65536 times too large
vTemp = _mm_mul_ps(vTemp,g_XMFixupY16);
// y + 0x8000 to undo the signed order.
@ -452,7 +452,7 @@ inline XMVECTOR PackedVector::XMLoadU565
// Mask off x, y and z
vResult = _mm_and_ps(vResult,U565And);
// Convert to float
vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
// Normalize x, y, and z
vResult = _mm_mul_ps(vResult,U565Mul);
return vResult;
@ -741,17 +741,17 @@ inline XMVECTOR PackedVector::XMLoadShortN4
// Splat the color in all four entries (x,z,y,w)
__m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
// Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
__m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
__m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16);
// x and z are unsigned! Flip the bits to convert the order to signed
vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16);
// Convert to floating point numbers
vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
// x and z - 0x8000 to complete the conversion
vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16);
// Convert to -1.0f - 1.0f
vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16Z16W16);
// Very important! The entries are x,z,y,w, flip it to x,y,z,w
vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0));
// Clamp result (for case of -32768)
return _mm_max_ps( vTemp, g_XMNegativeOne );
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
@ -782,17 +782,17 @@ inline XMVECTOR PackedVector::XMLoadShort4
// Splat the color in all four entries (x,z,y,w)
__m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
// Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
__m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
__m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16);
// x and z are unsigned! Flip the bits to convert the order to signed
vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16);
// Convert to floating point numbers
vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
// x and z - 0x8000 to complete the conversion
vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16);
// Fix y and w because they are 65536 too large
vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16);
// Very important! The entries are x,z,y,w, flip it to x,y,z,w
return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0));
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
#endif // _XM_VMX128_INTRINSICS_
}
@ -825,17 +825,17 @@ inline XMVECTOR PackedVector::XMLoadUShortN4
// Splat the color in all four entries (x,z,y,w)
__m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
// Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
__m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
__m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16);
// y and w are signed! Flip the bits to convert the order to unsigned
vTemp = _mm_xor_ps(vTemp,g_XMFlipZW);
// Convert to floating point numbers
vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
// y and w + 0x8000 to complete the conversion
vTemp = _mm_add_ps(vTemp,FixaddY16W16);
// Fix y and w because they are 65536 too large
vTemp = _mm_mul_ps(vTemp,FixupY16W16);
// Very important! The entries are x,z,y,w, flip it to x,y,z,w
return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0));
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
#endif // _XM_VMX128_INTRINSICS_
}
@ -865,17 +865,17 @@ inline XMVECTOR PackedVector::XMLoadUShort4
// Splat the color in all four entries (x,z,y,w)
__m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
// Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
__m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
__m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16);
// y and w are signed! Flip the bits to convert the order to unsigned
vTemp = _mm_xor_ps(vTemp,g_XMFlipZW);
// Convert to floating point numbers
vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
// Fix y and w because they are 65536 too large
vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16);
// y and w + 0x8000 to complete the conversion
vTemp = _mm_add_ps(vTemp,FixaddY16W16);
// Very important! The entries are x,z,y,w, flip it to x,y,z,w
return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0));
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
#endif // _XM_VMX128_INTRINSICS_
}
@ -910,7 +910,7 @@ inline XMVECTOR PackedVector::XMLoadXDecN4
// a is unsigned! Flip the bit to convert the order to signed
vTemp = _mm_xor_ps(vTemp,g_XMFlipA2B10G10R10);
// Convert to floating point numbers
vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
// RGB + 0, A + 0x80000000.f to undo the signed order.
vTemp = _mm_add_ps(vTemp,g_XMFixAA2B10G10R10);
// Convert 0-255 to 0.0f-1.0f
@ -953,7 +953,7 @@ inline XMVECTOR PackedVector::XMLoadXDec4
// a is unsigned! Flip the bit to convert the order to signed
vTemp = _mm_xor_ps(vTemp,XDec4Xor);
// Convert to floating point numbers
vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
// RGB + 0, A + 0x80000000.f to undo the signed order.
vTemp = _mm_add_ps(vTemp,XDec4Add);
// Convert 0-255 to 0.0f-1.0f
@ -993,7 +993,7 @@ inline XMVECTOR PackedVector::XMLoadUDecN4
// a is unsigned! Flip the bit to convert the order to signed
vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
// Convert to floating point numbers
vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
// RGB + 0, A + 0x80000000.f to undo the signed order.
vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
// Convert 0-255 to 0.0f-1.0f
@ -1031,7 +1031,7 @@ inline XMVECTOR PackedVector::XMLoadUDec4
// a is unsigned! Flip the bit to convert the order to signed
vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
// Convert to floating point numbers
vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
// RGB + 0, A + 0x80000000.f to undo the signed order.
vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
// Convert 0-255 to 0.0f-1.0f
@ -1074,7 +1074,7 @@ inline XMVECTOR PackedVector::XMLoadDecN4
// a is unsigned! Flip the bit to convert the order to signed
vTemp = _mm_xor_ps(vTemp,g_XMXorDec4);
// Convert to floating point numbers
vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
// RGB + 0, A + 0x80000000.f to undo the signed order.
vTemp = _mm_add_ps(vTemp,g_XMAddDec4);
// Convert 0-255 to 0.0f-1.0f
@ -1117,7 +1117,7 @@ inline XMVECTOR PackedVector::XMLoadDec4
// a is unsigned! Flip the bit to convert the order to signed
vTemp = _mm_xor_ps(vTemp,g_XMXorDec4);
// Convert to floating point numbers
vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
// RGB + 0, A + 0x80000000.f to undo the signed order.
vTemp = _mm_add_ps(vTemp,g_XMAddDec4);
// Convert 0-255 to 0.0f-1.0f
@ -1152,7 +1152,7 @@ inline XMVECTOR PackedVector::XMLoadUByteN4
// w is signed! Flip the bits to convert the order to unsigned
vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
// Convert to floating point numbers
vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
// w + 0x80 to complete the conversion
vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
// Fix y, z and w because they are too large
@ -1187,7 +1187,7 @@ inline XMVECTOR PackedVector::XMLoadUByte4
// w is signed! Flip the bits to convert the order to unsigned
vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
// Convert to floating point numbers
vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
// w + 0x80 to complete the conversion
vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
// Fix y, z and w because they are too large
@ -1222,7 +1222,7 @@ inline XMVECTOR PackedVector::XMLoadByteN4
// x,y and z are unsigned! Flip the bits to convert the order to signed
vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
// Convert to floating point numbers
vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
// x, y and z - 0x80 to complete the conversion
vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
// Fix y, z and w because they are too large
@ -1258,7 +1258,7 @@ inline XMVECTOR PackedVector::XMLoadByte4
// x,y and z are unsigned! Flip the bits to convert the order to signed
vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
// Convert to floating point numbers
vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
// x, y and z - 0x80 to complete the conversion
vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
// Fix y, z and w because they are too large
@ -1284,7 +1284,7 @@ inline XMVECTOR PackedVector::XMLoadUNibble4
// Mask off x, y and z
vResult = _mm_and_ps(vResult,UNibble4And);
// Convert to float
vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
// Normalize x, y, and z
vResult = _mm_mul_ps(vResult,UNibble4Mul);
return vResult;
@ -1315,7 +1315,7 @@ inline XMVECTOR PackedVector::XMLoadU555
// Mask off x, y and z
vResult = _mm_and_ps(vResult,U555And);
// Convert to float
vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
// Normalize x, y, and z
vResult = _mm_mul_ps(vResult,U555Mul);
return vResult;
@ -1369,7 +1369,7 @@ inline void PackedVector::XMStoreColor
// Convert to 0-255
vResult = _mm_mul_ps(vResult,Scale);
// Shuffle RGBA to ARGB
vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,0,1,2));
vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
// Convert to int
__m128i vInt = _mm_cvtps_epi32(vResult);
// Mash to shorts
@ -1431,7 +1431,7 @@ inline void PackedVector::XMStoreShortN2
vResult = _mm_mul_ps(vResult,Scale);
__m128i vResulti = _mm_cvtps_epi32(vResult);
vResulti = _mm_packs_epi32(vResulti,vResulti);
_mm_store_ss(reinterpret_cast<float *>(&pDestination->x),reinterpret_cast<const __m128 *>(&vResulti)[0]);
_mm_store_ss(reinterpret_cast<float *>(&pDestination->x),_mm_castsi128_ps(vResulti));
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}
@ -1469,7 +1469,7 @@ inline void PackedVector::XMStoreShort2
__m128i vInt = _mm_cvtps_epi32(vResult);
// Pack the ints into shorts
vInt = _mm_packs_epi32(vInt,vInt);
_mm_store_ss(reinterpret_cast<float *>(&pDestination->x),reinterpret_cast<const __m128 *>(&vInt)[0]);
_mm_store_ss(reinterpret_cast<float *>(&pDestination->x),_mm_castsi128_ps(vInt));
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}
@ -1942,7 +1942,7 @@ inline void PackedVector::XMStoreShortN4
vResult = _mm_mul_ps(vResult,Scale);
__m128i vResulti = _mm_cvtps_epi32(vResult);
vResulti = _mm_packs_epi32(vResulti,vResulti);
_mm_store_sd(reinterpret_cast<double *>(&pDestination->x),reinterpret_cast<const __m128d *>(&vResulti)[0]);
_mm_store_sd(reinterpret_cast<double *>(&pDestination->x),_mm_castsi128_pd(vResulti));
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}
@ -1991,7 +1991,7 @@ inline void PackedVector::XMStoreShort4
__m128i vInt = _mm_cvtps_epi32(vResult);
// Pack the ints into shorts
vInt = _mm_packs_epi32(vInt,vInt);
_mm_store_sd(reinterpret_cast<double *>(&pDestination->x),reinterpret_cast<const __m128d *>(&vInt)[0]);
_mm_store_sd(reinterpret_cast<double *>(&pDestination->x),_mm_castsi128_pd(vInt));
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}
@ -2138,13 +2138,13 @@ inline void PackedVector::XMStoreXDecN4
__m128i vResultw = _mm_and_si128(vResulti,g_XMMaskW);
vResulti = _mm_add_epi32(vResulti,vResultw);
// Do a horizontal or of all 4 entries
vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
_mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
vResult = XM_PERMUTE_PS(_mm_castsi128_ps(vResulti),_MM_SHUFFLE(0,3,2,1));
vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult));
vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,3,2,1));
vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult));
vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,3,2,1));
vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult));
_mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}
@ -2197,7 +2197,7 @@ inline void PackedVector::XMStoreXDec4
vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
// i = x|y|z|w
vResulti = _mm_or_si128(vResulti,vResulti2);
_mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
_mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}
@ -2248,7 +2248,7 @@ inline void PackedVector::XMStoreUDecN4
vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
// i = x|y|z|w
vResulti = _mm_or_si128(vResulti,vResulti2);
_mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
_mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}
@ -2299,7 +2299,7 @@ inline void PackedVector::XMStoreUDec4
vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
// i = x|y|z|w
vResulti = _mm_or_si128(vResulti,vResulti2);
_mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
_mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}
@ -2348,7 +2348,7 @@ inline void PackedVector::XMStoreDecN4
vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
// i = x|y|z|w
vResulti = _mm_or_si128(vResulti,vResulti2);
_mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
_mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}
@ -2399,7 +2399,7 @@ inline void PackedVector::XMStoreDec4
vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
// i = x|y|z|w
vResulti = _mm_or_si128(vResulti,vResulti2);
_mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
_mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}
@ -2451,7 +2451,7 @@ inline void PackedVector::XMStoreUByteN4
vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
// i = x|y|z|w
vResulti = _mm_or_si128(vResulti,vResulti2);
_mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
_mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}
@ -2503,7 +2503,7 @@ inline void PackedVector::XMStoreUByte4
vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
// i = x|y|z|w
vResulti = _mm_or_si128(vResulti,vResulti2);
_mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
_mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}
@ -2553,7 +2553,7 @@ inline void PackedVector::XMStoreByteN4
vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
// i = x|y|z|w
vResulti = _mm_or_si128(vResulti,vResulti2);
_mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
_mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}
@ -2605,7 +2605,7 @@ inline void PackedVector::XMStoreByte4
vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
// i = x|y|z|w
vResulti = _mm_or_si128(vResulti,vResulti2);
_mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
_mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
#else // _XM_VMX128_INTRINSICS_
#endif // _XM_VMX128_INTRINSICS_
}