diff --git a/Inc/DirectXMathMatrix.inl b/Inc/DirectXMathMatrix.inl index 8d51ba6..54a5ed7 100644 --- a/Inc/DirectXMathMatrix.inl +++ b/Inc/DirectXMathMatrix.inl @@ -297,13 +297,20 @@ inline XMMATRIX XM_CALLCONV XMMatrixMultiply return mResult; #elif defined(_XM_SSE_INTRINSICS_) XMMATRIX mResult; + // Splat the component X,Y,Z then W +#if defined(_XM_AVX_INTRINSICS_) + XMVECTOR vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 0); + XMVECTOR vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 1); + XMVECTOR vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 2); + XMVECTOR vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 3); +#else // Use vW to hold the original row XMVECTOR vW = M1.r[0]; - // Splat the component X,Y,Z then W XMVECTOR vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); XMVECTOR vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); XMVECTOR vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); +#endif // Perform the operation on the first row vX = _mm_mul_ps(vX,M2.r[0]); vY = _mm_mul_ps(vY,M2.r[1]); @@ -315,11 +322,18 @@ inline XMMATRIX XM_CALLCONV XMMatrixMultiply vX = _mm_add_ps(vX,vY); mResult.r[0] = vX; // Repeat for the other 3 rows +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 3); +#else vW = M1.r[1]; vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); +#endif vX = _mm_mul_ps(vX,M2.r[0]); vY = _mm_mul_ps(vY,M2.r[1]); vZ = _mm_mul_ps(vZ,M2.r[2]); @@ -328,11 +342,18 @@ inline XMMATRIX XM_CALLCONV XMMatrixMultiply vY = _mm_add_ps(vY,vW); vX = _mm_add_ps(vX,vY); mResult.r[1] = vX; +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 3); +#else vW = M1.r[2]; vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); +#endif vX = _mm_mul_ps(vX,M2.r[0]); vY = _mm_mul_ps(vY,M2.r[1]); vZ = _mm_mul_ps(vZ,M2.r[2]); @@ -341,11 +362,18 @@ inline XMMATRIX XM_CALLCONV XMMatrixMultiply vY = _mm_add_ps(vY,vW); vX = _mm_add_ps(vX,vY); mResult.r[2] = vX; +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 3); +#else vW = M1.r[3]; vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); +#endif vX = _mm_mul_ps(vX,M2.r[0]); vY = _mm_mul_ps(vY,M2.r[1]); vZ = _mm_mul_ps(vZ,M2.r[2]); @@ -450,13 +478,20 @@ inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose mResult.r[3] = T1.val[1]; return mResult; #elif defined(_XM_SSE_INTRINSICS_) + // Splat the component X,Y,Z then W +#if defined(_XM_AVX_INTRINSICS_) + XMVECTOR vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 0); + XMVECTOR vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 1); + XMVECTOR vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 2); + XMVECTOR vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 3); +#else // Use vW to hold the original row XMVECTOR vW = M1.r[0]; - // Splat the component X,Y,Z then W XMVECTOR vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); XMVECTOR vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); XMVECTOR vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); +#endif // Perform the operation on the first row vX = _mm_mul_ps(vX,M2.r[0]); vY = _mm_mul_ps(vY,M2.r[1]); @@ -466,13 +501,20 @@ inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose vX = _mm_add_ps(vX,vZ); vY = _mm_add_ps(vY,vW); vX = _mm_add_ps(vX,vY); - __m128 r0 = vX; + XMVECTOR r0 = vX; // Repeat for the other 3 rows +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 3); +#else vW = M1.r[1]; vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); +#endif vX = _mm_mul_ps(vX,M2.r[0]); vY = _mm_mul_ps(vY,M2.r[1]); vZ = _mm_mul_ps(vZ,M2.r[2]); @@ -480,12 +522,19 @@ inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose vX = _mm_add_ps(vX,vZ); vY = _mm_add_ps(vY,vW); vX = _mm_add_ps(vX,vY); - __m128 r1 = vX; + XMVECTOR r1 = vX; +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 3); +#else vW = M1.r[2]; vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); +#endif vX = _mm_mul_ps(vX,M2.r[0]); vY = _mm_mul_ps(vY,M2.r[1]); vZ = _mm_mul_ps(vZ,M2.r[2]); @@ -493,12 +542,19 @@ inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose vX = _mm_add_ps(vX,vZ); vY = _mm_add_ps(vY,vW); vX = _mm_add_ps(vX,vY); - __m128 r2 = vX; + XMVECTOR r2 = vX; +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 3); +#else vW = M1.r[3]; vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); +#endif vX = _mm_mul_ps(vX,M2.r[0]); vY = _mm_mul_ps(vY,M2.r[1]); vZ = _mm_mul_ps(vZ,M2.r[2]); @@ -506,7 +562,7 @@ inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose vX = _mm_add_ps(vX,vZ); vY = _mm_add_ps(vY,vW); vX = _mm_add_ps(vX,vY); - __m128 r3 = vX; + XMVECTOR r3 = vX; // x.x,x.y,y.x,y.y XMVECTOR vTemp1 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(1,0,1,0));