XMFLOAT3X4 data type and load/store functions (#71)

2024-11-09 14:10:09 +00:00 · 2018-06-01 10:47:08 -07:00 · 2018-06-01 10:47:08 -07:00 · 0fad2114f8
commit 0fad2114f8
parent 9226cd4d0c
3 changed files with 374 additions and 2 deletions
--- a/Inc/DirectXMath.h
+++ b/Inc/DirectXMath.h
@ -748,7 +748,7 @@ struct XMFLOAT3X3
 };

 //------------------------------------------------------------------------------
-// 4x3 Matrix: 32 bit floating point components
+// 4x3 Row-major Matrix: 32 bit floating point components
 struct XMFLOAT4X3
 {
    union
@ -761,6 +761,7 @@ struct XMFLOAT4X3
            float _41, _42, _43;
        };
        float m[4][3];
+        float f[12];
    };

    XMFLOAT4X3() = default;
@ -785,7 +786,7 @@ struct XMFLOAT4X3
    float&      operator() (size_t Row, size_t Column) { return m[Row][Column]; }
 };

-// 4x3 Matrix: 32 bit floating point components aligned on a 16 byte boundary
+// 4x3 Row-major Matrix: 32 bit floating point components aligned on a 16 byte boundary
 __declspec(align(16)) struct XMFLOAT4X3A : public XMFLOAT4X3
 {
    XMFLOAT4X3A() = default;
@ -804,6 +805,60 @@ __declspec(align(16)) struct XMFLOAT4X3A : public XMFLOAT4X3
    explicit XMFLOAT4X3A(_In_reads_(12) const float *pArray) : XMFLOAT4X3(pArray) {}
 };

+//------------------------------------------------------------------------------
+// 3x4 Column-major Matrix: 32 bit floating point components
+struct XMFLOAT3X4
+{
+    union
+    {
+        struct
+        {
+            float _11, _12, _13, _14;
+            float _21, _22, _23, _24;
+            float _31, _32, _33, _34;
+        };
+        float m[3][4];
+        float f[12];
+    };
+
+    XMFLOAT3X4() = default;
+
+    XMFLOAT3X4(const XMFLOAT3X4&) = default;
+    XMFLOAT3X4& operator=(const XMFLOAT3X4&) = default;
+
+    XMFLOAT3X4(XMFLOAT3X4&&) = default;
+    XMFLOAT3X4& operator=(XMFLOAT3X4&&) = default;
+
+    XM_CONSTEXPR XMFLOAT3X4(float m00, float m01, float m02, float m03,
+                            float m10, float m11, float m12, float m13,
+                            float m20, float m21, float m22, float m23)
+        : _11(m00), _12(m01), _13(m02), _14(m03),
+          _21(m10), _22(m11), _23(m12), _24(m13),
+          _31(m20), _32(m21), _33(m22), _34(m23) {}
+    explicit XMFLOAT3X4(_In_reads_(12) const float *pArray);
+
+    float       operator() (size_t Row, size_t Column) const { return m[Row][Column]; }
+    float&      operator() (size_t Row, size_t Column) { return m[Row][Column]; }
+};
+
+// 3x4 Column-major Matrix: 32 bit floating point components aligned on a 16 byte boundary
+__declspec(align(16)) struct XMFLOAT3X4A : public XMFLOAT3X4
+{
+    XMFLOAT3X4A() = default;
+
+    XMFLOAT3X4A(const XMFLOAT3X4A&) = default;
+    XMFLOAT3X4A& operator=(const XMFLOAT3X4A&) = default;
+
+    XMFLOAT3X4A(XMFLOAT3X4A&&) = default;
+    XMFLOAT3X4A& operator=(XMFLOAT3X4A&&) = default;
+
+    XM_CONSTEXPR XMFLOAT3X4A(float m00, float m01, float m02, float m03,
+                             float m10, float m11, float m12, float m13,
+                             float m20, float m21, float m22, float m23) :
+        XMFLOAT3X4(m00, m01, m02, m03, m10, m11, m12, m13, m20, m21, m22, m23) {}
+    explicit XMFLOAT3X4A(_In_reads_(12) const float *pArray) : XMFLOAT3X4(pArray) {}
+};
+
 //------------------------------------------------------------------------------
 // 4x4 Matrix: 32 bit floating point components
 struct XMFLOAT4X4
@ -923,6 +978,8 @@ XMVECTOR    XM_CALLCONV     XMLoadUInt4(_In_ const XMUINT4* pSource);
 XMMATRIX    XM_CALLCONV     XMLoadFloat3x3(_In_ const XMFLOAT3X3* pSource);
 XMMATRIX    XM_CALLCONV     XMLoadFloat4x3(_In_ const XMFLOAT4X3* pSource);
 XMMATRIX    XM_CALLCONV     XMLoadFloat4x3A(_In_ const XMFLOAT4X3A* pSource);
+XMMATRIX    XM_CALLCONV     XMLoadFloat3x4(_In_ const XMFLOAT3X4* pSource);
+XMMATRIX    XM_CALLCONV     XMLoadFloat3x4A(_In_ const XMFLOAT3X4A* pSource);
 XMMATRIX    XM_CALLCONV     XMLoadFloat4x4(_In_ const XMFLOAT4X4* pSource);
 XMMATRIX    XM_CALLCONV     XMLoadFloat4x4A(_In_ const XMFLOAT4X4A* pSource);

@ -959,6 +1016,8 @@ void        XM_CALLCONV     XMStoreUInt4(_Out_ XMUINT4* pDestination, _In_ FXMVE
 void        XM_CALLCONV     XMStoreFloat3x3(_Out_ XMFLOAT3X3* pDestination, _In_ FXMMATRIX M);
 void        XM_CALLCONV     XMStoreFloat4x3(_Out_ XMFLOAT4X3* pDestination, _In_ FXMMATRIX M);
 void        XM_CALLCONV     XMStoreFloat4x3A(_Out_ XMFLOAT4X3A* pDestination, _In_ FXMMATRIX M);
+void        XM_CALLCONV     XMStoreFloat3x4(_Out_ XMFLOAT3X4* pDestination, _In_ FXMMATRIX M);
+void        XM_CALLCONV     XMStoreFloat3x4A(_Out_ XMFLOAT3X4A* pDestination, _In_ FXMMATRIX M);
 void        XM_CALLCONV     XMStoreFloat4x4(_Out_ XMFLOAT4X4* pDestination, _In_ FXMMATRIX M);
 void        XM_CALLCONV     XMStoreFloat4x4A(_Out_ XMFLOAT4X4A* pDestination, _In_ FXMMATRIX M);

--- a/Inc/DirectXMathConvert.inl
+++ b/Inc/DirectXMathConvert.inl
@ -1000,6 +1000,169 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat4x3A
 #endif
 }

+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMMATRIX XM_CALLCONV XMLoadFloat3x4
+(
+    const XMFLOAT3X4* pSource
+)
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMMATRIX M;
+    M.r[0].vector4_f32[0] = pSource->m[0][0];
+    M.r[0].vector4_f32[1] = pSource->m[1][0];
+    M.r[0].vector4_f32[2] = pSource->m[2][0];
+    M.r[0].vector4_f32[3] = 0.0f;
+
+    M.r[1].vector4_f32[0] = pSource->m[0][1];
+    M.r[1].vector4_f32[1] = pSource->m[1][1];
+    M.r[1].vector4_f32[2] = pSource->m[2][1];
+    M.r[1].vector4_f32[3] = 0.0f;
+
+    M.r[2].vector4_f32[0] = pSource->m[0][2];
+    M.r[2].vector4_f32[1] = pSource->m[1][2];
+    M.r[2].vector4_f32[2] = pSource->m[2][2];
+    M.r[2].vector4_f32[3] = 0.0f;
+
+    M.r[3].vector4_f32[0] = pSource->m[0][3];
+    M.r[3].vector4_f32[1] = pSource->m[1][3];
+    M.r[3].vector4_f32[2] = pSource->m[2][3];
+    M.r[3].vector4_f32[3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2x4_t vTemp0 = vld4_f32(&pSource->_11);
+    float32x4_t vTemp1 = vld1q_f32(&pSource->_31);
+
+    float32x2_t l = vget_low_f32(vTemp1);
+    float32x4_t T0 = vcombine_f32(vTemp0.val[0], l);
+    float32x2_t rl = vrev64_f32(l);
+    float32x4_t T1 = vcombine_f32(vTemp0.val[1], rl);
+
+    float32x2_t h = vget_high_f32(vTemp1);
+    float32x4_t T2 = vcombine_f32(vTemp0.val[2], h);
+    float32x2_t rh = vrev64_f32(h);
+    float32x4_t T3 = vcombine_f32(vTemp0.val[3], rh);
+
+    XMMATRIX M = {};
+    M.r[0] = vandq_u32(T0, g_XMMask3);
+    M.r[1] = vandq_u32(T1, g_XMMask3);
+    M.r[2] = vandq_u32(T2, g_XMMask3);
+    M.r[3] = vsetq_lane_f32(1.f, T3, 3);
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    M.r[0] = _mm_loadu_ps(&pSource->_11);
+    M.r[1] = _mm_loadu_ps(&pSource->_21);
+    M.r[2] = _mm_loadu_ps(&pSource->_31);
+    M.r[3] = g_XMIdentityR3;
+
+    // x.x,x.y,y.x,y.y
+    XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0));
+    // x.z,x.w,y.z,y.w
+    XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2));
+    // z.x,z.y,w.x,w.y
+    XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0));
+    // z.z,z.w,w.z,w.w
+    XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2));
+    XMMATRIX mResult;
+
+    // x.x,y.x,z.x,w.x
+    mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
+    // x.y,y.y,z.y,w.y
+    mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
+    // x.z,y.z,z.z,w.z
+    mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
+    // x.w,y.w,z.w,w.w
+    mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1));
+    return mResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMMATRIX XM_CALLCONV XMLoadFloat3x4A
+(
+    const XMFLOAT3X4A* pSource
+)
+{
+    assert(pSource);
+    assert(((uintptr_t)pSource & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMMATRIX M;
+    M.r[0].vector4_f32[0] = pSource->m[0][0];
+    M.r[0].vector4_f32[1] = pSource->m[1][0];
+    M.r[0].vector4_f32[2] = pSource->m[2][0];
+    M.r[0].vector4_f32[3] = 0.0f;
+
+    M.r[1].vector4_f32[0] = pSource->m[0][1];
+    M.r[1].vector4_f32[1] = pSource->m[1][1];
+    M.r[1].vector4_f32[2] = pSource->m[2][1];
+    M.r[1].vector4_f32[3] = 0.0f;
+
+    M.r[2].vector4_f32[0] = pSource->m[0][2];
+    M.r[2].vector4_f32[1] = pSource->m[1][2];
+    M.r[2].vector4_f32[2] = pSource->m[2][2];
+    M.r[2].vector4_f32[3] = 0.0f;
+
+    M.r[3].vector4_f32[0] = pSource->m[0][3];
+    M.r[3].vector4_f32[1] = pSource->m[1][3];
+    M.r[3].vector4_f32[2] = pSource->m[2][3];
+    M.r[3].vector4_f32[3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2x4_t vTemp0 = vld4_f32_ex(&pSource->_11, 128);
+    float32x4_t vTemp1 = vld1q_f32_ex(&pSource->_31, 128);
+
+    float32x2_t l = vget_low_f32(vTemp1);
+    float32x4_t T0 = vcombine_f32(vTemp0.val[0], l);
+    float32x2_t rl = vrev64_f32(l);
+    float32x4_t T1 = vcombine_f32(vTemp0.val[1], rl);
+
+    float32x2_t h = vget_high_f32(vTemp1);
+    float32x4_t T2 = vcombine_f32(vTemp0.val[2], h);
+    float32x2_t rh = vrev64_f32(h);
+    float32x4_t T3 = vcombine_f32(vTemp0.val[3], rh);
+
+    XMMATRIX M = {};
+    M.r[0] = vandq_u32(T0, g_XMMask3);
+    M.r[1] = vandq_u32(T1, g_XMMask3);
+    M.r[2] = vandq_u32(T2, g_XMMask3);
+    M.r[3] = vsetq_lane_f32(1.f, T3, 3);
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    M.r[0] = _mm_load_ps(&pSource->_11);
+    M.r[1] = _mm_load_ps(&pSource->_21);
+    M.r[2] = _mm_load_ps(&pSource->_31);
+    M.r[3] = g_XMIdentityR3;
+
+    // x.x,x.y,y.x,y.y
+    XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0));
+    // x.z,x.w,y.z,y.w
+    XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2));
+    // z.x,z.y,w.x,w.y
+    XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0));
+    // z.z,z.w,w.z,w.w
+    XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2));
+    XMMATRIX mResult;
+
+    // x.x,y.x,z.x,w.x
+    mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
+    // x.y,y.y,z.y,w.y
+    mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
+    // x.z,y.z,z.z,w.z
+    mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
+    // x.w,y.w,z.w,w.w
+    mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1));
+    return mResult;
+#endif
+}
+
 //------------------------------------------------------------------------------
 _Use_decl_annotations_
 inline XMMATRIX XM_CALLCONV XMLoadFloat4x4
@ -1804,6 +1967,125 @@ inline void XM_CALLCONV XMStoreFloat4x3A
 #endif
 }

+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat3x4
+(
+    XMFLOAT3X4* pDestination,
+    FXMMATRIX M
+)
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    pDestination->m[0][0] = M.r[0].vector4_f32[0];
+    pDestination->m[0][1] = M.r[1].vector4_f32[0];
+    pDestination->m[0][2] = M.r[2].vector4_f32[0];
+    pDestination->m[0][3] = M.r[3].vector4_f32[0];
+
+    pDestination->m[1][0] = M.r[0].vector4_f32[1];
+    pDestination->m[1][1] = M.r[1].vector4_f32[1];
+    pDestination->m[1][2] = M.r[2].vector4_f32[1];
+    pDestination->m[1][3] = M.r[3].vector4_f32[1];
+
+    pDestination->m[2][0] = M.r[0].vector4_f32[2];
+    pDestination->m[2][1] = M.r[1].vector4_f32[2];
+    pDestination->m[2][2] = M.r[2].vector4_f32[2];
+    pDestination->m[2][3] = M.r[3].vector4_f32[2];
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]);
+    float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]);
+
+    float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]);
+    float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]);
+
+    vst1q_f32(&pDestination->m[0][0], T0.val[0]);
+    vst1q_f32(&pDestination->m[1][0], T0.val[1]);
+    vst1q_f32(&pDestination->m[2][0], T1.val[0]);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // x.x,x.y,y.x,y.y
+    XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0));
+    // x.z,x.w,y.z,y.w
+    XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2));
+    // z.x,z.y,w.x,w.y
+    XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0));
+    // z.z,z.w,w.z,w.w
+    XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2));
+
+    // x.x,y.x,z.x,w.x
+    XMVECTOR r0 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
+    // x.y,y.y,z.y,w.y
+    XMVECTOR r1 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
+    // x.z,y.z,z.z,w.z
+    XMVECTOR r2 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
+
+    _mm_storeu_ps(&pDestination->m[0][0], r0);
+    _mm_storeu_ps(&pDestination->m[1][0], r1);
+    _mm_storeu_ps(&pDestination->m[2][0], r2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat3x4A
+(
+    XMFLOAT3X4A* pDestination,
+    FXMMATRIX M
+)
+{
+    assert(pDestination);
+    assert(((uintptr_t)pDestination & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+
+    pDestination->m[0][0] = M.r[0].vector4_f32[0];
+    pDestination->m[0][1] = M.r[1].vector4_f32[0];
+    pDestination->m[0][2] = M.r[2].vector4_f32[0];
+    pDestination->m[0][3] = M.r[3].vector4_f32[0];
+
+    pDestination->m[1][0] = M.r[0].vector4_f32[1];
+    pDestination->m[1][1] = M.r[1].vector4_f32[1];
+    pDestination->m[1][2] = M.r[2].vector4_f32[1];
+    pDestination->m[1][3] = M.r[3].vector4_f32[1];
+
+    pDestination->m[2][0] = M.r[0].vector4_f32[2];
+    pDestination->m[2][1] = M.r[1].vector4_f32[2];
+    pDestination->m[2][2] = M.r[2].vector4_f32[2];
+    pDestination->m[2][3] = M.r[3].vector4_f32[2];
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]);
+    float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]);
+
+    float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]);
+    float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]);
+
+    vst1q_f32_ex(&pDestination->m[0][0], T0.val[0], 128);
+    vst1q_f32_ex(&pDestination->m[1][0], T0.val[1], 128);
+    vst1q_f32_ex(&pDestination->m[2][0], T1.val[0], 128);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // x.x,x.y,y.x,y.y
+    XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0));
+    // x.z,x.w,y.z,y.w
+    XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2));
+    // z.x,z.y,w.x,w.y
+    XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0));
+    // z.z,z.w,w.z,w.w
+    XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2));
+
+    // x.x,y.x,z.x,w.x
+    XMVECTOR r0 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
+    // x.y,y.y,z.y,w.y
+    XMVECTOR r1 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
+    // x.z,y.z,z.z,w.z
+    XMVECTOR r2 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
+
+    _mm_store_ps(&pDestination->m[0][0], r0);
+    _mm_store_ps(&pDestination->m[1][0], r1);
+    _mm_store_ps(&pDestination->m[2][0], r2);
+#endif
+}
+
 //------------------------------------------------------------------------------
 _Use_decl_annotations_
 inline void XM_CALLCONV XMStoreFloat4x4
--- a/Inc/DirectXMathMatrix.inl
+++ b/Inc/DirectXMathMatrix.inl
@ -3244,6 +3244,37 @@ inline XMFLOAT4X3::XMFLOAT4X3
    m[3][2] = pArray[11];
 }

+/****************************************************************************
+*
+* XMFLOAT3X4 operators
+*
+****************************************************************************/
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMFLOAT3X4::XMFLOAT3X4
+(
+    const float* pArray
+)
+{
+    assert(pArray != nullptr);
+
+    m[0][0] = pArray[0];
+    m[0][1] = pArray[1];
+    m[0][2] = pArray[2];
+    m[0][3] = pArray[3];
+
+    m[1][0] = pArray[4];
+    m[1][1] = pArray[5];
+    m[1][2] = pArray[6];
+    m[1][3] = pArray[7];
+
+    m[2][0] = pArray[8];
+    m[2][1] = pArray[9];
+    m[2][2] = pArray[10];
+    m[2][3] = pArray[11];
+}
+
 /****************************************************************************
 *
 * XMFLOAT4X4 operators