XMVectorSum function for horizontal add

2024-11-22 04:10:06 +00:00 · 2016-06-20 17:14:45 -07:00 · 2016-06-20 17:14:45 -07:00 · bea739e4e5
commit bea739e4e5
parent 7de910c260
2 changed files with 41 additions and 14 deletions
--- a/Inc/DirectXMath.h
+++ b/Inc/DirectXMath.h
@ -1007,6 +1007,7 @@ XMVECTOR    XM_CALLCONV     XMVectorXorInt(FXMVECTOR V1, FXMVECTOR V2);

 XMVECTOR    XM_CALLCONV     XMVectorNegate(FXMVECTOR V);
 XMVECTOR    XM_CALLCONV     XMVectorAdd(FXMVECTOR V1, FXMVECTOR V2);
+XMVECTOR    XM_CALLCONV     XMVectorSum(FXMVECTOR V);
 XMVECTOR    XM_CALLCONV     XMVectorAddAngles(FXMVECTOR V1, FXMVECTOR V2);
 XMVECTOR    XM_CALLCONV     XMVectorSubtract(FXMVECTOR V1, FXMVECTOR V2);
 XMVECTOR    XM_CALLCONV     XMVectorSubtractAngles(FXMVECTOR V1, FXMVECTOR V2);
--- a/Inc/DirectXMathVector.inl
+++ b/Inc/DirectXMathVector.inl
@ -2793,6 +2793,39 @@ inline XMVECTOR XM_CALLCONV XMVectorAdd

 //------------------------------------------------------------------------------

+inline XMVECTOR XM_CALLCONV XMVectorSum
+(
+    FXMVECTOR V
+)
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result.vector4_f32[0] = 
+    Result.vector4_f32[1] = 
+    Result.vector4_f32[2] = 
+    Result.vector4_f32[3] = V.vector4_f32[0] + V.vector4_f32[1] + V.vector4_f32[2] + V.vector4_f32[3];
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t v1 = vget_low_f32(V);
+    float32x2_t v2 = vget_high_f32(V);
+    v1 = vadd_f32(v1, v2);
+    v1 = vpadd_f32(v1, v1);
+    return vcombine_f32(v1, v1);
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vTemp = _mm_hadd_ps(V, V);
+    return _mm_hadd_ps(vTemp,vTemp);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 3, 0, 1));
+    XMVECTOR vTemp2 = _mm_add_ps(V, vTemp);
+    vTemp = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 0, 3, 2));
+    return _mm_add_ps(vTemp, vTemp2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
 inline XMVECTOR XM_CALLCONV XMVectorAddAngles
 (
    FXMVECTOR V1, 
@ -12893,9 +12926,8 @@ inline XMVECTOR XM_CALLCONV XMVector4Dot
    float32x4_t vTemp = vmulq_f32( V1, V2 );
    float32x2_t v1 = vget_low_f32( vTemp );
    float32x2_t v2 = vget_high_f32( vTemp );
-    v1 = vpadd_f32( v1, v1 );
-    v2 = vpadd_f32( v2, v2 );
    v1 = vadd_f32( v1, v2 );
+    v1 = vpadd_f32( v1, v1 );
    return vcombine_f32( v1, v1 );
 #elif defined(_XM_SSE4_INTRINSICS_)
    return _mm_dp_ps( V1, V2, 0xff );
@ -13090,9 +13122,8 @@ inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst
    float32x4_t vTemp = vmulq_f32( V, V );
    float32x2_t v1 = vget_low_f32( vTemp );
    float32x2_t v2 = vget_high_f32( vTemp );
-    v1 = vpadd_f32( v1, v1 );
-    v2 = vpadd_f32( v2, v2 );
    v1 = vadd_f32( v1, v2 );
+    v1 = vpadd_f32( v1, v1 );
    // Reciprocal sqrt (estimate)
    v2 = vrsqrte_f32( v1 );
    return vcombine_f32(v2, v2);
@ -13141,9 +13172,8 @@ inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLength
    float32x4_t vTemp = vmulq_f32( V, V );
    float32x2_t v1 = vget_low_f32( vTemp );
    float32x2_t v2 = vget_high_f32( vTemp );
-    v1 = vpadd_f32( v1, v1 );
-    v2 = vpadd_f32( v2, v2 );
    v1 = vadd_f32( v1, v2 );
+    v1 = vpadd_f32( v1, v1 );
    // Reciprocal sqrt
    float32x2_t  S0 = vrsqrte_f32(v1);
    float32x2_t  P0 = vmul_f32( v1, S0 );
@ -13201,9 +13231,8 @@ inline XMVECTOR XM_CALLCONV XMVector4LengthEst
    float32x4_t vTemp = vmulq_f32( V, V );
    float32x2_t v1 = vget_low_f32( vTemp );
    float32x2_t v2 = vget_high_f32( vTemp );
-    v1 = vpadd_f32( v1, v1 );
-    v2 = vpadd_f32( v2, v2 );
    v1 = vadd_f32( v1, v2 );
+    v1 = vpadd_f32( v1, v1 );
    const float32x2_t zero = vdup_n_f32(0);
    uint32x2_t VEqualsZero = vceq_f32( v1, zero );
    // Sqrt (estimate)
@ -13256,9 +13285,8 @@ inline XMVECTOR XM_CALLCONV XMVector4Length
    float32x4_t vTemp = vmulq_f32( V, V );
    float32x2_t v1 = vget_low_f32( vTemp );
    float32x2_t v2 = vget_high_f32( vTemp );
-    v1 = vpadd_f32( v1, v1 );
-    v2 = vpadd_f32( v2, v2 );
    v1 = vadd_f32( v1, v2 );
+    v1 = vpadd_f32( v1, v1 );
    const float32x2_t zero = vdup_n_f32(0);
    uint32x2_t VEqualsZero = vceq_f32( v1, zero );
    // Sqrt
@ -13317,9 +13345,8 @@ inline XMVECTOR XM_CALLCONV XMVector4NormalizeEst
    float32x4_t vTemp = vmulq_f32( V, V );
    float32x2_t v1 = vget_low_f32( vTemp );
    float32x2_t v2 = vget_high_f32( vTemp );
-    v1 = vpadd_f32( v1, v1 );
-    v2 = vpadd_f32( v2, v2 );
    v1 = vadd_f32( v1, v2 );
+    v1 = vpadd_f32( v1, v1 );
    // Reciprocal sqrt (estimate)
    v2 = vrsqrte_f32( v1 );
    // Normalize
@ -13381,9 +13408,8 @@ inline XMVECTOR XM_CALLCONV XMVector4Normalize
    float32x4_t vTemp = vmulq_f32( V, V );
    float32x2_t v1 = vget_low_f32( vTemp );
    float32x2_t v2 = vget_high_f32( vTemp );
-    v1 = vpadd_f32( v1, v1 );
-    v2 = vpadd_f32( v2, v2 );
    v1 = vadd_f32( v1, v2 );
+    v1 = vpadd_f32( v1, v1 );
    uint32x2_t VEqualsZero = vceq_f32( v1, vdup_n_f32(0) );
    uint32x2_t VEqualsInf = vceq_f32( v1, vget_low_f32(g_XMInfinity) );
    // Reciprocal sqrt (2 iterations of Newton-Raphson)