From 7fcdfbc3c64d5c695b39fb376c1f5fb52e1084db Mon Sep 17 00:00:00 2001 From: Chuck Walbourn Date: Tue, 25 Jun 2019 09:16:03 -0700 Subject: [PATCH] Updated for clang control defines --- Inc/DirectXMath.h | 29 ++++++-- Inc/DirectXMathMatrix.inl | 4 +- Inc/DirectXMathVector.inl | 134 ++++++++++++++++++------------------ Inc/DirectXPackedVector.inl | 8 +-- 4 files changed, 95 insertions(+), 80 deletions(-) diff --git a/Inc/DirectXMath.h b/Inc/DirectXMath.h index 148cf61..3b80d87 100644 --- a/Inc/DirectXMath.h +++ b/Inc/DirectXMath.h @@ -82,9 +82,9 @@ #endif #if !defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) -#if (defined(_M_IX86) || defined(_M_X64)) && !defined(_M_HYBRID_X86_ARM64) +#if (defined(_M_IX86) || defined(_M_X64) || __i386__ || __x86_64__) && !defined(_M_HYBRID_X86_ARM64) #define _XM_SSE_INTRINSICS_ -#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) +#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__ #define _XM_ARM_NEON_INTRINSICS_ #elif !defined(_XM_NO_INTRINSICS_) #error DirectX Math does not support this target @@ -104,11 +104,14 @@ #pragma warning(pop) #ifndef _XM_NO_INTRINSICS_ + +#ifdef _MSC_VER #pragma warning(push) #pragma warning(disable : 4987) // C4987: Off by default noise #include #pragma warning(pop) +#endif #if defined(__clang__) && (__x86_64__ || __i386__) #include @@ -131,7 +134,7 @@ #endif #elif defined(_XM_ARM_NEON_INTRINSICS_) -#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) +#if defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)) #include #else #include @@ -172,6 +175,18 @@ #endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_ +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + +#if defined(__clang__) +#define XM_PREFETCH( a ) __builtin_prefetch(a) +#elif defined(_MSC_VER) +#define XM_PREFETCH( a ) __prefetch(a) +#else +#define XM_PREFETCH( a ) +#endif + +#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_ + namespace DirectX { @@ -311,21 +326,21 @@ typedef __vector4 XMVECTOR; #endif // Fix-up for (1st-3rd) XMVECTOR parameters that are pass-in-register for x86, ARM, ARM64, and vector call; by reference otherwise -#if ( defined(_M_IX86) || defined(_M_ARM) || defined(_M_ARM64) || _XM_VECTORCALL_ ) && !defined(_XM_NO_INTRINSICS_) +#if ( defined(_M_IX86) || defined(_M_ARM) || defined(_M_ARM64) || _XM_VECTORCALL_ || __i386__ || __arm__ || __aarch64__ ) && !defined(_XM_NO_INTRINSICS_) typedef const XMVECTOR FXMVECTOR; #else typedef const XMVECTOR& FXMVECTOR; #endif // Fix-up for (4th) XMVECTOR parameter to pass in-register for ARM, ARM64, and x64 vector call; by reference otherwise -#if ( defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || (_XM_VECTORCALL_ && !defined(_M_IX86) ) ) && !defined(_XM_NO_INTRINSICS_) +#if ( defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || (_XM_VECTORCALL_ && !defined(_M_IX86) ) || __arm__ || __aarch64__ ) && !defined(_XM_NO_INTRINSICS_) typedef const XMVECTOR GXMVECTOR; #else typedef const XMVECTOR& GXMVECTOR; #endif // Fix-up for (5th & 6th) XMVECTOR parameter to pass in-register for ARM64 and vector call; by reference otherwise -#if ( defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || _XM_VECTORCALL_ ) && !defined(_XM_NO_INTRINSICS_) +#if ( defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || _XM_VECTORCALL_ || __aarch64__ ) && !defined(_XM_NO_INTRINSICS_) typedef const XMVECTOR HXMVECTOR; #else typedef const XMVECTOR& HXMVECTOR; @@ -428,7 +443,7 @@ XMVECTOR XM_CALLCONV operator/ (FXMVECTOR V, float S); struct XMMATRIX; // Fix-up for (1st) XMMATRIX parameter to pass in-register for ARM64 and vector call; by reference otherwise -#if ( defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || _XM_VECTORCALL_ ) && !defined(_XM_NO_INTRINSICS_) +#if ( defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || _XM_VECTORCALL_ || __aarch64__ ) && !defined(_XM_NO_INTRINSICS_) typedef const XMMATRIX FXMMATRIX; #else typedef const XMMATRIX& FXMMATRIX; diff --git a/Inc/DirectXMathMatrix.inl b/Inc/DirectXMathMatrix.inl index 56e175e..6a3380e 100644 --- a/Inc/DirectXMathMatrix.inl +++ b/Inc/DirectXMathMatrix.inl @@ -3061,7 +3061,7 @@ inline XMMATRIX& XMMATRIX::operator/= (float S) r[3] = XMVectorDivide( r[3], vS ); return *this; #elif defined(_XM_ARM_NEON_INTRINSICS_) -#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ float32x4_t vS = vdupq_n_f32( S ); r[0] = vdivq_f32( r[0], vS ); r[1] = vdivq_f32( r[1], vS ); @@ -3148,7 +3148,7 @@ inline XMMATRIX XMMATRIX::operator/ (float S) const R.r[3] = XMVectorDivide( r[3], vS ); return R; #elif defined(_XM_ARM_NEON_INTRINSICS_) -#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ float32x4_t vS = vdupq_n_f32( S ); XMMATRIX R; R.r[0] = vdivq_f32( r[0], vS ); diff --git a/Inc/DirectXMathVector.inl b/Inc/DirectXMathVector.inl index e44e769..ab0801f 100644 --- a/Inc/DirectXMathVector.inl +++ b/Inc/DirectXMathVector.inl @@ -2339,7 +2339,7 @@ inline XMVECTOR XM_CALLCONV XMVectorRound return Result.v; #elif defined(_XM_ARM_NEON_INTRINSICS_) -#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ return vrndnq_f32(V); #else uint32x4_t sign = vandq_u32( V, g_XMNegativeZero ); @@ -2403,7 +2403,7 @@ inline XMVECTOR XM_CALLCONV XMVectorTruncate return Result; #elif defined(_XM_ARM_NEON_INTRINSICS_) -#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ return vrndq_f32(V); #else float32x4_t vTest = vabsq_f32( V ); @@ -2453,7 +2453,7 @@ inline XMVECTOR XM_CALLCONV XMVectorFloor } } }; return Result.v; #elif defined(_XM_ARM_NEON_INTRINSICS_) -#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ return vrndmq_f32(V); #else float32x4_t vTest = vabsq_f32( V ); @@ -2507,7 +2507,7 @@ inline XMVECTOR XM_CALLCONV XMVectorCeiling } } }; return Result.v; #elif defined(_XM_ARM_NEON_INTRINSICS_) -#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ return vrndpq_f32(V); #else float32x4_t vTest = vabsq_f32( V ); @@ -2810,7 +2810,7 @@ inline XMVECTOR XM_CALLCONV XMVectorSum return Result.v; #elif defined(_XM_ARM_NEON_INTRINSICS_) -#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ XMVECTOR vTemp = vpaddq_f32(V, V); return vpaddq_f32(vTemp,vTemp); #else @@ -3015,7 +3015,7 @@ inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd } } }; return Result.v; #elif defined(_XM_ARM_NEON_INTRINSICS_) -#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ return vfmaq_f32( V3, V1, V2 ); #else return vmlaq_f32( V3, V1, V2 ); @@ -3045,7 +3045,7 @@ inline XMVECTOR XM_CALLCONV XMVectorDivide } } }; return Result.v; #elif defined(_XM_ARM_NEON_INTRINSICS_) -#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ return vdivq_f32( V1, V2 ); #else // 2 iterations of Newton-Raphson refinement of reciprocal @@ -3079,7 +3079,7 @@ inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract } } }; return Result; #elif defined(_XM_ARM_NEON_INTRINSICS_) -#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ return vfmsq_f32( V3, V1, V2 ); #else return vmlsq_f32( V3, V1, V2 ); @@ -3154,7 +3154,7 @@ inline XMVECTOR XM_CALLCONV XMVectorReciprocal } } }; return Result.v; #elif defined(_XM_ARM_NEON_INTRINSICS_) -#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ float32x4_t one = vdupq_n_f32(1.0f); return vdivq_f32(one,V); #else @@ -7755,26 +7755,26 @@ inline XMFLOAT4* XM_CALLCONV XMVector2TransformStream XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N - __prefetch( pInputVector ); + XM_PREFETCH( pInputVector ); r3 = vget_high_f32( row3 ); r = vget_high_f32( row0 ); XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O XMVECTOR vResult3 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P - __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + XM_PREFETCH( pInputVector+XM_CACHE_LINE_SIZE ); r = vget_low_f32( row1 ); vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*2) ); r = vget_high_f32( row1 ); vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O vResult3 = vmlaq_lane_f32( vResult3, V.val[1], r, 1 ); // Dx+Hy+P - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*3) ); float32x4x4_t R; R.val[0] = vResult0; @@ -8060,26 +8060,26 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N - __prefetch( pInputVector ); + XM_PREFETCH( pInputVector ); r3 = vget_high_f32( row3 ); r = vget_high_f32( row0 ); XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P - __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + XM_PREFETCH( pInputVector+XM_CACHE_LINE_SIZE ); r = vget_low_f32( row1 ); vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*2) ); r = vget_high_f32( row1 ); W = vmlaq_lane_f32( W, V.val[1], r, 1 ); // Dx+Hy+P - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*3) ); -#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ V.val[0] = vdivq_f32( vResult0, W ); V.val[1] = vdivq_f32( vResult1, W ); #else @@ -8113,7 +8113,7 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream V = vget_high_f32( vResult ); float32x2_t W = vdup_lane_f32( V, 1 ); -#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ V = vget_low_f32( vResult ); V = vdiv_f32( V, W ); #else @@ -8454,15 +8454,15 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream XMVECTOR vResult0 = vmulq_lane_f32( V.val[0], r, 0 ); // Ax XMVECTOR vResult1 = vmulq_lane_f32( V.val[0], r, 1 ); // Bx - __prefetch( pInputVector ); - __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + XM_PREFETCH( pInputVector ); + XM_PREFETCH( pInputVector+XM_CACHE_LINE_SIZE ); r = vget_low_f32( row1 ); vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*3) ); V.val[0] = vResult0; V.val[1] = vResult1; @@ -10224,38 +10224,38 @@ inline XMFLOAT4* XM_CALLCONV XMVector3TransformStream XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N - __prefetch( pInputVector ); + XM_PREFETCH( pInputVector ); r3 = vget_high_f32( row3 ); r = vget_high_f32( row0 ); XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O XMVECTOR vResult3 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P - __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + XM_PREFETCH( pInputVector+XM_CACHE_LINE_SIZE ); r = vget_low_f32( row1 ); vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*2) ); r = vget_high_f32( row1 ); vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O vResult3 = vmlaq_lane_f32( vResult3, V.val[1], r, 1 ); // Dx+Hy+P - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*3) ); r = vget_low_f32( row2 ); vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz+M vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz+N - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*4) ); r = vget_high_f32( row2 ); vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz+O vResult3 = vmlaq_lane_f32( vResult3, V.val[2], r, 1 ); // Dx+Hy+Lz+P - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*5) ); float32x4x4_t R; R.val[0] = vResult0; @@ -10611,40 +10611,40 @@ inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N - __prefetch( pInputVector ); + XM_PREFETCH( pInputVector ); r3 = vget_high_f32( row3 ); r = vget_high_f32( row0 ); XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P - __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + XM_PREFETCH( pInputVector+XM_CACHE_LINE_SIZE ); r = vget_low_f32( row1 ); vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*2) ); r = vget_high_f32( row1 ); vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O W = vmlaq_lane_f32( W, V.val[1], r, 1 ); // Dx+Hy+P - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*3) ); r = vget_low_f32( row2 ); vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz+M vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz+N - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*4) ); r = vget_high_f32( row2 ); vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz+O W = vmlaq_lane_f32( W, V.val[2], r, 1 ); // Dx+Hy+Lz+P - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*5) ); -#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ V.val[0] = vdivq_f32( vResult0, W ); V.val[1] = vdivq_f32( vResult1, W ); V.val[2] = vdivq_f32( vResult2, W ); @@ -10683,7 +10683,7 @@ inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream VH = vget_high_f32(vResult); XMVECTOR W = vdupq_lane_f32( VH, 1 ); -#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ vResult = vdivq_f32( vResult, W ); #else // 2 iterations of Newton-Raphson refinement of reciprocal for W @@ -11134,34 +11134,34 @@ inline XMFLOAT3* XM_CALLCONV XMVector3TransformNormalStream XMVECTOR vResult0 = vmulq_lane_f32( V.val[0], r, 0 ); // Ax XMVECTOR vResult1 = vmulq_lane_f32( V.val[0], r, 1 ); // Bx - __prefetch( pInputVector ); + XM_PREFETCH( pInputVector ); r = vget_high_f32( row0 ); XMVECTOR vResult2 = vmulq_lane_f32( V.val[0], r, 0 ); // Cx - __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + XM_PREFETCH( pInputVector+XM_CACHE_LINE_SIZE ); r = vget_low_f32( row1 ); vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*2) ); r = vget_high_f32( row1 ); vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*3) ); r = vget_low_f32( row2 ); vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*4) ); r = vget_high_f32( row2 ); vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*5) ); V.val[0] = vResult0; V.val[1] = vResult1; @@ -11576,40 +11576,40 @@ inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N - __prefetch( pInputVector ); + XM_PREFETCH( pInputVector ); r3 = vget_high_f32( Transform.r[3] ); r = vget_high_f32( Transform.r[0] ); XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P - __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + XM_PREFETCH( pInputVector+XM_CACHE_LINE_SIZE ); r = vget_low_f32( Transform.r[1] ); vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*2) ); r = vget_high_f32( Transform.r[1] ); vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O W = vmlaq_lane_f32( W, V.val[1], r, 1 ); // Dx+Hy+P - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*3) ); r = vget_low_f32( Transform.r[2] ); vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz+M vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz+N - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*4) ); r = vget_high_f32( Transform.r[2] ); vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz+O W = vmlaq_lane_f32( W, V.val[2], r, 1 ); // Dx+Hy+Lz+P - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*5) ); -#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ vResult0 = vdivq_f32( vResult0, W ); vResult1 = vdivq_f32( vResult1, W ); vResult2 = vdivq_f32( vResult2, W ); @@ -11657,7 +11657,7 @@ inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream VH = vget_high_f32(vResult); XMVECTOR W = vdupq_lane_f32( VH, 1 ); -#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ vResult = vdivq_f32( vResult, W ); #else // 2 iterations of Newton-Raphson refinement of reciprocal for W @@ -12165,14 +12165,14 @@ inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), VX, r, 0 ); // Ax+M XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), VX, r, 1 ); // Bx+N - __prefetch( pInputVector ); + XM_PREFETCH( pInputVector ); r3 = vget_high_f32( Transform.r[3] ); r = vget_high_f32( Transform.r[0] ); XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), VX, r, 0 ); // Cx+O XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), VX, r, 1 ); // Dx+P - __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + XM_PREFETCH( pInputVector+XM_CACHE_LINE_SIZE ); XMVECTOR ScaleY = vdupq_n_f32(sy); XMVECTOR OffsetY = vdupq_n_f32(oy); @@ -12182,13 +12182,13 @@ inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream vResult0 = vmlaq_lane_f32( vResult0, VY, r, 0 ); // Ax+Ey+M vResult1 = vmlaq_lane_f32( vResult1, VY, r, 1 ); // Bx+Fy+N - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*2) ); r = vget_high_f32( Transform.r[1] ); vResult2 = vmlaq_lane_f32( vResult2, VY, r, 0 ); // Cx+Gy+O W = vmlaq_lane_f32( W, VY, r, 1 ); // Dx+Hy+P - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*3) ); XMVECTOR ScaleZ = vdupq_n_f32(sz); XMVECTOR OffsetZ = vdupq_n_f32(oz); @@ -12198,15 +12198,15 @@ inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream vResult0 = vmlaq_lane_f32( vResult0, VZ, r, 0 ); // Ax+Ey+Iz+M vResult1 = vmlaq_lane_f32( vResult1, VZ, r, 1 ); // Bx+Fy+Jz+N - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*4) ); r = vget_high_f32( Transform.r[2] ); vResult2 = vmlaq_lane_f32( vResult2, VZ, r, 0 ); // Cx+Gy+Kz+O W = vmlaq_lane_f32( W, VZ, r, 1 ); // Dx+Hy+Lz+P - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*5) ); -#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ V.val[0] = vdivq_f32( vResult0, W ); V.val[1] = vdivq_f32( vResult1, W ); V.val[2] = vdivq_f32( vResult2, W ); @@ -12260,7 +12260,7 @@ inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream VH = vget_high_f32(vResult); XMVECTOR W = vdupq_lane_f32( VH, 1 ); -#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ vResult = vdivq_f32( vResult, W ); #else // 2 iterations of Newton-Raphson refinement of reciprocal for W @@ -14256,49 +14256,49 @@ inline XMFLOAT4* XM_CALLCONV XMVector4TransformStream XMVECTOR vResult0 = vmulq_lane_f32( V.val[0], r, 0 ); // Ax XMVECTOR vResult1 = vmulq_lane_f32( V.val[0], r, 1 ); // Bx - __prefetch( pInputVector ); + XM_PREFETCH( pInputVector ); r = vget_high_f32( row0 ); XMVECTOR vResult2 = vmulq_lane_f32( V.val[0], r, 0 ); // Cx XMVECTOR vResult3 = vmulq_lane_f32( V.val[0], r, 1 ); // Dx - __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + XM_PREFETCH( pInputVector+XM_CACHE_LINE_SIZE ); r = vget_low_f32( row1 ); vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*2) ); r = vget_high_f32( row1 ); vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy vResult3 = vmlaq_lane_f32( vResult3, V.val[1], r, 1 ); // Dx+Hy - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*3) ); r = vget_low_f32( row2 ); vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*4) ); r = vget_high_f32( row2 ); vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz vResult3 = vmlaq_lane_f32( vResult3, V.val[2], r, 1 ); // Dx+Hy+Lz - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*5) ); r = vget_low_f32( row3 ); vResult0 = vmlaq_lane_f32( vResult0, V.val[3], r, 0 ); // Ax+Ey+Iz+Mw vResult1 = vmlaq_lane_f32( vResult1, V.val[3], r, 1 ); // Bx+Fy+Jz+Nw - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*6) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*6) ); r = vget_high_f32( row3 ); vResult2 = vmlaq_lane_f32( vResult2, V.val[3], r, 0 ); // Cx+Gy+Kz+Ow vResult3 = vmlaq_lane_f32( vResult3, V.val[3], r, 1 ); // Dx+Hy+Lz+Pw - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*7) ); + XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*7) ); V.val[0] = vResult0; V.val[1] = vResult1; diff --git a/Inc/DirectXPackedVector.inl b/Inc/DirectXPackedVector.inl index a07bd46..a419316 100644 --- a/Inc/DirectXPackedVector.inl +++ b/Inc/DirectXPackedVector.inl @@ -26,7 +26,7 @@ inline float XMConvertHalfToFloat __m128i V1 = _mm_cvtsi32_si128( static_cast(Value) ); __m128 V2 = _mm_cvtph_ps( V1 ); return _mm_cvtss_f32( V2 ); -#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)) && !defined(_XM_NO_INTRINSICS_) +#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) uint16x4_t vHalf = vdup_n_u16(Value); float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf)); return vgetq_lane_f32(vFloat, 0); @@ -258,7 +258,7 @@ inline float* XMConvertHalfToFloatStream XM_SFENCE(); return pOutputStream; -#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)) && !defined(_XM_NO_INTRINSICS_) +#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) auto pHalf = reinterpret_cast(pInputStream); auto pFloat = reinterpret_cast(pOutputStream); @@ -395,7 +395,7 @@ inline HALF XMConvertFloatToHalf __m128 V1 = _mm_set_ss( Value ); __m128i V2 = _mm_cvtps_ph( V1, 0 ); return static_cast( _mm_cvtsi128_si32(V2) ); -#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)) && !defined(_XM_NO_INTRINSICS_) +#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) float32x4_t vFloat = vdupq_n_f32(Value); float16x4_t vHalf = vcvt_f16_f32(vFloat); return vget_lane_u16(vreinterpret_u16_f16(vHalf), 0); @@ -624,7 +624,7 @@ inline HALF* XMConvertFloatToHalfStream } return pOutputStream; -#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)) && !defined(_XM_NO_INTRINSICS_) +#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) auto pFloat = reinterpret_cast(pInputStream); auto pHalf = reinterpret_cast(pOutputStream);