mirror of
https://github.com/microsoft/DirectXMath
synced 2024-11-08 13:40:09 +00:00
Updated for clang control defines
This commit is contained in:
parent
9fa46af4ec
commit
7fcdfbc3c6
@ -82,9 +82,9 @@
|
||||
#endif
|
||||
|
||||
#if !defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
|
||||
#if (defined(_M_IX86) || defined(_M_X64)) && !defined(_M_HYBRID_X86_ARM64)
|
||||
#if (defined(_M_IX86) || defined(_M_X64) || __i386__ || __x86_64__) && !defined(_M_HYBRID_X86_ARM64)
|
||||
#define _XM_SSE_INTRINSICS_
|
||||
#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
|
||||
#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__
|
||||
#define _XM_ARM_NEON_INTRINSICS_
|
||||
#elif !defined(_XM_NO_INTRINSICS_)
|
||||
#error DirectX Math does not support this target
|
||||
@ -104,11 +104,14 @@
|
||||
#pragma warning(pop)
|
||||
|
||||
#ifndef _XM_NO_INTRINSICS_
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 4987)
|
||||
// C4987: Off by default noise
|
||||
#include <intrin.h>
|
||||
#pragma warning(pop)
|
||||
#endif
|
||||
|
||||
#if defined(__clang__) && (__x86_64__ || __i386__)
|
||||
#include <cpuid.h>
|
||||
@ -131,7 +134,7 @@
|
||||
#endif
|
||||
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
|
||||
#if defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64))
|
||||
#include <arm64_neon.h>
|
||||
#else
|
||||
#include <arm_neon.h>
|
||||
@ -172,6 +175,18 @@
|
||||
|
||||
#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_
|
||||
|
||||
#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
|
||||
|
||||
#if defined(__clang__)
|
||||
#define XM_PREFETCH( a ) __builtin_prefetch(a)
|
||||
#elif defined(_MSC_VER)
|
||||
#define XM_PREFETCH( a ) __prefetch(a)
|
||||
#else
|
||||
#define XM_PREFETCH( a )
|
||||
#endif
|
||||
|
||||
#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_
|
||||
|
||||
namespace DirectX
|
||||
{
|
||||
|
||||
@ -311,21 +326,21 @@ typedef __vector4 XMVECTOR;
|
||||
#endif
|
||||
|
||||
// Fix-up for (1st-3rd) XMVECTOR parameters that are pass-in-register for x86, ARM, ARM64, and vector call; by reference otherwise
|
||||
#if ( defined(_M_IX86) || defined(_M_ARM) || defined(_M_ARM64) || _XM_VECTORCALL_ ) && !defined(_XM_NO_INTRINSICS_)
|
||||
#if ( defined(_M_IX86) || defined(_M_ARM) || defined(_M_ARM64) || _XM_VECTORCALL_ || __i386__ || __arm__ || __aarch64__ ) && !defined(_XM_NO_INTRINSICS_)
|
||||
typedef const XMVECTOR FXMVECTOR;
|
||||
#else
|
||||
typedef const XMVECTOR& FXMVECTOR;
|
||||
#endif
|
||||
|
||||
// Fix-up for (4th) XMVECTOR parameter to pass in-register for ARM, ARM64, and x64 vector call; by reference otherwise
|
||||
#if ( defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || (_XM_VECTORCALL_ && !defined(_M_IX86) ) ) && !defined(_XM_NO_INTRINSICS_)
|
||||
#if ( defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || (_XM_VECTORCALL_ && !defined(_M_IX86) ) || __arm__ || __aarch64__ ) && !defined(_XM_NO_INTRINSICS_)
|
||||
typedef const XMVECTOR GXMVECTOR;
|
||||
#else
|
||||
typedef const XMVECTOR& GXMVECTOR;
|
||||
#endif
|
||||
|
||||
// Fix-up for (5th & 6th) XMVECTOR parameter to pass in-register for ARM64 and vector call; by reference otherwise
|
||||
#if ( defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || _XM_VECTORCALL_ ) && !defined(_XM_NO_INTRINSICS_)
|
||||
#if ( defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || _XM_VECTORCALL_ || __aarch64__ ) && !defined(_XM_NO_INTRINSICS_)
|
||||
typedef const XMVECTOR HXMVECTOR;
|
||||
#else
|
||||
typedef const XMVECTOR& HXMVECTOR;
|
||||
@ -428,7 +443,7 @@ XMVECTOR XM_CALLCONV operator/ (FXMVECTOR V, float S);
|
||||
struct XMMATRIX;
|
||||
|
||||
// Fix-up for (1st) XMMATRIX parameter to pass in-register for ARM64 and vector call; by reference otherwise
|
||||
#if ( defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || _XM_VECTORCALL_ ) && !defined(_XM_NO_INTRINSICS_)
|
||||
#if ( defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || _XM_VECTORCALL_ || __aarch64__ ) && !defined(_XM_NO_INTRINSICS_)
|
||||
typedef const XMMATRIX FXMMATRIX;
|
||||
#else
|
||||
typedef const XMMATRIX& FXMMATRIX;
|
||||
|
@ -3061,7 +3061,7 @@ inline XMMATRIX& XMMATRIX::operator/= (float S)
|
||||
r[3] = XMVectorDivide( r[3], vS );
|
||||
return *this;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__
|
||||
float32x4_t vS = vdupq_n_f32( S );
|
||||
r[0] = vdivq_f32( r[0], vS );
|
||||
r[1] = vdivq_f32( r[1], vS );
|
||||
@ -3148,7 +3148,7 @@ inline XMMATRIX XMMATRIX::operator/ (float S) const
|
||||
R.r[3] = XMVectorDivide( r[3], vS );
|
||||
return R;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__
|
||||
float32x4_t vS = vdupq_n_f32( S );
|
||||
XMMATRIX R;
|
||||
R.r[0] = vdivq_f32( r[0], vS );
|
||||
|
@ -2339,7 +2339,7 @@ inline XMVECTOR XM_CALLCONV XMVectorRound
|
||||
return Result.v;
|
||||
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__
|
||||
return vrndnq_f32(V);
|
||||
#else
|
||||
uint32x4_t sign = vandq_u32( V, g_XMNegativeZero );
|
||||
@ -2403,7 +2403,7 @@ inline XMVECTOR XM_CALLCONV XMVectorTruncate
|
||||
return Result;
|
||||
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__
|
||||
return vrndq_f32(V);
|
||||
#else
|
||||
float32x4_t vTest = vabsq_f32( V );
|
||||
@ -2453,7 +2453,7 @@ inline XMVECTOR XM_CALLCONV XMVectorFloor
|
||||
} } };
|
||||
return Result.v;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__
|
||||
return vrndmq_f32(V);
|
||||
#else
|
||||
float32x4_t vTest = vabsq_f32( V );
|
||||
@ -2507,7 +2507,7 @@ inline XMVECTOR XM_CALLCONV XMVectorCeiling
|
||||
} } };
|
||||
return Result.v;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__
|
||||
return vrndpq_f32(V);
|
||||
#else
|
||||
float32x4_t vTest = vabsq_f32( V );
|
||||
@ -2810,7 +2810,7 @@ inline XMVECTOR XM_CALLCONV XMVectorSum
|
||||
return Result.v;
|
||||
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__
|
||||
XMVECTOR vTemp = vpaddq_f32(V, V);
|
||||
return vpaddq_f32(vTemp,vTemp);
|
||||
#else
|
||||
@ -3015,7 +3015,7 @@ inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd
|
||||
} } };
|
||||
return Result.v;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__
|
||||
return vfmaq_f32( V3, V1, V2 );
|
||||
#else
|
||||
return vmlaq_f32( V3, V1, V2 );
|
||||
@ -3045,7 +3045,7 @@ inline XMVECTOR XM_CALLCONV XMVectorDivide
|
||||
} } };
|
||||
return Result.v;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__
|
||||
return vdivq_f32( V1, V2 );
|
||||
#else
|
||||
// 2 iterations of Newton-Raphson refinement of reciprocal
|
||||
@ -3079,7 +3079,7 @@ inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract
|
||||
} } };
|
||||
return Result;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__
|
||||
return vfmsq_f32( V3, V1, V2 );
|
||||
#else
|
||||
return vmlsq_f32( V3, V1, V2 );
|
||||
@ -3154,7 +3154,7 @@ inline XMVECTOR XM_CALLCONV XMVectorReciprocal
|
||||
} } };
|
||||
return Result.v;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__
|
||||
float32x4_t one = vdupq_n_f32(1.0f);
|
||||
return vdivq_f32(one,V);
|
||||
#else
|
||||
@ -7755,26 +7755,26 @@ inline XMFLOAT4* XM_CALLCONV XMVector2TransformStream
|
||||
XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M
|
||||
XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N
|
||||
|
||||
__prefetch( pInputVector );
|
||||
XM_PREFETCH( pInputVector );
|
||||
|
||||
r3 = vget_high_f32( row3 );
|
||||
r = vget_high_f32( row0 );
|
||||
XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O
|
||||
XMVECTOR vResult3 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P
|
||||
|
||||
__prefetch( pInputVector+XM_CACHE_LINE_SIZE );
|
||||
XM_PREFETCH( pInputVector+XM_CACHE_LINE_SIZE );
|
||||
|
||||
r = vget_low_f32( row1 );
|
||||
vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M
|
||||
vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*2) );
|
||||
|
||||
r = vget_high_f32( row1 );
|
||||
vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O
|
||||
vResult3 = vmlaq_lane_f32( vResult3, V.val[1], r, 1 ); // Dx+Hy+P
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*3) );
|
||||
|
||||
float32x4x4_t R;
|
||||
R.val[0] = vResult0;
|
||||
@ -8060,26 +8060,26 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
|
||||
XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M
|
||||
XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N
|
||||
|
||||
__prefetch( pInputVector );
|
||||
XM_PREFETCH( pInputVector );
|
||||
|
||||
r3 = vget_high_f32( row3 );
|
||||
r = vget_high_f32( row0 );
|
||||
XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P
|
||||
|
||||
__prefetch( pInputVector+XM_CACHE_LINE_SIZE );
|
||||
XM_PREFETCH( pInputVector+XM_CACHE_LINE_SIZE );
|
||||
|
||||
r = vget_low_f32( row1 );
|
||||
vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M
|
||||
vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*2) );
|
||||
|
||||
r = vget_high_f32( row1 );
|
||||
W = vmlaq_lane_f32( W, V.val[1], r, 1 ); // Dx+Hy+P
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*3) );
|
||||
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__
|
||||
V.val[0] = vdivq_f32( vResult0, W );
|
||||
V.val[1] = vdivq_f32( vResult1, W );
|
||||
#else
|
||||
@ -8113,7 +8113,7 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
|
||||
V = vget_high_f32( vResult );
|
||||
float32x2_t W = vdup_lane_f32( V, 1 );
|
||||
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__
|
||||
V = vget_low_f32( vResult );
|
||||
V = vdiv_f32( V, W );
|
||||
#else
|
||||
@ -8454,15 +8454,15 @@ inline XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream
|
||||
XMVECTOR vResult0 = vmulq_lane_f32( V.val[0], r, 0 ); // Ax
|
||||
XMVECTOR vResult1 = vmulq_lane_f32( V.val[0], r, 1 ); // Bx
|
||||
|
||||
__prefetch( pInputVector );
|
||||
__prefetch( pInputVector+XM_CACHE_LINE_SIZE );
|
||||
XM_PREFETCH( pInputVector );
|
||||
XM_PREFETCH( pInputVector+XM_CACHE_LINE_SIZE );
|
||||
|
||||
r = vget_low_f32( row1 );
|
||||
vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey
|
||||
vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*2) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*3) );
|
||||
|
||||
V.val[0] = vResult0;
|
||||
V.val[1] = vResult1;
|
||||
@ -10224,38 +10224,38 @@ inline XMFLOAT4* XM_CALLCONV XMVector3TransformStream
|
||||
XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M
|
||||
XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N
|
||||
|
||||
__prefetch( pInputVector );
|
||||
XM_PREFETCH( pInputVector );
|
||||
|
||||
r3 = vget_high_f32( row3 );
|
||||
r = vget_high_f32( row0 );
|
||||
XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O
|
||||
XMVECTOR vResult3 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P
|
||||
|
||||
__prefetch( pInputVector+XM_CACHE_LINE_SIZE );
|
||||
XM_PREFETCH( pInputVector+XM_CACHE_LINE_SIZE );
|
||||
|
||||
r = vget_low_f32( row1 );
|
||||
vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M
|
||||
vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*2) );
|
||||
|
||||
r = vget_high_f32( row1 );
|
||||
vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O
|
||||
vResult3 = vmlaq_lane_f32( vResult3, V.val[1], r, 1 ); // Dx+Hy+P
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*3) );
|
||||
|
||||
r = vget_low_f32( row2 );
|
||||
vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz+M
|
||||
vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz+N
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*4) );
|
||||
|
||||
r = vget_high_f32( row2 );
|
||||
vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz+O
|
||||
vResult3 = vmlaq_lane_f32( vResult3, V.val[2], r, 1 ); // Dx+Hy+Lz+P
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*5) );
|
||||
|
||||
float32x4x4_t R;
|
||||
R.val[0] = vResult0;
|
||||
@ -10611,40 +10611,40 @@ inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream
|
||||
XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M
|
||||
XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N
|
||||
|
||||
__prefetch( pInputVector );
|
||||
XM_PREFETCH( pInputVector );
|
||||
|
||||
r3 = vget_high_f32( row3 );
|
||||
r = vget_high_f32( row0 );
|
||||
XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O
|
||||
XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P
|
||||
|
||||
__prefetch( pInputVector+XM_CACHE_LINE_SIZE );
|
||||
XM_PREFETCH( pInputVector+XM_CACHE_LINE_SIZE );
|
||||
|
||||
r = vget_low_f32( row1 );
|
||||
vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M
|
||||
vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*2) );
|
||||
|
||||
r = vget_high_f32( row1 );
|
||||
vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O
|
||||
W = vmlaq_lane_f32( W, V.val[1], r, 1 ); // Dx+Hy+P
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*3) );
|
||||
|
||||
r = vget_low_f32( row2 );
|
||||
vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz+M
|
||||
vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz+N
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*4) );
|
||||
|
||||
r = vget_high_f32( row2 );
|
||||
vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz+O
|
||||
W = vmlaq_lane_f32( W, V.val[2], r, 1 ); // Dx+Hy+Lz+P
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*5) );
|
||||
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__
|
||||
V.val[0] = vdivq_f32( vResult0, W );
|
||||
V.val[1] = vdivq_f32( vResult1, W );
|
||||
V.val[2] = vdivq_f32( vResult2, W );
|
||||
@ -10683,7 +10683,7 @@ inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream
|
||||
VH = vget_high_f32(vResult);
|
||||
XMVECTOR W = vdupq_lane_f32( VH, 1 );
|
||||
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__
|
||||
vResult = vdivq_f32( vResult, W );
|
||||
#else
|
||||
// 2 iterations of Newton-Raphson refinement of reciprocal for W
|
||||
@ -11134,34 +11134,34 @@ inline XMFLOAT3* XM_CALLCONV XMVector3TransformNormalStream
|
||||
XMVECTOR vResult0 = vmulq_lane_f32( V.val[0], r, 0 ); // Ax
|
||||
XMVECTOR vResult1 = vmulq_lane_f32( V.val[0], r, 1 ); // Bx
|
||||
|
||||
__prefetch( pInputVector );
|
||||
XM_PREFETCH( pInputVector );
|
||||
|
||||
r = vget_high_f32( row0 );
|
||||
XMVECTOR vResult2 = vmulq_lane_f32( V.val[0], r, 0 ); // Cx
|
||||
|
||||
__prefetch( pInputVector+XM_CACHE_LINE_SIZE );
|
||||
XM_PREFETCH( pInputVector+XM_CACHE_LINE_SIZE );
|
||||
|
||||
r = vget_low_f32( row1 );
|
||||
vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey
|
||||
vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*2) );
|
||||
|
||||
r = vget_high_f32( row1 );
|
||||
vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*3) );
|
||||
|
||||
r = vget_low_f32( row2 );
|
||||
vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz
|
||||
vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*4) );
|
||||
|
||||
r = vget_high_f32( row2 );
|
||||
vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*5) );
|
||||
|
||||
V.val[0] = vResult0;
|
||||
V.val[1] = vResult1;
|
||||
@ -11576,40 +11576,40 @@ inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream
|
||||
XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M
|
||||
XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N
|
||||
|
||||
__prefetch( pInputVector );
|
||||
XM_PREFETCH( pInputVector );
|
||||
|
||||
r3 = vget_high_f32( Transform.r[3] );
|
||||
r = vget_high_f32( Transform.r[0] );
|
||||
XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O
|
||||
XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P
|
||||
|
||||
__prefetch( pInputVector+XM_CACHE_LINE_SIZE );
|
||||
XM_PREFETCH( pInputVector+XM_CACHE_LINE_SIZE );
|
||||
|
||||
r = vget_low_f32( Transform.r[1] );
|
||||
vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M
|
||||
vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*2) );
|
||||
|
||||
r = vget_high_f32( Transform.r[1] );
|
||||
vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O
|
||||
W = vmlaq_lane_f32( W, V.val[1], r, 1 ); // Dx+Hy+P
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*3) );
|
||||
|
||||
r = vget_low_f32( Transform.r[2] );
|
||||
vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz+M
|
||||
vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz+N
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*4) );
|
||||
|
||||
r = vget_high_f32( Transform.r[2] );
|
||||
vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz+O
|
||||
W = vmlaq_lane_f32( W, V.val[2], r, 1 ); // Dx+Hy+Lz+P
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*5) );
|
||||
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__
|
||||
vResult0 = vdivq_f32( vResult0, W );
|
||||
vResult1 = vdivq_f32( vResult1, W );
|
||||
vResult2 = vdivq_f32( vResult2, W );
|
||||
@ -11657,7 +11657,7 @@ inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream
|
||||
VH = vget_high_f32(vResult);
|
||||
XMVECTOR W = vdupq_lane_f32( VH, 1 );
|
||||
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__
|
||||
vResult = vdivq_f32( vResult, W );
|
||||
#else
|
||||
// 2 iterations of Newton-Raphson refinement of reciprocal for W
|
||||
@ -12165,14 +12165,14 @@ inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream
|
||||
XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), VX, r, 0 ); // Ax+M
|
||||
XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), VX, r, 1 ); // Bx+N
|
||||
|
||||
__prefetch( pInputVector );
|
||||
XM_PREFETCH( pInputVector );
|
||||
|
||||
r3 = vget_high_f32( Transform.r[3] );
|
||||
r = vget_high_f32( Transform.r[0] );
|
||||
XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), VX, r, 0 ); // Cx+O
|
||||
XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), VX, r, 1 ); // Dx+P
|
||||
|
||||
__prefetch( pInputVector+XM_CACHE_LINE_SIZE );
|
||||
XM_PREFETCH( pInputVector+XM_CACHE_LINE_SIZE );
|
||||
|
||||
XMVECTOR ScaleY = vdupq_n_f32(sy);
|
||||
XMVECTOR OffsetY = vdupq_n_f32(oy);
|
||||
@ -12182,13 +12182,13 @@ inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream
|
||||
vResult0 = vmlaq_lane_f32( vResult0, VY, r, 0 ); // Ax+Ey+M
|
||||
vResult1 = vmlaq_lane_f32( vResult1, VY, r, 1 ); // Bx+Fy+N
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*2) );
|
||||
|
||||
r = vget_high_f32( Transform.r[1] );
|
||||
vResult2 = vmlaq_lane_f32( vResult2, VY, r, 0 ); // Cx+Gy+O
|
||||
W = vmlaq_lane_f32( W, VY, r, 1 ); // Dx+Hy+P
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*3) );
|
||||
|
||||
XMVECTOR ScaleZ = vdupq_n_f32(sz);
|
||||
XMVECTOR OffsetZ = vdupq_n_f32(oz);
|
||||
@ -12198,15 +12198,15 @@ inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream
|
||||
vResult0 = vmlaq_lane_f32( vResult0, VZ, r, 0 ); // Ax+Ey+Iz+M
|
||||
vResult1 = vmlaq_lane_f32( vResult1, VZ, r, 1 ); // Bx+Fy+Jz+N
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*4) );
|
||||
|
||||
r = vget_high_f32( Transform.r[2] );
|
||||
vResult2 = vmlaq_lane_f32( vResult2, VZ, r, 0 ); // Cx+Gy+Kz+O
|
||||
W = vmlaq_lane_f32( W, VZ, r, 1 ); // Dx+Hy+Lz+P
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*5) );
|
||||
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__
|
||||
V.val[0] = vdivq_f32( vResult0, W );
|
||||
V.val[1] = vdivq_f32( vResult1, W );
|
||||
V.val[2] = vdivq_f32( vResult2, W );
|
||||
@ -12260,7 +12260,7 @@ inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream
|
||||
VH = vget_high_f32(vResult);
|
||||
XMVECTOR W = vdupq_lane_f32( VH, 1 );
|
||||
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
|
||||
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__
|
||||
vResult = vdivq_f32( vResult, W );
|
||||
#else
|
||||
// 2 iterations of Newton-Raphson refinement of reciprocal for W
|
||||
@ -14256,49 +14256,49 @@ inline XMFLOAT4* XM_CALLCONV XMVector4TransformStream
|
||||
XMVECTOR vResult0 = vmulq_lane_f32( V.val[0], r, 0 ); // Ax
|
||||
XMVECTOR vResult1 = vmulq_lane_f32( V.val[0], r, 1 ); // Bx
|
||||
|
||||
__prefetch( pInputVector );
|
||||
XM_PREFETCH( pInputVector );
|
||||
|
||||
r = vget_high_f32( row0 );
|
||||
XMVECTOR vResult2 = vmulq_lane_f32( V.val[0], r, 0 ); // Cx
|
||||
XMVECTOR vResult3 = vmulq_lane_f32( V.val[0], r, 1 ); // Dx
|
||||
|
||||
__prefetch( pInputVector+XM_CACHE_LINE_SIZE );
|
||||
XM_PREFETCH( pInputVector+XM_CACHE_LINE_SIZE );
|
||||
|
||||
r = vget_low_f32( row1 );
|
||||
vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey
|
||||
vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*2) );
|
||||
|
||||
r = vget_high_f32( row1 );
|
||||
vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy
|
||||
vResult3 = vmlaq_lane_f32( vResult3, V.val[1], r, 1 ); // Dx+Hy
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*3) );
|
||||
|
||||
r = vget_low_f32( row2 );
|
||||
vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz
|
||||
vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*4) );
|
||||
|
||||
r = vget_high_f32( row2 );
|
||||
vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz
|
||||
vResult3 = vmlaq_lane_f32( vResult3, V.val[2], r, 1 ); // Dx+Hy+Lz
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*5) );
|
||||
|
||||
r = vget_low_f32( row3 );
|
||||
vResult0 = vmlaq_lane_f32( vResult0, V.val[3], r, 0 ); // Ax+Ey+Iz+Mw
|
||||
vResult1 = vmlaq_lane_f32( vResult1, V.val[3], r, 1 ); // Bx+Fy+Jz+Nw
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*6) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*6) );
|
||||
|
||||
r = vget_high_f32( row3 );
|
||||
vResult2 = vmlaq_lane_f32( vResult2, V.val[3], r, 0 ); // Cx+Gy+Kz+Ow
|
||||
vResult3 = vmlaq_lane_f32( vResult3, V.val[3], r, 1 ); // Dx+Hy+Lz+Pw
|
||||
|
||||
__prefetch( pInputVector+(XM_CACHE_LINE_SIZE*7) );
|
||||
XM_PREFETCH( pInputVector+(XM_CACHE_LINE_SIZE*7) );
|
||||
|
||||
V.val[0] = vResult0;
|
||||
V.val[1] = vResult1;
|
||||
|
@ -26,7 +26,7 @@ inline float XMConvertHalfToFloat
|
||||
__m128i V1 = _mm_cvtsi32_si128( static_cast<int>(Value) );
|
||||
__m128 V2 = _mm_cvtph_ps( V1 );
|
||||
return _mm_cvtss_f32( V2 );
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)) && !defined(_XM_NO_INTRINSICS_)
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_)
|
||||
uint16x4_t vHalf = vdup_n_u16(Value);
|
||||
float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
|
||||
return vgetq_lane_f32(vFloat, 0);
|
||||
@ -258,7 +258,7 @@ inline float* XMConvertHalfToFloatStream
|
||||
XM_SFENCE();
|
||||
|
||||
return pOutputStream;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)) && !defined(_XM_NO_INTRINSICS_)
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_)
|
||||
auto pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
|
||||
auto pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
|
||||
|
||||
@ -395,7 +395,7 @@ inline HALF XMConvertFloatToHalf
|
||||
__m128 V1 = _mm_set_ss( Value );
|
||||
__m128i V2 = _mm_cvtps_ph( V1, 0 );
|
||||
return static_cast<HALF>( _mm_cvtsi128_si32(V2) );
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)) && !defined(_XM_NO_INTRINSICS_)
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_)
|
||||
float32x4_t vFloat = vdupq_n_f32(Value);
|
||||
float16x4_t vHalf = vcvt_f16_f32(vFloat);
|
||||
return vget_lane_u16(vreinterpret_u16_f16(vHalf), 0);
|
||||
@ -624,7 +624,7 @@ inline HALF* XMConvertFloatToHalfStream
|
||||
}
|
||||
|
||||
return pOutputStream;
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)) && !defined(_XM_NO_INTRINSICS_)
|
||||
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_)
|
||||
auto pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
|
||||
auto pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user