mirror of
https://github.com/microsoft/DirectXMath
synced 2024-11-21 20:00:12 +00:00
Updated extensions headers for clang
This commit is contained in:
parent
549b51d9cc
commit
24a4887b35
@ -7,25 +7,12 @@
|
||||
// http://go.microsoft.com/fwlink/?LinkID=615560
|
||||
//-------------------------------------------------------------------------------------
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma once
|
||||
#endif
|
||||
|
||||
#ifdef _M_ARM
|
||||
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__
|
||||
#error AVX not supported on ARM platform
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) && (_MSC_VER < 1600)
|
||||
#error AVX intrinsics requires Visual C++ 2010 Service Pack 1 or later.
|
||||
#endif
|
||||
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 4987)
|
||||
#include <intrin.h>
|
||||
#pragma warning(pop)
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#include <DirectXMath.h>
|
||||
|
||||
namespace DirectX
|
||||
@ -41,12 +28,20 @@ inline bool XMVerifyAVXSupport()
|
||||
|
||||
// See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
|
||||
int CPUInfo[4] = {-1};
|
||||
#ifdef __clang__
|
||||
__cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
|
||||
#else
|
||||
__cpuid( CPUInfo, 0 );
|
||||
#endif
|
||||
|
||||
if ( CPUInfo[0] < 1 )
|
||||
return false;
|
||||
|
||||
#ifdef __clang__
|
||||
__cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
|
||||
#else
|
||||
__cpuid(CPUInfo, 1 );
|
||||
#endif
|
||||
|
||||
// We check for AVX, OSXSAVE, SSSE4.1, and SSE3
|
||||
return ( (CPUInfo[2] & 0x18080001) == 0x18080001 );
|
||||
@ -97,9 +92,9 @@ inline XMVECTOR XM_CALLCONV XMVectorPermute( FXMVECTOR V1, FXMVECTOR V2, uint32_
|
||||
assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
|
||||
_Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
|
||||
|
||||
static const XMVECTORU32 three = { 3, 3, 3, 3 };
|
||||
static const XMVECTORU32 three = { { { 3, 3, 3, 3 } } };
|
||||
|
||||
_declspec(align(16)) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW };
|
||||
__declspec(align(16)) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW };
|
||||
__m128i vControl = _mm_load_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
|
||||
|
||||
__m128i vSelect = _mm_cmpgt_epi32( vControl, three );
|
||||
@ -210,8 +205,8 @@ template<uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t Permu
|
||||
}
|
||||
|
||||
// Special-case permute templates
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { (V2); return V1; }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { (V1); return V2; }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR) { return V1; }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR, FXMVECTOR V2) { return V2; }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); }
|
||||
|
@ -7,25 +7,12 @@
|
||||
// http://go.microsoft.com/fwlink/?LinkID=615560
|
||||
//-------------------------------------------------------------------------------------
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma once
|
||||
#endif
|
||||
|
||||
#ifdef _M_ARM
|
||||
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__
|
||||
#error AVX2 not supported on ARM platform
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) && (_MSC_VER < 1700)
|
||||
#error AVX2 intrinsics requires Visual C++ 2012 or later.
|
||||
#endif
|
||||
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 4987)
|
||||
#include <intrin.h>
|
||||
#pragma warning(pop)
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#include <DirectXMath.h>
|
||||
#include <DirectXPackedVector.h>
|
||||
|
||||
@ -42,18 +29,30 @@ inline bool XMVerifyAVX2Support()
|
||||
|
||||
// See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
|
||||
int CPUInfo[4] = {-1};
|
||||
__cpuid( CPUInfo, 0 );
|
||||
#ifdef __clang__
|
||||
__cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
|
||||
#else
|
||||
__cpuid(CPUInfo, 0);
|
||||
#endif
|
||||
|
||||
if ( CPUInfo[0] < 7 )
|
||||
return false;
|
||||
|
||||
__cpuid(CPUInfo, 1 );
|
||||
#ifdef __clang__
|
||||
__cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
|
||||
#else
|
||||
__cpuid(CPUInfo, 1);
|
||||
#endif
|
||||
|
||||
// We check for F16C, FMA3, AVX, OSXSAVE, SSSE4.1, and SSE3
|
||||
if ( (CPUInfo[2] & 0x38081001) != 0x38081001 )
|
||||
return false;
|
||||
|
||||
#ifdef __clang__
|
||||
__cpuid_count(7, 0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
|
||||
#else
|
||||
__cpuidex(CPUInfo, 7, 0);
|
||||
#endif
|
||||
|
||||
return ( (CPUInfo[1] & 0x20 ) == 0x20 );
|
||||
}
|
||||
@ -123,9 +122,9 @@ inline XMVECTOR XM_CALLCONV XMVectorPermute( FXMVECTOR V1, FXMVECTOR V2, uint32_
|
||||
assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
|
||||
_Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
|
||||
|
||||
static const XMVECTORU32 three = { 3, 3, 3, 3 };
|
||||
static const XMVECTORU32 three = { { { 3, 3, 3, 3 } } };
|
||||
|
||||
_declspec(align(16)) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW };
|
||||
__declspec(align(16)) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW };
|
||||
__m128i vControl = _mm_load_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
|
||||
|
||||
__m128i vSelect = _mm_cmpgt_epi32( vControl, three );
|
||||
@ -305,7 +304,7 @@ inline XMVECTOR XM_CALLCONV XMVector3Unproject
|
||||
CXMMATRIX World
|
||||
)
|
||||
{
|
||||
static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f };
|
||||
static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } };
|
||||
|
||||
XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
|
||||
Scale = XMVectorReciprocal(Scale);
|
||||
@ -550,8 +549,8 @@ template<uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t Permu
|
||||
}
|
||||
|
||||
// Special-case permute templates
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { (V2); return V1; }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { (V1); return V2; }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR) { return V1; }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR, FXMVECTOR V2) { return V2; }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); }
|
||||
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); }
|
||||
@ -622,7 +621,7 @@ template<uint32_t Elements>
|
||||
|
||||
inline float XMConvertHalfToFloat( PackedVector::HALF Value )
|
||||
{
|
||||
__m128i V1 = _mm_cvtsi32_si128( static_cast<uint32_t>(Value) );
|
||||
__m128i V1 = _mm_cvtsi32_si128( static_cast<int>(Value) );
|
||||
__m128 V2 = _mm_cvtph_ps( V1 );
|
||||
return _mm_cvtss_f32( V2 );
|
||||
}
|
||||
@ -647,29 +646,33 @@ inline float* XMConvertHalfToFloatStream
|
||||
|
||||
assert(pOutputStream);
|
||||
assert(pInputStream);
|
||||
const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
|
||||
uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
|
||||
|
||||
assert(InputStride >= sizeof(HALF));
|
||||
assert(OutputStride >= sizeof(float));
|
||||
|
||||
auto pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
|
||||
auto pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
|
||||
|
||||
size_t i = 0;
|
||||
size_t four = HalfCount >> 2;
|
||||
if ( four > 0 )
|
||||
if (four > 0)
|
||||
{
|
||||
if (InputStride == sizeof(HALF))
|
||||
{
|
||||
if (OutputStride == sizeof(float))
|
||||
{
|
||||
if ( ((uintptr_t)pFloat & 0xF) == 0)
|
||||
if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
|
||||
{
|
||||
// Packed input, aligned & packed output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
|
||||
pHalf += InputStride*4;
|
||||
__m128i HV = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pHalf));
|
||||
pHalf += InputStride * 4;
|
||||
|
||||
__m128 FV = _mm_cvtph_ps( HV );
|
||||
__m128 FV = _mm_cvtph_ps(HV);
|
||||
|
||||
_mm_stream_ps( reinterpret_cast<float*>(pFloat), FV );
|
||||
pFloat += OutputStride*4;
|
||||
_mm_stream_ps(reinterpret_cast<float*>(pFloat), FV);
|
||||
pFloat += OutputStride * 4;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
@ -678,13 +681,13 @@ inline float* XMConvertHalfToFloatStream
|
||||
// Packed input, packed output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
|
||||
pHalf += InputStride*4;
|
||||
__m128i HV = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pHalf));
|
||||
pHalf += InputStride * 4;
|
||||
|
||||
__m128 FV = _mm_cvtph_ps( HV );
|
||||
__m128 FV = _mm_cvtph_ps(HV);
|
||||
|
||||
_mm_storeu_ps( reinterpret_cast<float*>(pFloat), FV );
|
||||
pFloat += OutputStride*4;
|
||||
_mm_storeu_ps(reinterpret_cast<float*>(pFloat), FV);
|
||||
pFloat += OutputStride * 4;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
@ -694,26 +697,26 @@ inline float* XMConvertHalfToFloatStream
|
||||
// Packed input, scattered output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
|
||||
pHalf += InputStride*4;
|
||||
__m128i HV = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pHalf));
|
||||
pHalf += InputStride * 4;
|
||||
|
||||
__m128 FV = _mm_cvtph_ps( HV );
|
||||
__m128 FV = _mm_cvtph_ps(HV);
|
||||
|
||||
_mm_store_ss( reinterpret_cast<float*>(pFloat), FV );
|
||||
pFloat += OutputStride;
|
||||
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 1 );
|
||||
pFloat += OutputStride;
|
||||
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 2 );
|
||||
pFloat += OutputStride;
|
||||
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 3 );
|
||||
pFloat += OutputStride;
|
||||
_mm_store_ss(reinterpret_cast<float*>(pFloat), FV);
|
||||
pFloat += OutputStride;
|
||||
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 1);
|
||||
pFloat += OutputStride;
|
||||
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 2);
|
||||
pFloat += OutputStride;
|
||||
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 3);
|
||||
pFloat += OutputStride;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (OutputStride == sizeof(float))
|
||||
{
|
||||
if ( ((uintptr_t)pFloat & 0xF) == 0)
|
||||
if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
|
||||
{
|
||||
// Scattered input, aligned & packed output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
@ -728,14 +731,14 @@ inline float* XMConvertHalfToFloatStream
|
||||
pHalf += InputStride;
|
||||
|
||||
__m128i HV = _mm_setzero_si128();
|
||||
HV = _mm_insert_epi16( HV, H1, 0 );
|
||||
HV = _mm_insert_epi16( HV, H2, 1 );
|
||||
HV = _mm_insert_epi16( HV, H3, 2 );
|
||||
HV = _mm_insert_epi16( HV, H4, 3 );
|
||||
__m128 FV = _mm_cvtph_ps( HV );
|
||||
HV = _mm_insert_epi16(HV, H1, 0);
|
||||
HV = _mm_insert_epi16(HV, H2, 1);
|
||||
HV = _mm_insert_epi16(HV, H3, 2);
|
||||
HV = _mm_insert_epi16(HV, H4, 3);
|
||||
__m128 FV = _mm_cvtph_ps(HV);
|
||||
|
||||
_mm_stream_ps( reinterpret_cast<float*>(pFloat ), FV );
|
||||
pFloat += OutputStride*4;
|
||||
_mm_stream_ps(reinterpret_cast<float*>(pFloat), FV);
|
||||
pFloat += OutputStride * 4;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
@ -754,16 +757,49 @@ inline float* XMConvertHalfToFloatStream
|
||||
pHalf += InputStride;
|
||||
|
||||
__m128i HV = _mm_setzero_si128();
|
||||
HV = _mm_insert_epi16( HV, H1, 0 );
|
||||
HV = _mm_insert_epi16( HV, H2, 1 );
|
||||
HV = _mm_insert_epi16( HV, H3, 2 );
|
||||
HV = _mm_insert_epi16( HV, H4, 3 );
|
||||
__m128 FV = _mm_cvtph_ps( HV );
|
||||
HV = _mm_insert_epi16(HV, H1, 0);
|
||||
HV = _mm_insert_epi16(HV, H2, 1);
|
||||
HV = _mm_insert_epi16(HV, H3, 2);
|
||||
HV = _mm_insert_epi16(HV, H4, 3);
|
||||
__m128 FV = _mm_cvtph_ps(HV);
|
||||
|
||||
_mm_storeu_ps( reinterpret_cast<float*>(pFloat ), FV );
|
||||
pFloat += OutputStride*4;
|
||||
_mm_storeu_ps(reinterpret_cast<float*>(pFloat), FV);
|
||||
pFloat += OutputStride * 4;
|
||||
i += 4;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Scattered input, scattered output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
|
||||
pHalf += InputStride;
|
||||
uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
|
||||
pHalf += InputStride;
|
||||
uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
|
||||
pHalf += InputStride;
|
||||
uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
|
||||
pHalf += InputStride;
|
||||
|
||||
__m128i HV = _mm_setzero_si128();
|
||||
HV = _mm_insert_epi16(HV, H1, 0);
|
||||
HV = _mm_insert_epi16(HV, H2, 1);
|
||||
HV = _mm_insert_epi16(HV, H3, 2);
|
||||
HV = _mm_insert_epi16(HV, H4, 3);
|
||||
__m128 FV = _mm_cvtph_ps(HV);
|
||||
|
||||
_mm_store_ss(reinterpret_cast<float*>(pFloat), FV);
|
||||
pFloat += OutputStride;
|
||||
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 1);
|
||||
pFloat += OutputStride;
|
||||
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 2);
|
||||
pFloat += OutputStride;
|
||||
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 3);
|
||||
pFloat += OutputStride;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -792,8 +828,12 @@ inline PackedVector::HALF* XMConvertFloatToHalfStream
|
||||
|
||||
assert(pOutputStream);
|
||||
assert(pInputStream);
|
||||
const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
|
||||
uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
|
||||
|
||||
assert(InputStride >= sizeof(float));
|
||||
assert(OutputStride >= sizeof(HALF));
|
||||
|
||||
auto pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
|
||||
auto pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
|
||||
|
||||
size_t i = 0;
|
||||
size_t four = FloatCount >> 2;
|
||||
@ -803,18 +843,18 @@ inline PackedVector::HALF* XMConvertFloatToHalfStream
|
||||
{
|
||||
if (OutputStride == sizeof(HALF))
|
||||
{
|
||||
if ( ((uintptr_t)pFloat & 0xF) == 0)
|
||||
if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
|
||||
{
|
||||
// Aligned and packed input, packed output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
|
||||
pFloat += InputStride*4;
|
||||
__m128 FV = _mm_load_ps(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride * 4;
|
||||
|
||||
__m128i HV = _mm_cvtps_ph( FV, 0 );
|
||||
__m128i HV = _mm_cvtps_ph(FV, 0);
|
||||
|
||||
_mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
|
||||
pHalf += OutputStride*4;
|
||||
_mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
|
||||
pHalf += OutputStride * 4;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
@ -823,36 +863,36 @@ inline PackedVector::HALF* XMConvertFloatToHalfStream
|
||||
// Packed input, packed output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
|
||||
pFloat += InputStride*4;
|
||||
__m128 FV = _mm_loadu_ps(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride * 4;
|
||||
|
||||
__m128i HV = _mm_cvtps_ph( FV, 0 );
|
||||
__m128i HV = _mm_cvtps_ph(FV, 0);
|
||||
|
||||
_mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
|
||||
pHalf += OutputStride*4;
|
||||
_mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
|
||||
pHalf += OutputStride * 4;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( ((uintptr_t)pFloat & 0xF) == 0)
|
||||
if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
|
||||
{
|
||||
// Aligned & packed input, scattered output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
|
||||
pFloat += InputStride*4;
|
||||
__m128 FV = _mm_load_ps(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride * 4;
|
||||
|
||||
__m128i HV = _mm_cvtps_ph( FV, 0 );
|
||||
__m128i HV = _mm_cvtps_ph(FV, 0);
|
||||
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
|
||||
pHalf += OutputStride;
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 1));
|
||||
pHalf += OutputStride;
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 2));
|
||||
pHalf += OutputStride;
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 3));
|
||||
pHalf += OutputStride;
|
||||
i += 4;
|
||||
}
|
||||
@ -862,18 +902,18 @@ inline PackedVector::HALF* XMConvertFloatToHalfStream
|
||||
// Packed input, scattered output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
|
||||
pFloat += InputStride*4;
|
||||
__m128 FV = _mm_loadu_ps(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride * 4;
|
||||
|
||||
__m128i HV = _mm_cvtps_ph( FV, 0 );
|
||||
__m128i HV = _mm_cvtps_ph(FV, 0);
|
||||
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
|
||||
pHalf += OutputStride;
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 1));
|
||||
pHalf += OutputStride;
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 2));
|
||||
pHalf += OutputStride;
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 3));
|
||||
pHalf += OutputStride;
|
||||
i += 4;
|
||||
}
|
||||
@ -885,26 +925,60 @@ inline PackedVector::HALF* XMConvertFloatToHalfStream
|
||||
// Scattered input, packed output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128 FV1 = _mm_load_ss( reinterpret_cast<const float*>(pFloat) );
|
||||
__m128 FV1 = _mm_load_ss(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride;
|
||||
|
||||
__m128 FV2 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
|
||||
__m128 FV2 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride;
|
||||
|
||||
__m128 FV3 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
|
||||
__m128 FV3 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride;
|
||||
|
||||
__m128 FV4 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
|
||||
__m128 FV4 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride;
|
||||
|
||||
__m128 FV = _mm_blend_ps( FV1, FV2, 0x2 );
|
||||
__m128 FT = _mm_blend_ps( FV3, FV4, 0x8 );
|
||||
FV = _mm_blend_ps( FV, FT, 0xC );
|
||||
__m128 FV = _mm_blend_ps(FV1, FV2, 0x2);
|
||||
__m128 FT = _mm_blend_ps(FV3, FV4, 0x8);
|
||||
FV = _mm_blend_ps(FV, FT, 0xC);
|
||||
|
||||
__m128i HV = _mm_cvtps_ph( FV, 0 );
|
||||
__m128i HV = _mm_cvtps_ph(FV, 0);
|
||||
|
||||
_mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
|
||||
pHalf += OutputStride*4;
|
||||
_mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
|
||||
pHalf += OutputStride * 4;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Scattered input, scattered output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128 FV1 = _mm_load_ss(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride;
|
||||
|
||||
__m128 FV2 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride;
|
||||
|
||||
__m128 FV3 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride;
|
||||
|
||||
__m128 FV4 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride;
|
||||
|
||||
__m128 FV = _mm_blend_ps(FV1, FV2, 0x2);
|
||||
__m128 FT = _mm_blend_ps(FV3, FV4, 0x8);
|
||||
FV = _mm_blend_ps(FV, FT, 0xC);
|
||||
|
||||
__m128i HV = _mm_cvtps_ph(FV, 0);
|
||||
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
|
||||
pHalf += OutputStride;
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 1));
|
||||
pHalf += OutputStride;
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 2));
|
||||
pHalf += OutputStride;
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 3));
|
||||
pHalf += OutputStride;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
|
@ -7,16 +7,9 @@
|
||||
// http://go.microsoft.com/fwlink/?LinkID=615560
|
||||
//-------------------------------------------------------------------------------------
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma once
|
||||
#endif
|
||||
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 4987)
|
||||
#include <intrin.h>
|
||||
#pragma warning(pop)
|
||||
|
||||
#ifndef _M_ARM
|
||||
#if (defined(_M_IX86) || defined(_M_X64) || __i386__ || __x86_64__) && !defined(_M_HYBRID_X86_ARM64)
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
|
||||
@ -31,15 +24,15 @@ inline XMVECTOR XM_CALLCONV XMVectorEndian
|
||||
)
|
||||
{
|
||||
#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
|
||||
static const XMVECTORU32 idx = { 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F };
|
||||
static const XMVECTORU32 idx = { { { 0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu } } };
|
||||
|
||||
int8x8x2_t tbl;
|
||||
tbl.val[0] = vget_low_f32(V);
|
||||
tbl.val[1] = vget_high_f32(V);
|
||||
uint8x8x2_t tbl;
|
||||
tbl.val[0] = vreinterpret_u8_f32(vget_low_f32(V));
|
||||
tbl.val[1] = vreinterpret_u8_f32(vget_high_f32(V));
|
||||
|
||||
const __n64 rL = vtbl2_u8( tbl, vget_low_f32(idx) );
|
||||
const __n64 rH = vtbl2_u8( tbl, vget_high_f32(idx) );
|
||||
return vcombine_f32( rL, rH );
|
||||
const uint8x8_t rL = vtbl2_u8(tbl, vget_low_u32(idx));
|
||||
const uint8x8_t rH = vtbl2_u8(tbl, vget_high_u32(idx));
|
||||
return vcombine_f32(vreinterpret_f32_u8(rL), vreinterpret_f32_u8(rH));
|
||||
#else
|
||||
XMVECTORU32 E;
|
||||
E.v = V;
|
||||
@ -56,7 +49,7 @@ inline XMVECTOR XM_CALLCONV XMVectorEndian
|
||||
}
|
||||
|
||||
|
||||
#ifndef _M_ARM
|
||||
#if (defined(_M_IX86) || defined(_M_X64) || __i386__ || __x86_64__) && !defined(_M_HYBRID_X86_ARM64)
|
||||
namespace SSSE3
|
||||
{
|
||||
|
||||
@ -65,13 +58,21 @@ inline bool XMVerifySSSE3Support()
|
||||
// Should return true on AMD Bulldozer, Intel Core i7/i5/i3, Intel Atom, or later processors
|
||||
|
||||
// See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
|
||||
int CPUInfo[4] = {-1};
|
||||
__cpuid( CPUInfo, 0 );
|
||||
int CPUInfo[4] = { -1 };
|
||||
#ifdef __clang__
|
||||
__cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
|
||||
#else
|
||||
__cpuid(CPUInfo, 0);
|
||||
#endif
|
||||
|
||||
if ( CPUInfo[0] < 1 )
|
||||
return false;
|
||||
|
||||
__cpuid(CPUInfo, 1 );
|
||||
#ifdef __clang__
|
||||
__cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
|
||||
#else
|
||||
__cpuid(CPUInfo, 1);
|
||||
#endif
|
||||
|
||||
// Check for SSSE3 instruction set.
|
||||
return ( (CPUInfo[2] & 0x200) != 0 );
|
||||
@ -82,13 +83,13 @@ inline XMVECTOR XM_CALLCONV XMVectorEndian
|
||||
FXMVECTOR V
|
||||
)
|
||||
{
|
||||
static const XMVECTORU32 idx = { 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F };
|
||||
static const XMVECTORU32 idx = { { { 0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu } } };
|
||||
|
||||
__m128i Result = _mm_shuffle_epi8( _mm_castps_si128(V), idx );
|
||||
return _mm_castsi128_ps( Result );
|
||||
}
|
||||
|
||||
} // namespace SSSE3
|
||||
#endif // !_M_ARM
|
||||
#endif // X86 || X64
|
||||
|
||||
} // namespace DirectX;
|
||||
} // namespace DirectX
|
||||
|
@ -7,25 +7,12 @@
|
||||
// http://go.microsoft.com/fwlink/?LinkID=615560
|
||||
//-------------------------------------------------------------------------------------
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma once
|
||||
#endif
|
||||
|
||||
#ifdef _M_ARM
|
||||
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__
|
||||
#error F16C not supported on ARM platform
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) && (_MSC_VER < 1700)
|
||||
#error F16C/CVT16 intrinsics requires Visual C++ 2012 or later.
|
||||
#endif
|
||||
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 4987)
|
||||
#include <intrin.h>
|
||||
#pragma warning(pop)
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#include <DirectXMath.h>
|
||||
#include <DirectXPackedVector.h>
|
||||
|
||||
@ -41,13 +28,21 @@ inline bool XMVerifyF16CSupport()
|
||||
// with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012)
|
||||
|
||||
// See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
|
||||
int CPUInfo[4] = {-1};
|
||||
__cpuid( CPUInfo, 0 );
|
||||
int CPUInfo[4] = { -1 };
|
||||
#ifdef __clang__
|
||||
__cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
|
||||
#else
|
||||
__cpuid(CPUInfo, 0);
|
||||
#endif
|
||||
|
||||
if ( CPUInfo[0] < 1 )
|
||||
return false;
|
||||
|
||||
__cpuid(CPUInfo, 1 );
|
||||
#ifdef __clang__
|
||||
__cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
|
||||
#else
|
||||
__cpuid(CPUInfo, 1);
|
||||
#endif
|
||||
|
||||
// We check for F16C, AVX, OSXSAVE, and SSE4.1
|
||||
return ( (CPUInfo[2] & 0x38080000 ) == 0x38080000 );
|
||||
@ -60,7 +55,7 @@ inline bool XMVerifyF16CSupport()
|
||||
|
||||
inline float XMConvertHalfToFloat( PackedVector::HALF Value )
|
||||
{
|
||||
__m128i V1 = _mm_cvtsi32_si128( static_cast<uint32_t>(Value) );
|
||||
__m128i V1 = _mm_cvtsi32_si128( static_cast<int>(Value) );
|
||||
__m128 V2 = _mm_cvtph_ps( V1 );
|
||||
return _mm_cvtss_f32( V2 );
|
||||
}
|
||||
@ -74,10 +69,10 @@ inline PackedVector::HALF XMConvertFloatToHalf( float Value )
|
||||
|
||||
inline float* XMConvertHalfToFloatStream
|
||||
(
|
||||
_Out_writes_bytes_(sizeof(float)+OutputStride*(HalfCount-1)) float* pOutputStream,
|
||||
_In_ size_t OutputStride,
|
||||
_In_reads_bytes_(2+InputStride*(HalfCount-1)) const PackedVector::HALF* pInputStream,
|
||||
_In_ size_t InputStride,
|
||||
_Out_writes_bytes_(sizeof(float) + OutputStride * (HalfCount - 1)) float* pOutputStream,
|
||||
_In_ size_t OutputStride,
|
||||
_In_reads_bytes_(2 + InputStride * (HalfCount - 1)) const PackedVector::HALF* pInputStream,
|
||||
_In_ size_t InputStride,
|
||||
_In_ size_t HalfCount
|
||||
)
|
||||
{
|
||||
@ -85,29 +80,33 @@ inline float* XMConvertHalfToFloatStream
|
||||
|
||||
assert(pOutputStream);
|
||||
assert(pInputStream);
|
||||
const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
|
||||
uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
|
||||
|
||||
assert(InputStride >= sizeof(HALF));
|
||||
assert(OutputStride >= sizeof(float));
|
||||
|
||||
auto pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
|
||||
auto pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
|
||||
|
||||
size_t i = 0;
|
||||
size_t four = HalfCount >> 2;
|
||||
if ( four > 0 )
|
||||
if (four > 0)
|
||||
{
|
||||
if (InputStride == sizeof(HALF))
|
||||
{
|
||||
if (OutputStride == sizeof(float))
|
||||
{
|
||||
if ( ((uintptr_t)pFloat & 0xF) == 0)
|
||||
if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
|
||||
{
|
||||
// Packed input, aligned & packed output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
|
||||
pHalf += InputStride*4;
|
||||
__m128i HV = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pHalf));
|
||||
pHalf += InputStride * 4;
|
||||
|
||||
__m128 FV = _mm_cvtph_ps( HV );
|
||||
__m128 FV = _mm_cvtph_ps(HV);
|
||||
|
||||
_mm_stream_ps( reinterpret_cast<float*>(pFloat), FV );
|
||||
pFloat += OutputStride*4;
|
||||
_mm_stream_ps(reinterpret_cast<float*>(pFloat), FV);
|
||||
pFloat += OutputStride * 4;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
@ -116,13 +115,13 @@ inline float* XMConvertHalfToFloatStream
|
||||
// Packed input, packed output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
|
||||
pHalf += InputStride*4;
|
||||
__m128i HV = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pHalf));
|
||||
pHalf += InputStride * 4;
|
||||
|
||||
__m128 FV = _mm_cvtph_ps( HV );
|
||||
__m128 FV = _mm_cvtph_ps(HV);
|
||||
|
||||
_mm_storeu_ps( reinterpret_cast<float*>(pFloat), FV );
|
||||
pFloat += OutputStride*4;
|
||||
_mm_storeu_ps(reinterpret_cast<float*>(pFloat), FV);
|
||||
pFloat += OutputStride * 4;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
@ -132,26 +131,26 @@ inline float* XMConvertHalfToFloatStream
|
||||
// Packed input, scattered output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
|
||||
pHalf += InputStride*4;
|
||||
__m128i HV = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pHalf));
|
||||
pHalf += InputStride * 4;
|
||||
|
||||
__m128 FV = _mm_cvtph_ps( HV );
|
||||
__m128 FV = _mm_cvtph_ps(HV);
|
||||
|
||||
_mm_store_ss( reinterpret_cast<float*>(pFloat), FV );
|
||||
pFloat += OutputStride;
|
||||
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 1 );
|
||||
pFloat += OutputStride;
|
||||
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 2 );
|
||||
pFloat += OutputStride;
|
||||
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 3 );
|
||||
pFloat += OutputStride;
|
||||
_mm_store_ss(reinterpret_cast<float*>(pFloat), FV);
|
||||
pFloat += OutputStride;
|
||||
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 1);
|
||||
pFloat += OutputStride;
|
||||
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 2);
|
||||
pFloat += OutputStride;
|
||||
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 3);
|
||||
pFloat += OutputStride;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (OutputStride == sizeof(float))
|
||||
{
|
||||
if ( ((uintptr_t)pFloat & 0xF) == 0)
|
||||
if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
|
||||
{
|
||||
// Scattered input, aligned & packed output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
@ -166,14 +165,14 @@ inline float* XMConvertHalfToFloatStream
|
||||
pHalf += InputStride;
|
||||
|
||||
__m128i HV = _mm_setzero_si128();
|
||||
HV = _mm_insert_epi16( HV, H1, 0 );
|
||||
HV = _mm_insert_epi16( HV, H2, 1 );
|
||||
HV = _mm_insert_epi16( HV, H3, 2 );
|
||||
HV = _mm_insert_epi16( HV, H4, 3 );
|
||||
__m128 FV = _mm_cvtph_ps( HV );
|
||||
HV = _mm_insert_epi16(HV, H1, 0);
|
||||
HV = _mm_insert_epi16(HV, H2, 1);
|
||||
HV = _mm_insert_epi16(HV, H3, 2);
|
||||
HV = _mm_insert_epi16(HV, H4, 3);
|
||||
__m128 FV = _mm_cvtph_ps(HV);
|
||||
|
||||
_mm_stream_ps( reinterpret_cast<float*>(pFloat ), FV );
|
||||
pFloat += OutputStride*4;
|
||||
_mm_stream_ps(reinterpret_cast<float*>(pFloat), FV);
|
||||
pFloat += OutputStride * 4;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
@ -192,16 +191,49 @@ inline float* XMConvertHalfToFloatStream
|
||||
pHalf += InputStride;
|
||||
|
||||
__m128i HV = _mm_setzero_si128();
|
||||
HV = _mm_insert_epi16( HV, H1, 0 );
|
||||
HV = _mm_insert_epi16( HV, H2, 1 );
|
||||
HV = _mm_insert_epi16( HV, H3, 2 );
|
||||
HV = _mm_insert_epi16( HV, H4, 3 );
|
||||
__m128 FV = _mm_cvtph_ps( HV );
|
||||
HV = _mm_insert_epi16(HV, H1, 0);
|
||||
HV = _mm_insert_epi16(HV, H2, 1);
|
||||
HV = _mm_insert_epi16(HV, H3, 2);
|
||||
HV = _mm_insert_epi16(HV, H4, 3);
|
||||
__m128 FV = _mm_cvtph_ps(HV);
|
||||
|
||||
_mm_storeu_ps( reinterpret_cast<float*>(pFloat ), FV );
|
||||
pFloat += OutputStride*4;
|
||||
_mm_storeu_ps(reinterpret_cast<float*>(pFloat), FV);
|
||||
pFloat += OutputStride * 4;
|
||||
i += 4;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Scattered input, scattered output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
|
||||
pHalf += InputStride;
|
||||
uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
|
||||
pHalf += InputStride;
|
||||
uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
|
||||
pHalf += InputStride;
|
||||
uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
|
||||
pHalf += InputStride;
|
||||
|
||||
__m128i HV = _mm_setzero_si128();
|
||||
HV = _mm_insert_epi16(HV, H1, 0);
|
||||
HV = _mm_insert_epi16(HV, H2, 1);
|
||||
HV = _mm_insert_epi16(HV, H3, 2);
|
||||
HV = _mm_insert_epi16(HV, H4, 3);
|
||||
__m128 FV = _mm_cvtph_ps(HV);
|
||||
|
||||
_mm_store_ss(reinterpret_cast<float*>(pFloat), FV);
|
||||
pFloat += OutputStride;
|
||||
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 1);
|
||||
pFloat += OutputStride;
|
||||
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 2);
|
||||
pFloat += OutputStride;
|
||||
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 3);
|
||||
pFloat += OutputStride;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -210,7 +242,7 @@ inline float* XMConvertHalfToFloatStream
|
||||
{
|
||||
*reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
|
||||
pHalf += InputStride;
|
||||
pFloat += OutputStride;
|
||||
pFloat += OutputStride;
|
||||
}
|
||||
|
||||
return pOutputStream;
|
||||
@ -219,10 +251,10 @@ inline float* XMConvertHalfToFloatStream
|
||||
|
||||
inline PackedVector::HALF* XMConvertFloatToHalfStream
|
||||
(
|
||||
_Out_writes_bytes_(2+OutputStride*(FloatCount-1)) PackedVector::HALF* pOutputStream,
|
||||
_In_ size_t OutputStride,
|
||||
_In_reads_bytes_(sizeof(float)+InputStride*(FloatCount-1)) const float* pInputStream,
|
||||
_In_ size_t InputStride,
|
||||
_Out_writes_bytes_(2 + OutputStride * (FloatCount - 1)) PackedVector::HALF* pOutputStream,
|
||||
_In_ size_t OutputStride,
|
||||
_In_reads_bytes_(sizeof(float) + InputStride * (FloatCount - 1)) const float* pInputStream,
|
||||
_In_ size_t InputStride,
|
||||
_In_ size_t FloatCount
|
||||
)
|
||||
{
|
||||
@ -230,8 +262,12 @@ inline PackedVector::HALF* XMConvertFloatToHalfStream
|
||||
|
||||
assert(pOutputStream);
|
||||
assert(pInputStream);
|
||||
const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
|
||||
uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
|
||||
|
||||
assert(InputStride >= sizeof(float));
|
||||
assert(OutputStride >= sizeof(HALF));
|
||||
|
||||
auto pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
|
||||
auto pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
|
||||
|
||||
size_t i = 0;
|
||||
size_t four = FloatCount >> 2;
|
||||
@ -241,18 +277,18 @@ inline PackedVector::HALF* XMConvertFloatToHalfStream
|
||||
{
|
||||
if (OutputStride == sizeof(HALF))
|
||||
{
|
||||
if ( ((uintptr_t)pFloat & 0xF) == 0)
|
||||
if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
|
||||
{
|
||||
// Aligned and packed input, packed output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
|
||||
pFloat += InputStride*4;
|
||||
__m128 FV = _mm_load_ps(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride * 4;
|
||||
|
||||
__m128i HV = _mm_cvtps_ph( FV, 0 );
|
||||
__m128i HV = _mm_cvtps_ph(FV, 0);
|
||||
|
||||
_mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
|
||||
pHalf += OutputStride*4;
|
||||
_mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
|
||||
pHalf += OutputStride * 4;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
@ -261,36 +297,36 @@ inline PackedVector::HALF* XMConvertFloatToHalfStream
|
||||
// Packed input, packed output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
|
||||
pFloat += InputStride*4;
|
||||
__m128 FV = _mm_loadu_ps(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride * 4;
|
||||
|
||||
__m128i HV = _mm_cvtps_ph( FV, 0 );
|
||||
__m128i HV = _mm_cvtps_ph(FV, 0);
|
||||
|
||||
_mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
|
||||
pHalf += OutputStride*4;
|
||||
_mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
|
||||
pHalf += OutputStride * 4;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( ((uintptr_t)pFloat & 0xF) == 0)
|
||||
if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
|
||||
{
|
||||
// Aligned & packed input, scattered output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
|
||||
pFloat += InputStride*4;
|
||||
__m128 FV = _mm_load_ps(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride * 4;
|
||||
|
||||
__m128i HV = _mm_cvtps_ph( FV, 0 );
|
||||
__m128i HV = _mm_cvtps_ph(FV, 0);
|
||||
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
|
||||
pHalf += OutputStride;
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 1));
|
||||
pHalf += OutputStride;
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 2));
|
||||
pHalf += OutputStride;
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 3));
|
||||
pHalf += OutputStride;
|
||||
i += 4;
|
||||
}
|
||||
@ -300,18 +336,18 @@ inline PackedVector::HALF* XMConvertFloatToHalfStream
|
||||
// Packed input, scattered output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
|
||||
pFloat += InputStride*4;
|
||||
__m128 FV = _mm_loadu_ps(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride * 4;
|
||||
|
||||
__m128i HV = _mm_cvtps_ph( FV, 0 );
|
||||
__m128i HV = _mm_cvtps_ph(FV, 0);
|
||||
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
|
||||
pHalf += OutputStride;
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 1));
|
||||
pHalf += OutputStride;
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 2));
|
||||
pHalf += OutputStride;
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 3));
|
||||
pHalf += OutputStride;
|
||||
i += 4;
|
||||
}
|
||||
@ -323,26 +359,60 @@ inline PackedVector::HALF* XMConvertFloatToHalfStream
|
||||
// Scattered input, packed output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128 FV1 = _mm_load_ss( reinterpret_cast<const float*>(pFloat) );
|
||||
__m128 FV1 = _mm_load_ss(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride;
|
||||
|
||||
__m128 FV2 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
|
||||
__m128 FV2 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride;
|
||||
|
||||
__m128 FV3 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
|
||||
__m128 FV3 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride;
|
||||
|
||||
__m128 FV4 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
|
||||
__m128 FV4 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride;
|
||||
|
||||
__m128 FV = _mm_blend_ps( FV1, FV2, 0x2 );
|
||||
__m128 FT = _mm_blend_ps( FV3, FV4, 0x8 );
|
||||
FV = _mm_blend_ps( FV, FT, 0xC );
|
||||
__m128 FV = _mm_blend_ps(FV1, FV2, 0x2);
|
||||
__m128 FT = _mm_blend_ps(FV3, FV4, 0x8);
|
||||
FV = _mm_blend_ps(FV, FT, 0xC);
|
||||
|
||||
__m128i HV = _mm_cvtps_ph( FV, 0 );
|
||||
__m128i HV = _mm_cvtps_ph(FV, 0);
|
||||
|
||||
_mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
|
||||
pHalf += OutputStride*4;
|
||||
_mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
|
||||
pHalf += OutputStride * 4;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Scattered input, scattered output
|
||||
for (size_t j = 0; j < four; ++j)
|
||||
{
|
||||
__m128 FV1 = _mm_load_ss(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride;
|
||||
|
||||
__m128 FV2 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride;
|
||||
|
||||
__m128 FV3 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride;
|
||||
|
||||
__m128 FV4 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
|
||||
pFloat += InputStride;
|
||||
|
||||
__m128 FV = _mm_blend_ps(FV1, FV2, 0x2);
|
||||
__m128 FT = _mm_blend_ps(FV3, FV4, 0x8);
|
||||
FV = _mm_blend_ps(FV, FT, 0xC);
|
||||
|
||||
__m128i HV = _mm_cvtps_ph(FV, 0);
|
||||
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
|
||||
pHalf += OutputStride;
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 1));
|
||||
pHalf += OutputStride;
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 2));
|
||||
pHalf += OutputStride;
|
||||
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 3));
|
||||
pHalf += OutputStride;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
@ -351,7 +421,7 @@ inline PackedVector::HALF* XMConvertFloatToHalfStream
|
||||
for (; i < FloatCount; ++i)
|
||||
{
|
||||
*reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
|
||||
pFloat += InputStride;
|
||||
pFloat += InputStride;
|
||||
pHalf += OutputStride;
|
||||
}
|
||||
|
||||
@ -398,4 +468,4 @@ inline void XM_CALLCONV XMStoreHalf4( _Out_ PackedVector::XMHALF4* pDestination,
|
||||
|
||||
} // namespace F16C
|
||||
|
||||
} // namespace DirectX;
|
||||
} // namespace DirectX
|
||||
|
@ -7,25 +7,12 @@
|
||||
// http://go.microsoft.com/fwlink/?LinkID=615560
|
||||
//-------------------------------------------------------------------------------------
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma once
|
||||
#endif
|
||||
|
||||
#ifdef _M_ARM
|
||||
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__
|
||||
#error FMA3 not supported on ARM platform
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) && (_MSC_VER < 1700)
|
||||
#error FMA3 intrinsics requires Visual C++ 2012 or later.
|
||||
#endif
|
||||
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 4987)
|
||||
#include <intrin.h>
|
||||
#pragma warning(pop)
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#include <DirectXMath.h>
|
||||
|
||||
namespace DirectX
|
||||
@ -41,12 +28,20 @@ inline bool XMVerifyFMA3Support()
|
||||
|
||||
// See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
|
||||
int CPUInfo[4] = {-1};
|
||||
__cpuid( CPUInfo, 0 );
|
||||
#ifdef __clang__
|
||||
__cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
|
||||
#else
|
||||
__cpuid(CPUInfo, 0);
|
||||
#endif
|
||||
|
||||
if ( CPUInfo[0] < 1 )
|
||||
return false;
|
||||
|
||||
__cpuid(CPUInfo, 1 );
|
||||
#ifdef __clang__
|
||||
__cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
|
||||
#else
|
||||
__cpuid(CPUInfo, 1);
|
||||
#endif
|
||||
|
||||
// We check for FMA3, AVX, OSXSAVE
|
||||
return ( (CPUInfo[2] & 0x18001000) == 0x18001000 );
|
||||
@ -221,7 +216,7 @@ inline XMVECTOR XM_CALLCONV XMVector3Unproject
|
||||
CXMMATRIX World
|
||||
)
|
||||
{
|
||||
static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f };
|
||||
static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } };
|
||||
|
||||
XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
|
||||
Scale = XMVectorReciprocal(Scale);
|
||||
|
@ -7,23 +7,12 @@
|
||||
// http://go.microsoft.com/fwlink/?LinkID=615560
|
||||
//-------------------------------------------------------------------------------------
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma once
|
||||
#endif
|
||||
|
||||
#ifdef _M_ARM
|
||||
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__
|
||||
#error FMA4 not supported on ARM platform
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) && (_MSC_VER < 1600)
|
||||
#error FMA4 intrinsics requires Visual C++ 2010 Service Pack 1 or later.
|
||||
#endif
|
||||
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 4987)
|
||||
#include <intrin.h>
|
||||
#pragma warning(pop)
|
||||
|
||||
#include <ammintrin.h>
|
||||
|
||||
#include <DirectXMath.h>
|
||||
@ -41,20 +30,32 @@ inline bool XMVerifyFMA4Support()
|
||||
|
||||
// See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
|
||||
int CPUInfo[4] = {-1};
|
||||
__cpuid( CPUInfo, 0 );
|
||||
#ifdef __clang__
|
||||
__cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
|
||||
#else
|
||||
__cpuid(CPUInfo, 0);
|
||||
#endif
|
||||
|
||||
if ( CPUInfo[0] < 1 )
|
||||
return false;
|
||||
|
||||
__cpuid(CPUInfo, 1 );
|
||||
#ifdef __clang__
|
||||
__cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
|
||||
#else
|
||||
__cpuid(CPUInfo, 1);
|
||||
#endif
|
||||
|
||||
// We check for AVX, OSXSAVE (required to access FMA4)
|
||||
if ( (CPUInfo[2] & 0x18000000) != 0x18000000 )
|
||||
return false;
|
||||
|
||||
__cpuid( CPUInfo, 0x80000000 );
|
||||
#ifdef __clang__
|
||||
__cpuid(0x80000000, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
|
||||
#else
|
||||
__cpuid(CPUInfo, 0x80000000);
|
||||
#endif
|
||||
|
||||
if ( CPUInfo[0] < 0x80000001 )
|
||||
if ( uint32_t(CPUInfo[0]) < 0x80000001u )
|
||||
return false;
|
||||
|
||||
// We check for FMA4
|
||||
@ -230,7 +231,7 @@ inline XMVECTOR XM_CALLCONV XMVector3Unproject
|
||||
CXMMATRIX World
|
||||
)
|
||||
{
|
||||
static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f };
|
||||
static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } };
|
||||
|
||||
XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
|
||||
Scale = XMVectorReciprocal(Scale);
|
||||
|
@ -7,19 +7,12 @@
|
||||
// http://go.microsoft.com/fwlink/?LinkID=615560
|
||||
//-------------------------------------------------------------------------------------
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma once
|
||||
#endif
|
||||
|
||||
#ifdef _M_ARM
|
||||
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__
|
||||
#error SSE3 not supported on ARM platform
|
||||
#endif
|
||||
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 4987)
|
||||
#include <intrin.h>
|
||||
#pragma warning(pop)
|
||||
|
||||
#include <pmmintrin.h>
|
||||
|
||||
#include <DirectXMath.h>
|
||||
@ -35,13 +28,20 @@ inline bool XMVerifySSE3Support()
|
||||
// Should return true on AMD Athlon 64, AMD Phenom, and Intel Pentium 4 or later processors
|
||||
|
||||
// See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
|
||||
int CPUInfo[4] = {-1};
|
||||
__cpuid( CPUInfo, 0 );
|
||||
|
||||
int CPUInfo[4] = { -1 };
|
||||
#ifdef __clang__
|
||||
__cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
|
||||
#else
|
||||
__cpuid(CPUInfo, 0);
|
||||
#endif
|
||||
if ( CPUInfo[0] < 1 )
|
||||
return false;
|
||||
|
||||
__cpuid(CPUInfo, 1 );
|
||||
#ifdef __clang__
|
||||
__cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
|
||||
#else
|
||||
__cpuid(CPUInfo, 1);
|
||||
#endif
|
||||
|
||||
// We only check for SSE3 instruction set. SSSE3 instructions are not used.
|
||||
return ( (CPUInfo[2] & 0x1) != 0 );
|
||||
@ -108,4 +108,4 @@ inline XMVECTOR XM_CALLCONV XMVectorSwizzle_1133( FXMVECTOR V )
|
||||
|
||||
} // namespace SSE3
|
||||
|
||||
} // namespace DirectX;
|
||||
} // namespace DirectX
|
||||
|
@ -7,19 +7,12 @@
|
||||
// http://go.microsoft.com/fwlink/?LinkID=615560
|
||||
//-------------------------------------------------------------------------------------
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma once
|
||||
#endif
|
||||
|
||||
#ifdef _M_ARM
|
||||
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__
|
||||
#error SSE4 not supported on ARM platform
|
||||
#endif
|
||||
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 4987)
|
||||
#include <intrin.h>
|
||||
#pragma warning(pop)
|
||||
|
||||
#include <smmintrin.h>
|
||||
|
||||
#include <DirectXMath.h>
|
||||
@ -35,13 +28,20 @@ inline bool XMVerifySSE4Support()
|
||||
// Should return true on AMD Bulldozer, Intel Core 2 ("Penryn"), and Intel Core i7 ("Nehalem") or later processors
|
||||
|
||||
// See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
|
||||
int CPUInfo[4] = {-1};
|
||||
__cpuid( CPUInfo, 0 );
|
||||
|
||||
int CPUInfo[4] = { -1 };
|
||||
#ifdef __clang__
|
||||
__cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
|
||||
#else
|
||||
__cpuid(CPUInfo, 0);
|
||||
#endif
|
||||
if ( CPUInfo[0] < 1 )
|
||||
return false;
|
||||
|
||||
__cpuid(CPUInfo, 1 );
|
||||
#ifdef __clang__
|
||||
__cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
|
||||
#else
|
||||
__cpuid(CPUInfo, 1);
|
||||
#endif
|
||||
|
||||
// We only check for SSE4.1 instruction set. SSE4.2 instructions are not used.
|
||||
return ( (CPUInfo[2] & 0x80000) == 0x80000 );
|
||||
@ -52,22 +52,26 @@ inline bool XMVerifySSE4Support()
|
||||
// Vector
|
||||
//-------------------------------------------------------------------------------------
|
||||
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
|
||||
#endif
|
||||
|
||||
inline void XM_CALLCONV XMVectorGetYPtr(_Out_ float *y, _In_ FXMVECTOR V)
|
||||
{
|
||||
assert( y != nullptr );
|
||||
*((int*)y) = _mm_extract_ps( V, 1 );
|
||||
*reinterpret_cast<int*>(y) = _mm_extract_ps( V, 1 );
|
||||
}
|
||||
|
||||
inline void XM_CALLCONV XMVectorGetZPtr(_Out_ float *z, _In_ FXMVECTOR V)
|
||||
{
|
||||
assert( z != nullptr );
|
||||
*((int*)z) = _mm_extract_ps( V, 2 );
|
||||
*reinterpret_cast<int*>(z) = _mm_extract_ps( V, 2 );
|
||||
}
|
||||
|
||||
inline void XM_CALLCONV XMVectorGetWPtr(_Out_ float *w, _In_ FXMVECTOR V)
|
||||
{
|
||||
assert( w != nullptr );
|
||||
*((int*)w) = _mm_extract_ps( V, 3 );
|
||||
*reinterpret_cast<int*>(w) = _mm_extract_ps( V, 3 );
|
||||
}
|
||||
|
||||
inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V)
|
||||
@ -410,4 +414,4 @@ inline XMVECTOR XM_CALLCONV XMPlaneNormalize( FXMVECTOR P )
|
||||
|
||||
} // namespace SSE4
|
||||
|
||||
} // namespace DirectX;
|
||||
} // namespace DirectX
|
||||
|
Loading…
Reference in New Issue
Block a user