1
0
mirror of https://github.com/microsoft/DirectXMath synced 2024-11-09 14:10:09 +00:00

Updated extensions headers for clang

This commit is contained in:
Chuck Walbourn 2019-08-02 14:33:58 -07:00
parent 549b51d9cc
commit 24a4887b35
8 changed files with 450 additions and 310 deletions

View File

@ -7,25 +7,12 @@
// http://go.microsoft.com/fwlink/?LinkID=615560
//-------------------------------------------------------------------------------------
#ifdef _MSC_VER
#pragma once
#endif
#ifdef _M_ARM
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__
#error AVX not supported on ARM platform
#endif
#if defined(_MSC_VER) && (_MSC_VER < 1600)
#error AVX intrinsics requires Visual C++ 2010 Service Pack 1 or later.
#endif
#pragma warning(push)
#pragma warning(disable : 4987)
#include <intrin.h>
#pragma warning(pop)
#include <immintrin.h>
#include <DirectXMath.h>
namespace DirectX
@ -41,12 +28,20 @@ inline bool XMVerifyAVXSupport()
// See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
int CPUInfo[4] = {-1};
#ifdef __clang__
__cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
__cpuid( CPUInfo, 0 );
#endif
if ( CPUInfo[0] < 1 )
return false;
#ifdef __clang__
__cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
__cpuid(CPUInfo, 1 );
#endif
// We check for AVX, OSXSAVE, SSSE4.1, and SSE3
return ( (CPUInfo[2] & 0x18080001) == 0x18080001 );
@ -97,9 +92,9 @@ inline XMVECTOR XM_CALLCONV XMVectorPermute( FXMVECTOR V1, FXMVECTOR V2, uint32_
assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
_Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
static const XMVECTORU32 three = { 3, 3, 3, 3 };
static const XMVECTORU32 three = { { { 3, 3, 3, 3 } } };
_declspec(align(16)) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW };
__declspec(align(16)) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW };
__m128i vControl = _mm_load_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
__m128i vSelect = _mm_cmpgt_epi32( vControl, three );
@ -210,8 +205,8 @@ template<uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t Permu
}
// Special-case permute templates
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { (V2); return V1; }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { (V1); return V2; }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR) { return V1; }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR, FXMVECTOR V2) { return V2; }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); }

View File

@ -7,25 +7,12 @@
// http://go.microsoft.com/fwlink/?LinkID=615560
//-------------------------------------------------------------------------------------
#ifdef _MSC_VER
#pragma once
#endif
#ifdef _M_ARM
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__
#error AVX2 not supported on ARM platform
#endif
#if defined(_MSC_VER) && (_MSC_VER < 1700)
#error AVX2 intrinsics requires Visual C++ 2012 or later.
#endif
#pragma warning(push)
#pragma warning(disable : 4987)
#include <intrin.h>
#pragma warning(pop)
#include <immintrin.h>
#include <DirectXMath.h>
#include <DirectXPackedVector.h>
@ -42,18 +29,30 @@ inline bool XMVerifyAVX2Support()
// See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
int CPUInfo[4] = {-1};
__cpuid( CPUInfo, 0 );
#ifdef __clang__
__cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
__cpuid(CPUInfo, 0);
#endif
if ( CPUInfo[0] < 7 )
return false;
__cpuid(CPUInfo, 1 );
#ifdef __clang__
__cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
__cpuid(CPUInfo, 1);
#endif
// We check for F16C, FMA3, AVX, OSXSAVE, SSSE4.1, and SSE3
if ( (CPUInfo[2] & 0x38081001) != 0x38081001 )
return false;
#ifdef __clang__
__cpuid_count(7, 0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
__cpuidex(CPUInfo, 7, 0);
#endif
return ( (CPUInfo[1] & 0x20 ) == 0x20 );
}
@ -123,9 +122,9 @@ inline XMVECTOR XM_CALLCONV XMVectorPermute( FXMVECTOR V1, FXMVECTOR V2, uint32_
assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
_Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
static const XMVECTORU32 three = { 3, 3, 3, 3 };
static const XMVECTORU32 three = { { { 3, 3, 3, 3 } } };
_declspec(align(16)) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW };
__declspec(align(16)) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW };
__m128i vControl = _mm_load_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
__m128i vSelect = _mm_cmpgt_epi32( vControl, three );
@ -305,7 +304,7 @@ inline XMVECTOR XM_CALLCONV XMVector3Unproject
CXMMATRIX World
)
{
static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f };
static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } };
XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
Scale = XMVectorReciprocal(Scale);
@ -550,8 +549,8 @@ template<uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t Permu
}
// Special-case permute templates
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { (V2); return V1; }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { (V1); return V2; }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR) { return V1; }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR, FXMVECTOR V2) { return V2; }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); }
@ -622,7 +621,7 @@ template<uint32_t Elements>
inline float XMConvertHalfToFloat( PackedVector::HALF Value )
{
__m128i V1 = _mm_cvtsi32_si128( static_cast<uint32_t>(Value) );
__m128i V1 = _mm_cvtsi32_si128( static_cast<int>(Value) );
__m128 V2 = _mm_cvtph_ps( V1 );
return _mm_cvtss_f32( V2 );
}
@ -647,29 +646,33 @@ inline float* XMConvertHalfToFloatStream
assert(pOutputStream);
assert(pInputStream);
const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
assert(InputStride >= sizeof(HALF));
assert(OutputStride >= sizeof(float));
auto pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
auto pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
size_t i = 0;
size_t four = HalfCount >> 2;
if ( four > 0 )
if (four > 0)
{
if (InputStride == sizeof(HALF))
{
if (OutputStride == sizeof(float))
{
if ( ((uintptr_t)pFloat & 0xF) == 0)
if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
{
// Packed input, aligned & packed output
for (size_t j = 0; j < four; ++j)
{
__m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
pHalf += InputStride*4;
__m128i HV = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pHalf));
pHalf += InputStride * 4;
__m128 FV = _mm_cvtph_ps( HV );
__m128 FV = _mm_cvtph_ps(HV);
_mm_stream_ps( reinterpret_cast<float*>(pFloat), FV );
pFloat += OutputStride*4;
_mm_stream_ps(reinterpret_cast<float*>(pFloat), FV);
pFloat += OutputStride * 4;
i += 4;
}
}
@ -678,13 +681,13 @@ inline float* XMConvertHalfToFloatStream
// Packed input, packed output
for (size_t j = 0; j < four; ++j)
{
__m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
pHalf += InputStride*4;
__m128i HV = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pHalf));
pHalf += InputStride * 4;
__m128 FV = _mm_cvtph_ps( HV );
__m128 FV = _mm_cvtph_ps(HV);
_mm_storeu_ps( reinterpret_cast<float*>(pFloat), FV );
pFloat += OutputStride*4;
_mm_storeu_ps(reinterpret_cast<float*>(pFloat), FV);
pFloat += OutputStride * 4;
i += 4;
}
}
@ -694,26 +697,26 @@ inline float* XMConvertHalfToFloatStream
// Packed input, scattered output
for (size_t j = 0; j < four; ++j)
{
__m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
pHalf += InputStride*4;
__m128i HV = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pHalf));
pHalf += InputStride * 4;
__m128 FV = _mm_cvtph_ps( HV );
__m128 FV = _mm_cvtph_ps(HV);
_mm_store_ss( reinterpret_cast<float*>(pFloat), FV );
pFloat += OutputStride;
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 1 );
pFloat += OutputStride;
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 2 );
pFloat += OutputStride;
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 3 );
pFloat += OutputStride;
_mm_store_ss(reinterpret_cast<float*>(pFloat), FV);
pFloat += OutputStride;
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 1);
pFloat += OutputStride;
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 2);
pFloat += OutputStride;
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 3);
pFloat += OutputStride;
i += 4;
}
}
}
else if (OutputStride == sizeof(float))
{
if ( ((uintptr_t)pFloat & 0xF) == 0)
if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
{
// Scattered input, aligned & packed output
for (size_t j = 0; j < four; ++j)
@ -728,14 +731,14 @@ inline float* XMConvertHalfToFloatStream
pHalf += InputStride;
__m128i HV = _mm_setzero_si128();
HV = _mm_insert_epi16( HV, H1, 0 );
HV = _mm_insert_epi16( HV, H2, 1 );
HV = _mm_insert_epi16( HV, H3, 2 );
HV = _mm_insert_epi16( HV, H4, 3 );
__m128 FV = _mm_cvtph_ps( HV );
HV = _mm_insert_epi16(HV, H1, 0);
HV = _mm_insert_epi16(HV, H2, 1);
HV = _mm_insert_epi16(HV, H3, 2);
HV = _mm_insert_epi16(HV, H4, 3);
__m128 FV = _mm_cvtph_ps(HV);
_mm_stream_ps( reinterpret_cast<float*>(pFloat ), FV );
pFloat += OutputStride*4;
_mm_stream_ps(reinterpret_cast<float*>(pFloat), FV);
pFloat += OutputStride * 4;
i += 4;
}
}
@ -754,16 +757,49 @@ inline float* XMConvertHalfToFloatStream
pHalf += InputStride;
__m128i HV = _mm_setzero_si128();
HV = _mm_insert_epi16( HV, H1, 0 );
HV = _mm_insert_epi16( HV, H2, 1 );
HV = _mm_insert_epi16( HV, H3, 2 );
HV = _mm_insert_epi16( HV, H4, 3 );
__m128 FV = _mm_cvtph_ps( HV );
HV = _mm_insert_epi16(HV, H1, 0);
HV = _mm_insert_epi16(HV, H2, 1);
HV = _mm_insert_epi16(HV, H3, 2);
HV = _mm_insert_epi16(HV, H4, 3);
__m128 FV = _mm_cvtph_ps(HV);
_mm_storeu_ps( reinterpret_cast<float*>(pFloat ), FV );
pFloat += OutputStride*4;
_mm_storeu_ps(reinterpret_cast<float*>(pFloat), FV);
pFloat += OutputStride * 4;
i += 4;
}
}
}
else
{
// Scattered input, scattered output
for (size_t j = 0; j < four; ++j)
{
uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
pHalf += InputStride;
uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
pHalf += InputStride;
uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
pHalf += InputStride;
uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
pHalf += InputStride;
__m128i HV = _mm_setzero_si128();
HV = _mm_insert_epi16(HV, H1, 0);
HV = _mm_insert_epi16(HV, H2, 1);
HV = _mm_insert_epi16(HV, H3, 2);
HV = _mm_insert_epi16(HV, H4, 3);
__m128 FV = _mm_cvtph_ps(HV);
_mm_store_ss(reinterpret_cast<float*>(pFloat), FV);
pFloat += OutputStride;
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 1);
pFloat += OutputStride;
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 2);
pFloat += OutputStride;
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 3);
pFloat += OutputStride;
i += 4;
}
}
}
@ -792,8 +828,12 @@ inline PackedVector::HALF* XMConvertFloatToHalfStream
assert(pOutputStream);
assert(pInputStream);
const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
assert(InputStride >= sizeof(float));
assert(OutputStride >= sizeof(HALF));
auto pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
auto pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
size_t i = 0;
size_t four = FloatCount >> 2;
@ -803,18 +843,18 @@ inline PackedVector::HALF* XMConvertFloatToHalfStream
{
if (OutputStride == sizeof(HALF))
{
if ( ((uintptr_t)pFloat & 0xF) == 0)
if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
{
// Aligned and packed input, packed output
for (size_t j = 0; j < four; ++j)
{
__m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
pFloat += InputStride*4;
__m128 FV = _mm_load_ps(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride * 4;
__m128i HV = _mm_cvtps_ph( FV, 0 );
__m128i HV = _mm_cvtps_ph(FV, 0);
_mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
pHalf += OutputStride*4;
_mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
pHalf += OutputStride * 4;
i += 4;
}
}
@ -823,36 +863,36 @@ inline PackedVector::HALF* XMConvertFloatToHalfStream
// Packed input, packed output
for (size_t j = 0; j < four; ++j)
{
__m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
pFloat += InputStride*4;
__m128 FV = _mm_loadu_ps(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride * 4;
__m128i HV = _mm_cvtps_ph( FV, 0 );
__m128i HV = _mm_cvtps_ph(FV, 0);
_mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
pHalf += OutputStride*4;
_mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
pHalf += OutputStride * 4;
i += 4;
}
}
}
else
{
if ( ((uintptr_t)pFloat & 0xF) == 0)
if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
{
// Aligned & packed input, scattered output
for (size_t j = 0; j < four; ++j)
{
__m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
pFloat += InputStride*4;
__m128 FV = _mm_load_ps(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride * 4;
__m128i HV = _mm_cvtps_ph( FV, 0 );
__m128i HV = _mm_cvtps_ph(FV, 0);
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
pHalf += OutputStride;
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 1));
pHalf += OutputStride;
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 2));
pHalf += OutputStride;
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 3));
pHalf += OutputStride;
i += 4;
}
@ -862,18 +902,18 @@ inline PackedVector::HALF* XMConvertFloatToHalfStream
// Packed input, scattered output
for (size_t j = 0; j < four; ++j)
{
__m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
pFloat += InputStride*4;
__m128 FV = _mm_loadu_ps(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride * 4;
__m128i HV = _mm_cvtps_ph( FV, 0 );
__m128i HV = _mm_cvtps_ph(FV, 0);
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
pHalf += OutputStride;
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 1));
pHalf += OutputStride;
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 2));
pHalf += OutputStride;
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 3));
pHalf += OutputStride;
i += 4;
}
@ -885,26 +925,60 @@ inline PackedVector::HALF* XMConvertFloatToHalfStream
// Scattered input, packed output
for (size_t j = 0; j < four; ++j)
{
__m128 FV1 = _mm_load_ss( reinterpret_cast<const float*>(pFloat) );
__m128 FV1 = _mm_load_ss(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride;
__m128 FV2 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
__m128 FV2 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride;
__m128 FV3 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
__m128 FV3 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride;
__m128 FV4 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
__m128 FV4 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride;
__m128 FV = _mm_blend_ps( FV1, FV2, 0x2 );
__m128 FT = _mm_blend_ps( FV3, FV4, 0x8 );
FV = _mm_blend_ps( FV, FT, 0xC );
__m128 FV = _mm_blend_ps(FV1, FV2, 0x2);
__m128 FT = _mm_blend_ps(FV3, FV4, 0x8);
FV = _mm_blend_ps(FV, FT, 0xC);
__m128i HV = _mm_cvtps_ph( FV, 0 );
__m128i HV = _mm_cvtps_ph(FV, 0);
_mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
pHalf += OutputStride*4;
_mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
pHalf += OutputStride * 4;
i += 4;
}
}
else
{
// Scattered input, scattered output
for (size_t j = 0; j < four; ++j)
{
__m128 FV1 = _mm_load_ss(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride;
__m128 FV2 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride;
__m128 FV3 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride;
__m128 FV4 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride;
__m128 FV = _mm_blend_ps(FV1, FV2, 0x2);
__m128 FT = _mm_blend_ps(FV3, FV4, 0x8);
FV = _mm_blend_ps(FV, FT, 0xC);
__m128i HV = _mm_cvtps_ph(FV, 0);
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
pHalf += OutputStride;
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 1));
pHalf += OutputStride;
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 2));
pHalf += OutputStride;
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 3));
pHalf += OutputStride;
i += 4;
}
}

View File

@ -7,16 +7,9 @@
// http://go.microsoft.com/fwlink/?LinkID=615560
//-------------------------------------------------------------------------------------
#ifdef _MSC_VER
#pragma once
#endif
#pragma warning(push)
#pragma warning(disable : 4987)
#include <intrin.h>
#pragma warning(pop)
#ifndef _M_ARM
#if (defined(_M_IX86) || defined(_M_X64) || __i386__ || __x86_64__) && !defined(_M_HYBRID_X86_ARM64)
#include <tmmintrin.h>
#endif
@ -31,15 +24,15 @@ inline XMVECTOR XM_CALLCONV XMVectorEndian
)
{
#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
static const XMVECTORU32 idx = { 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F };
static const XMVECTORU32 idx = { { { 0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu } } };
int8x8x2_t tbl;
tbl.val[0] = vget_low_f32(V);
tbl.val[1] = vget_high_f32(V);
uint8x8x2_t tbl;
tbl.val[0] = vreinterpret_u8_f32(vget_low_f32(V));
tbl.val[1] = vreinterpret_u8_f32(vget_high_f32(V));
const __n64 rL = vtbl2_u8( tbl, vget_low_f32(idx) );
const __n64 rH = vtbl2_u8( tbl, vget_high_f32(idx) );
return vcombine_f32( rL, rH );
const uint8x8_t rL = vtbl2_u8(tbl, vget_low_u32(idx));
const uint8x8_t rH = vtbl2_u8(tbl, vget_high_u32(idx));
return vcombine_f32(vreinterpret_f32_u8(rL), vreinterpret_f32_u8(rH));
#else
XMVECTORU32 E;
E.v = V;
@ -56,7 +49,7 @@ inline XMVECTOR XM_CALLCONV XMVectorEndian
}
#ifndef _M_ARM
#if (defined(_M_IX86) || defined(_M_X64) || __i386__ || __x86_64__) && !defined(_M_HYBRID_X86_ARM64)
namespace SSSE3
{
@ -65,13 +58,21 @@ inline bool XMVerifySSSE3Support()
// Should return true on AMD Bulldozer, Intel Core i7/i5/i3, Intel Atom, or later processors
// See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
int CPUInfo[4] = {-1};
__cpuid( CPUInfo, 0 );
int CPUInfo[4] = { -1 };
#ifdef __clang__
__cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
__cpuid(CPUInfo, 0);
#endif
if ( CPUInfo[0] < 1 )
return false;
__cpuid(CPUInfo, 1 );
#ifdef __clang__
__cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
__cpuid(CPUInfo, 1);
#endif
// Check for SSSE3 instruction set.
return ( (CPUInfo[2] & 0x200) != 0 );
@ -82,13 +83,13 @@ inline XMVECTOR XM_CALLCONV XMVectorEndian
FXMVECTOR V
)
{
static const XMVECTORU32 idx = { 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F };
static const XMVECTORU32 idx = { { { 0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu } } };
__m128i Result = _mm_shuffle_epi8( _mm_castps_si128(V), idx );
return _mm_castsi128_ps( Result );
}
} // namespace SSSE3
#endif // !_M_ARM
#endif // X86 || X64
} // namespace DirectX;
} // namespace DirectX

View File

@ -7,25 +7,12 @@
// http://go.microsoft.com/fwlink/?LinkID=615560
//-------------------------------------------------------------------------------------
#ifdef _MSC_VER
#pragma once
#endif
#ifdef _M_ARM
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__
#error F16C not supported on ARM platform
#endif
#if defined(_MSC_VER) && (_MSC_VER < 1700)
#error F16C/CVT16 intrinsics requires Visual C++ 2012 or later.
#endif
#pragma warning(push)
#pragma warning(disable : 4987)
#include <intrin.h>
#pragma warning(pop)
#include <immintrin.h>
#include <DirectXMath.h>
#include <DirectXPackedVector.h>
@ -41,13 +28,21 @@ inline bool XMVerifyF16CSupport()
// with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012)
// See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
int CPUInfo[4] = {-1};
__cpuid( CPUInfo, 0 );
int CPUInfo[4] = { -1 };
#ifdef __clang__
__cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
__cpuid(CPUInfo, 0);
#endif
if ( CPUInfo[0] < 1 )
return false;
__cpuid(CPUInfo, 1 );
#ifdef __clang__
__cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
__cpuid(CPUInfo, 1);
#endif
// We check for F16C, AVX, OSXSAVE, and SSE4.1
return ( (CPUInfo[2] & 0x38080000 ) == 0x38080000 );
@ -60,7 +55,7 @@ inline bool XMVerifyF16CSupport()
inline float XMConvertHalfToFloat( PackedVector::HALF Value )
{
__m128i V1 = _mm_cvtsi32_si128( static_cast<uint32_t>(Value) );
__m128i V1 = _mm_cvtsi32_si128( static_cast<int>(Value) );
__m128 V2 = _mm_cvtph_ps( V1 );
return _mm_cvtss_f32( V2 );
}
@ -74,10 +69,10 @@ inline PackedVector::HALF XMConvertFloatToHalf( float Value )
inline float* XMConvertHalfToFloatStream
(
_Out_writes_bytes_(sizeof(float)+OutputStride*(HalfCount-1)) float* pOutputStream,
_In_ size_t OutputStride,
_In_reads_bytes_(2+InputStride*(HalfCount-1)) const PackedVector::HALF* pInputStream,
_In_ size_t InputStride,
_Out_writes_bytes_(sizeof(float) + OutputStride * (HalfCount - 1)) float* pOutputStream,
_In_ size_t OutputStride,
_In_reads_bytes_(2 + InputStride * (HalfCount - 1)) const PackedVector::HALF* pInputStream,
_In_ size_t InputStride,
_In_ size_t HalfCount
)
{
@ -85,29 +80,33 @@ inline float* XMConvertHalfToFloatStream
assert(pOutputStream);
assert(pInputStream);
const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
assert(InputStride >= sizeof(HALF));
assert(OutputStride >= sizeof(float));
auto pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
auto pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
size_t i = 0;
size_t four = HalfCount >> 2;
if ( four > 0 )
if (four > 0)
{
if (InputStride == sizeof(HALF))
{
if (OutputStride == sizeof(float))
{
if ( ((uintptr_t)pFloat & 0xF) == 0)
if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
{
// Packed input, aligned & packed output
for (size_t j = 0; j < four; ++j)
{
__m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
pHalf += InputStride*4;
__m128i HV = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pHalf));
pHalf += InputStride * 4;
__m128 FV = _mm_cvtph_ps( HV );
__m128 FV = _mm_cvtph_ps(HV);
_mm_stream_ps( reinterpret_cast<float*>(pFloat), FV );
pFloat += OutputStride*4;
_mm_stream_ps(reinterpret_cast<float*>(pFloat), FV);
pFloat += OutputStride * 4;
i += 4;
}
}
@ -116,13 +115,13 @@ inline float* XMConvertHalfToFloatStream
// Packed input, packed output
for (size_t j = 0; j < four; ++j)
{
__m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
pHalf += InputStride*4;
__m128i HV = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pHalf));
pHalf += InputStride * 4;
__m128 FV = _mm_cvtph_ps( HV );
__m128 FV = _mm_cvtph_ps(HV);
_mm_storeu_ps( reinterpret_cast<float*>(pFloat), FV );
pFloat += OutputStride*4;
_mm_storeu_ps(reinterpret_cast<float*>(pFloat), FV);
pFloat += OutputStride * 4;
i += 4;
}
}
@ -132,26 +131,26 @@ inline float* XMConvertHalfToFloatStream
// Packed input, scattered output
for (size_t j = 0; j < four; ++j)
{
__m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
pHalf += InputStride*4;
__m128i HV = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pHalf));
pHalf += InputStride * 4;
__m128 FV = _mm_cvtph_ps( HV );
__m128 FV = _mm_cvtph_ps(HV);
_mm_store_ss( reinterpret_cast<float*>(pFloat), FV );
pFloat += OutputStride;
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 1 );
pFloat += OutputStride;
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 2 );
pFloat += OutputStride;
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 3 );
pFloat += OutputStride;
_mm_store_ss(reinterpret_cast<float*>(pFloat), FV);
pFloat += OutputStride;
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 1);
pFloat += OutputStride;
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 2);
pFloat += OutputStride;
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 3);
pFloat += OutputStride;
i += 4;
}
}
}
else if (OutputStride == sizeof(float))
{
if ( ((uintptr_t)pFloat & 0xF) == 0)
if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
{
// Scattered input, aligned & packed output
for (size_t j = 0; j < four; ++j)
@ -166,14 +165,14 @@ inline float* XMConvertHalfToFloatStream
pHalf += InputStride;
__m128i HV = _mm_setzero_si128();
HV = _mm_insert_epi16( HV, H1, 0 );
HV = _mm_insert_epi16( HV, H2, 1 );
HV = _mm_insert_epi16( HV, H3, 2 );
HV = _mm_insert_epi16( HV, H4, 3 );
__m128 FV = _mm_cvtph_ps( HV );
HV = _mm_insert_epi16(HV, H1, 0);
HV = _mm_insert_epi16(HV, H2, 1);
HV = _mm_insert_epi16(HV, H3, 2);
HV = _mm_insert_epi16(HV, H4, 3);
__m128 FV = _mm_cvtph_ps(HV);
_mm_stream_ps( reinterpret_cast<float*>(pFloat ), FV );
pFloat += OutputStride*4;
_mm_stream_ps(reinterpret_cast<float*>(pFloat), FV);
pFloat += OutputStride * 4;
i += 4;
}
}
@ -192,16 +191,49 @@ inline float* XMConvertHalfToFloatStream
pHalf += InputStride;
__m128i HV = _mm_setzero_si128();
HV = _mm_insert_epi16( HV, H1, 0 );
HV = _mm_insert_epi16( HV, H2, 1 );
HV = _mm_insert_epi16( HV, H3, 2 );
HV = _mm_insert_epi16( HV, H4, 3 );
__m128 FV = _mm_cvtph_ps( HV );
HV = _mm_insert_epi16(HV, H1, 0);
HV = _mm_insert_epi16(HV, H2, 1);
HV = _mm_insert_epi16(HV, H3, 2);
HV = _mm_insert_epi16(HV, H4, 3);
__m128 FV = _mm_cvtph_ps(HV);
_mm_storeu_ps( reinterpret_cast<float*>(pFloat ), FV );
pFloat += OutputStride*4;
_mm_storeu_ps(reinterpret_cast<float*>(pFloat), FV);
pFloat += OutputStride * 4;
i += 4;
}
}
}
else
{
// Scattered input, scattered output
for (size_t j = 0; j < four; ++j)
{
uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
pHalf += InputStride;
uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
pHalf += InputStride;
uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
pHalf += InputStride;
uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
pHalf += InputStride;
__m128i HV = _mm_setzero_si128();
HV = _mm_insert_epi16(HV, H1, 0);
HV = _mm_insert_epi16(HV, H2, 1);
HV = _mm_insert_epi16(HV, H3, 2);
HV = _mm_insert_epi16(HV, H4, 3);
__m128 FV = _mm_cvtph_ps(HV);
_mm_store_ss(reinterpret_cast<float*>(pFloat), FV);
pFloat += OutputStride;
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 1);
pFloat += OutputStride;
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 2);
pFloat += OutputStride;
*reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 3);
pFloat += OutputStride;
i += 4;
}
}
}
@ -210,7 +242,7 @@ inline float* XMConvertHalfToFloatStream
{
*reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
pHalf += InputStride;
pFloat += OutputStride;
pFloat += OutputStride;
}
return pOutputStream;
@ -219,10 +251,10 @@ inline float* XMConvertHalfToFloatStream
inline PackedVector::HALF* XMConvertFloatToHalfStream
(
_Out_writes_bytes_(2+OutputStride*(FloatCount-1)) PackedVector::HALF* pOutputStream,
_In_ size_t OutputStride,
_In_reads_bytes_(sizeof(float)+InputStride*(FloatCount-1)) const float* pInputStream,
_In_ size_t InputStride,
_Out_writes_bytes_(2 + OutputStride * (FloatCount - 1)) PackedVector::HALF* pOutputStream,
_In_ size_t OutputStride,
_In_reads_bytes_(sizeof(float) + InputStride * (FloatCount - 1)) const float* pInputStream,
_In_ size_t InputStride,
_In_ size_t FloatCount
)
{
@ -230,8 +262,12 @@ inline PackedVector::HALF* XMConvertFloatToHalfStream
assert(pOutputStream);
assert(pInputStream);
const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
assert(InputStride >= sizeof(float));
assert(OutputStride >= sizeof(HALF));
auto pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
auto pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
size_t i = 0;
size_t four = FloatCount >> 2;
@ -241,18 +277,18 @@ inline PackedVector::HALF* XMConvertFloatToHalfStream
{
if (OutputStride == sizeof(HALF))
{
if ( ((uintptr_t)pFloat & 0xF) == 0)
if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
{
// Aligned and packed input, packed output
for (size_t j = 0; j < four; ++j)
{
__m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
pFloat += InputStride*4;
__m128 FV = _mm_load_ps(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride * 4;
__m128i HV = _mm_cvtps_ph( FV, 0 );
__m128i HV = _mm_cvtps_ph(FV, 0);
_mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
pHalf += OutputStride*4;
_mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
pHalf += OutputStride * 4;
i += 4;
}
}
@ -261,36 +297,36 @@ inline PackedVector::HALF* XMConvertFloatToHalfStream
// Packed input, packed output
for (size_t j = 0; j < four; ++j)
{
__m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
pFloat += InputStride*4;
__m128 FV = _mm_loadu_ps(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride * 4;
__m128i HV = _mm_cvtps_ph( FV, 0 );
__m128i HV = _mm_cvtps_ph(FV, 0);
_mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
pHalf += OutputStride*4;
_mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
pHalf += OutputStride * 4;
i += 4;
}
}
}
else
{
if ( ((uintptr_t)pFloat & 0xF) == 0)
if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
{
// Aligned & packed input, scattered output
for (size_t j = 0; j < four; ++j)
{
__m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
pFloat += InputStride*4;
__m128 FV = _mm_load_ps(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride * 4;
__m128i HV = _mm_cvtps_ph( FV, 0 );
__m128i HV = _mm_cvtps_ph(FV, 0);
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
pHalf += OutputStride;
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 1));
pHalf += OutputStride;
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 2));
pHalf += OutputStride;
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 3));
pHalf += OutputStride;
i += 4;
}
@ -300,18 +336,18 @@ inline PackedVector::HALF* XMConvertFloatToHalfStream
// Packed input, scattered output
for (size_t j = 0; j < four; ++j)
{
__m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
pFloat += InputStride*4;
__m128 FV = _mm_loadu_ps(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride * 4;
__m128i HV = _mm_cvtps_ph( FV, 0 );
__m128i HV = _mm_cvtps_ph(FV, 0);
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
pHalf += OutputStride;
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 1));
pHalf += OutputStride;
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 2));
pHalf += OutputStride;
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 3));
pHalf += OutputStride;
i += 4;
}
@ -323,26 +359,60 @@ inline PackedVector::HALF* XMConvertFloatToHalfStream
// Scattered input, packed output
for (size_t j = 0; j < four; ++j)
{
__m128 FV1 = _mm_load_ss( reinterpret_cast<const float*>(pFloat) );
__m128 FV1 = _mm_load_ss(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride;
__m128 FV2 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
__m128 FV2 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride;
__m128 FV3 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
__m128 FV3 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride;
__m128 FV4 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
__m128 FV4 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride;
__m128 FV = _mm_blend_ps( FV1, FV2, 0x2 );
__m128 FT = _mm_blend_ps( FV3, FV4, 0x8 );
FV = _mm_blend_ps( FV, FT, 0xC );
__m128 FV = _mm_blend_ps(FV1, FV2, 0x2);
__m128 FT = _mm_blend_ps(FV3, FV4, 0x8);
FV = _mm_blend_ps(FV, FT, 0xC);
__m128i HV = _mm_cvtps_ph( FV, 0 );
__m128i HV = _mm_cvtps_ph(FV, 0);
_mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
pHalf += OutputStride*4;
_mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
pHalf += OutputStride * 4;
i += 4;
}
}
else
{
// Scattered input, scattered output
for (size_t j = 0; j < four; ++j)
{
__m128 FV1 = _mm_load_ss(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride;
__m128 FV2 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride;
__m128 FV3 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride;
__m128 FV4 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
pFloat += InputStride;
__m128 FV = _mm_blend_ps(FV1, FV2, 0x2);
__m128 FT = _mm_blend_ps(FV3, FV4, 0x8);
FV = _mm_blend_ps(FV, FT, 0xC);
__m128i HV = _mm_cvtps_ph(FV, 0);
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
pHalf += OutputStride;
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 1));
pHalf += OutputStride;
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 2));
pHalf += OutputStride;
*reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 3));
pHalf += OutputStride;
i += 4;
}
}
@ -351,7 +421,7 @@ inline PackedVector::HALF* XMConvertFloatToHalfStream
for (; i < FloatCount; ++i)
{
*reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
pFloat += InputStride;
pFloat += InputStride;
pHalf += OutputStride;
}
@ -398,4 +468,4 @@ inline void XM_CALLCONV XMStoreHalf4( _Out_ PackedVector::XMHALF4* pDestination,
} // namespace F16C
} // namespace DirectX;
} // namespace DirectX

View File

@ -7,25 +7,12 @@
// http://go.microsoft.com/fwlink/?LinkID=615560
//-------------------------------------------------------------------------------------
#ifdef _MSC_VER
#pragma once
#endif
#ifdef _M_ARM
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__
#error FMA3 not supported on ARM platform
#endif
#if defined(_MSC_VER) && (_MSC_VER < 1700)
#error FMA3 intrinsics requires Visual C++ 2012 or later.
#endif
#pragma warning(push)
#pragma warning(disable : 4987)
#include <intrin.h>
#pragma warning(pop)
#include <immintrin.h>
#include <DirectXMath.h>
namespace DirectX
@ -41,12 +28,20 @@ inline bool XMVerifyFMA3Support()
// See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
int CPUInfo[4] = {-1};
__cpuid( CPUInfo, 0 );
#ifdef __clang__
__cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
__cpuid(CPUInfo, 0);
#endif
if ( CPUInfo[0] < 1 )
return false;
__cpuid(CPUInfo, 1 );
#ifdef __clang__
__cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
__cpuid(CPUInfo, 1);
#endif
// We check for FMA3, AVX, OSXSAVE
return ( (CPUInfo[2] & 0x18001000) == 0x18001000 );
@ -221,7 +216,7 @@ inline XMVECTOR XM_CALLCONV XMVector3Unproject
CXMMATRIX World
)
{
static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f };
static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } };
XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
Scale = XMVectorReciprocal(Scale);

View File

@ -7,23 +7,12 @@
// http://go.microsoft.com/fwlink/?LinkID=615560
//-------------------------------------------------------------------------------------
#ifdef _MSC_VER
#pragma once
#endif
#ifdef _M_ARM
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__
#error FMA4 not supported on ARM platform
#endif
#if defined(_MSC_VER) && (_MSC_VER < 1600)
#error FMA4 intrinsics requires Visual C++ 2010 Service Pack 1 or later.
#endif
#pragma warning(push)
#pragma warning(disable : 4987)
#include <intrin.h>
#pragma warning(pop)
#include <ammintrin.h>
#include <DirectXMath.h>
@ -41,20 +30,32 @@ inline bool XMVerifyFMA4Support()
// See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
int CPUInfo[4] = {-1};
__cpuid( CPUInfo, 0 );
#ifdef __clang__
__cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
__cpuid(CPUInfo, 0);
#endif
if ( CPUInfo[0] < 1 )
return false;
__cpuid(CPUInfo, 1 );
#ifdef __clang__
__cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
__cpuid(CPUInfo, 1);
#endif
// We check for AVX, OSXSAVE (required to access FMA4)
if ( (CPUInfo[2] & 0x18000000) != 0x18000000 )
return false;
__cpuid( CPUInfo, 0x80000000 );
#ifdef __clang__
__cpuid(0x80000000, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
__cpuid(CPUInfo, 0x80000000);
#endif
if ( CPUInfo[0] < 0x80000001 )
if ( uint32_t(CPUInfo[0]) < 0x80000001u )
return false;
// We check for FMA4
@ -230,7 +231,7 @@ inline XMVECTOR XM_CALLCONV XMVector3Unproject
CXMMATRIX World
)
{
static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f };
static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } };
XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
Scale = XMVectorReciprocal(Scale);

View File

@ -7,19 +7,12 @@
// http://go.microsoft.com/fwlink/?LinkID=615560
//-------------------------------------------------------------------------------------
#ifdef _MSC_VER
#pragma once
#endif
#ifdef _M_ARM
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__
#error SSE3 not supported on ARM platform
#endif
#pragma warning(push)
#pragma warning(disable : 4987)
#include <intrin.h>
#pragma warning(pop)
#include <pmmintrin.h>
#include <DirectXMath.h>
@ -35,13 +28,20 @@ inline bool XMVerifySSE3Support()
// Should return true on AMD Athlon 64, AMD Phenom, and Intel Pentium 4 or later processors
// See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
int CPUInfo[4] = {-1};
__cpuid( CPUInfo, 0 );
int CPUInfo[4] = { -1 };
#ifdef __clang__
__cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
__cpuid(CPUInfo, 0);
#endif
if ( CPUInfo[0] < 1 )
return false;
__cpuid(CPUInfo, 1 );
#ifdef __clang__
__cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
__cpuid(CPUInfo, 1);
#endif
// We only check for SSE3 instruction set. SSSE3 instructions are not used.
return ( (CPUInfo[2] & 0x1) != 0 );
@ -108,4 +108,4 @@ inline XMVECTOR XM_CALLCONV XMVectorSwizzle_1133( FXMVECTOR V )
} // namespace SSE3
} // namespace DirectX;
} // namespace DirectX

View File

@ -7,19 +7,12 @@
// http://go.microsoft.com/fwlink/?LinkID=615560
//-------------------------------------------------------------------------------------
#ifdef _MSC_VER
#pragma once
#endif
#ifdef _M_ARM
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__
#error SSE4 not supported on ARM platform
#endif
#pragma warning(push)
#pragma warning(disable : 4987)
#include <intrin.h>
#pragma warning(pop)
#include <smmintrin.h>
#include <DirectXMath.h>
@ -35,13 +28,20 @@ inline bool XMVerifySSE4Support()
// Should return true on AMD Bulldozer, Intel Core 2 ("Penryn"), and Intel Core i7 ("Nehalem") or later processors
// See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
int CPUInfo[4] = {-1};
__cpuid( CPUInfo, 0 );
int CPUInfo[4] = { -1 };
#ifdef __clang__
__cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
__cpuid(CPUInfo, 0);
#endif
if ( CPUInfo[0] < 1 )
return false;
__cpuid(CPUInfo, 1 );
#ifdef __clang__
__cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
__cpuid(CPUInfo, 1);
#endif
// We only check for SSE4.1 instruction set. SSE4.2 instructions are not used.
return ( (CPUInfo[2] & 0x80000) == 0x80000 );
@ -52,22 +52,26 @@ inline bool XMVerifySSE4Support()
// Vector
//-------------------------------------------------------------------------------------
#ifdef __clang__
#pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
#endif
inline void XM_CALLCONV XMVectorGetYPtr(_Out_ float *y, _In_ FXMVECTOR V)
{
assert( y != nullptr );
*((int*)y) = _mm_extract_ps( V, 1 );
*reinterpret_cast<int*>(y) = _mm_extract_ps( V, 1 );
}
inline void XM_CALLCONV XMVectorGetZPtr(_Out_ float *z, _In_ FXMVECTOR V)
{
assert( z != nullptr );
*((int*)z) = _mm_extract_ps( V, 2 );
*reinterpret_cast<int*>(z) = _mm_extract_ps( V, 2 );
}
inline void XM_CALLCONV XMVectorGetWPtr(_Out_ float *w, _In_ FXMVECTOR V)
{
assert( w != nullptr );
*((int*)w) = _mm_extract_ps( V, 3 );
*reinterpret_cast<int*>(w) = _mm_extract_ps( V, 3 );
}
inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V)
@ -410,4 +414,4 @@ inline XMVECTOR XM_CALLCONV XMPlaneNormalize( FXMVECTOR P )
} // namespace SSE4
} // namespace DirectX;
} // namespace DirectX