Fixed overread problem with XMUNIBBLE4 and other 16-bit packed types (#128)

2024-11-21 20:00:12 +00:00 · 2021-09-10 17:45:22 -07:00 · 2021-09-10 17:45:22 -07:00 · 77faf5af62
commit 77faf5af62
parent 792ae2adc0
1 changed files with 17 additions and 10 deletions
--- a/Inc/DirectXPackedVector.inl
+++ b/Inc/DirectXPackedVector.inl
@ -985,7 +985,8 @@ inline XMVECTOR XM_CALLCONV XMLoadByteN2(const XMBYTEN2* pSource) noexcept
    static const XMVECTORF32 Scale = { { { 1.0f / 127.0f, 1.0f / (127.0f * 256.0f), 0, 0 } } };
    static const XMVECTORU32 Mask = { { { 0xFF, 0xFF00, 0, 0 } } };
    // Splat the color in all four entries (x,z,y,w)
-    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float*>(&pSource->x));
+    __m128i vInt = _mm_loadu_si16(&pSource->v);
+    XMVECTOR vTemp = XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0, 0, 0, 0));
    // Mask
    vTemp = _mm_and_ps(vTemp, Mask);
    // x,y and z are unsigned! Flip the bits to convert the order to signed
@ -1024,7 +1025,8 @@ inline XMVECTOR XM_CALLCONV XMLoadByte2(const XMBYTE2* pSource) noexcept
    static const XMVECTORF32 Scale = { { { 1.0f, 1.0f / 256.0f, 1.0f / 65536.0f, 1.0f / (65536.0f * 256.0f) } } };
    static const XMVECTORU32 Mask = { { { 0xFF, 0xFF00, 0, 0 } } };
    // Splat the color in all four entries (x,z,y,w)
-    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float*>(&pSource->x));
+    __m128i vInt = _mm_loadu_si16(&pSource->v);
+    XMVECTOR vTemp = XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0, 0, 0, 0));
    // Mask
    vTemp = _mm_and_ps(vTemp, Mask);
    // x,y and z are unsigned! Flip the bits to convert the order to signed
@ -1062,7 +1064,8 @@ inline XMVECTOR XM_CALLCONV XMLoadUByteN2(const XMUBYTEN2* pSource) noexcept
    static const XMVECTORF32 Scale = { { { 1.0f / 255.0f, 1.0f / (255.0f * 256.0f), 0, 0 } } };
    static const XMVECTORU32 Mask = { { { 0xFF, 0xFF00, 0, 0 } } };
    // Splat the color in all four entries (x,z,y,w)
-    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float*>(&pSource->x));
+    __m128i vInt = _mm_loadu_si16(&pSource->v);
+    XMVECTOR vTemp = XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0, 0, 0, 0));
    // Mask
    vTemp = _mm_and_ps(vTemp, Mask);
    // w is signed! Flip the bits to convert the order to unsigned
@ -1099,7 +1102,8 @@ inline XMVECTOR XM_CALLCONV XMLoadUByte2(const XMUBYTE2* pSource) noexcept
    static const XMVECTORF32 Scale = { { { 1.0f, 1.0f / 256.0f, 0, 0 } } };
    static const XMVECTORU32 Mask = { { { 0xFF, 0xFF00, 0, 0 } } };
    // Splat the color in all four entries (x,z,y,w)
-    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float*>(&pSource->x));
+    __m128i vInt = _mm_loadu_si16(&pSource->v);
+    XMVECTOR vTemp = XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0, 0, 0, 0));
    // Mask
    vTemp = _mm_and_ps(vTemp, Mask);
    // w is signed! Flip the bits to convert the order to unsigned
@ -1137,8 +1141,9 @@ inline XMVECTOR XM_CALLCONV XMLoadU565(const XMU565* pSource) noexcept
 #elif defined(_XM_SSE_INTRINSICS_)
    static const XMVECTORI32 U565And = { { { 0x1F, 0x3F << 5, 0x1F << 11, 0 } } };
    static const XMVECTORF32 U565Mul = { { { 1.0f, 1.0f / 32.0f, 1.0f / 2048.f, 0 } } };
-    // Get the 32 bit value and splat it
-    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float*>(&pSource->v));
+    // Get the 16 bit value and splat it
+    __m128i vInt = _mm_loadu_si16(&pSource->v);
+    XMVECTOR vResult = XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0, 0, 0, 0));
    // Mask off x, y and z
    vResult = _mm_and_ps(vResult, U565And);
    // Convert to float
@ -1971,8 +1976,9 @@ inline XMVECTOR XM_CALLCONV XMLoadUNibble4(const XMUNIBBLE4* pSource) noexcept
 #elif defined(_XM_SSE_INTRINSICS_)
    static const XMVECTORI32 UNibble4And = { { { 0xF, 0xF0, 0xF00, 0xF000 } } };
    static const XMVECTORF32 UNibble4Mul = { { { 1.0f, 1.0f / 16.f, 1.0f / 256.f, 1.0f / 4096.f } } };
-    // Get the 32 bit value and splat it
-    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float*>(&pSource->v));
+    // Get the 16 bit value and splat it
+    __m128i vInt = _mm_loadu_si16(&pSource->v);
+    XMVECTOR vResult = XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0,0,0,0));
    // Mask off x, y and z
    vResult = _mm_and_ps(vResult, UNibble4And);
    // Convert to float
@ -2007,8 +2013,9 @@ inline XMVECTOR XM_CALLCONV XMLoadU555(const XMU555* pSource) noexcept
 #elif defined(_XM_SSE_INTRINSICS_)
    static const XMVECTORI32 U555And = { { { 0x1F, 0x1F << 5, 0x1F << 10, 0x8000 } } };
    static const XMVECTORF32 U555Mul = { { { 1.0f, 1.0f / 32.f, 1.0f / 1024.f, 1.0f / 32768.f } } };
-    // Get the 32 bit value and splat it
-    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float*>(&pSource->v));
+    // Get the 16bit value and splat it
+    __m128i vInt = _mm_loadu_si16(&pSource->v);
+    XMVECTOR vResult = XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0, 0, 0, 0));
    // Mask off x, y and z
    vResult = _mm_and_ps(vResult, U555And);
    // Convert to float