diff --git a/.gitignore b/.gitignore index 9477d57..ab37976 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,5 @@ x64 /Tests /wiki /out +*.o +/CMakeFiles/* diff --git a/CMakeCache.txt b/CMakeCache.txt new file mode 100644 index 0000000..7c0bd57 --- /dev/null +++ b/CMakeCache.txt @@ -0,0 +1,339 @@ +# This is the CMakeCache file. +# For build in directory: /run/media/reece/Misc/GameEngine/Master/Public/DirectXTex +# It was generated by CMake: /usr/bin/cmake +# You can edit this file to change values found and used by cmake. +# If you do not want to change any of the values, simply exit the editor. +# If you do want to change a value, simply edit, save, and exit the editor. +# The syntax for the file is as follows: +# KEY:TYPE=VALUE +# KEY is the name of a variable in the cache. +# TYPE is a hint to GUIs for the type of VALUE, DO NOT EDIT TYPE!. +# VALUE is the current value for the KEY. + +######################## +# EXTERNAL cache entries +######################## + +//Build with OpenMP support +BC_USE_OPENMP:BOOL=ON + +//Build with DirectX11 Runtime support +BUILD_DX11:BOOL=ON + +//Build with DirectX12 Runtime support +BUILD_DX12:BOOL=ON + +//Path to a program. +CMAKE_ADDR2LINE:FILEPATH=/usr/bin/addr2line + +//Path to a program. +CMAKE_AR:FILEPATH=/usr/bin/ar + +//Choose the type of build, options are: None Debug Release RelWithDebInfo +// MinSizeRel ... +CMAKE_BUILD_TYPE:STRING= + +//Enable/Disable color output during build. +CMAKE_COLOR_MAKEFILE:BOOL=ON + +//No help, variable specified on the command line. +CMAKE_CXX_COMPILER:UNINITIALIZED=/usr/bin/clang++ + +//LLVM archiver +CMAKE_CXX_COMPILER_AR:FILEPATH=CMAKE_CXX_COMPILER_AR-NOTFOUND + +//Generate index for LLVM archive +CMAKE_CXX_COMPILER_RANLIB:FILEPATH=CMAKE_CXX_COMPILER_RANLIB-NOTFOUND + +//Flags used by the CXX compiler during all build types. +CMAKE_CXX_FLAGS:STRING= + +//Flags used by the CXX compiler during DEBUG builds. +CMAKE_CXX_FLAGS_DEBUG:STRING= + +//Flags used by the CXX compiler during MINSIZEREL builds. +CMAKE_CXX_FLAGS_MINSIZEREL:STRING= + +//Flags used by the CXX compiler during RELEASE builds. +CMAKE_CXX_FLAGS_RELEASE:STRING= + +//Flags used by the CXX compiler during RELWITHDEBINFO builds. +CMAKE_CXX_FLAGS_RELWITHDEBINFO:STRING= + +//No help, variable specified on the command line. +CMAKE_C_COMPILER:UNINITIALIZED=/usr/bin/clang + +//Path to a program. +CMAKE_DLLTOOL:FILEPATH=CMAKE_DLLTOOL-NOTFOUND + +//Flags used by the linker during all build types. +CMAKE_EXE_LINKER_FLAGS:STRING= + +//Flags used by the linker during DEBUG builds. +CMAKE_EXE_LINKER_FLAGS_DEBUG:STRING= + +//Flags used by the linker during MINSIZEREL builds. +CMAKE_EXE_LINKER_FLAGS_MINSIZEREL:STRING= + +//Flags used by the linker during RELEASE builds. +CMAKE_EXE_LINKER_FLAGS_RELEASE:STRING= + +//Flags used by the linker during RELWITHDEBINFO builds. +CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO:STRING= + +//Enable/Disable output of compile commands during generation. +CMAKE_EXPORT_COMPILE_COMMANDS:BOOL= + +//Install path prefix, prepended onto install directories. +CMAKE_INSTALL_PREFIX:PATH=/usr/local + +//Path to a program. +CMAKE_LINKER:FILEPATH=/usr/bin/ld + +//Path to a program. +CMAKE_MAKE_PROGRAM:FILEPATH=/usr/bin/make + +//Flags used by the linker during the creation of modules during +// all build types. +CMAKE_MODULE_LINKER_FLAGS:STRING= + +//Flags used by the linker during the creation of modules during +// DEBUG builds. +CMAKE_MODULE_LINKER_FLAGS_DEBUG:STRING= + +//Flags used by the linker during the creation of modules during +// MINSIZEREL builds. +CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL:STRING= + +//Flags used by the linker during the creation of modules during +// RELEASE builds. +CMAKE_MODULE_LINKER_FLAGS_RELEASE:STRING= + +//Flags used by the linker during the creation of modules during +// RELWITHDEBINFO builds. +CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO:STRING= + +//Path to a program. +CMAKE_NM:FILEPATH=/usr/bin/nm + +//Path to a program. +CMAKE_OBJCOPY:FILEPATH=/usr/bin/objcopy + +//Path to a program. +CMAKE_OBJDUMP:FILEPATH=/usr/bin/objdump + +//Value Computed by CMake +CMAKE_PROJECT_DESCRIPTION:STATIC= + +//Value Computed by CMake +CMAKE_PROJECT_HOMEPAGE_URL:STATIC= + +//Value Computed by CMake +CMAKE_PROJECT_NAME:STATIC=DirectXTex + +//Path to a program. +CMAKE_RANLIB:FILEPATH=/usr/bin/ranlib + +//Path to a program. +CMAKE_READELF:FILEPATH=/usr/bin/readelf + +//Flags used by the linker during the creation of shared libraries +// during all build types. +CMAKE_SHARED_LINKER_FLAGS:STRING= + +//Flags used by the linker during the creation of shared libraries +// during DEBUG builds. +CMAKE_SHARED_LINKER_FLAGS_DEBUG:STRING= + +//Flags used by the linker during the creation of shared libraries +// during MINSIZEREL builds. +CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL:STRING= + +//Flags used by the linker during the creation of shared libraries +// during RELEASE builds. +CMAKE_SHARED_LINKER_FLAGS_RELEASE:STRING= + +//Flags used by the linker during the creation of shared libraries +// during RELWITHDEBINFO builds. +CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO:STRING= + +//If set, runtime paths are not added when installing shared libraries, +// but are added when building. +CMAKE_SKIP_INSTALL_RPATH:BOOL=NO + +//If set, runtime paths are not added when using shared libraries. +CMAKE_SKIP_RPATH:BOOL=NO + +//Flags used by the linker during the creation of static libraries +// during all build types. +CMAKE_STATIC_LINKER_FLAGS:STRING= + +//Flags used by the linker during the creation of static libraries +// during DEBUG builds. +CMAKE_STATIC_LINKER_FLAGS_DEBUG:STRING= + +//Flags used by the linker during the creation of static libraries +// during MINSIZEREL builds. +CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL:STRING= + +//Flags used by the linker during the creation of static libraries +// during RELEASE builds. +CMAKE_STATIC_LINKER_FLAGS_RELEASE:STRING= + +//Flags used by the linker during the creation of static libraries +// during RELWITHDEBINFO builds. +CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO:STRING= + +//Path to a program. +CMAKE_STRIP:FILEPATH=/usr/bin/strip + +//If this value is on, makefiles will be generated without the +// .SILENT directive, and all commands will be echoed to the console +// during the make. This is useful for debugging only. With Visual +// Studio IDE projects all commands are done without /nologo. +CMAKE_VERBOSE_MAKEFILE:BOOL=FALSE + +//Value Computed by CMake +DirectXTex_BINARY_DIR:STATIC=/run/media/reece/Misc/GameEngine/Master/Public/DirectXTex + +//Value Computed by CMake +DirectXTex_SOURCE_DIR:STATIC=/run/media/reece/Misc/GameEngine/Master/Public/DirectXTex + +//Use Static Code Analysis on build +ENABLE_CODE_ANALYSIS:BOOL=OFF + + +######################## +# INTERNAL cache entries +######################## + +//ADVANCED property for variable: CMAKE_ADDR2LINE +CMAKE_ADDR2LINE-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_AR +CMAKE_AR-ADVANCED:INTERNAL=1 +//This is the directory where this CMakeCache.txt was created +CMAKE_CACHEFILE_DIR:INTERNAL=/run/media/reece/Misc/GameEngine/Master/Public/DirectXTex +//Major version of cmake used to create the current loaded cache +CMAKE_CACHE_MAJOR_VERSION:INTERNAL=3 +//Minor version of cmake used to create the current loaded cache +CMAKE_CACHE_MINOR_VERSION:INTERNAL=18 +//Patch version of cmake used to create the current loaded cache +CMAKE_CACHE_PATCH_VERSION:INTERNAL=2 +//ADVANCED property for variable: CMAKE_COLOR_MAKEFILE +CMAKE_COLOR_MAKEFILE-ADVANCED:INTERNAL=1 +//Path to CMake executable. +CMAKE_COMMAND:INTERNAL=/usr/bin/cmake +//Path to cpack program executable. +CMAKE_CPACK_COMMAND:INTERNAL=/usr/bin/cpack +//Path to ctest program executable. +CMAKE_CTEST_COMMAND:INTERNAL=/usr/bin/ctest +//ADVANCED property for variable: CMAKE_CXX_COMPILER +CMAKE_CXX_COMPILER-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_CXX_COMPILER_AR +CMAKE_CXX_COMPILER_AR-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_CXX_COMPILER_RANLIB +CMAKE_CXX_COMPILER_RANLIB-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_CXX_FLAGS +CMAKE_CXX_FLAGS-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_CXX_FLAGS_DEBUG +CMAKE_CXX_FLAGS_DEBUG-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_CXX_FLAGS_MINSIZEREL +CMAKE_CXX_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_CXX_FLAGS_RELEASE +CMAKE_CXX_FLAGS_RELEASE-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_CXX_FLAGS_RELWITHDEBINFO +CMAKE_CXX_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_DLLTOOL +CMAKE_DLLTOOL-ADVANCED:INTERNAL=1 +//Path to cache edit program executable. +CMAKE_EDIT_COMMAND:INTERNAL=/usr/bin/ccmake +//Executable file format +CMAKE_EXECUTABLE_FORMAT:INTERNAL=Unknown +//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS +CMAKE_EXE_LINKER_FLAGS-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_DEBUG +CMAKE_EXE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_MINSIZEREL +CMAKE_EXE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELEASE +CMAKE_EXE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO +CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_EXPORT_COMPILE_COMMANDS +CMAKE_EXPORT_COMPILE_COMMANDS-ADVANCED:INTERNAL=1 +//Name of external makefile project generator. +CMAKE_EXTRA_GENERATOR:INTERNAL= +//Name of generator. +CMAKE_GENERATOR:INTERNAL=Unix Makefiles +//Generator instance identifier. +CMAKE_GENERATOR_INSTANCE:INTERNAL= +//Name of generator platform. +CMAKE_GENERATOR_PLATFORM:INTERNAL= +//Name of generator toolset. +CMAKE_GENERATOR_TOOLSET:INTERNAL= +//Source directory with the top level CMakeLists.txt file for this +// project +CMAKE_HOME_DIRECTORY:INTERNAL=/run/media/reece/Misc/GameEngine/Master/Public/DirectXTex +//Install .so files without execute permission. +CMAKE_INSTALL_SO_NO_EXE:INTERNAL=0 +//ADVANCED property for variable: CMAKE_LINKER +CMAKE_LINKER-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_MAKE_PROGRAM +CMAKE_MAKE_PROGRAM-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS +CMAKE_MODULE_LINKER_FLAGS-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_DEBUG +CMAKE_MODULE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL +CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELEASE +CMAKE_MODULE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO +CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_NM +CMAKE_NM-ADVANCED:INTERNAL=1 +//number of local generators +CMAKE_NUMBER_OF_MAKEFILES:INTERNAL=1 +//ADVANCED property for variable: CMAKE_OBJCOPY +CMAKE_OBJCOPY-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_OBJDUMP +CMAKE_OBJDUMP-ADVANCED:INTERNAL=1 +//Platform information initialized +CMAKE_PLATFORM_INFO_INITIALIZED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_RANLIB +CMAKE_RANLIB-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_READELF +CMAKE_READELF-ADVANCED:INTERNAL=1 +//Path to CMake installation. +CMAKE_ROOT:INTERNAL=/usr/share/cmake-3.18 +//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS +CMAKE_SHARED_LINKER_FLAGS-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_DEBUG +CMAKE_SHARED_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL +CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELEASE +CMAKE_SHARED_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO +CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_SKIP_INSTALL_RPATH +CMAKE_SKIP_INSTALL_RPATH-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_SKIP_RPATH +CMAKE_SKIP_RPATH-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS +CMAKE_STATIC_LINKER_FLAGS-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_DEBUG +CMAKE_STATIC_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL +CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELEASE +CMAKE_STATIC_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO +CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1 +//ADVANCED property for variable: CMAKE_STRIP +CMAKE_STRIP-ADVANCED:INTERNAL=1 +//uname command +CMAKE_UNAME:INTERNAL=/usr/bin/uname +//ADVANCED property for variable: CMAKE_VERBOSE_MAKEFILE +CMAKE_VERBOSE_MAKEFILE-ADVANCED:INTERNAL=1 + diff --git a/CMakeLists.txt b/CMakeLists.txt index 53b41a3..00753e2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,14 +20,22 @@ option(BC_USE_OPENMP "Build with OpenMP support" ON) option(ENABLE_CODE_ANALYSIS "Use Static Code Analysis on build" OFF) -set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_FLAGS "-fpermissive") + set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/CMake") set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/CMake") set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/CMake") +if (WIN32) + set (PLATFORM_FLAGS ) +else() + set (PLATFORM_FLAGS _DXTX_NOWIN) +endif() + set(LIBRARY_SOURCES DirectXTex/BC.h DirectXTex/BCDirectCompute.h @@ -37,12 +45,11 @@ set(LIBRARY_SOURCES DirectXTex/DirectXTexP.h DirectXTex/filters.h DirectXTex/scoped.h + DirectXTex/BC.cpp DirectXTex/BC4BC5.cpp DirectXTex/BC6HBC7.cpp - DirectXTex/BCDirectCompute.cpp - DirectXTex/DirectXTexCompress.cpp - DirectXTex/DirectXTexCompressGPU.cpp + DirectXTex/DirectXTexConvert.cpp DirectXTex/DirectXTexDDS.cpp DirectXTex/DirectXTexFlipRotate.cpp @@ -54,12 +61,14 @@ set(LIBRARY_SOURCES DirectXTex/DirectXTexPMAlpha.cpp DirectXTex/DirectXTexResize.cpp DirectXTex/DirectXTexTGA.cpp - DirectXTex/DirectXTexUtil.cpp - DirectXTex/DirectXTexWIC.cpp) + DirectXTex/DirectXTexUtil.cpp) + +if (WIN32) + set(LIBRARY_SOURCES ${LIBRARY_SOURCES} DirectXTex/DirectXTexCompressGPU.cpp) + set(LIBRARY_SOURCES ${LIBRARY_SOURCES} DirectXTex/BCDirectCompute.cpp) + set(LIBRARY_SOURCES ${LIBRARY_SOURCES} DirectXTex/DirectXTexWIC.cpp) +endif() -set(SHADER_SOURCES - DirectXTex/Shaders/BC6HEncode.hlsl - DirectXTex/Shaders/BC7Encode.hlsl) if(BUILD_DX11) set(LIBRARY_SOURCES ${LIBRARY_SOURCES} DirectXTex/DirectXTexD3D11.cpp) @@ -68,20 +77,34 @@ if(BUILD_DX12) set(LIBRARY_SOURCES ${LIBRARY_SOURCES} DirectXTex/DirectXTexD3D12.cpp) endif() -add_library(${PROJECT_NAME} STATIC ${LIBRARY_SOURCES} DirectXTex/Shaders/Compiled/BC6HEncode_EncodeBlockCS.inc) +set(SHADER_SOURCES + DirectXTex/Shaders/BC6HEncode.hlsl + DirectXTex/Shaders/BC7Encode.hlsl) + +if (WIN32) + add_library(${PROJECT_NAME} STATIC ${LIBRARY_SOURCES} DirectXTex/Shaders/Compiled/BC6HEncode_EncodeBlockCS.inc) + + add_custom_command( + OUTPUT "${PROJECT_SOURCE_DIR}/DirectXTex/Shaders/Compiled/BC6HEncode_EncodeBlockCS.inc" + MAIN_DEPENDENCY "${PROJECT_SOURCE_DIR}/DirectXTex/Shaders/CompileShaders.cmd" + DEPENDS ${SHADER_SOURCES} + COMMENT "Generating HLSL shaders..." + COMMAND "CompileShaders.cmd" + WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/DirectXTex/Shaders" + USES_TERMINAL) + +else() + + add_library(${PROJECT_NAME} STATIC ${LIBRARY_SOURCES}) + +endif() -add_custom_command( - OUTPUT "${PROJECT_SOURCE_DIR}/DirectXTex/Shaders/Compiled/BC6HEncode_EncodeBlockCS.inc" - MAIN_DEPENDENCY "${PROJECT_SOURCE_DIR}/DirectXTex/Shaders/CompileShaders.cmd" - DEPENDS ${SHADER_SOURCES} - COMMENT "Generating HLSL shaders..." - COMMAND "CompileShaders.cmd" - WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/DirectXTex/Shaders" - USES_TERMINAL) source_group(${PROJECT_NAME} REGULAR_EXPRESSION DirectXTex/*.*) target_include_directories(${PROJECT_NAME} PUBLIC DirectXTex) +target_include_directories(${PROJECT_NAME} PUBLIC DirectXMath) +target_include_directories(${PROJECT_NAME} PUBLIC PlatformSupport) if (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.16") target_precompile_headers(${PROJECT_NAME} PRIVATE DirectXTex/DirectXTexP.h) @@ -130,9 +153,11 @@ if(MSVC) endif() endif() +# TODO FIX ME if ( CMAKE_CXX_COMPILER_ID MATCHES "Clang" ) set(WarningsLib "-Wpedantic" "-Wextra") target_compile_options(${PROJECT_NAME} PRIVATE ${WarningsLib}) + target_compile_options(${PROJECT_NAME} PRIVATE "-fdeclspec") # OpenMP is not supported for clang for Windows by default @@ -173,6 +198,7 @@ if ( CMAKE_CXX_COMPILER_ID MATCHES "MSVC" ) target_compile_options(texdiag PRIVATE ${WarningsEXE}) endif() +target_compile_definitions(${PROJECT_NAME} PRIVATE ${PLATFORM_FLAGS}) if(WIN32) target_compile_definitions(${PROJECT_NAME} PRIVATE _UNICODE UNICODE) target_compile_definitions(texassemble PRIVATE _UNICODE UNICODE _WIN32_WINNT=0x0601) diff --git a/DirectXMath/DirectXCollision.h b/DirectXMath/DirectXCollision.h new file mode 100644 index 0000000..94777bd --- /dev/null +++ b/DirectXMath/DirectXCollision.h @@ -0,0 +1,353 @@ +//------------------------------------------------------------------------------------- +// DirectXCollision.h -- C++ Collision Math library +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#include "DirectXMath.h" + +namespace DirectX +{ + + enum ContainmentType + { + DISJOINT = 0, + INTERSECTS = 1, + CONTAINS = 2 + }; + + enum PlaneIntersectionType + { + FRONT = 0, + INTERSECTING = 1, + BACK = 2 + }; + + struct BoundingBox; + struct BoundingOrientedBox; + struct BoundingFrustum; + +#pragma warning(push) +#pragma warning(disable:4324 4820) + // C4324: alignment padding warnings + // C4820: Off by default noise + + //------------------------------------------------------------------------------------- + // Bounding sphere + //------------------------------------------------------------------------------------- + struct BoundingSphere + { + XMFLOAT3 Center; // Center of the sphere. + float Radius; // Radius of the sphere. + + // Creators + BoundingSphere() noexcept : Center(0, 0, 0), Radius(1.f) {} + + BoundingSphere(const BoundingSphere&) = default; + BoundingSphere& operator=(const BoundingSphere&) = default; + + BoundingSphere(BoundingSphere&&) = default; + BoundingSphere& operator=(BoundingSphere&&) = default; + + constexpr BoundingSphere(_In_ const XMFLOAT3& center, _In_ float radius) noexcept + : Center(center), Radius(radius) {} + + // Methods + void XM_CALLCONV Transform(_Out_ BoundingSphere& Out, _In_ FXMMATRIX M) const noexcept; + void XM_CALLCONV Transform(_Out_ BoundingSphere& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation) const noexcept; + // Transform the sphere + + ContainmentType XM_CALLCONV Contains(_In_ FXMVECTOR Point) const noexcept; + ContainmentType XM_CALLCONV Contains(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) const noexcept; + ContainmentType Contains(_In_ const BoundingSphere& sh) const noexcept; + ContainmentType Contains(_In_ const BoundingBox& box) const noexcept; + ContainmentType Contains(_In_ const BoundingOrientedBox& box) const noexcept; + ContainmentType Contains(_In_ const BoundingFrustum& fr) const noexcept; + + bool Intersects(_In_ const BoundingSphere& sh) const noexcept; + bool Intersects(_In_ const BoundingBox& box) const noexcept; + bool Intersects(_In_ const BoundingOrientedBox& box) const noexcept; + bool Intersects(_In_ const BoundingFrustum& fr) const noexcept; + + bool XM_CALLCONV Intersects(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) const noexcept; + // Triangle-sphere test + + PlaneIntersectionType XM_CALLCONV Intersects(_In_ FXMVECTOR Plane) const noexcept; + // Plane-sphere test + + bool XM_CALLCONV Intersects(_In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist) const noexcept; + // Ray-sphere test + + ContainmentType XM_CALLCONV ContainedBy(_In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2, + _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5) const noexcept; + // Test sphere against six planes (see BoundingFrustum::GetPlanes) + + // Static methods + static void CreateMerged(_Out_ BoundingSphere& Out, _In_ const BoundingSphere& S1, _In_ const BoundingSphere& S2) noexcept; + + static void CreateFromBoundingBox(_Out_ BoundingSphere& Out, _In_ const BoundingBox& box) noexcept; + static void CreateFromBoundingBox(_Out_ BoundingSphere& Out, _In_ const BoundingOrientedBox& box) noexcept; + + static void CreateFromPoints(_Out_ BoundingSphere& Out, _In_ size_t Count, + _In_reads_bytes_(sizeof(XMFLOAT3) + Stride * (Count - 1)) const XMFLOAT3* pPoints, _In_ size_t Stride) noexcept; + + static void CreateFromFrustum(_Out_ BoundingSphere& Out, _In_ const BoundingFrustum& fr) noexcept; + }; + + //------------------------------------------------------------------------------------- + // Axis-aligned bounding box + //------------------------------------------------------------------------------------- + struct BoundingBox + { + static const size_t CORNER_COUNT = 8; + + XMFLOAT3 Center; // Center of the box. + XMFLOAT3 Extents; // Distance from the center to each side. + + // Creators + BoundingBox() noexcept : Center(0, 0, 0), Extents(1.f, 1.f, 1.f) {} + + BoundingBox(const BoundingBox&) = default; + BoundingBox& operator=(const BoundingBox&) = default; + + BoundingBox(BoundingBox&&) = default; + BoundingBox& operator=(BoundingBox&&) = default; + + constexpr BoundingBox(_In_ const XMFLOAT3& center, _In_ const XMFLOAT3& extents) noexcept + : Center(center), Extents(extents) {} + + // Methods + void XM_CALLCONV Transform(_Out_ BoundingBox& Out, _In_ FXMMATRIX M) const noexcept; + void XM_CALLCONV Transform(_Out_ BoundingBox& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation) const noexcept; + + void GetCorners(_Out_writes_(8) XMFLOAT3* Corners) const noexcept; + // Gets the 8 corners of the box + + ContainmentType XM_CALLCONV Contains(_In_ FXMVECTOR Point) const noexcept; + ContainmentType XM_CALLCONV Contains(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) const noexcept; + ContainmentType Contains(_In_ const BoundingSphere& sh) const noexcept; + ContainmentType Contains(_In_ const BoundingBox& box) const noexcept; + ContainmentType Contains(_In_ const BoundingOrientedBox& box) const noexcept; + ContainmentType Contains(_In_ const BoundingFrustum& fr) const noexcept; + + bool Intersects(_In_ const BoundingSphere& sh) const noexcept; + bool Intersects(_In_ const BoundingBox& box) const noexcept; + bool Intersects(_In_ const BoundingOrientedBox& box) const noexcept; + bool Intersects(_In_ const BoundingFrustum& fr) const noexcept; + + bool XM_CALLCONV Intersects(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) const noexcept; + // Triangle-Box test + + PlaneIntersectionType XM_CALLCONV Intersects(_In_ FXMVECTOR Plane) const noexcept; + // Plane-box test + + bool XM_CALLCONV Intersects(_In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist) const noexcept; + // Ray-Box test + + ContainmentType XM_CALLCONV ContainedBy(_In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2, + _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5) const noexcept; + // Test box against six planes (see BoundingFrustum::GetPlanes) + + // Static methods + static void CreateMerged(_Out_ BoundingBox& Out, _In_ const BoundingBox& b1, _In_ const BoundingBox& b2) noexcept; + + static void CreateFromSphere(_Out_ BoundingBox& Out, _In_ const BoundingSphere& sh) noexcept; + + static void XM_CALLCONV CreateFromPoints(_Out_ BoundingBox& Out, _In_ FXMVECTOR pt1, _In_ FXMVECTOR pt2) noexcept; + static void CreateFromPoints(_Out_ BoundingBox& Out, _In_ size_t Count, + _In_reads_bytes_(sizeof(XMFLOAT3) + Stride * (Count - 1)) const XMFLOAT3* pPoints, _In_ size_t Stride) noexcept; + }; + + //------------------------------------------------------------------------------------- + // Oriented bounding box + //------------------------------------------------------------------------------------- + struct BoundingOrientedBox + { + static const size_t CORNER_COUNT = 8; + + XMFLOAT3 Center; // Center of the box. + XMFLOAT3 Extents; // Distance from the center to each side. + XMFLOAT4 Orientation; // Unit quaternion representing rotation (box -> world). + + // Creators + BoundingOrientedBox() noexcept : Center(0, 0, 0), Extents(1.f, 1.f, 1.f), Orientation(0, 0, 0, 1.f) {} + + BoundingOrientedBox(const BoundingOrientedBox&) = default; + BoundingOrientedBox& operator=(const BoundingOrientedBox&) = default; + + BoundingOrientedBox(BoundingOrientedBox&&) = default; + BoundingOrientedBox& operator=(BoundingOrientedBox&&) = default; + + constexpr BoundingOrientedBox(_In_ const XMFLOAT3& _Center, _In_ const XMFLOAT3& _Extents, _In_ const XMFLOAT4& _Orientation) noexcept + : Center(_Center), Extents(_Extents), Orientation(_Orientation) {} + + // Methods + void XM_CALLCONV Transform(_Out_ BoundingOrientedBox& Out, _In_ FXMMATRIX M) const noexcept; + void XM_CALLCONV Transform(_Out_ BoundingOrientedBox& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation) const noexcept; + + void GetCorners(_Out_writes_(8) XMFLOAT3* Corners) const noexcept; + // Gets the 8 corners of the box + + ContainmentType XM_CALLCONV Contains(_In_ FXMVECTOR Point) const noexcept; + ContainmentType XM_CALLCONV Contains(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) const noexcept; + ContainmentType Contains(_In_ const BoundingSphere& sh) const noexcept; + ContainmentType Contains(_In_ const BoundingBox& box) const noexcept; + ContainmentType Contains(_In_ const BoundingOrientedBox& box) const noexcept; + ContainmentType Contains(_In_ const BoundingFrustum& fr) const noexcept; + + bool Intersects(_In_ const BoundingSphere& sh) const noexcept; + bool Intersects(_In_ const BoundingBox& box) const noexcept; + bool Intersects(_In_ const BoundingOrientedBox& box) const noexcept; + bool Intersects(_In_ const BoundingFrustum& fr) const noexcept; + + bool XM_CALLCONV Intersects(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) const noexcept; + // Triangle-OrientedBox test + + PlaneIntersectionType XM_CALLCONV Intersects(_In_ FXMVECTOR Plane) const noexcept; + // Plane-OrientedBox test + + bool XM_CALLCONV Intersects(_In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist) const noexcept; + // Ray-OrientedBox test + + ContainmentType XM_CALLCONV ContainedBy(_In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2, + _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5) const noexcept; + // Test OrientedBox against six planes (see BoundingFrustum::GetPlanes) + + // Static methods + static void CreateFromBoundingBox(_Out_ BoundingOrientedBox& Out, _In_ const BoundingBox& box) noexcept; + + static void CreateFromPoints(_Out_ BoundingOrientedBox& Out, _In_ size_t Count, + _In_reads_bytes_(sizeof(XMFLOAT3) + Stride * (Count - 1)) const XMFLOAT3* pPoints, _In_ size_t Stride) noexcept; + }; + + //------------------------------------------------------------------------------------- + // Bounding frustum + //------------------------------------------------------------------------------------- + struct BoundingFrustum + { + static const size_t CORNER_COUNT = 8; + + XMFLOAT3 Origin; // Origin of the frustum (and projection). + XMFLOAT4 Orientation; // Quaternion representing rotation. + + float RightSlope; // Positive X (X/Z) + float LeftSlope; // Negative X + float TopSlope; // Positive Y (Y/Z) + float BottomSlope; // Negative Y + float Near, Far; // Z of the near plane and far plane. + + // Creators + BoundingFrustum() noexcept : + Origin(0, 0, 0), Orientation(0, 0, 0, 1.f), RightSlope(1.f), LeftSlope(-1.f), + TopSlope(1.f), BottomSlope(-1.f), Near(0), Far(1.f) {} + + BoundingFrustum(const BoundingFrustum&) = default; + BoundingFrustum& operator=(const BoundingFrustum&) = default; + + BoundingFrustum(BoundingFrustum&&) = default; + BoundingFrustum& operator=(BoundingFrustum&&) = default; + + constexpr BoundingFrustum(_In_ const XMFLOAT3& _Origin, _In_ const XMFLOAT4& _Orientation, + _In_ float _RightSlope, _In_ float _LeftSlope, _In_ float _TopSlope, _In_ float _BottomSlope, + _In_ float _Near, _In_ float _Far) noexcept + : Origin(_Origin), Orientation(_Orientation), + RightSlope(_RightSlope), LeftSlope(_LeftSlope), TopSlope(_TopSlope), BottomSlope(_BottomSlope), + Near(_Near), Far(_Far) {} + BoundingFrustum(_In_ CXMMATRIX Projection, bool rhcoords = false) noexcept; + + // Methods + void XM_CALLCONV Transform(_Out_ BoundingFrustum& Out, _In_ FXMMATRIX M) const noexcept; + void XM_CALLCONV Transform(_Out_ BoundingFrustum& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation) const noexcept; + + void GetCorners(_Out_writes_(8) XMFLOAT3* Corners) const noexcept; + // Gets the 8 corners of the frustum + + ContainmentType XM_CALLCONV Contains(_In_ FXMVECTOR Point) const noexcept; + ContainmentType XM_CALLCONV Contains(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) const noexcept; + ContainmentType Contains(_In_ const BoundingSphere& sp) const noexcept; + ContainmentType Contains(_In_ const BoundingBox& box) const noexcept; + ContainmentType Contains(_In_ const BoundingOrientedBox& box) const noexcept; + ContainmentType Contains(_In_ const BoundingFrustum& fr) const noexcept; + // Frustum-Frustum test + + bool Intersects(_In_ const BoundingSphere& sh) const noexcept; + bool Intersects(_In_ const BoundingBox& box) const noexcept; + bool Intersects(_In_ const BoundingOrientedBox& box) const noexcept; + bool Intersects(_In_ const BoundingFrustum& fr) const noexcept; + + bool XM_CALLCONV Intersects(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) const noexcept; + // Triangle-Frustum test + + PlaneIntersectionType XM_CALLCONV Intersects(_In_ FXMVECTOR Plane) const noexcept; + // Plane-Frustum test + + bool XM_CALLCONV Intersects(_In_ FXMVECTOR rayOrigin, _In_ FXMVECTOR Direction, _Out_ float& Dist) const noexcept; + // Ray-Frustum test + + ContainmentType XM_CALLCONV ContainedBy(_In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2, + _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5) const noexcept; + // Test frustum against six planes (see BoundingFrustum::GetPlanes) + + void GetPlanes(_Out_opt_ XMVECTOR* NearPlane, _Out_opt_ XMVECTOR* FarPlane, _Out_opt_ XMVECTOR* RightPlane, + _Out_opt_ XMVECTOR* LeftPlane, _Out_opt_ XMVECTOR* TopPlane, _Out_opt_ XMVECTOR* BottomPlane) const noexcept; + // Create 6 Planes representation of Frustum + + // Static methods + static void XM_CALLCONV CreateFromMatrix(_Out_ BoundingFrustum& Out, _In_ FXMMATRIX Projection, bool rhcoords = false) noexcept; + }; + + //----------------------------------------------------------------------------- + // Triangle intersection testing routines. + //----------------------------------------------------------------------------- + namespace TriangleTests + { + bool XM_CALLCONV Intersects(_In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _In_ FXMVECTOR V0, _In_ GXMVECTOR V1, _In_ HXMVECTOR V2, _Out_ float& Dist) noexcept; + // Ray-Triangle + + bool XM_CALLCONV Intersects(_In_ FXMVECTOR A0, _In_ FXMVECTOR A1, _In_ FXMVECTOR A2, _In_ GXMVECTOR B0, _In_ HXMVECTOR B1, _In_ HXMVECTOR B2) noexcept; + // Triangle-Triangle + + PlaneIntersectionType XM_CALLCONV Intersects(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2, _In_ GXMVECTOR Plane) noexcept; + // Plane-Triangle + + ContainmentType XM_CALLCONV ContainedBy(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2, + _In_ GXMVECTOR Plane0, _In_ HXMVECTOR Plane1, _In_ HXMVECTOR Plane2, + _In_ CXMVECTOR Plane3, _In_ CXMVECTOR Plane4, _In_ CXMVECTOR Plane5) noexcept; + // Test a triangle against six planes at once (see BoundingFrustum::GetPlanes) + } + +#pragma warning(pop) + + /**************************************************************************** + * + * Implementation + * + ****************************************************************************/ + +#pragma warning(push) +#pragma warning(disable : 4068 4365 4616 6001) + // C4068/4616: ignore unknown pragmas + // C4365: Off by default noise + // C6001: False positives + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") +#pragma prefast(disable : 26495, "Union initialization confuses /analyze") +#endif + +#include "DirectXCollision.inl" + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +#pragma warning(pop) + +} // namespace DirectX + diff --git a/DirectXMath/DirectXCollision.inl b/DirectXMath/DirectXCollision.inl new file mode 100644 index 0000000..c65ef54 --- /dev/null +++ b/DirectXMath/DirectXCollision.inl @@ -0,0 +1,4816 @@ +//------------------------------------------------------------------------------------- +// DirectXCollision.inl -- C++ Collision Math library +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +XMGLOBALCONST XMVECTORF32 g_BoxOffset[8] = +{ + { { { -1.0f, -1.0f, 1.0f, 0.0f } } }, + { { { 1.0f, -1.0f, 1.0f, 0.0f } } }, + { { { 1.0f, 1.0f, 1.0f, 0.0f } } }, + { { { -1.0f, 1.0f, 1.0f, 0.0f } } }, + { { { -1.0f, -1.0f, -1.0f, 0.0f } } }, + { { { 1.0f, -1.0f, -1.0f, 0.0f } } }, + { { { 1.0f, 1.0f, -1.0f, 0.0f } } }, + { { { -1.0f, 1.0f, -1.0f, 0.0f } } }, +}; + +XMGLOBALCONST XMVECTORF32 g_RayEpsilon = { { { 1e-20f, 1e-20f, 1e-20f, 1e-20f } } }; +XMGLOBALCONST XMVECTORF32 g_RayNegEpsilon = { { { -1e-20f, -1e-20f, -1e-20f, -1e-20f } } }; +XMGLOBALCONST XMVECTORF32 g_FltMin = { { { -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX } } }; +XMGLOBALCONST XMVECTORF32 g_FltMax = { { { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX } } }; + +namespace Internal +{ + + //----------------------------------------------------------------------------- + // Return true if any of the elements of a 3 vector are equal to 0xffffffff. + // Slightly more efficient than using XMVector3EqualInt. + //----------------------------------------------------------------------------- + inline bool XMVector3AnyTrue(_In_ FXMVECTOR V) noexcept + { + // Duplicate the fourth element from the first element. + XMVECTOR C = XMVectorSwizzle(V); + + return XMComparisonAnyTrue(XMVector4EqualIntR(C, XMVectorTrueInt())); + } + + + //----------------------------------------------------------------------------- + // Return true if all of the elements of a 3 vector are equal to 0xffffffff. + // Slightly more efficient than using XMVector3EqualInt. + //----------------------------------------------------------------------------- + inline bool XMVector3AllTrue(_In_ FXMVECTOR V) noexcept + { + // Duplicate the fourth element from the first element. + XMVECTOR C = XMVectorSwizzle(V); + + return XMComparisonAllTrue(XMVector4EqualIntR(C, XMVectorTrueInt())); + } + +#if defined(_PREFAST_) || !defined(NDEBUG) + + XMGLOBALCONST XMVECTORF32 g_UnitVectorEpsilon = { { { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f } } }; + XMGLOBALCONST XMVECTORF32 g_UnitQuaternionEpsilon = { { { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f } } }; + XMGLOBALCONST XMVECTORF32 g_UnitPlaneEpsilon = { { { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f } } }; + + //----------------------------------------------------------------------------- + // Return true if the vector is a unit vector (length == 1). + //----------------------------------------------------------------------------- + inline bool XMVector3IsUnit(_In_ FXMVECTOR V) noexcept + { + XMVECTOR Difference = XMVectorSubtract(XMVector3Length(V), XMVectorSplatOne()); + return XMVector4Less(XMVectorAbs(Difference), g_UnitVectorEpsilon); + } + + //----------------------------------------------------------------------------- + // Return true if the quaterion is a unit quaternion. + //----------------------------------------------------------------------------- + inline bool XMQuaternionIsUnit(_In_ FXMVECTOR Q) noexcept + { + XMVECTOR Difference = XMVectorSubtract(XMVector4Length(Q), XMVectorSplatOne()); + return XMVector4Less(XMVectorAbs(Difference), g_UnitQuaternionEpsilon); + } + + //----------------------------------------------------------------------------- + // Return true if the plane is a unit plane. + //----------------------------------------------------------------------------- + inline bool XMPlaneIsUnit(_In_ FXMVECTOR Plane) noexcept + { + XMVECTOR Difference = XMVectorSubtract(XMVector3Length(Plane), XMVectorSplatOne()); + return XMVector4Less(XMVectorAbs(Difference), g_UnitPlaneEpsilon); + } + +#endif // _PREFAST_ || !NDEBUG + + //----------------------------------------------------------------------------- + inline XMVECTOR XMPlaneTransform(_In_ FXMVECTOR Plane, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation) noexcept + { + XMVECTOR vNormal = XMVector3Rotate(Plane, Rotation); + XMVECTOR vD = XMVectorSubtract(XMVectorSplatW(Plane), XMVector3Dot(vNormal, Translation)); + + return XMVectorInsert<0, 0, 0, 0, 1>(vNormal, vD); + } + + //----------------------------------------------------------------------------- + // Return the point on the line segement (S1, S2) nearest the point P. + //----------------------------------------------------------------------------- + inline XMVECTOR PointOnLineSegmentNearestPoint(_In_ FXMVECTOR S1, _In_ FXMVECTOR S2, _In_ FXMVECTOR P) noexcept + { + XMVECTOR Dir = XMVectorSubtract(S2, S1); + XMVECTOR Projection = XMVectorSubtract(XMVector3Dot(P, Dir), XMVector3Dot(S1, Dir)); + XMVECTOR LengthSq = XMVector3Dot(Dir, Dir); + + XMVECTOR t = XMVectorMultiply(Projection, XMVectorReciprocal(LengthSq)); + XMVECTOR Point = XMVectorMultiplyAdd(t, Dir, S1); + + // t < 0 + XMVECTOR SelectS1 = XMVectorLess(Projection, XMVectorZero()); + Point = XMVectorSelect(Point, S1, SelectS1); + + // t > 1 + XMVECTOR SelectS2 = XMVectorGreater(Projection, LengthSq); + Point = XMVectorSelect(Point, S2, SelectS2); + + return Point; + } + + //----------------------------------------------------------------------------- + // Test if the point (P) on the plane of the triangle is inside the triangle + // (V0, V1, V2). + //----------------------------------------------------------------------------- + inline XMVECTOR XM_CALLCONV PointOnPlaneInsideTriangle(_In_ FXMVECTOR P, _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ GXMVECTOR V2) noexcept + { + // Compute the triangle normal. + XMVECTOR N = XMVector3Cross(XMVectorSubtract(V2, V0), XMVectorSubtract(V1, V0)); + + // Compute the cross products of the vector from the base of each edge to + // the point with each edge vector. + XMVECTOR C0 = XMVector3Cross(XMVectorSubtract(P, V0), XMVectorSubtract(V1, V0)); + XMVECTOR C1 = XMVector3Cross(XMVectorSubtract(P, V1), XMVectorSubtract(V2, V1)); + XMVECTOR C2 = XMVector3Cross(XMVectorSubtract(P, V2), XMVectorSubtract(V0, V2)); + + // If the cross product points in the same direction as the normal the the + // point is inside the edge (it is zero if is on the edge). + XMVECTOR Zero = XMVectorZero(); + XMVECTOR Inside0 = XMVectorGreaterOrEqual(XMVector3Dot(C0, N), Zero); + XMVECTOR Inside1 = XMVectorGreaterOrEqual(XMVector3Dot(C1, N), Zero); + XMVECTOR Inside2 = XMVectorGreaterOrEqual(XMVector3Dot(C2, N), Zero); + + // If the point inside all of the edges it is inside. + return XMVectorAndInt(XMVectorAndInt(Inside0, Inside1), Inside2); + } + + //----------------------------------------------------------------------------- + inline bool SolveCubic(_In_ float e, _In_ float f, _In_ float g, _Out_ float* t, _Out_ float* u, _Out_ float* v) noexcept + { + float p, q, h, rc, d, theta, costh3, sinth3; + + p = f - e * e / 3.0f; + q = g - e * f / 3.0f + e * e * e * 2.0f / 27.0f; + h = q * q / 4.0f + p * p * p / 27.0f; + + if (h > 0) + { + *t = *u = *v = 0.f; + return false; // only one real root + } + + if ((h == 0) && (q == 0)) // all the same root + { + *t = -e / 3; + *u = -e / 3; + *v = -e / 3; + + return true; + } + + d = sqrtf(q * q / 4.0f - h); + if (d < 0) + rc = -powf(-d, 1.0f / 3.0f); + else + rc = powf(d, 1.0f / 3.0f); + + theta = XMScalarACos(-q / (2.0f * d)); + costh3 = XMScalarCos(theta / 3.0f); + sinth3 = sqrtf(3.0f) * XMScalarSin(theta / 3.0f); + *t = 2.0f * rc * costh3 - e / 3.0f; + *u = -rc * (costh3 + sinth3) - e / 3.0f; + *v = -rc * (costh3 - sinth3) - e / 3.0f; + + return true; + } + + //----------------------------------------------------------------------------- + inline XMVECTOR CalculateEigenVector(_In_ float m11, _In_ float m12, _In_ float m13, + _In_ float m22, _In_ float m23, _In_ float m33, _In_ float e) noexcept + { + float fTmp[3]; + fTmp[0] = m12 * m23 - m13 * (m22 - e); + fTmp[1] = m13 * m12 - m23 * (m11 - e); + fTmp[2] = (m11 - e) * (m22 - e) - m12 * m12; + + XMVECTOR vTmp = XMLoadFloat3(reinterpret_cast(fTmp)); + + if (XMVector3Equal(vTmp, XMVectorZero())) // planar or linear + { + float f1, f2, f3; + + // we only have one equation - find a valid one + if ((m11 - e != 0) || (m12 != 0) || (m13 != 0)) + { + f1 = m11 - e; f2 = m12; f3 = m13; + } + else if ((m12 != 0) || (m22 - e != 0) || (m23 != 0)) + { + f1 = m12; f2 = m22 - e; f3 = m23; + } + else if ((m13 != 0) || (m23 != 0) || (m33 - e != 0)) + { + f1 = m13; f2 = m23; f3 = m33 - e; + } + else + { + // error, we'll just make something up - we have NO context + f1 = 1.0f; f2 = 0.0f; f3 = 0.0f; + } + + if (f1 == 0) + vTmp = XMVectorSetX(vTmp, 0.0f); + else + vTmp = XMVectorSetX(vTmp, 1.0f); + + if (f2 == 0) + vTmp = XMVectorSetY(vTmp, 0.0f); + else + vTmp = XMVectorSetY(vTmp, 1.0f); + + if (f3 == 0) + { + vTmp = XMVectorSetZ(vTmp, 0.0f); + // recalculate y to make equation work + if (m12 != 0) + vTmp = XMVectorSetY(vTmp, -f1 / f2); + } + else + { + vTmp = XMVectorSetZ(vTmp, (f2 - f1) / f3); + } + } + + if (XMVectorGetX(XMVector3LengthSq(vTmp)) > 1e-5f) + { + return XMVector3Normalize(vTmp); + } + else + { + // Multiply by a value large enough to make the vector non-zero. + vTmp = XMVectorScale(vTmp, 1e5f); + return XMVector3Normalize(vTmp); + } + } + + //----------------------------------------------------------------------------- + inline bool CalculateEigenVectors(_In_ float m11, _In_ float m12, _In_ float m13, + _In_ float m22, _In_ float m23, _In_ float m33, + _In_ float e1, _In_ float e2, _In_ float e3, + _Out_ XMVECTOR* pV1, _Out_ XMVECTOR* pV2, _Out_ XMVECTOR* pV3) noexcept + { + *pV1 = DirectX::Internal::CalculateEigenVector(m11, m12, m13, m22, m23, m33, e1); + *pV2 = DirectX::Internal::CalculateEigenVector(m11, m12, m13, m22, m23, m33, e2); + *pV3 = DirectX::Internal::CalculateEigenVector(m11, m12, m13, m22, m23, m33, e3); + + bool v1z = false; + bool v2z = false; + bool v3z = false; + + XMVECTOR Zero = XMVectorZero(); + + if (XMVector3Equal(*pV1, Zero)) + v1z = true; + + if (XMVector3Equal(*pV2, Zero)) + v2z = true; + + if (XMVector3Equal(*pV3, Zero)) + v3z = true; + + bool e12 = (fabsf(XMVectorGetX(XMVector3Dot(*pV1, *pV2))) > 0.1f); // check for non-orthogonal vectors + bool e13 = (fabsf(XMVectorGetX(XMVector3Dot(*pV1, *pV3))) > 0.1f); + bool e23 = (fabsf(XMVectorGetX(XMVector3Dot(*pV2, *pV3))) > 0.1f); + + if ((v1z && v2z && v3z) || (e12 && e13 && e23) || + (e12 && v3z) || (e13 && v2z) || (e23 && v1z)) // all eigenvectors are 0- any basis set + { + *pV1 = g_XMIdentityR0.v; + *pV2 = g_XMIdentityR1.v; + *pV3 = g_XMIdentityR2.v; + return true; + } + + if (v1z && v2z) + { + XMVECTOR vTmp = XMVector3Cross(g_XMIdentityR1, *pV3); + if (XMVectorGetX(XMVector3LengthSq(vTmp)) < 1e-5f) + { + vTmp = XMVector3Cross(g_XMIdentityR0, *pV3); + } + *pV1 = XMVector3Normalize(vTmp); + *pV2 = XMVector3Cross(*pV3, *pV1); + return true; + } + + if (v3z && v1z) + { + XMVECTOR vTmp = XMVector3Cross(g_XMIdentityR1, *pV2); + if (XMVectorGetX(XMVector3LengthSq(vTmp)) < 1e-5f) + { + vTmp = XMVector3Cross(g_XMIdentityR0, *pV2); + } + *pV3 = XMVector3Normalize(vTmp); + *pV1 = XMVector3Cross(*pV2, *pV3); + return true; + } + + if (v2z && v3z) + { + XMVECTOR vTmp = XMVector3Cross(g_XMIdentityR1, *pV1); + if (XMVectorGetX(XMVector3LengthSq(vTmp)) < 1e-5f) + { + vTmp = XMVector3Cross(g_XMIdentityR0, *pV1); + } + *pV2 = XMVector3Normalize(vTmp); + *pV3 = XMVector3Cross(*pV1, *pV2); + return true; + } + + if ((v1z) || e12) + { + *pV1 = XMVector3Cross(*pV2, *pV3); + return true; + } + + if ((v2z) || e23) + { + *pV2 = XMVector3Cross(*pV3, *pV1); + return true; + } + + if ((v3z) || e13) + { + *pV3 = XMVector3Cross(*pV1, *pV2); + return true; + } + + return true; + } + + //----------------------------------------------------------------------------- + inline bool CalculateEigenVectorsFromCovarianceMatrix(_In_ float Cxx, _In_ float Cyy, _In_ float Czz, + _In_ float Cxy, _In_ float Cxz, _In_ float Cyz, + _Out_ XMVECTOR* pV1, _Out_ XMVECTOR* pV2, _Out_ XMVECTOR* pV3) noexcept + { + // Calculate the eigenvalues by solving a cubic equation. + float e = -(Cxx + Cyy + Czz); + float f = Cxx * Cyy + Cyy * Czz + Czz * Cxx - Cxy * Cxy - Cxz * Cxz - Cyz * Cyz; + float g = Cxy * Cxy * Czz + Cxz * Cxz * Cyy + Cyz * Cyz * Cxx - Cxy * Cyz * Cxz * 2.0f - Cxx * Cyy * Czz; + + float ev1, ev2, ev3; + if (!DirectX::Internal::SolveCubic(e, f, g, &ev1, &ev2, &ev3)) + { + // set them to arbitrary orthonormal basis set + *pV1 = g_XMIdentityR0.v; + *pV2 = g_XMIdentityR1.v; + *pV3 = g_XMIdentityR2.v; + return false; + } + + return DirectX::Internal::CalculateEigenVectors(Cxx, Cxy, Cxz, Cyy, Cyz, Czz, ev1, ev2, ev3, pV1, pV2, pV3); + } + + //----------------------------------------------------------------------------- + inline void XM_CALLCONV FastIntersectTrianglePlane( + FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2, + GXMVECTOR Plane, + XMVECTOR& Outside, XMVECTOR& Inside) noexcept + { + // Plane0 + XMVECTOR Dist0 = XMVector4Dot(V0, Plane); + XMVECTOR Dist1 = XMVector4Dot(V1, Plane); + XMVECTOR Dist2 = XMVector4Dot(V2, Plane); + + XMVECTOR MinDist = XMVectorMin(Dist0, Dist1); + MinDist = XMVectorMin(MinDist, Dist2); + + XMVECTOR MaxDist = XMVectorMax(Dist0, Dist1); + MaxDist = XMVectorMax(MaxDist, Dist2); + + XMVECTOR Zero = XMVectorZero(); + + // Outside the plane? + Outside = XMVectorGreater(MinDist, Zero); + + // Fully inside the plane? + Inside = XMVectorLess(MaxDist, Zero); + } + + //----------------------------------------------------------------------------- + inline void FastIntersectSpherePlane(_In_ FXMVECTOR Center, _In_ FXMVECTOR Radius, _In_ FXMVECTOR Plane, + _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside) noexcept + { + XMVECTOR Dist = XMVector4Dot(Center, Plane); + + // Outside the plane? + Outside = XMVectorGreater(Dist, Radius); + + // Fully inside the plane? + Inside = XMVectorLess(Dist, XMVectorNegate(Radius)); + } + + //----------------------------------------------------------------------------- + inline void FastIntersectAxisAlignedBoxPlane(_In_ FXMVECTOR Center, _In_ FXMVECTOR Extents, _In_ FXMVECTOR Plane, + _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside) noexcept + { + // Compute the distance to the center of the box. + XMVECTOR Dist = XMVector4Dot(Center, Plane); + + // Project the axes of the box onto the normal of the plane. Half the + // length of the projection (sometime called the "radius") is equal to + // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w)) + // where h(i) are extents of the box, n is the plane normal, and b(i) are the + // axes of the box. In this case b(i) = [(1,0,0), (0,1,0), (0,0,1)]. + XMVECTOR Radius = XMVector3Dot(Extents, XMVectorAbs(Plane)); + + // Outside the plane? + Outside = XMVectorGreater(Dist, Radius); + + // Fully inside the plane? + Inside = XMVectorLess(Dist, XMVectorNegate(Radius)); + } + + //----------------------------------------------------------------------------- + inline void XM_CALLCONV FastIntersectOrientedBoxPlane( + _In_ FXMVECTOR Center, _In_ FXMVECTOR Extents, _In_ FXMVECTOR Axis0, + _In_ GXMVECTOR Axis1, + _In_ HXMVECTOR Axis2, _In_ HXMVECTOR Plane, + _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside) noexcept + { + // Compute the distance to the center of the box. + XMVECTOR Dist = XMVector4Dot(Center, Plane); + + // Project the axes of the box onto the normal of the plane. Half the + // length of the projection (sometime called the "radius") is equal to + // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w)) + // where h(i) are extents of the box, n is the plane normal, and b(i) are the + // axes of the box. + XMVECTOR Radius = XMVector3Dot(Plane, Axis0); + Radius = XMVectorInsert<0, 0, 1, 0, 0>(Radius, XMVector3Dot(Plane, Axis1)); + Radius = XMVectorInsert<0, 0, 0, 1, 0>(Radius, XMVector3Dot(Plane, Axis2)); + Radius = XMVector3Dot(Extents, XMVectorAbs(Radius)); + + // Outside the plane? + Outside = XMVectorGreater(Dist, Radius); + + // Fully inside the plane? + Inside = XMVectorLess(Dist, XMVectorNegate(Radius)); + } + + //----------------------------------------------------------------------------- + inline void XM_CALLCONV FastIntersectFrustumPlane( + _In_ FXMVECTOR Point0, _In_ FXMVECTOR Point1, _In_ FXMVECTOR Point2, + _In_ GXMVECTOR Point3, + _In_ HXMVECTOR Point4, _In_ HXMVECTOR Point5, + _In_ CXMVECTOR Point6, _In_ CXMVECTOR Point7, _In_ CXMVECTOR Plane, + _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside) noexcept + { + // Find the min/max projection of the frustum onto the plane normal. + XMVECTOR Min, Max, Dist; + + Min = Max = XMVector3Dot(Plane, Point0); + + Dist = XMVector3Dot(Plane, Point1); + Min = XMVectorMin(Min, Dist); + Max = XMVectorMax(Max, Dist); + + Dist = XMVector3Dot(Plane, Point2); + Min = XMVectorMin(Min, Dist); + Max = XMVectorMax(Max, Dist); + + Dist = XMVector3Dot(Plane, Point3); + Min = XMVectorMin(Min, Dist); + Max = XMVectorMax(Max, Dist); + + Dist = XMVector3Dot(Plane, Point4); + Min = XMVectorMin(Min, Dist); + Max = XMVectorMax(Max, Dist); + + Dist = XMVector3Dot(Plane, Point5); + Min = XMVectorMin(Min, Dist); + Max = XMVectorMax(Max, Dist); + + Dist = XMVector3Dot(Plane, Point6); + Min = XMVectorMin(Min, Dist); + Max = XMVectorMax(Max, Dist); + + Dist = XMVector3Dot(Plane, Point7); + Min = XMVectorMin(Min, Dist); + Max = XMVectorMax(Max, Dist); + + XMVECTOR PlaneDist = XMVectorNegate(XMVectorSplatW(Plane)); + + // Outside the plane? + Outside = XMVectorGreater(Min, PlaneDist); + + // Fully inside the plane? + Inside = XMVectorLess(Max, PlaneDist); + } + +} // namespace Internal + + +/**************************************************************************** + * + * BoundingSphere + * + ****************************************************************************/ + + //----------------------------------------------------------------------------- + // Transform a sphere by an angle preserving transform. + //----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingSphere::Transform(BoundingSphere& Out, FXMMATRIX M) const noexcept +{ + // Load the center of the sphere. + XMVECTOR vCenter = XMLoadFloat3(&Center); + + // Transform the center of the sphere. + XMVECTOR C = XMVector3Transform(vCenter, M); + + XMVECTOR dX = XMVector3Dot(M.r[0], M.r[0]); + XMVECTOR dY = XMVector3Dot(M.r[1], M.r[1]); + XMVECTOR dZ = XMVector3Dot(M.r[2], M.r[2]); + + XMVECTOR d = XMVectorMax(dX, XMVectorMax(dY, dZ)); + + // Store the center sphere. + XMStoreFloat3(&Out.Center, C); + + // Scale the radius of the pshere. + float Scale = sqrtf(XMVectorGetX(d)); + Out.Radius = Radius * Scale; +} + +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingSphere::Transform(BoundingSphere& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation) const noexcept +{ + // Load the center of the sphere. + XMVECTOR vCenter = XMLoadFloat3(&Center); + + // Transform the center of the sphere. + vCenter = XMVectorAdd(XMVector3Rotate(XMVectorScale(vCenter, Scale), Rotation), Translation); + + // Store the center sphere. + XMStoreFloat3(&Out.Center, vCenter); + + // Scale the radius of the pshere. + Out.Radius = Radius * Scale; +} + + +//----------------------------------------------------------------------------- +// Point in sphere test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingSphere::Contains(FXMVECTOR Point) const noexcept +{ + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vRadius = XMVectorReplicatePtr(&Radius); + + XMVECTOR DistanceSquared = XMVector3LengthSq(XMVectorSubtract(Point, vCenter)); + XMVECTOR RadiusSquared = XMVectorMultiply(vRadius, vRadius); + + return XMVector3LessOrEqual(DistanceSquared, RadiusSquared) ? CONTAINS : DISJOINT; +} + + +//----------------------------------------------------------------------------- +// Triangle in sphere test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingSphere::Contains(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept +{ + if (!Intersects(V0, V1, V2)) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vRadius = XMVectorReplicatePtr(&Radius); + XMVECTOR RadiusSquared = XMVectorMultiply(vRadius, vRadius); + + XMVECTOR DistanceSquared = XMVector3LengthSq(XMVectorSubtract(V0, vCenter)); + XMVECTOR Inside = XMVectorLessOrEqual(DistanceSquared, RadiusSquared); + + DistanceSquared = XMVector3LengthSq(XMVectorSubtract(V1, vCenter)); + Inside = XMVectorAndInt(Inside, XMVectorLessOrEqual(DistanceSquared, RadiusSquared)); + + DistanceSquared = XMVector3LengthSq(XMVectorSubtract(V2, vCenter)); + Inside = XMVectorAndInt(Inside, XMVectorLessOrEqual(DistanceSquared, RadiusSquared)); + + return (XMVector3EqualInt(Inside, XMVectorTrueInt())) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Sphere in sphere test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingSphere::Contains(const BoundingSphere& sh) const noexcept +{ + XMVECTOR Center1 = XMLoadFloat3(&Center); + float r1 = Radius; + + XMVECTOR Center2 = XMLoadFloat3(&sh.Center); + float r2 = sh.Radius; + + XMVECTOR V = XMVectorSubtract(Center2, Center1); + + XMVECTOR Dist = XMVector3Length(V); + + float d = XMVectorGetX(Dist); + + return (r1 + r2 >= d) ? ((r1 - r2 >= d) ? CONTAINS : INTERSECTS) : DISJOINT; +} + + +//----------------------------------------------------------------------------- +// Axis-aligned box in sphere test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingSphere::Contains(const BoundingBox& box) const noexcept +{ + if (!box.Intersects(*this)) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vRadius = XMVectorReplicatePtr(&Radius); + XMVECTOR RadiusSq = XMVectorMultiply(vRadius, vRadius); + + XMVECTOR boxCenter = XMLoadFloat3(&box.Center); + XMVECTOR boxExtents = XMLoadFloat3(&box.Extents); + + XMVECTOR InsideAll = XMVectorTrueInt(); + + XMVECTOR offset = XMVectorSubtract(boxCenter, vCenter); + + for (size_t i = 0; i < BoundingBox::CORNER_COUNT; ++i) + { + XMVECTOR C = XMVectorMultiplyAdd(boxExtents, g_BoxOffset[i], offset); + XMVECTOR d = XMVector3LengthSq(C); + InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(d, RadiusSq)); + } + + return (XMVector3EqualInt(InsideAll, XMVectorTrueInt())) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Oriented box in sphere test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingSphere::Contains(const BoundingOrientedBox& box) const noexcept +{ + if (!box.Intersects(*this)) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vRadius = XMVectorReplicatePtr(&Radius); + XMVECTOR RadiusSq = XMVectorMultiply(vRadius, vRadius); + + XMVECTOR boxCenter = XMLoadFloat3(&box.Center); + XMVECTOR boxExtents = XMLoadFloat3(&box.Extents); + XMVECTOR boxOrientation = XMLoadFloat4(&box.Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(boxOrientation)); + + XMVECTOR InsideAll = XMVectorTrueInt(); + + for (size_t i = 0; i < BoundingOrientedBox::CORNER_COUNT; ++i) + { + XMVECTOR C = XMVectorAdd(XMVector3Rotate(XMVectorMultiply(boxExtents, g_BoxOffset[i]), boxOrientation), boxCenter); + XMVECTOR d = XMVector3LengthSq(XMVectorSubtract(vCenter, C)); + InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(d, RadiusSq)); + } + + return (XMVector3EqualInt(InsideAll, XMVectorTrueInt())) ? CONTAINS : INTERSECTS; + +} + + +//----------------------------------------------------------------------------- +// Frustum in sphere test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingSphere::Contains(const BoundingFrustum& fr) const noexcept +{ + if (!fr.Intersects(*this)) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vRadius = XMVectorReplicatePtr(&Radius); + XMVECTOR RadiusSq = XMVectorMultiply(vRadius, vRadius); + + XMVECTOR vOrigin = XMLoadFloat3(&fr.Origin); + XMVECTOR vOrientation = XMLoadFloat4(&fr.Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + // Build the corners of the frustum. + XMVECTOR vRightTop = XMVectorSet(fr.RightSlope, fr.TopSlope, 1.0f, 0.0f); + XMVECTOR vRightBottom = XMVectorSet(fr.RightSlope, fr.BottomSlope, 1.0f, 0.0f); + XMVECTOR vLeftTop = XMVectorSet(fr.LeftSlope, fr.TopSlope, 1.0f, 0.0f); + XMVECTOR vLeftBottom = XMVectorSet(fr.LeftSlope, fr.BottomSlope, 1.0f, 0.0f); + XMVECTOR vNear = XMVectorReplicatePtr(&fr.Near); + XMVECTOR vFar = XMVectorReplicatePtr(&fr.Far); + + XMVECTOR Corners[BoundingFrustum::CORNER_COUNT]; + Corners[0] = XMVectorMultiply(vRightTop, vNear); + Corners[1] = XMVectorMultiply(vRightBottom, vNear); + Corners[2] = XMVectorMultiply(vLeftTop, vNear); + Corners[3] = XMVectorMultiply(vLeftBottom, vNear); + Corners[4] = XMVectorMultiply(vRightTop, vFar); + Corners[5] = XMVectorMultiply(vRightBottom, vFar); + Corners[6] = XMVectorMultiply(vLeftTop, vFar); + Corners[7] = XMVectorMultiply(vLeftBottom, vFar); + + XMVECTOR InsideAll = XMVectorTrueInt(); + for (size_t i = 0; i < BoundingFrustum::CORNER_COUNT; ++i) + { + XMVECTOR C = XMVectorAdd(XMVector3Rotate(Corners[i], vOrientation), vOrigin); + XMVECTOR d = XMVector3LengthSq(XMVectorSubtract(vCenter, C)); + InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(d, RadiusSq)); + } + + return (XMVector3EqualInt(InsideAll, XMVectorTrueInt())) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Sphere vs. sphere test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingSphere::Intersects(const BoundingSphere& sh) const noexcept +{ + // Load A. + XMVECTOR vCenterA = XMLoadFloat3(&Center); + XMVECTOR vRadiusA = XMVectorReplicatePtr(&Radius); + + // Load B. + XMVECTOR vCenterB = XMLoadFloat3(&sh.Center); + XMVECTOR vRadiusB = XMVectorReplicatePtr(&sh.Radius); + + // Distance squared between centers. + XMVECTOR Delta = XMVectorSubtract(vCenterB, vCenterA); + XMVECTOR DistanceSquared = XMVector3LengthSq(Delta); + + // Sum of the radii squared. + XMVECTOR RadiusSquared = XMVectorAdd(vRadiusA, vRadiusB); + RadiusSquared = XMVectorMultiply(RadiusSquared, RadiusSquared); + + return XMVector3LessOrEqual(DistanceSquared, RadiusSquared); +} + + +//----------------------------------------------------------------------------- +// Box vs. sphere test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingSphere::Intersects(const BoundingBox& box) const noexcept +{ + return box.Intersects(*this); +} + +_Use_decl_annotations_ +inline bool BoundingSphere::Intersects(const BoundingOrientedBox& box) const noexcept +{ + return box.Intersects(*this); +} + + +//----------------------------------------------------------------------------- +// Frustum vs. sphere test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingSphere::Intersects(const BoundingFrustum& fr) const noexcept +{ + return fr.Intersects(*this); +} + + +//----------------------------------------------------------------------------- +// Triangle vs sphere test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingSphere::Intersects(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept +{ + // Load the sphere. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vRadius = XMVectorReplicatePtr(&Radius); + + // Compute the plane of the triangle (has to be normalized). + XMVECTOR N = XMVector3Normalize(XMVector3Cross(XMVectorSubtract(V1, V0), XMVectorSubtract(V2, V0))); + + // Assert that the triangle is not degenerate. + assert(!XMVector3Equal(N, XMVectorZero())); + + // Find the nearest feature on the triangle to the sphere. + XMVECTOR Dist = XMVector3Dot(XMVectorSubtract(vCenter, V0), N); + + // If the center of the sphere is farther from the plane of the triangle than + // the radius of the sphere, then there cannot be an intersection. + XMVECTOR NoIntersection = XMVectorLess(Dist, XMVectorNegate(vRadius)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Dist, vRadius)); + + // Project the center of the sphere onto the plane of the triangle. + XMVECTOR Point = XMVectorNegativeMultiplySubtract(N, Dist, vCenter); + + // Is it inside all the edges? If so we intersect because the distance + // to the plane is less than the radius. + XMVECTOR Intersection = DirectX::Internal::PointOnPlaneInsideTriangle(Point, V0, V1, V2); + + // Find the nearest point on each edge. + XMVECTOR RadiusSq = XMVectorMultiply(vRadius, vRadius); + + // Edge 0,1 + Point = DirectX::Internal::PointOnLineSegmentNearestPoint(V0, V1, vCenter); + + // If the distance to the center of the sphere to the point is less than + // the radius of the sphere then it must intersect. + Intersection = XMVectorOrInt(Intersection, XMVectorLessOrEqual(XMVector3LengthSq(XMVectorSubtract(vCenter, Point)), RadiusSq)); + + // Edge 1,2 + Point = DirectX::Internal::PointOnLineSegmentNearestPoint(V1, V2, vCenter); + + // If the distance to the center of the sphere to the point is less than + // the radius of the sphere then it must intersect. + Intersection = XMVectorOrInt(Intersection, XMVectorLessOrEqual(XMVector3LengthSq(XMVectorSubtract(vCenter, Point)), RadiusSq)); + + // Edge 2,0 + Point = DirectX::Internal::PointOnLineSegmentNearestPoint(V2, V0, vCenter); + + // If the distance to the center of the sphere to the point is less than + // the radius of the sphere then it must intersect. + Intersection = XMVectorOrInt(Intersection, XMVectorLessOrEqual(XMVector3LengthSq(XMVectorSubtract(vCenter, Point)), RadiusSq)); + + return XMVector4EqualInt(XMVectorAndCInt(Intersection, NoIntersection), XMVectorTrueInt()); +} + + +//----------------------------------------------------------------------------- +// Sphere-plane intersection +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline PlaneIntersectionType XM_CALLCONV BoundingSphere::Intersects(FXMVECTOR Plane) const noexcept +{ + assert(DirectX::Internal::XMPlaneIsUnit(Plane)); + + // Load the sphere. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vRadius = XMVectorReplicatePtr(&Radius); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne()); + + XMVECTOR Outside, Inside; + DirectX::Internal::FastIntersectSpherePlane(vCenter, vRadius, Plane, Outside, Inside); + + // If the sphere is outside any plane it is outside. + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return FRONT; + + // If the sphere is inside all planes it is inside. + if (XMVector4EqualInt(Inside, XMVectorTrueInt())) + return BACK; + + // The sphere is not inside all planes or outside a plane it intersects. + return INTERSECTING; +} + + +//----------------------------------------------------------------------------- +// Compute the intersection of a ray (Origin, Direction) with a sphere. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingSphere::Intersects(FXMVECTOR Origin, FXMVECTOR Direction, float& Dist) const noexcept +{ + assert(DirectX::Internal::XMVector3IsUnit(Direction)); + + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vRadius = XMVectorReplicatePtr(&Radius); + + // l is the vector from the ray origin to the center of the sphere. + XMVECTOR l = XMVectorSubtract(vCenter, Origin); + + // s is the projection of the l onto the ray direction. + XMVECTOR s = XMVector3Dot(l, Direction); + + XMVECTOR l2 = XMVector3Dot(l, l); + + XMVECTOR r2 = XMVectorMultiply(vRadius, vRadius); + + // m2 is squared distance from the center of the sphere to the projection. + XMVECTOR m2 = XMVectorNegativeMultiplySubtract(s, s, l2); + + XMVECTOR NoIntersection; + + // If the ray origin is outside the sphere and the center of the sphere is + // behind the ray origin there is no intersection. + NoIntersection = XMVectorAndInt(XMVectorLess(s, XMVectorZero()), XMVectorGreater(l2, r2)); + + // If the squared distance from the center of the sphere to the projection + // is greater than the radius squared the ray will miss the sphere. + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(m2, r2)); + + // The ray hits the sphere, compute the nearest intersection point. + XMVECTOR q = XMVectorSqrt(XMVectorSubtract(r2, m2)); + XMVECTOR t1 = XMVectorSubtract(s, q); + XMVECTOR t2 = XMVectorAdd(s, q); + + XMVECTOR OriginInside = XMVectorLessOrEqual(l2, r2); + XMVECTOR t = XMVectorSelect(t1, t2, OriginInside); + + if (XMVector4NotEqualInt(NoIntersection, XMVectorTrueInt())) + { + // Store the x-component to *pDist. + XMStoreFloat(&Dist, t); + return true; + } + + Dist = 0.f; + return false; +} + + +//----------------------------------------------------------------------------- +// Test a sphere vs 6 planes (typically forming a frustum). +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingSphere::ContainedBy( + FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2, + GXMVECTOR Plane3, + HXMVECTOR Plane4, HXMVECTOR Plane5) const noexcept +{ + // Load the sphere. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vRadius = XMVectorReplicatePtr(&Radius); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne()); + + XMVECTOR Outside, Inside; + + // Test against each plane. + DirectX::Internal::FastIntersectSpherePlane(vCenter, vRadius, Plane0, Outside, Inside); + + XMVECTOR AnyOutside = Outside; + XMVECTOR AllInside = Inside; + + DirectX::Internal::FastIntersectSpherePlane(vCenter, vRadius, Plane1, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectSpherePlane(vCenter, vRadius, Plane2, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectSpherePlane(vCenter, vRadius, Plane3, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectSpherePlane(vCenter, vRadius, Plane4, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectSpherePlane(vCenter, vRadius, Plane5, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + // If the sphere is outside any plane it is outside. + if (XMVector4EqualInt(AnyOutside, XMVectorTrueInt())) + return DISJOINT; + + // If the sphere is inside all planes it is inside. + if (XMVector4EqualInt(AllInside, XMVectorTrueInt())) + return CONTAINS; + + // The sphere is not inside all planes or outside a plane, it may intersect. + return INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Creates a bounding sphere that contains two other bounding spheres +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingSphere::CreateMerged(BoundingSphere& Out, const BoundingSphere& S1, const BoundingSphere& S2) noexcept +{ + XMVECTOR Center1 = XMLoadFloat3(&S1.Center); + float r1 = S1.Radius; + + XMVECTOR Center2 = XMLoadFloat3(&S2.Center); + float r2 = S2.Radius; + + XMVECTOR V = XMVectorSubtract(Center2, Center1); + + XMVECTOR Dist = XMVector3Length(V); + + float d = XMVectorGetX(Dist); + + if (r1 + r2 >= d) + { + if (r1 - r2 >= d) + { + Out = S1; + return; + } + else if (r2 - r1 >= d) + { + Out = S2; + return; + } + } + + XMVECTOR N = XMVectorDivide(V, Dist); + + float t1 = XMMin(-r1, d - r2); + float t2 = XMMax(r1, d + r2); + float t_5 = (t2 - t1) * 0.5f; + + XMVECTOR NCenter = XMVectorAdd(Center1, XMVectorMultiply(N, XMVectorReplicate(t_5 + t1))); + + XMStoreFloat3(&Out.Center, NCenter); + Out.Radius = t_5; +} + + +//----------------------------------------------------------------------------- +// Create sphere enscribing bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingSphere::CreateFromBoundingBox(BoundingSphere& Out, const BoundingBox& box) noexcept +{ + Out.Center = box.Center; + XMVECTOR vExtents = XMLoadFloat3(&box.Extents); + Out.Radius = XMVectorGetX(XMVector3Length(vExtents)); +} + +_Use_decl_annotations_ +inline void BoundingSphere::CreateFromBoundingBox(BoundingSphere& Out, const BoundingOrientedBox& box) noexcept +{ + // Bounding box orientation is irrelevant because a sphere is rotationally invariant + Out.Center = box.Center; + XMVECTOR vExtents = XMLoadFloat3(&box.Extents); + Out.Radius = XMVectorGetX(XMVector3Length(vExtents)); +} + + +//----------------------------------------------------------------------------- +// Find the approximate smallest enclosing bounding sphere for a set of +// points. Exact computation of the smallest enclosing bounding sphere is +// possible but is slower and requires a more complex algorithm. +// The algorithm is based on Jack Ritter, "An Efficient Bounding Sphere", +// Graphics Gems. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingSphere::CreateFromPoints(BoundingSphere& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride) noexcept +{ + assert(Count > 0); + assert(pPoints); + + // Find the points with minimum and maximum x, y, and z + XMVECTOR MinX, MaxX, MinY, MaxY, MinZ, MaxZ; + + MinX = MaxX = MinY = MaxY = MinZ = MaxZ = XMLoadFloat3(pPoints); + + for (size_t i = 1; i < Count; ++i) + { + XMVECTOR Point = XMLoadFloat3(reinterpret_cast(reinterpret_cast(pPoints) + i * Stride)); + + float px = XMVectorGetX(Point); + float py = XMVectorGetY(Point); + float pz = XMVectorGetZ(Point); + + if (px < XMVectorGetX(MinX)) + MinX = Point; + + if (px > XMVectorGetX(MaxX)) + MaxX = Point; + + if (py < XMVectorGetY(MinY)) + MinY = Point; + + if (py > XMVectorGetY(MaxY)) + MaxY = Point; + + if (pz < XMVectorGetZ(MinZ)) + MinZ = Point; + + if (pz > XMVectorGetZ(MaxZ)) + MaxZ = Point; + } + + // Use the min/max pair that are farthest apart to form the initial sphere. + XMVECTOR DeltaX = XMVectorSubtract(MaxX, MinX); + XMVECTOR DistX = XMVector3Length(DeltaX); + + XMVECTOR DeltaY = XMVectorSubtract(MaxY, MinY); + XMVECTOR DistY = XMVector3Length(DeltaY); + + XMVECTOR DeltaZ = XMVectorSubtract(MaxZ, MinZ); + XMVECTOR DistZ = XMVector3Length(DeltaZ); + + XMVECTOR vCenter; + XMVECTOR vRadius; + + if (XMVector3Greater(DistX, DistY)) + { + if (XMVector3Greater(DistX, DistZ)) + { + // Use min/max x. + vCenter = XMVectorLerp(MaxX, MinX, 0.5f); + vRadius = XMVectorScale(DistX, 0.5f); + } + else + { + // Use min/max z. + vCenter = XMVectorLerp(MaxZ, MinZ, 0.5f); + vRadius = XMVectorScale(DistZ, 0.5f); + } + } + else // Y >= X + { + if (XMVector3Greater(DistY, DistZ)) + { + // Use min/max y. + vCenter = XMVectorLerp(MaxY, MinY, 0.5f); + vRadius = XMVectorScale(DistY, 0.5f); + } + else + { + // Use min/max z. + vCenter = XMVectorLerp(MaxZ, MinZ, 0.5f); + vRadius = XMVectorScale(DistZ, 0.5f); + } + } + + // Add any points not inside the sphere. + for (size_t i = 0; i < Count; ++i) + { + XMVECTOR Point = XMLoadFloat3(reinterpret_cast(reinterpret_cast(pPoints) + i * Stride)); + + XMVECTOR Delta = XMVectorSubtract(Point, vCenter); + + XMVECTOR Dist = XMVector3Length(Delta); + + if (XMVector3Greater(Dist, vRadius)) + { + // Adjust sphere to include the new point. + vRadius = XMVectorScale(XMVectorAdd(vRadius, Dist), 0.5f); + vCenter = XMVectorAdd(vCenter, XMVectorMultiply(XMVectorSubtract(XMVectorReplicate(1.0f), XMVectorDivide(vRadius, Dist)), Delta)); + } + } + + XMStoreFloat3(&Out.Center, vCenter); + XMStoreFloat(&Out.Radius, vRadius); +} + + +//----------------------------------------------------------------------------- +// Create sphere containing frustum +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingSphere::CreateFromFrustum(BoundingSphere& Out, const BoundingFrustum& fr) noexcept +{ + XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT]; + fr.GetCorners(Corners); + CreateFromPoints(Out, BoundingFrustum::CORNER_COUNT, Corners, sizeof(XMFLOAT3)); +} + + +/**************************************************************************** + * + * BoundingBox + * + ****************************************************************************/ + + //----------------------------------------------------------------------------- + // Transform an axis aligned box by an angle preserving transform. + //----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingBox::Transform(BoundingBox& Out, FXMMATRIX M) const noexcept +{ + // Load center and extents. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + + // Compute and transform the corners and find new min/max bounds. + XMVECTOR Corner = XMVectorMultiplyAdd(vExtents, g_BoxOffset[0], vCenter); + Corner = XMVector3Transform(Corner, M); + + XMVECTOR Min, Max; + Min = Max = Corner; + + for (size_t i = 1; i < CORNER_COUNT; ++i) + { + Corner = XMVectorMultiplyAdd(vExtents, g_BoxOffset[i], vCenter); + Corner = XMVector3Transform(Corner, M); + + Min = XMVectorMin(Min, Corner); + Max = XMVectorMax(Max, Corner); + } + + // Store center and extents. + XMStoreFloat3(&Out.Center, XMVectorScale(XMVectorAdd(Min, Max), 0.5f)); + XMStoreFloat3(&Out.Extents, XMVectorScale(XMVectorSubtract(Max, Min), 0.5f)); +} + +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingBox::Transform(BoundingBox& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation) const noexcept +{ + assert(DirectX::Internal::XMQuaternionIsUnit(Rotation)); + + // Load center and extents. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + + XMVECTOR VectorScale = XMVectorReplicate(Scale); + + // Compute and transform the corners and find new min/max bounds. + XMVECTOR Corner = XMVectorMultiplyAdd(vExtents, g_BoxOffset[0], vCenter); + Corner = XMVectorAdd(XMVector3Rotate(XMVectorMultiply(Corner, VectorScale), Rotation), Translation); + + XMVECTOR Min, Max; + Min = Max = Corner; + + for (size_t i = 1; i < CORNER_COUNT; ++i) + { + Corner = XMVectorMultiplyAdd(vExtents, g_BoxOffset[i], vCenter); + Corner = XMVectorAdd(XMVector3Rotate(XMVectorMultiply(Corner, VectorScale), Rotation), Translation); + + Min = XMVectorMin(Min, Corner); + Max = XMVectorMax(Max, Corner); + } + + // Store center and extents. + XMStoreFloat3(&Out.Center, XMVectorScale(XMVectorAdd(Min, Max), 0.5f)); + XMStoreFloat3(&Out.Extents, XMVectorScale(XMVectorSubtract(Max, Min), 0.5f)); +} + + +//----------------------------------------------------------------------------- +// Get the corner points of the box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingBox::GetCorners(XMFLOAT3* Corners) const noexcept +{ + assert(Corners != nullptr); + + // Load the box + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + + for (size_t i = 0; i < CORNER_COUNT; ++i) + { + XMVECTOR C = XMVectorMultiplyAdd(vExtents, g_BoxOffset[i], vCenter); + XMStoreFloat3(&Corners[i], C); + } +} + + +//----------------------------------------------------------------------------- +// Point in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingBox::Contains(FXMVECTOR Point) const noexcept +{ + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + + return XMVector3InBounds(XMVectorSubtract(Point, vCenter), vExtents) ? CONTAINS : DISJOINT; +} + + +//----------------------------------------------------------------------------- +// Triangle in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingBox::Contains(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept +{ + if (!Intersects(V0, V1, V2)) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + + XMVECTOR d = XMVectorAbs(XMVectorSubtract(V0, vCenter)); + XMVECTOR Inside = XMVectorLessOrEqual(d, vExtents); + + d = XMVectorAbs(XMVectorSubtract(V1, vCenter)); + Inside = XMVectorAndInt(Inside, XMVectorLessOrEqual(d, vExtents)); + + d = XMVectorAbs(XMVectorSubtract(V2, vCenter)); + Inside = XMVectorAndInt(Inside, XMVectorLessOrEqual(d, vExtents)); + + return (XMVector3EqualInt(Inside, XMVectorTrueInt())) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Sphere in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingBox::Contains(const BoundingSphere& sh) const noexcept +{ + XMVECTOR SphereCenter = XMLoadFloat3(&sh.Center); + XMVECTOR SphereRadius = XMVectorReplicatePtr(&sh.Radius); + + XMVECTOR BoxCenter = XMLoadFloat3(&Center); + XMVECTOR BoxExtents = XMLoadFloat3(&Extents); + + XMVECTOR BoxMin = XMVectorSubtract(BoxCenter, BoxExtents); + XMVECTOR BoxMax = XMVectorAdd(BoxCenter, BoxExtents); + + // Find the distance to the nearest point on the box. + // for each i in (x, y, z) + // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2 + // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2 + + XMVECTOR d = XMVectorZero(); + + // Compute d for each dimension. + XMVECTOR LessThanMin = XMVectorLess(SphereCenter, BoxMin); + XMVECTOR GreaterThanMax = XMVectorGreater(SphereCenter, BoxMax); + + XMVECTOR MinDelta = XMVectorSubtract(SphereCenter, BoxMin); + XMVECTOR MaxDelta = XMVectorSubtract(SphereCenter, BoxMax); + + // Choose value for each dimension based on the comparison. + d = XMVectorSelect(d, MinDelta, LessThanMin); + d = XMVectorSelect(d, MaxDelta, GreaterThanMax); + + // Use a dot-product to square them and sum them together. + XMVECTOR d2 = XMVector3Dot(d, d); + + if (XMVector3Greater(d2, XMVectorMultiply(SphereRadius, SphereRadius))) + return DISJOINT; + + XMVECTOR InsideAll = XMVectorLessOrEqual(XMVectorAdd(BoxMin, SphereRadius), SphereCenter); + InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(SphereCenter, XMVectorSubtract(BoxMax, SphereRadius))); + InsideAll = XMVectorAndInt(InsideAll, XMVectorGreater(XMVectorSubtract(BoxMax, BoxMin), SphereRadius)); + + return (XMVector3EqualInt(InsideAll, XMVectorTrueInt())) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Axis-aligned box in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingBox::Contains(const BoundingBox& box) const noexcept +{ + XMVECTOR CenterA = XMLoadFloat3(&Center); + XMVECTOR ExtentsA = XMLoadFloat3(&Extents); + + XMVECTOR CenterB = XMLoadFloat3(&box.Center); + XMVECTOR ExtentsB = XMLoadFloat3(&box.Extents); + + XMVECTOR MinA = XMVectorSubtract(CenterA, ExtentsA); + XMVECTOR MaxA = XMVectorAdd(CenterA, ExtentsA); + + XMVECTOR MinB = XMVectorSubtract(CenterB, ExtentsB); + XMVECTOR MaxB = XMVectorAdd(CenterB, ExtentsB); + + // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then return false + XMVECTOR Disjoint = XMVectorOrInt(XMVectorGreater(MinA, MaxB), XMVectorGreater(MinB, MaxA)); + + if (DirectX::Internal::XMVector3AnyTrue(Disjoint)) + return DISJOINT; + + // for each i in (x, y, z) if a_min(i) <= b_min(i) and b_max(i) <= a_max(i) then A contains B + XMVECTOR Inside = XMVectorAndInt(XMVectorLessOrEqual(MinA, MinB), XMVectorLessOrEqual(MaxB, MaxA)); + + return DirectX::Internal::XMVector3AllTrue(Inside) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Oriented box in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingBox::Contains(const BoundingOrientedBox& box) const noexcept +{ + if (!box.Intersects(*this)) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + + // Subtract off the AABB center to remove a subtract below + XMVECTOR oCenter = XMVectorSubtract(XMLoadFloat3(&box.Center), vCenter); + + XMVECTOR oExtents = XMLoadFloat3(&box.Extents); + XMVECTOR oOrientation = XMLoadFloat4(&box.Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(oOrientation)); + + XMVECTOR Inside = XMVectorTrueInt(); + + for (size_t i = 0; i < BoundingOrientedBox::CORNER_COUNT; ++i) + { + XMVECTOR C = XMVectorAdd(XMVector3Rotate(XMVectorMultiply(oExtents, g_BoxOffset[i]), oOrientation), oCenter); + XMVECTOR d = XMVectorAbs(C); + Inside = XMVectorAndInt(Inside, XMVectorLessOrEqual(d, vExtents)); + } + + return (XMVector3EqualInt(Inside, XMVectorTrueInt())) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Frustum in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingBox::Contains(const BoundingFrustum& fr) const noexcept +{ + if (!fr.Intersects(*this)) + return DISJOINT; + + XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT]; + fr.GetCorners(Corners); + + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + + XMVECTOR Inside = XMVectorTrueInt(); + + for (size_t i = 0; i < BoundingFrustum::CORNER_COUNT; ++i) + { + XMVECTOR Point = XMLoadFloat3(&Corners[i]); + XMVECTOR d = XMVectorAbs(XMVectorSubtract(Point, vCenter)); + Inside = XMVectorAndInt(Inside, XMVectorLessOrEqual(d, vExtents)); + } + + return (XMVector3EqualInt(Inside, XMVectorTrueInt())) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Sphere vs axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingBox::Intersects(const BoundingSphere& sh) const noexcept +{ + XMVECTOR SphereCenter = XMLoadFloat3(&sh.Center); + XMVECTOR SphereRadius = XMVectorReplicatePtr(&sh.Radius); + + XMVECTOR BoxCenter = XMLoadFloat3(&Center); + XMVECTOR BoxExtents = XMLoadFloat3(&Extents); + + XMVECTOR BoxMin = XMVectorSubtract(BoxCenter, BoxExtents); + XMVECTOR BoxMax = XMVectorAdd(BoxCenter, BoxExtents); + + // Find the distance to the nearest point on the box. + // for each i in (x, y, z) + // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2 + // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2 + + XMVECTOR d = XMVectorZero(); + + // Compute d for each dimension. + XMVECTOR LessThanMin = XMVectorLess(SphereCenter, BoxMin); + XMVECTOR GreaterThanMax = XMVectorGreater(SphereCenter, BoxMax); + + XMVECTOR MinDelta = XMVectorSubtract(SphereCenter, BoxMin); + XMVECTOR MaxDelta = XMVectorSubtract(SphereCenter, BoxMax); + + // Choose value for each dimension based on the comparison. + d = XMVectorSelect(d, MinDelta, LessThanMin); + d = XMVectorSelect(d, MaxDelta, GreaterThanMax); + + // Use a dot-product to square them and sum them together. + XMVECTOR d2 = XMVector3Dot(d, d); + + return XMVector3LessOrEqual(d2, XMVectorMultiply(SphereRadius, SphereRadius)); +} + + +//----------------------------------------------------------------------------- +// Axis-aligned box vs. axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingBox::Intersects(const BoundingBox& box) const noexcept +{ + XMVECTOR CenterA = XMLoadFloat3(&Center); + XMVECTOR ExtentsA = XMLoadFloat3(&Extents); + + XMVECTOR CenterB = XMLoadFloat3(&box.Center); + XMVECTOR ExtentsB = XMLoadFloat3(&box.Extents); + + XMVECTOR MinA = XMVectorSubtract(CenterA, ExtentsA); + XMVECTOR MaxA = XMVectorAdd(CenterA, ExtentsA); + + XMVECTOR MinB = XMVectorSubtract(CenterB, ExtentsB); + XMVECTOR MaxB = XMVectorAdd(CenterB, ExtentsB); + + // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then return false + XMVECTOR Disjoint = XMVectorOrInt(XMVectorGreater(MinA, MaxB), XMVectorGreater(MinB, MaxA)); + + return !DirectX::Internal::XMVector3AnyTrue(Disjoint); +} + + +//----------------------------------------------------------------------------- +// Oriented box vs. axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingBox::Intersects(const BoundingOrientedBox& box) const noexcept +{ + return box.Intersects(*this); +} + + +//----------------------------------------------------------------------------- +// Frustum vs. axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingBox::Intersects(const BoundingFrustum& fr) const noexcept +{ + return fr.Intersects(*this); +} + + +//----------------------------------------------------------------------------- +// Triangle vs. axis aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingBox::Intersects(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept +{ + XMVECTOR Zero = XMVectorZero(); + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + + XMVECTOR BoxMin = XMVectorSubtract(vCenter, vExtents); + XMVECTOR BoxMax = XMVectorAdd(vCenter, vExtents); + + // Test the axes of the box (in effect test the AAB against the minimal AAB + // around the triangle). + XMVECTOR TriMin = XMVectorMin(XMVectorMin(V0, V1), V2); + XMVECTOR TriMax = XMVectorMax(XMVectorMax(V0, V1), V2); + + // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then disjoint + XMVECTOR Disjoint = XMVectorOrInt(XMVectorGreater(TriMin, BoxMax), XMVectorGreater(BoxMin, TriMax)); + if (DirectX::Internal::XMVector3AnyTrue(Disjoint)) + return false; + + // Test the plane of the triangle. + XMVECTOR Normal = XMVector3Cross(XMVectorSubtract(V1, V0), XMVectorSubtract(V2, V0)); + XMVECTOR Dist = XMVector3Dot(Normal, V0); + + // Assert that the triangle is not degenerate. + assert(!XMVector3Equal(Normal, Zero)); + + // for each i in (x, y, z) if n(i) >= 0 then v_min(i)=b_min(i), v_max(i)=b_max(i) + // else v_min(i)=b_max(i), v_max(i)=b_min(i) + XMVECTOR NormalSelect = XMVectorGreater(Normal, Zero); + XMVECTOR V_Min = XMVectorSelect(BoxMax, BoxMin, NormalSelect); + XMVECTOR V_Max = XMVectorSelect(BoxMin, BoxMax, NormalSelect); + + // if n dot v_min + d > 0 || n dot v_max + d < 0 then disjoint + XMVECTOR MinDist = XMVector3Dot(V_Min, Normal); + XMVECTOR MaxDist = XMVector3Dot(V_Max, Normal); + + XMVECTOR NoIntersection = XMVectorGreater(MinDist, Dist); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(MaxDist, Dist)); + + // Move the box center to zero to simplify the following tests. + XMVECTOR TV0 = XMVectorSubtract(V0, vCenter); + XMVECTOR TV1 = XMVectorSubtract(V1, vCenter); + XMVECTOR TV2 = XMVectorSubtract(V2, vCenter); + + // Test the edge/edge axes (3*3). + XMVECTOR e0 = XMVectorSubtract(TV1, TV0); + XMVECTOR e1 = XMVectorSubtract(TV2, TV1); + XMVECTOR e2 = XMVectorSubtract(TV0, TV2); + + // Make w zero. + e0 = XMVectorInsert<0, 0, 0, 0, 1>(e0, Zero); + e1 = XMVectorInsert<0, 0, 0, 0, 1>(e1, Zero); + e2 = XMVectorInsert<0, 0, 0, 0, 1>(e2, Zero); + + XMVECTOR Axis; + XMVECTOR p0, p1, p2; + XMVECTOR Min, Max; + XMVECTOR Radius; + + // Axis == (1,0,0) x e0 = (0, -e0.z, e0.y) + Axis = XMVectorPermute(e0, XMVectorNegate(e0)); + p0 = XMVector3Dot(TV0, Axis); + // p1 = XMVector3Dot( V1, Axis ); // p1 = p0; + p2 = XMVector3Dot(TV2, Axis); + Min = XMVectorMin(p0, p2); + Max = XMVectorMax(p0, p2); + Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius))); + + // Axis == (1,0,0) x e1 = (0, -e1.z, e1.y) + Axis = XMVectorPermute(e1, XMVectorNegate(e1)); + p0 = XMVector3Dot(TV0, Axis); + p1 = XMVector3Dot(TV1, Axis); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p1; + Min = XMVectorMin(p0, p1); + Max = XMVectorMax(p0, p1); + Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius))); + + // Axis == (1,0,0) x e2 = (0, -e2.z, e2.y) + Axis = XMVectorPermute(e2, XMVectorNegate(e2)); + p0 = XMVector3Dot(TV0, Axis); + p1 = XMVector3Dot(TV1, Axis); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p0; + Min = XMVectorMin(p0, p1); + Max = XMVectorMax(p0, p1); + Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius))); + + // Axis == (0,1,0) x e0 = (e0.z, 0, -e0.x) + Axis = XMVectorPermute(e0, XMVectorNegate(e0)); + p0 = XMVector3Dot(TV0, Axis); + // p1 = XMVector3Dot( V1, Axis ); // p1 = p0; + p2 = XMVector3Dot(TV2, Axis); + Min = XMVectorMin(p0, p2); + Max = XMVectorMax(p0, p2); + Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius))); + + // Axis == (0,1,0) x e1 = (e1.z, 0, -e1.x) + Axis = XMVectorPermute(e1, XMVectorNegate(e1)); + p0 = XMVector3Dot(TV0, Axis); + p1 = XMVector3Dot(TV1, Axis); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p1; + Min = XMVectorMin(p0, p1); + Max = XMVectorMax(p0, p1); + Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius))); + + // Axis == (0,0,1) x e2 = (e2.z, 0, -e2.x) + Axis = XMVectorPermute(e2, XMVectorNegate(e2)); + p0 = XMVector3Dot(TV0, Axis); + p1 = XMVector3Dot(TV1, Axis); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p0; + Min = XMVectorMin(p0, p1); + Max = XMVectorMax(p0, p1); + Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius))); + + // Axis == (0,0,1) x e0 = (-e0.y, e0.x, 0) + Axis = XMVectorPermute(e0, XMVectorNegate(e0)); + p0 = XMVector3Dot(TV0, Axis); + // p1 = XMVector3Dot( V1, Axis ); // p1 = p0; + p2 = XMVector3Dot(TV2, Axis); + Min = XMVectorMin(p0, p2); + Max = XMVectorMax(p0, p2); + Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius))); + + // Axis == (0,0,1) x e1 = (-e1.y, e1.x, 0) + Axis = XMVectorPermute(e1, XMVectorNegate(e1)); + p0 = XMVector3Dot(TV0, Axis); + p1 = XMVector3Dot(TV1, Axis); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p1; + Min = XMVectorMin(p0, p1); + Max = XMVectorMax(p0, p1); + Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius))); + + // Axis == (0,0,1) x e2 = (-e2.y, e2.x, 0) + Axis = XMVectorPermute(e2, XMVectorNegate(e2)); + p0 = XMVector3Dot(TV0, Axis); + p1 = XMVector3Dot(TV1, Axis); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p0; + Min = XMVectorMin(p0, p1); + Max = XMVectorMax(p0, p1); + Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius))); + + return XMVector4NotEqualInt(NoIntersection, XMVectorTrueInt()); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline PlaneIntersectionType XM_CALLCONV BoundingBox::Intersects(FXMVECTOR Plane) const noexcept +{ + assert(DirectX::Internal::XMPlaneIsUnit(Plane)); + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne()); + + XMVECTOR Outside, Inside; + DirectX::Internal::FastIntersectAxisAlignedBoxPlane(vCenter, vExtents, Plane, Outside, Inside); + + // If the box is outside any plane it is outside. + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return FRONT; + + // If the box is inside all planes it is inside. + if (XMVector4EqualInt(Inside, XMVectorTrueInt())) + return BACK; + + // The box is not inside all planes or outside a plane it intersects. + return INTERSECTING; +} + + +//----------------------------------------------------------------------------- +// Compute the intersection of a ray (Origin, Direction) with an axis aligned +// box using the slabs method. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingBox::Intersects(FXMVECTOR Origin, FXMVECTOR Direction, float& Dist) const noexcept +{ + assert(DirectX::Internal::XMVector3IsUnit(Direction)); + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + + // Adjust ray origin to be relative to center of the box. + XMVECTOR TOrigin = XMVectorSubtract(vCenter, Origin); + + // Compute the dot product againt each axis of the box. + // Since the axii are (1,0,0), (0,1,0), (0,0,1) no computation is necessary. + XMVECTOR AxisDotOrigin = TOrigin; + XMVECTOR AxisDotDirection = Direction; + + // if (fabs(AxisDotDirection) <= Epsilon) the ray is nearly parallel to the slab. + XMVECTOR IsParallel = XMVectorLessOrEqual(XMVectorAbs(AxisDotDirection), g_RayEpsilon); + + // Test against all three axii simultaneously. + XMVECTOR InverseAxisDotDirection = XMVectorReciprocal(AxisDotDirection); + XMVECTOR t1 = XMVectorMultiply(XMVectorSubtract(AxisDotOrigin, vExtents), InverseAxisDotDirection); + XMVECTOR t2 = XMVectorMultiply(XMVectorAdd(AxisDotOrigin, vExtents), InverseAxisDotDirection); + + // Compute the max of min(t1,t2) and the min of max(t1,t2) ensuring we don't + // use the results from any directions parallel to the slab. + XMVECTOR t_min = XMVectorSelect(XMVectorMin(t1, t2), g_FltMin, IsParallel); + XMVECTOR t_max = XMVectorSelect(XMVectorMax(t1, t2), g_FltMax, IsParallel); + + // t_min.x = maximum( t_min.x, t_min.y, t_min.z ); + // t_max.x = minimum( t_max.x, t_max.y, t_max.z ); + t_min = XMVectorMax(t_min, XMVectorSplatY(t_min)); // x = max(x,y) + t_min = XMVectorMax(t_min, XMVectorSplatZ(t_min)); // x = max(max(x,y),z) + t_max = XMVectorMin(t_max, XMVectorSplatY(t_max)); // x = min(x,y) + t_max = XMVectorMin(t_max, XMVectorSplatZ(t_max)); // x = min(min(x,y),z) + + // if ( t_min > t_max ) return false; + XMVECTOR NoIntersection = XMVectorGreater(XMVectorSplatX(t_min), XMVectorSplatX(t_max)); + + // if ( t_max < 0.0f ) return false; + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(XMVectorSplatX(t_max), XMVectorZero())); + + // if (IsParallel && (-Extents > AxisDotOrigin || Extents < AxisDotOrigin)) return false; + XMVECTOR ParallelOverlap = XMVectorInBounds(AxisDotOrigin, vExtents); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorAndCInt(IsParallel, ParallelOverlap)); + + if (!DirectX::Internal::XMVector3AnyTrue(NoIntersection)) + { + // Store the x-component to *pDist + XMStoreFloat(&Dist, t_min); + return true; + } + + Dist = 0.f; + return false; +} + + +//----------------------------------------------------------------------------- +// Test an axis alinged box vs 6 planes (typically forming a frustum). +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingBox::ContainedBy( + FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2, + GXMVECTOR Plane3, + HXMVECTOR Plane4, HXMVECTOR Plane5) const noexcept +{ + // Load the box. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne()); + + XMVECTOR Outside, Inside; + + // Test against each plane. + DirectX::Internal::FastIntersectAxisAlignedBoxPlane(vCenter, vExtents, Plane0, Outside, Inside); + + XMVECTOR AnyOutside = Outside; + XMVECTOR AllInside = Inside; + + DirectX::Internal::FastIntersectAxisAlignedBoxPlane(vCenter, vExtents, Plane1, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectAxisAlignedBoxPlane(vCenter, vExtents, Plane2, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectAxisAlignedBoxPlane(vCenter, vExtents, Plane3, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectAxisAlignedBoxPlane(vCenter, vExtents, Plane4, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectAxisAlignedBoxPlane(vCenter, vExtents, Plane5, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + // If the box is outside any plane it is outside. + if (XMVector4EqualInt(AnyOutside, XMVectorTrueInt())) + return DISJOINT; + + // If the box is inside all planes it is inside. + if (XMVector4EqualInt(AllInside, XMVectorTrueInt())) + return CONTAINS; + + // The box is not inside all planes or outside a plane, it may intersect. + return INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Create axis-aligned box that contains two other bounding boxes +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingBox::CreateMerged(BoundingBox& Out, const BoundingBox& b1, const BoundingBox& b2) noexcept +{ + XMVECTOR b1Center = XMLoadFloat3(&b1.Center); + XMVECTOR b1Extents = XMLoadFloat3(&b1.Extents); + + XMVECTOR b2Center = XMLoadFloat3(&b2.Center); + XMVECTOR b2Extents = XMLoadFloat3(&b2.Extents); + + XMVECTOR Min = XMVectorSubtract(b1Center, b1Extents); + Min = XMVectorMin(Min, XMVectorSubtract(b2Center, b2Extents)); + + XMVECTOR Max = XMVectorAdd(b1Center, b1Extents); + Max = XMVectorMax(Max, XMVectorAdd(b2Center, b2Extents)); + + assert(XMVector3LessOrEqual(Min, Max)); + + XMStoreFloat3(&Out.Center, XMVectorScale(XMVectorAdd(Min, Max), 0.5f)); + XMStoreFloat3(&Out.Extents, XMVectorScale(XMVectorSubtract(Max, Min), 0.5f)); +} + + +//----------------------------------------------------------------------------- +// Create axis-aligned box that contains a bounding sphere +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingBox::CreateFromSphere(BoundingBox& Out, const BoundingSphere& sh) noexcept +{ + XMVECTOR spCenter = XMLoadFloat3(&sh.Center); + XMVECTOR shRadius = XMVectorReplicatePtr(&sh.Radius); + + XMVECTOR Min = XMVectorSubtract(spCenter, shRadius); + XMVECTOR Max = XMVectorAdd(spCenter, shRadius); + + assert(XMVector3LessOrEqual(Min, Max)); + + XMStoreFloat3(&Out.Center, XMVectorScale(XMVectorAdd(Min, Max), 0.5f)); + XMStoreFloat3(&Out.Extents, XMVectorScale(XMVectorSubtract(Max, Min), 0.5f)); +} + + +//----------------------------------------------------------------------------- +// Create axis-aligned box from min/max points +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingBox::CreateFromPoints(BoundingBox& Out, FXMVECTOR pt1, FXMVECTOR pt2) noexcept +{ + XMVECTOR Min = XMVectorMin(pt1, pt2); + XMVECTOR Max = XMVectorMax(pt1, pt2); + + // Store center and extents. + XMStoreFloat3(&Out.Center, XMVectorScale(XMVectorAdd(Min, Max), 0.5f)); + XMStoreFloat3(&Out.Extents, XMVectorScale(XMVectorSubtract(Max, Min), 0.5f)); +} + + +//----------------------------------------------------------------------------- +// Find the minimum axis aligned bounding box containing a set of points. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingBox::CreateFromPoints(BoundingBox& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride) noexcept +{ + assert(Count > 0); + assert(pPoints); + + // Find the minimum and maximum x, y, and z + XMVECTOR vMin, vMax; + + vMin = vMax = XMLoadFloat3(pPoints); + + for (size_t i = 1; i < Count; ++i) + { + XMVECTOR Point = XMLoadFloat3(reinterpret_cast(reinterpret_cast(pPoints) + i * Stride)); + + vMin = XMVectorMin(vMin, Point); + vMax = XMVectorMax(vMax, Point); + } + + // Store center and extents. + XMStoreFloat3(&Out.Center, XMVectorScale(XMVectorAdd(vMin, vMax), 0.5f)); + XMStoreFloat3(&Out.Extents, XMVectorScale(XMVectorSubtract(vMax, vMin), 0.5f)); +} + + +/**************************************************************************** + * + * BoundingOrientedBox + * + ****************************************************************************/ + + //----------------------------------------------------------------------------- + // Transform an oriented box by an angle preserving transform. + //----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingOrientedBox::Transform(BoundingOrientedBox& Out, FXMMATRIX M) const noexcept +{ + // Load the box. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + // Composite the box rotation and the transform rotation. + XMMATRIX nM; + nM.r[0] = XMVector3Normalize(M.r[0]); + nM.r[1] = XMVector3Normalize(M.r[1]); + nM.r[2] = XMVector3Normalize(M.r[2]); + nM.r[3] = g_XMIdentityR3; + XMVECTOR Rotation = XMQuaternionRotationMatrix(nM); + vOrientation = XMQuaternionMultiply(vOrientation, Rotation); + + // Transform the center. + vCenter = XMVector3Transform(vCenter, M); + + // Scale the box extents. + XMVECTOR dX = XMVector3Length(M.r[0]); + XMVECTOR dY = XMVector3Length(M.r[1]); + XMVECTOR dZ = XMVector3Length(M.r[2]); + + XMVECTOR VectorScale = XMVectorSelect(dY, dX, g_XMSelect1000); + VectorScale = XMVectorSelect(dZ, VectorScale, g_XMSelect1100); + vExtents = XMVectorMultiply(vExtents, VectorScale); + + // Store the box. + XMStoreFloat3(&Out.Center, vCenter); + XMStoreFloat3(&Out.Extents, vExtents); + XMStoreFloat4(&Out.Orientation, vOrientation); +} + +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingOrientedBox::Transform(BoundingOrientedBox& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation) const noexcept +{ + assert(DirectX::Internal::XMQuaternionIsUnit(Rotation)); + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + // Composite the box rotation and the transform rotation. + vOrientation = XMQuaternionMultiply(vOrientation, Rotation); + + // Transform the center. + XMVECTOR VectorScale = XMVectorReplicate(Scale); + vCenter = XMVectorAdd(XMVector3Rotate(XMVectorMultiply(vCenter, VectorScale), Rotation), Translation); + + // Scale the box extents. + vExtents = XMVectorMultiply(vExtents, VectorScale); + + // Store the box. + XMStoreFloat3(&Out.Center, vCenter); + XMStoreFloat3(&Out.Extents, vExtents); + XMStoreFloat4(&Out.Orientation, vOrientation); +} + + +//----------------------------------------------------------------------------- +// Get the corner points of the box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingOrientedBox::GetCorners(XMFLOAT3* Corners) const noexcept +{ + assert(Corners != nullptr); + + // Load the box + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + for (size_t i = 0; i < CORNER_COUNT; ++i) + { + XMVECTOR C = XMVectorAdd(XMVector3Rotate(XMVectorMultiply(vExtents, g_BoxOffset[i]), vOrientation), vCenter); + XMStoreFloat3(&Corners[i], C); + } +} + + +//----------------------------------------------------------------------------- +// Point in oriented box test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingOrientedBox::Contains(FXMVECTOR Point) const noexcept +{ + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + // Transform the point to be local to the box. + XMVECTOR TPoint = XMVector3InverseRotate(XMVectorSubtract(Point, vCenter), vOrientation); + + return XMVector3InBounds(TPoint, vExtents) ? CONTAINS : DISJOINT; +} + + +//----------------------------------------------------------------------------- +// Triangle in oriented bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingOrientedBox::Contains(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept +{ + // Load the box center & orientation. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + // Transform the triangle vertices into the space of the box. + XMVECTOR TV0 = XMVector3InverseRotate(XMVectorSubtract(V0, vCenter), vOrientation); + XMVECTOR TV1 = XMVector3InverseRotate(XMVectorSubtract(V1, vCenter), vOrientation); + XMVECTOR TV2 = XMVector3InverseRotate(XMVectorSubtract(V2, vCenter), vOrientation); + + BoundingBox box; + box.Center = XMFLOAT3(0.0f, 0.0f, 0.0f); + box.Extents = Extents; + + // Use the triangle vs axis aligned box intersection routine. + return box.Contains(TV0, TV1, TV2); +} + + +//----------------------------------------------------------------------------- +// Sphere in oriented bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingOrientedBox::Contains(const BoundingSphere& sh) const noexcept +{ + XMVECTOR SphereCenter = XMLoadFloat3(&sh.Center); + XMVECTOR SphereRadius = XMVectorReplicatePtr(&sh.Radius); + + XMVECTOR BoxCenter = XMLoadFloat3(&Center); + XMVECTOR BoxExtents = XMLoadFloat3(&Extents); + XMVECTOR BoxOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(BoxOrientation)); + + // Transform the center of the sphere to be local to the box. + // BoxMin = -BoxExtents + // BoxMax = +BoxExtents + SphereCenter = XMVector3InverseRotate(XMVectorSubtract(SphereCenter, BoxCenter), BoxOrientation); + + // Find the distance to the nearest point on the box. + // for each i in (x, y, z) + // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2 + // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2 + + XMVECTOR d = XMVectorZero(); + + // Compute d for each dimension. + XMVECTOR LessThanMin = XMVectorLess(SphereCenter, XMVectorNegate(BoxExtents)); + XMVECTOR GreaterThanMax = XMVectorGreater(SphereCenter, BoxExtents); + + XMVECTOR MinDelta = XMVectorAdd(SphereCenter, BoxExtents); + XMVECTOR MaxDelta = XMVectorSubtract(SphereCenter, BoxExtents); + + // Choose value for each dimension based on the comparison. + d = XMVectorSelect(d, MinDelta, LessThanMin); + d = XMVectorSelect(d, MaxDelta, GreaterThanMax); + + // Use a dot-product to square them and sum them together. + XMVECTOR d2 = XMVector3Dot(d, d); + XMVECTOR SphereRadiusSq = XMVectorMultiply(SphereRadius, SphereRadius); + + if (XMVector4Greater(d2, SphereRadiusSq)) + return DISJOINT; + + // See if we are completely inside the box + XMVECTOR SMin = XMVectorSubtract(SphereCenter, SphereRadius); + XMVECTOR SMax = XMVectorAdd(SphereCenter, SphereRadius); + + return (XMVector3InBounds(SMin, BoxExtents) && XMVector3InBounds(SMax, BoxExtents)) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Axis aligned box vs. oriented box. Constructs an oriented box and uses +// the oriented box vs. oriented box test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingOrientedBox::Contains(const BoundingBox& box) const noexcept +{ + // Make the axis aligned box oriented and do an OBB vs OBB test. + BoundingOrientedBox obox(box.Center, box.Extents, XMFLOAT4(0.f, 0.f, 0.f, 1.f)); + return Contains(obox); +} + + +//----------------------------------------------------------------------------- +// Oriented bounding box in oriented bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingOrientedBox::Contains(const BoundingOrientedBox& box) const noexcept +{ + if (!Intersects(box)) + return DISJOINT; + + // Load the boxes + XMVECTOR aCenter = XMLoadFloat3(&Center); + XMVECTOR aExtents = XMLoadFloat3(&Extents); + XMVECTOR aOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(aOrientation)); + + XMVECTOR bCenter = XMLoadFloat3(&box.Center); + XMVECTOR bExtents = XMLoadFloat3(&box.Extents); + XMVECTOR bOrientation = XMLoadFloat4(&box.Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(bOrientation)); + + XMVECTOR offset = XMVectorSubtract(bCenter, aCenter); + + for (size_t i = 0; i < CORNER_COUNT; ++i) + { + // Cb = rotate( bExtents * corneroffset[i], bOrientation ) + bcenter + // Ca = invrotate( Cb - aCenter, aOrientation ) + + XMVECTOR C = XMVectorAdd(XMVector3Rotate(XMVectorMultiply(bExtents, g_BoxOffset[i]), bOrientation), offset); + C = XMVector3InverseRotate(C, aOrientation); + + if (!XMVector3InBounds(C, aExtents)) + return INTERSECTS; + } + + return CONTAINS; +} + + +//----------------------------------------------------------------------------- +// Frustum in oriented bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingOrientedBox::Contains(const BoundingFrustum& fr) const noexcept +{ + if (!fr.Intersects(*this)) + return DISJOINT; + + XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT]; + fr.GetCorners(Corners); + + // Load the box + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + for (size_t i = 0; i < BoundingFrustum::CORNER_COUNT; ++i) + { + XMVECTOR C = XMVector3InverseRotate(XMVectorSubtract(XMLoadFloat3(&Corners[i]), vCenter), vOrientation); + + if (!XMVector3InBounds(C, vExtents)) + return INTERSECTS; + } + + return CONTAINS; +} + + +//----------------------------------------------------------------------------- +// Sphere vs. oriented box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingOrientedBox::Intersects(const BoundingSphere& sh) const noexcept +{ + XMVECTOR SphereCenter = XMLoadFloat3(&sh.Center); + XMVECTOR SphereRadius = XMVectorReplicatePtr(&sh.Radius); + + XMVECTOR BoxCenter = XMLoadFloat3(&Center); + XMVECTOR BoxExtents = XMLoadFloat3(&Extents); + XMVECTOR BoxOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(BoxOrientation)); + + // Transform the center of the sphere to be local to the box. + // BoxMin = -BoxExtents + // BoxMax = +BoxExtents + SphereCenter = XMVector3InverseRotate(XMVectorSubtract(SphereCenter, BoxCenter), BoxOrientation); + + // Find the distance to the nearest point on the box. + // for each i in (x, y, z) + // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2 + // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2 + + XMVECTOR d = XMVectorZero(); + + // Compute d for each dimension. + XMVECTOR LessThanMin = XMVectorLess(SphereCenter, XMVectorNegate(BoxExtents)); + XMVECTOR GreaterThanMax = XMVectorGreater(SphereCenter, BoxExtents); + + XMVECTOR MinDelta = XMVectorAdd(SphereCenter, BoxExtents); + XMVECTOR MaxDelta = XMVectorSubtract(SphereCenter, BoxExtents); + + // Choose value for each dimension based on the comparison. + d = XMVectorSelect(d, MinDelta, LessThanMin); + d = XMVectorSelect(d, MaxDelta, GreaterThanMax); + + // Use a dot-product to square them and sum them together. + XMVECTOR d2 = XMVector3Dot(d, d); + + return XMVector4LessOrEqual(d2, XMVectorMultiply(SphereRadius, SphereRadius)) ? true : false; +} + + +//----------------------------------------------------------------------------- +// Axis aligned box vs. oriented box. Constructs an oriented box and uses +// the oriented box vs. oriented box test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingOrientedBox::Intersects(const BoundingBox& box) const noexcept +{ + // Make the axis aligned box oriented and do an OBB vs OBB test. + BoundingOrientedBox obox(box.Center, box.Extents, XMFLOAT4(0.f, 0.f, 0.f, 1.f)); + return Intersects(obox); +} + + +//----------------------------------------------------------------------------- +// Fast oriented box / oriented box intersection test using the separating axis +// theorem. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingOrientedBox::Intersects(const BoundingOrientedBox& box) const noexcept +{ + // Build the 3x3 rotation matrix that defines the orientation of B relative to A. + XMVECTOR A_quat = XMLoadFloat4(&Orientation); + XMVECTOR B_quat = XMLoadFloat4(&box.Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(A_quat)); + assert(DirectX::Internal::XMQuaternionIsUnit(B_quat)); + + XMVECTOR Q = XMQuaternionMultiply(A_quat, XMQuaternionConjugate(B_quat)); + XMMATRIX R = XMMatrixRotationQuaternion(Q); + + // Compute the translation of B relative to A. + XMVECTOR A_cent = XMLoadFloat3(&Center); + XMVECTOR B_cent = XMLoadFloat3(&box.Center); + XMVECTOR t = XMVector3InverseRotate(XMVectorSubtract(B_cent, A_cent), A_quat); + + // + // h(A) = extents of A. + // h(B) = extents of B. + // + // a(u) = axes of A = (1,0,0), (0,1,0), (0,0,1) + // b(u) = axes of B relative to A = (r00,r10,r20), (r01,r11,r21), (r02,r12,r22) + // + // For each possible separating axis l: + // d(A) = sum (for i = u,v,w) h(A)(i) * abs( a(i) dot l ) + // d(B) = sum (for i = u,v,w) h(B)(i) * abs( b(i) dot l ) + // if abs( t dot l ) > d(A) + d(B) then disjoint + // + + // Load extents of A and B. + XMVECTOR h_A = XMLoadFloat3(&Extents); + XMVECTOR h_B = XMLoadFloat3(&box.Extents); + + // Rows. Note R[0,1,2]X.w = 0. + XMVECTOR R0X = R.r[0]; + XMVECTOR R1X = R.r[1]; + XMVECTOR R2X = R.r[2]; + + R = XMMatrixTranspose(R); + + // Columns. Note RX[0,1,2].w = 0. + XMVECTOR RX0 = R.r[0]; + XMVECTOR RX1 = R.r[1]; + XMVECTOR RX2 = R.r[2]; + + // Absolute value of rows. + XMVECTOR AR0X = XMVectorAbs(R0X); + XMVECTOR AR1X = XMVectorAbs(R1X); + XMVECTOR AR2X = XMVectorAbs(R2X); + + // Absolute value of columns. + XMVECTOR ARX0 = XMVectorAbs(RX0); + XMVECTOR ARX1 = XMVectorAbs(RX1); + XMVECTOR ARX2 = XMVectorAbs(RX2); + + // Test each of the 15 possible seperating axii. + XMVECTOR d, d_A, d_B; + + // l = a(u) = (1, 0, 0) + // t dot l = t.x + // d(A) = h(A).x + // d(B) = h(B) dot abs(r00, r01, r02) + d = XMVectorSplatX(t); + d_A = XMVectorSplatX(h_A); + d_B = XMVector3Dot(h_B, AR0X); + XMVECTOR NoIntersection = XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)); + + // l = a(v) = (0, 1, 0) + // t dot l = t.y + // d(A) = h(A).y + // d(B) = h(B) dot abs(r10, r11, r12) + d = XMVectorSplatY(t); + d_A = XMVectorSplatY(h_A); + d_B = XMVector3Dot(h_B, AR1X); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = a(w) = (0, 0, 1) + // t dot l = t.z + // d(A) = h(A).z + // d(B) = h(B) dot abs(r20, r21, r22) + d = XMVectorSplatZ(t); + d_A = XMVectorSplatZ(h_A); + d_B = XMVector3Dot(h_B, AR2X); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = b(u) = (r00, r10, r20) + // d(A) = h(A) dot abs(r00, r10, r20) + // d(B) = h(B).x + d = XMVector3Dot(t, RX0); + d_A = XMVector3Dot(h_A, ARX0); + d_B = XMVectorSplatX(h_B); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = b(v) = (r01, r11, r21) + // d(A) = h(A) dot abs(r01, r11, r21) + // d(B) = h(B).y + d = XMVector3Dot(t, RX1); + d_A = XMVector3Dot(h_A, ARX1); + d_B = XMVectorSplatY(h_B); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = b(w) = (r02, r12, r22) + // d(A) = h(A) dot abs(r02, r12, r22) + // d(B) = h(B).z + d = XMVector3Dot(t, RX2); + d_A = XMVector3Dot(h_A, ARX2); + d_B = XMVectorSplatZ(h_B); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = a(u) x b(u) = (0, -r20, r10) + // d(A) = h(A) dot abs(0, r20, r10) + // d(B) = h(B) dot abs(0, r02, r01) + d = XMVector3Dot(t, XMVectorPermute(RX0, XMVectorNegate(RX0))); + d_A = XMVector3Dot(h_A, XMVectorSwizzle(ARX0)); + d_B = XMVector3Dot(h_B, XMVectorSwizzle(AR0X)); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = a(u) x b(v) = (0, -r21, r11) + // d(A) = h(A) dot abs(0, r21, r11) + // d(B) = h(B) dot abs(r02, 0, r00) + d = XMVector3Dot(t, XMVectorPermute(RX1, XMVectorNegate(RX1))); + d_A = XMVector3Dot(h_A, XMVectorSwizzle(ARX1)); + d_B = XMVector3Dot(h_B, XMVectorSwizzle(AR0X)); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = a(u) x b(w) = (0, -r22, r12) + // d(A) = h(A) dot abs(0, r22, r12) + // d(B) = h(B) dot abs(r01, r00, 0) + d = XMVector3Dot(t, XMVectorPermute(RX2, XMVectorNegate(RX2))); + d_A = XMVector3Dot(h_A, XMVectorSwizzle(ARX2)); + d_B = XMVector3Dot(h_B, XMVectorSwizzle(AR0X)); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = a(v) x b(u) = (r20, 0, -r00) + // d(A) = h(A) dot abs(r20, 0, r00) + // d(B) = h(B) dot abs(0, r12, r11) + d = XMVector3Dot(t, XMVectorPermute(RX0, XMVectorNegate(RX0))); + d_A = XMVector3Dot(h_A, XMVectorSwizzle(ARX0)); + d_B = XMVector3Dot(h_B, XMVectorSwizzle(AR1X)); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = a(v) x b(v) = (r21, 0, -r01) + // d(A) = h(A) dot abs(r21, 0, r01) + // d(B) = h(B) dot abs(r12, 0, r10) + d = XMVector3Dot(t, XMVectorPermute(RX1, XMVectorNegate(RX1))); + d_A = XMVector3Dot(h_A, XMVectorSwizzle(ARX1)); + d_B = XMVector3Dot(h_B, XMVectorSwizzle(AR1X)); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = a(v) x b(w) = (r22, 0, -r02) + // d(A) = h(A) dot abs(r22, 0, r02) + // d(B) = h(B) dot abs(r11, r10, 0) + d = XMVector3Dot(t, XMVectorPermute(RX2, XMVectorNegate(RX2))); + d_A = XMVector3Dot(h_A, XMVectorSwizzle(ARX2)); + d_B = XMVector3Dot(h_B, XMVectorSwizzle(AR1X)); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = a(w) x b(u) = (-r10, r00, 0) + // d(A) = h(A) dot abs(r10, r00, 0) + // d(B) = h(B) dot abs(0, r22, r21) + d = XMVector3Dot(t, XMVectorPermute(RX0, XMVectorNegate(RX0))); + d_A = XMVector3Dot(h_A, XMVectorSwizzle(ARX0)); + d_B = XMVector3Dot(h_B, XMVectorSwizzle(AR2X)); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = a(w) x b(v) = (-r11, r01, 0) + // d(A) = h(A) dot abs(r11, r01, 0) + // d(B) = h(B) dot abs(r22, 0, r20) + d = XMVector3Dot(t, XMVectorPermute(RX1, XMVectorNegate(RX1))); + d_A = XMVector3Dot(h_A, XMVectorSwizzle(ARX1)); + d_B = XMVector3Dot(h_B, XMVectorSwizzle(AR2X)); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = a(w) x b(w) = (-r12, r02, 0) + // d(A) = h(A) dot abs(r12, r02, 0) + // d(B) = h(B) dot abs(r21, r20, 0) + d = XMVector3Dot(t, XMVectorPermute(RX2, XMVectorNegate(RX2))); + d_A = XMVector3Dot(h_A, XMVectorSwizzle(ARX2)); + d_B = XMVector3Dot(h_B, XMVectorSwizzle(AR2X)); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // No seperating axis found, boxes must intersect. + return XMVector4NotEqualInt(NoIntersection, XMVectorTrueInt()) ? true : false; +} + + +//----------------------------------------------------------------------------- +// Frustum vs. oriented box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingOrientedBox::Intersects(const BoundingFrustum& fr) const noexcept +{ + return fr.Intersects(*this); +} + + +//----------------------------------------------------------------------------- +// Triangle vs. oriented box test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingOrientedBox::Intersects(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept +{ + // Load the box center & orientation. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + // Transform the triangle vertices into the space of the box. + XMVECTOR TV0 = XMVector3InverseRotate(XMVectorSubtract(V0, vCenter), vOrientation); + XMVECTOR TV1 = XMVector3InverseRotate(XMVectorSubtract(V1, vCenter), vOrientation); + XMVECTOR TV2 = XMVector3InverseRotate(XMVectorSubtract(V2, vCenter), vOrientation); + + BoundingBox box; + box.Center = XMFLOAT3(0.0f, 0.0f, 0.0f); + box.Extents = Extents; + + // Use the triangle vs axis aligned box intersection routine. + return box.Intersects(TV0, TV1, TV2); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline PlaneIntersectionType XM_CALLCONV BoundingOrientedBox::Intersects(FXMVECTOR Plane) const noexcept +{ + assert(DirectX::Internal::XMPlaneIsUnit(Plane)); + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + XMVECTOR BoxOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(BoxOrientation)); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne()); + + // Build the 3x3 rotation matrix that defines the box axes. + XMMATRIX R = XMMatrixRotationQuaternion(BoxOrientation); + + XMVECTOR Outside, Inside; + DirectX::Internal::FastIntersectOrientedBoxPlane(vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane, Outside, Inside); + + // If the box is outside any plane it is outside. + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return FRONT; + + // If the box is inside all planes it is inside. + if (XMVector4EqualInt(Inside, XMVectorTrueInt())) + return BACK; + + // The box is not inside all planes or outside a plane it intersects. + return INTERSECTING; +} + + +//----------------------------------------------------------------------------- +// Compute the intersection of a ray (Origin, Direction) with an oriented box +// using the slabs method. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingOrientedBox::Intersects(FXMVECTOR Origin, FXMVECTOR Direction, float& Dist) const noexcept +{ + assert(DirectX::Internal::XMVector3IsUnit(Direction)); + + static const XMVECTORU32 SelectY = { { { XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 } } }; + static const XMVECTORU32 SelectZ = { { { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 } } }; + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + // Get the boxes normalized side directions. + XMMATRIX R = XMMatrixRotationQuaternion(vOrientation); + + // Adjust ray origin to be relative to center of the box. + XMVECTOR TOrigin = XMVectorSubtract(vCenter, Origin); + + // Compute the dot product againt each axis of the box. + XMVECTOR AxisDotOrigin = XMVector3Dot(R.r[0], TOrigin); + AxisDotOrigin = XMVectorSelect(AxisDotOrigin, XMVector3Dot(R.r[1], TOrigin), SelectY); + AxisDotOrigin = XMVectorSelect(AxisDotOrigin, XMVector3Dot(R.r[2], TOrigin), SelectZ); + + XMVECTOR AxisDotDirection = XMVector3Dot(R.r[0], Direction); + AxisDotDirection = XMVectorSelect(AxisDotDirection, XMVector3Dot(R.r[1], Direction), SelectY); + AxisDotDirection = XMVectorSelect(AxisDotDirection, XMVector3Dot(R.r[2], Direction), SelectZ); + + // if (fabs(AxisDotDirection) <= Epsilon) the ray is nearly parallel to the slab. + XMVECTOR IsParallel = XMVectorLessOrEqual(XMVectorAbs(AxisDotDirection), g_RayEpsilon); + + // Test against all three axes simultaneously. + XMVECTOR InverseAxisDotDirection = XMVectorReciprocal(AxisDotDirection); + XMVECTOR t1 = XMVectorMultiply(XMVectorSubtract(AxisDotOrigin, vExtents), InverseAxisDotDirection); + XMVECTOR t2 = XMVectorMultiply(XMVectorAdd(AxisDotOrigin, vExtents), InverseAxisDotDirection); + + // Compute the max of min(t1,t2) and the min of max(t1,t2) ensuring we don't + // use the results from any directions parallel to the slab. + XMVECTOR t_min = XMVectorSelect(XMVectorMin(t1, t2), g_FltMin, IsParallel); + XMVECTOR t_max = XMVectorSelect(XMVectorMax(t1, t2), g_FltMax, IsParallel); + + // t_min.x = maximum( t_min.x, t_min.y, t_min.z ); + // t_max.x = minimum( t_max.x, t_max.y, t_max.z ); + t_min = XMVectorMax(t_min, XMVectorSplatY(t_min)); // x = max(x,y) + t_min = XMVectorMax(t_min, XMVectorSplatZ(t_min)); // x = max(max(x,y),z) + t_max = XMVectorMin(t_max, XMVectorSplatY(t_max)); // x = min(x,y) + t_max = XMVectorMin(t_max, XMVectorSplatZ(t_max)); // x = min(min(x,y),z) + + // if ( t_min > t_max ) return false; + XMVECTOR NoIntersection = XMVectorGreater(XMVectorSplatX(t_min), XMVectorSplatX(t_max)); + + // if ( t_max < 0.0f ) return false; + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(XMVectorSplatX(t_max), XMVectorZero())); + + // if (IsParallel && (-Extents > AxisDotOrigin || Extents < AxisDotOrigin)) return false; + XMVECTOR ParallelOverlap = XMVectorInBounds(AxisDotOrigin, vExtents); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorAndCInt(IsParallel, ParallelOverlap)); + + if (!DirectX::Internal::XMVector3AnyTrue(NoIntersection)) + { + // Store the x-component to *pDist + XMStoreFloat(&Dist, t_min); + return true; + } + + Dist = 0.f; + return false; +} + + +//----------------------------------------------------------------------------- +// Test an oriented box vs 6 planes (typically forming a frustum). +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingOrientedBox::ContainedBy( + FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2, + GXMVECTOR Plane3, + HXMVECTOR Plane4, HXMVECTOR Plane5) const noexcept +{ + // Load the box. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + XMVECTOR BoxOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(BoxOrientation)); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne()); + + // Build the 3x3 rotation matrix that defines the box axes. + XMMATRIX R = XMMatrixRotationQuaternion(BoxOrientation); + + XMVECTOR Outside, Inside; + + // Test against each plane. + DirectX::Internal::FastIntersectOrientedBoxPlane(vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane0, Outside, Inside); + + XMVECTOR AnyOutside = Outside; + XMVECTOR AllInside = Inside; + + DirectX::Internal::FastIntersectOrientedBoxPlane(vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane1, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectOrientedBoxPlane(vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane2, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectOrientedBoxPlane(vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane3, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectOrientedBoxPlane(vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane4, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectOrientedBoxPlane(vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane5, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + // If the box is outside any plane it is outside. + if (XMVector4EqualInt(AnyOutside, XMVectorTrueInt())) + return DISJOINT; + + // If the box is inside all planes it is inside. + if (XMVector4EqualInt(AllInside, XMVectorTrueInt())) + return CONTAINS; + + // The box is not inside all planes or outside a plane, it may intersect. + return INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Create oriented bounding box from axis-aligned bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingOrientedBox::CreateFromBoundingBox(BoundingOrientedBox& Out, const BoundingBox& box) noexcept +{ + Out.Center = box.Center; + Out.Extents = box.Extents; + Out.Orientation = XMFLOAT4(0.f, 0.f, 0.f, 1.f); +} + + +//----------------------------------------------------------------------------- +// Find the approximate minimum oriented bounding box containing a set of +// points. Exact computation of minimum oriented bounding box is possible but +// is slower and requires a more complex algorithm. +// The algorithm works by computing the inertia tensor of the points and then +// using the eigenvectors of the intertia tensor as the axes of the box. +// Computing the intertia tensor of the convex hull of the points will usually +// result in better bounding box but the computation is more complex. +// Exact computation of the minimum oriented bounding box is possible but the +// best know algorithm is O(N^3) and is significanly more complex to implement. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingOrientedBox::CreateFromPoints(BoundingOrientedBox& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride) noexcept +{ + assert(Count > 0); + assert(pPoints != nullptr); + + XMVECTOR CenterOfMass = XMVectorZero(); + + // Compute the center of mass and inertia tensor of the points. + for (size_t i = 0; i < Count; ++i) + { + XMVECTOR Point = XMLoadFloat3(reinterpret_cast(reinterpret_cast(pPoints) + i * Stride)); + + CenterOfMass = XMVectorAdd(CenterOfMass, Point); + } + + CenterOfMass = XMVectorMultiply(CenterOfMass, XMVectorReciprocal(XMVectorReplicate(float(Count)))); + + // Compute the inertia tensor of the points around the center of mass. + // Using the center of mass is not strictly necessary, but will hopefully + // improve the stability of finding the eigenvectors. + XMVECTOR XX_YY_ZZ = XMVectorZero(); + XMVECTOR XY_XZ_YZ = XMVectorZero(); + + for (size_t i = 0; i < Count; ++i) + { + XMVECTOR Point = XMVectorSubtract(XMLoadFloat3(reinterpret_cast(reinterpret_cast(pPoints) + i * Stride)), CenterOfMass); + + XX_YY_ZZ = XMVectorAdd(XX_YY_ZZ, XMVectorMultiply(Point, Point)); + + XMVECTOR XXY = XMVectorSwizzle(Point); + XMVECTOR YZZ = XMVectorSwizzle(Point); + + XY_XZ_YZ = XMVectorAdd(XY_XZ_YZ, XMVectorMultiply(XXY, YZZ)); + } + + XMVECTOR v1, v2, v3; + + // Compute the eigenvectors of the inertia tensor. + DirectX::Internal::CalculateEigenVectorsFromCovarianceMatrix(XMVectorGetX(XX_YY_ZZ), XMVectorGetY(XX_YY_ZZ), + XMVectorGetZ(XX_YY_ZZ), + XMVectorGetX(XY_XZ_YZ), XMVectorGetY(XY_XZ_YZ), + XMVectorGetZ(XY_XZ_YZ), + &v1, &v2, &v3); + + // Put them in a matrix. + XMMATRIX R; + + R.r[0] = XMVectorSetW(v1, 0.f); + R.r[1] = XMVectorSetW(v2, 0.f); + R.r[2] = XMVectorSetW(v3, 0.f); + R.r[3] = g_XMIdentityR3.v; + + // Multiply by -1 to convert the matrix into a right handed coordinate + // system (Det ~= 1) in case the eigenvectors form a left handed + // coordinate system (Det ~= -1) because XMQuaternionRotationMatrix only + // works on right handed matrices. + XMVECTOR Det = XMMatrixDeterminant(R); + + if (XMVector4Less(Det, XMVectorZero())) + { + R.r[0] = XMVectorMultiply(R.r[0], g_XMNegativeOne.v); + R.r[1] = XMVectorMultiply(R.r[1], g_XMNegativeOne.v); + R.r[2] = XMVectorMultiply(R.r[2], g_XMNegativeOne.v); + } + + // Get the rotation quaternion from the matrix. + XMVECTOR vOrientation = XMQuaternionRotationMatrix(R); + + // Make sure it is normal (in case the vectors are slightly non-orthogonal). + vOrientation = XMQuaternionNormalize(vOrientation); + + // Rebuild the rotation matrix from the quaternion. + R = XMMatrixRotationQuaternion(vOrientation); + + // Build the rotation into the rotated space. + XMMATRIX InverseR = XMMatrixTranspose(R); + + // Find the minimum OBB using the eigenvectors as the axes. + XMVECTOR vMin, vMax; + + vMin = vMax = XMVector3TransformNormal(XMLoadFloat3(pPoints), InverseR); + + for (size_t i = 1; i < Count; ++i) + { + XMVECTOR Point = XMVector3TransformNormal(XMLoadFloat3(reinterpret_cast(reinterpret_cast(pPoints) + i * Stride)), + InverseR); + + vMin = XMVectorMin(vMin, Point); + vMax = XMVectorMax(vMax, Point); + } + + // Rotate the center into world space. + XMVECTOR vCenter = XMVectorScale(XMVectorAdd(vMin, vMax), 0.5f); + vCenter = XMVector3TransformNormal(vCenter, R); + + // Store center, extents, and orientation. + XMStoreFloat3(&Out.Center, vCenter); + XMStoreFloat3(&Out.Extents, XMVectorScale(XMVectorSubtract(vMax, vMin), 0.5f)); + XMStoreFloat4(&Out.Orientation, vOrientation); +} + + +/**************************************************************************** + * + * BoundingFrustum + * + ****************************************************************************/ + +_Use_decl_annotations_ +inline BoundingFrustum::BoundingFrustum(CXMMATRIX Projection, bool rhcoords) noexcept +{ + CreateFromMatrix(*this, Projection, rhcoords); +} + + +//----------------------------------------------------------------------------- +// Transform a frustum by an angle preserving transform. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingFrustum::Transform(BoundingFrustum& Out, FXMMATRIX M) const noexcept +{ + // Load the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + // Composite the frustum rotation and the transform rotation + XMMATRIX nM; + nM.r[0] = XMVector3Normalize(M.r[0]); + nM.r[1] = XMVector3Normalize(M.r[1]); + nM.r[2] = XMVector3Normalize(M.r[2]); + nM.r[3] = g_XMIdentityR3; + XMVECTOR Rotation = XMQuaternionRotationMatrix(nM); + vOrientation = XMQuaternionMultiply(vOrientation, Rotation); + + // Transform the center. + vOrigin = XMVector3Transform(vOrigin, M); + + // Store the frustum. + XMStoreFloat3(&Out.Origin, vOrigin); + XMStoreFloat4(&Out.Orientation, vOrientation); + + // Scale the near and far distances (the slopes remain the same). + XMVECTOR dX = XMVector3Dot(M.r[0], M.r[0]); + XMVECTOR dY = XMVector3Dot(M.r[1], M.r[1]); + XMVECTOR dZ = XMVector3Dot(M.r[2], M.r[2]); + + XMVECTOR d = XMVectorMax(dX, XMVectorMax(dY, dZ)); + float Scale = sqrtf(XMVectorGetX(d)); + + Out.Near = Near * Scale; + Out.Far = Far * Scale; + + // Copy the slopes. + Out.RightSlope = RightSlope; + Out.LeftSlope = LeftSlope; + Out.TopSlope = TopSlope; + Out.BottomSlope = BottomSlope; +} + +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingFrustum::Transform(BoundingFrustum& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation) const noexcept +{ + assert(DirectX::Internal::XMQuaternionIsUnit(Rotation)); + + // Load the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + // Composite the frustum rotation and the transform rotation. + vOrientation = XMQuaternionMultiply(vOrientation, Rotation); + + // Transform the origin. + vOrigin = XMVectorAdd(XMVector3Rotate(XMVectorScale(vOrigin, Scale), Rotation), Translation); + + // Store the frustum. + XMStoreFloat3(&Out.Origin, vOrigin); + XMStoreFloat4(&Out.Orientation, vOrientation); + + // Scale the near and far distances (the slopes remain the same). + Out.Near = Near * Scale; + Out.Far = Far * Scale; + + // Copy the slopes. + Out.RightSlope = RightSlope; + Out.LeftSlope = LeftSlope; + Out.TopSlope = TopSlope; + Out.BottomSlope = BottomSlope; +} + + +//----------------------------------------------------------------------------- +// Get the corner points of the frustum +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingFrustum::GetCorners(XMFLOAT3* Corners) const noexcept +{ + assert(Corners != nullptr); + + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + // Build the corners of the frustum. + XMVECTOR vRightTop = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR vRightBottom = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR vLeftTop = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR vLeftBottom = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR vNear = XMVectorReplicatePtr(&Near); + XMVECTOR vFar = XMVectorReplicatePtr(&Far); + + // Returns 8 corners position of bounding frustum. + // Near Far + // 0----1 4----5 + // | | | | + // | | | | + // 3----2 7----6 + + XMVECTOR vCorners[CORNER_COUNT]; + vCorners[0] = XMVectorMultiply(vLeftTop, vNear); + vCorners[1] = XMVectorMultiply(vRightTop, vNear); + vCorners[2] = XMVectorMultiply(vRightBottom, vNear); + vCorners[3] = XMVectorMultiply(vLeftBottom, vNear); + vCorners[4] = XMVectorMultiply(vLeftTop, vFar); + vCorners[5] = XMVectorMultiply(vRightTop, vFar); + vCorners[6] = XMVectorMultiply(vRightBottom, vFar); + vCorners[7] = XMVectorMultiply(vLeftBottom, vFar); + + for (size_t i = 0; i < CORNER_COUNT; ++i) + { + XMVECTOR C = XMVectorAdd(XMVector3Rotate(vCorners[i], vOrientation), vOrigin); + XMStoreFloat3(&Corners[i], C); + } +} + + +//----------------------------------------------------------------------------- +// Point in frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingFrustum::Contains(FXMVECTOR Point) const noexcept +{ + // Build frustum planes. + XMVECTOR Planes[6]; + Planes[0] = XMVectorSet(0.0f, 0.0f, -1.0f, Near); + Planes[1] = XMVectorSet(0.0f, 0.0f, 1.0f, -Far); + Planes[2] = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f); + Planes[3] = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f); + Planes[4] = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f); + Planes[5] = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f); + + // Load origin and orientation. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + // Transform point into local space of frustum. + XMVECTOR TPoint = XMVector3InverseRotate(XMVectorSubtract(Point, vOrigin), vOrientation); + + // Set w to one. + TPoint = XMVectorInsert<0, 0, 0, 0, 1>(TPoint, XMVectorSplatOne()); + + XMVECTOR Zero = XMVectorZero(); + XMVECTOR Outside = Zero; + + // Test point against each plane of the frustum. + for (size_t i = 0; i < 6; ++i) + { + XMVECTOR Dot = XMVector4Dot(TPoint, Planes[i]); + Outside = XMVectorOrInt(Outside, XMVectorGreater(Dot, Zero)); + } + + return XMVector4NotEqualInt(Outside, XMVectorTrueInt()) ? CONTAINS : DISJOINT; +} + + +//----------------------------------------------------------------------------- +// Triangle vs frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingFrustum::Contains(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + // Create 6 planes (do it inline to encourage use of registers) + XMVECTOR NearPlane = XMVectorSet(0.0f, 0.0f, -1.0f, Near); + NearPlane = DirectX::Internal::XMPlaneTransform(NearPlane, vOrientation, vOrigin); + NearPlane = XMPlaneNormalize(NearPlane); + + XMVECTOR FarPlane = XMVectorSet(0.0f, 0.0f, 1.0f, -Far); + FarPlane = DirectX::Internal::XMPlaneTransform(FarPlane, vOrientation, vOrigin); + FarPlane = XMPlaneNormalize(FarPlane); + + XMVECTOR RightPlane = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f); + RightPlane = DirectX::Internal::XMPlaneTransform(RightPlane, vOrientation, vOrigin); + RightPlane = XMPlaneNormalize(RightPlane); + + XMVECTOR LeftPlane = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f); + LeftPlane = DirectX::Internal::XMPlaneTransform(LeftPlane, vOrientation, vOrigin); + LeftPlane = XMPlaneNormalize(LeftPlane); + + XMVECTOR TopPlane = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f); + TopPlane = DirectX::Internal::XMPlaneTransform(TopPlane, vOrientation, vOrigin); + TopPlane = XMPlaneNormalize(TopPlane); + + XMVECTOR BottomPlane = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f); + BottomPlane = DirectX::Internal::XMPlaneTransform(BottomPlane, vOrientation, vOrigin); + BottomPlane = XMPlaneNormalize(BottomPlane); + + return TriangleTests::ContainedBy(V0, V1, V2, NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingFrustum::Contains(const BoundingSphere& sh) const noexcept +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + // Create 6 planes (do it inline to encourage use of registers) + XMVECTOR NearPlane = XMVectorSet(0.0f, 0.0f, -1.0f, Near); + NearPlane = DirectX::Internal::XMPlaneTransform(NearPlane, vOrientation, vOrigin); + NearPlane = XMPlaneNormalize(NearPlane); + + XMVECTOR FarPlane = XMVectorSet(0.0f, 0.0f, 1.0f, -Far); + FarPlane = DirectX::Internal::XMPlaneTransform(FarPlane, vOrientation, vOrigin); + FarPlane = XMPlaneNormalize(FarPlane); + + XMVECTOR RightPlane = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f); + RightPlane = DirectX::Internal::XMPlaneTransform(RightPlane, vOrientation, vOrigin); + RightPlane = XMPlaneNormalize(RightPlane); + + XMVECTOR LeftPlane = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f); + LeftPlane = DirectX::Internal::XMPlaneTransform(LeftPlane, vOrientation, vOrigin); + LeftPlane = XMPlaneNormalize(LeftPlane); + + XMVECTOR TopPlane = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f); + TopPlane = DirectX::Internal::XMPlaneTransform(TopPlane, vOrientation, vOrigin); + TopPlane = XMPlaneNormalize(TopPlane); + + XMVECTOR BottomPlane = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f); + BottomPlane = DirectX::Internal::XMPlaneTransform(BottomPlane, vOrientation, vOrigin); + BottomPlane = XMPlaneNormalize(BottomPlane); + + return sh.ContainedBy(NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingFrustum::Contains(const BoundingBox& box) const noexcept +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + // Create 6 planes (do it inline to encourage use of registers) + XMVECTOR NearPlane = XMVectorSet(0.0f, 0.0f, -1.0f, Near); + NearPlane = DirectX::Internal::XMPlaneTransform(NearPlane, vOrientation, vOrigin); + NearPlane = XMPlaneNormalize(NearPlane); + + XMVECTOR FarPlane = XMVectorSet(0.0f, 0.0f, 1.0f, -Far); + FarPlane = DirectX::Internal::XMPlaneTransform(FarPlane, vOrientation, vOrigin); + FarPlane = XMPlaneNormalize(FarPlane); + + XMVECTOR RightPlane = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f); + RightPlane = DirectX::Internal::XMPlaneTransform(RightPlane, vOrientation, vOrigin); + RightPlane = XMPlaneNormalize(RightPlane); + + XMVECTOR LeftPlane = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f); + LeftPlane = DirectX::Internal::XMPlaneTransform(LeftPlane, vOrientation, vOrigin); + LeftPlane = XMPlaneNormalize(LeftPlane); + + XMVECTOR TopPlane = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f); + TopPlane = DirectX::Internal::XMPlaneTransform(TopPlane, vOrientation, vOrigin); + TopPlane = XMPlaneNormalize(TopPlane); + + XMVECTOR BottomPlane = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f); + BottomPlane = DirectX::Internal::XMPlaneTransform(BottomPlane, vOrientation, vOrigin); + BottomPlane = XMPlaneNormalize(BottomPlane); + + return box.ContainedBy(NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingFrustum::Contains(const BoundingOrientedBox& box) const noexcept +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + // Create 6 planes (do it inline to encourage use of registers) + XMVECTOR NearPlane = XMVectorSet(0.0f, 0.0f, -1.0f, Near); + NearPlane = DirectX::Internal::XMPlaneTransform(NearPlane, vOrientation, vOrigin); + NearPlane = XMPlaneNormalize(NearPlane); + + XMVECTOR FarPlane = XMVectorSet(0.0f, 0.0f, 1.0f, -Far); + FarPlane = DirectX::Internal::XMPlaneTransform(FarPlane, vOrientation, vOrigin); + FarPlane = XMPlaneNormalize(FarPlane); + + XMVECTOR RightPlane = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f); + RightPlane = DirectX::Internal::XMPlaneTransform(RightPlane, vOrientation, vOrigin); + RightPlane = XMPlaneNormalize(RightPlane); + + XMVECTOR LeftPlane = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f); + LeftPlane = DirectX::Internal::XMPlaneTransform(LeftPlane, vOrientation, vOrigin); + LeftPlane = XMPlaneNormalize(LeftPlane); + + XMVECTOR TopPlane = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f); + TopPlane = DirectX::Internal::XMPlaneTransform(TopPlane, vOrientation, vOrigin); + TopPlane = XMPlaneNormalize(TopPlane); + + XMVECTOR BottomPlane = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f); + BottomPlane = DirectX::Internal::XMPlaneTransform(BottomPlane, vOrientation, vOrigin); + BottomPlane = XMPlaneNormalize(BottomPlane); + + return box.ContainedBy(NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingFrustum::Contains(const BoundingFrustum& fr) const noexcept +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + // Create 6 planes (do it inline to encourage use of registers) + XMVECTOR NearPlane = XMVectorSet(0.0f, 0.0f, -1.0f, Near); + NearPlane = DirectX::Internal::XMPlaneTransform(NearPlane, vOrientation, vOrigin); + NearPlane = XMPlaneNormalize(NearPlane); + + XMVECTOR FarPlane = XMVectorSet(0.0f, 0.0f, 1.0f, -Far); + FarPlane = DirectX::Internal::XMPlaneTransform(FarPlane, vOrientation, vOrigin); + FarPlane = XMPlaneNormalize(FarPlane); + + XMVECTOR RightPlane = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f); + RightPlane = DirectX::Internal::XMPlaneTransform(RightPlane, vOrientation, vOrigin); + RightPlane = XMPlaneNormalize(RightPlane); + + XMVECTOR LeftPlane = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f); + LeftPlane = DirectX::Internal::XMPlaneTransform(LeftPlane, vOrientation, vOrigin); + LeftPlane = XMPlaneNormalize(LeftPlane); + + XMVECTOR TopPlane = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f); + TopPlane = DirectX::Internal::XMPlaneTransform(TopPlane, vOrientation, vOrigin); + TopPlane = XMPlaneNormalize(TopPlane); + + XMVECTOR BottomPlane = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f); + BottomPlane = DirectX::Internal::XMPlaneTransform(BottomPlane, vOrientation, vOrigin); + BottomPlane = XMPlaneNormalize(BottomPlane); + + return fr.ContainedBy(NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane); +} + + +//----------------------------------------------------------------------------- +// Exact sphere vs frustum test. The algorithm first checks the sphere against +// the planes of the frustum, then if the plane checks were indeterminate finds +// the nearest feature (plane, line, point) on the frustum to the center of the +// sphere and compares the distance to the nearest feature to the radius of the +// sphere +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingFrustum::Intersects(const BoundingSphere& sh) const noexcept +{ + XMVECTOR Zero = XMVectorZero(); + + // Build the frustum planes. + XMVECTOR Planes[6]; + Planes[0] = XMVectorSet(0.0f, 0.0f, -1.0f, Near); + Planes[1] = XMVectorSet(0.0f, 0.0f, 1.0f, -Far); + Planes[2] = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f); + Planes[3] = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f); + Planes[4] = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f); + Planes[5] = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f); + + // Normalize the planes so we can compare to the sphere radius. + Planes[2] = XMVector3Normalize(Planes[2]); + Planes[3] = XMVector3Normalize(Planes[3]); + Planes[4] = XMVector3Normalize(Planes[4]); + Planes[5] = XMVector3Normalize(Planes[5]); + + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + // Load the sphere. + XMVECTOR vCenter = XMLoadFloat3(&sh.Center); + XMVECTOR vRadius = XMVectorReplicatePtr(&sh.Radius); + + // Transform the center of the sphere into the local space of frustum. + vCenter = XMVector3InverseRotate(XMVectorSubtract(vCenter, vOrigin), vOrientation); + + // Set w of the center to one so we can dot4 with the plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne()); + + // Check against each plane of the frustum. + XMVECTOR Outside = XMVectorFalseInt(); + XMVECTOR InsideAll = XMVectorTrueInt(); + XMVECTOR CenterInsideAll = XMVectorTrueInt(); + + XMVECTOR Dist[6]; + + for (size_t i = 0; i < 6; ++i) + { + Dist[i] = XMVector4Dot(vCenter, Planes[i]); + + // Outside the plane? + Outside = XMVectorOrInt(Outside, XMVectorGreater(Dist[i], vRadius)); + + // Fully inside the plane? + InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(Dist[i], XMVectorNegate(vRadius))); + + // Check if the center is inside the plane. + CenterInsideAll = XMVectorAndInt(CenterInsideAll, XMVectorLessOrEqual(Dist[i], Zero)); + } + + // If the sphere is outside any of the planes it is outside. + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return false; + + // If the sphere is inside all planes it is fully inside. + if (XMVector4EqualInt(InsideAll, XMVectorTrueInt())) + return true; + + // If the center of the sphere is inside all planes and the sphere intersects + // one or more planes then it must intersect. + if (XMVector4EqualInt(CenterInsideAll, XMVectorTrueInt())) + return true; + + // The sphere may be outside the frustum or intersecting the frustum. + // Find the nearest feature (face, edge, or corner) on the frustum + // to the sphere. + + // The faces adjacent to each face are: + static const size_t adjacent_faces[6][4] = + { + { 2, 3, 4, 5 }, // 0 + { 2, 3, 4, 5 }, // 1 + { 0, 1, 4, 5 }, // 2 + { 0, 1, 4, 5 }, // 3 + { 0, 1, 2, 3 }, // 4 + { 0, 1, 2, 3 } + }; // 5 + + XMVECTOR Intersects = XMVectorFalseInt(); + + // Check to see if the nearest feature is one of the planes. + for (size_t i = 0; i < 6; ++i) + { + // Find the nearest point on the plane to the center of the sphere. + XMVECTOR Point = XMVectorNegativeMultiplySubtract(Planes[i], Dist[i], vCenter); + + // Set w of the point to one. + Point = XMVectorInsert<0, 0, 0, 0, 1>(Point, XMVectorSplatOne()); + + // If the point is inside the face (inside the adjacent planes) then + // this plane is the nearest feature. + XMVECTOR InsideFace = XMVectorTrueInt(); + + for (size_t j = 0; j < 4; j++) + { + size_t plane_index = adjacent_faces[i][j]; + + InsideFace = XMVectorAndInt(InsideFace, + XMVectorLessOrEqual(XMVector4Dot(Point, Planes[plane_index]), Zero)); + } + + // Since we have already checked distance from the plane we know that the + // sphere must intersect if this plane is the nearest feature. + Intersects = XMVectorOrInt(Intersects, + XMVectorAndInt(XMVectorGreater(Dist[i], Zero), InsideFace)); + } + + if (XMVector4EqualInt(Intersects, XMVectorTrueInt())) + return true; + + // Build the corners of the frustum. + XMVECTOR vRightTop = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR vRightBottom = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR vLeftTop = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR vLeftBottom = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR vNear = XMVectorReplicatePtr(&Near); + XMVECTOR vFar = XMVectorReplicatePtr(&Far); + + XMVECTOR Corners[CORNER_COUNT]; + Corners[0] = XMVectorMultiply(vRightTop, vNear); + Corners[1] = XMVectorMultiply(vRightBottom, vNear); + Corners[2] = XMVectorMultiply(vLeftTop, vNear); + Corners[3] = XMVectorMultiply(vLeftBottom, vNear); + Corners[4] = XMVectorMultiply(vRightTop, vFar); + Corners[5] = XMVectorMultiply(vRightBottom, vFar); + Corners[6] = XMVectorMultiply(vLeftTop, vFar); + Corners[7] = XMVectorMultiply(vLeftBottom, vFar); + + // The Edges are: + static const size_t edges[12][2] = + { + { 0, 1 }, { 2, 3 }, { 0, 2 }, { 1, 3 }, // Near plane + { 4, 5 }, { 6, 7 }, { 4, 6 }, { 5, 7 }, // Far plane + { 0, 4 }, { 1, 5 }, { 2, 6 }, { 3, 7 }, + }; // Near to far + + XMVECTOR RadiusSq = XMVectorMultiply(vRadius, vRadius); + + // Check to see if the nearest feature is one of the edges (or corners). + for (size_t i = 0; i < 12; ++i) + { + size_t ei0 = edges[i][0]; + size_t ei1 = edges[i][1]; + + // Find the nearest point on the edge to the center of the sphere. + // The corners of the frustum are included as the endpoints of the edges. + XMVECTOR Point = DirectX::Internal::PointOnLineSegmentNearestPoint(Corners[ei0], Corners[ei1], vCenter); + + XMVECTOR Delta = XMVectorSubtract(vCenter, Point); + + XMVECTOR DistSq = XMVector3Dot(Delta, Delta); + + // If the distance to the center of the sphere to the point is less than + // the radius of the sphere then it must intersect. + Intersects = XMVectorOrInt(Intersects, XMVectorLessOrEqual(DistSq, RadiusSq)); + } + + if (XMVector4EqualInt(Intersects, XMVectorTrueInt())) + return true; + + // The sphere must be outside the frustum. + return false; +} + + +//----------------------------------------------------------------------------- +// Exact axis aligned box vs frustum test. Constructs an oriented box and uses +// the oriented box vs frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingFrustum::Intersects(const BoundingBox& box) const noexcept +{ + // Make the axis aligned box oriented and do an OBB vs frustum test. + BoundingOrientedBox obox(box.Center, box.Extents, XMFLOAT4(0.f, 0.f, 0.f, 1.f)); + return Intersects(obox); +} + + +//----------------------------------------------------------------------------- +// Exact oriented box vs frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingFrustum::Intersects(const BoundingOrientedBox& box) const noexcept +{ + static const XMVECTORU32 SelectY = { { { XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 } } }; + static const XMVECTORU32 SelectZ = { { { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 } } }; + + XMVECTOR Zero = XMVectorZero(); + + // Build the frustum planes. + XMVECTOR Planes[6]; + Planes[0] = XMVectorSet(0.0f, 0.0f, -1.0f, Near); + Planes[1] = XMVectorSet(0.0f, 0.0f, 1.0f, -Far); + Planes[2] = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f); + Planes[3] = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f); + Planes[4] = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f); + Planes[5] = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f); + + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR FrustumOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(FrustumOrientation)); + + // Load the box. + XMVECTOR Center = XMLoadFloat3(&box.Center); + XMVECTOR Extents = XMLoadFloat3(&box.Extents); + XMVECTOR BoxOrientation = XMLoadFloat4(&box.Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(BoxOrientation)); + + // Transform the oriented box into the space of the frustum in order to + // minimize the number of transforms we have to do. + Center = XMVector3InverseRotate(XMVectorSubtract(Center, vOrigin), FrustumOrientation); + BoxOrientation = XMQuaternionMultiply(BoxOrientation, XMQuaternionConjugate(FrustumOrientation)); + + // Set w of the center to one so we can dot4 with the plane. + Center = XMVectorInsert<0, 0, 0, 0, 1>(Center, XMVectorSplatOne()); + + // Build the 3x3 rotation matrix that defines the box axes. + XMMATRIX R = XMMatrixRotationQuaternion(BoxOrientation); + + // Check against each plane of the frustum. + XMVECTOR Outside = XMVectorFalseInt(); + XMVECTOR InsideAll = XMVectorTrueInt(); + XMVECTOR CenterInsideAll = XMVectorTrueInt(); + + for (size_t i = 0; i < 6; ++i) + { + // Compute the distance to the center of the box. + XMVECTOR Dist = XMVector4Dot(Center, Planes[i]); + + // Project the axes of the box onto the normal of the plane. Half the + // length of the projection (sometime called the "radius") is equal to + // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w)) + // where h(i) are extents of the box, n is the plane normal, and b(i) are the + // axes of the box. + XMVECTOR Radius = XMVector3Dot(Planes[i], R.r[0]); + Radius = XMVectorSelect(Radius, XMVector3Dot(Planes[i], R.r[1]), SelectY); + Radius = XMVectorSelect(Radius, XMVector3Dot(Planes[i], R.r[2]), SelectZ); + Radius = XMVector3Dot(Extents, XMVectorAbs(Radius)); + + // Outside the plane? + Outside = XMVectorOrInt(Outside, XMVectorGreater(Dist, Radius)); + + // Fully inside the plane? + InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(Dist, XMVectorNegate(Radius))); + + // Check if the center is inside the plane. + CenterInsideAll = XMVectorAndInt(CenterInsideAll, XMVectorLessOrEqual(Dist, Zero)); + } + + // If the box is outside any of the planes it is outside. + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return false; + + // If the box is inside all planes it is fully inside. + if (XMVector4EqualInt(InsideAll, XMVectorTrueInt())) + return true; + + // If the center of the box is inside all planes and the box intersects + // one or more planes then it must intersect. + if (XMVector4EqualInt(CenterInsideAll, XMVectorTrueInt())) + return true; + + // Build the corners of the frustum. + XMVECTOR vRightTop = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR vRightBottom = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR vLeftTop = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR vLeftBottom = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR vNear = XMVectorReplicatePtr(&Near); + XMVECTOR vFar = XMVectorReplicatePtr(&Far); + + XMVECTOR Corners[CORNER_COUNT]; + Corners[0] = XMVectorMultiply(vRightTop, vNear); + Corners[1] = XMVectorMultiply(vRightBottom, vNear); + Corners[2] = XMVectorMultiply(vLeftTop, vNear); + Corners[3] = XMVectorMultiply(vLeftBottom, vNear); + Corners[4] = XMVectorMultiply(vRightTop, vFar); + Corners[5] = XMVectorMultiply(vRightBottom, vFar); + Corners[6] = XMVectorMultiply(vLeftTop, vFar); + Corners[7] = XMVectorMultiply(vLeftBottom, vFar); + + // Test against box axes (3) + { + // Find the min/max values of the projection of the frustum onto each axis. + XMVECTOR FrustumMin, FrustumMax; + + FrustumMin = XMVector3Dot(Corners[0], R.r[0]); + FrustumMin = XMVectorSelect(FrustumMin, XMVector3Dot(Corners[0], R.r[1]), SelectY); + FrustumMin = XMVectorSelect(FrustumMin, XMVector3Dot(Corners[0], R.r[2]), SelectZ); + FrustumMax = FrustumMin; + + for (size_t i = 1; i < BoundingOrientedBox::CORNER_COUNT; ++i) + { + XMVECTOR Temp = XMVector3Dot(Corners[i], R.r[0]); + Temp = XMVectorSelect(Temp, XMVector3Dot(Corners[i], R.r[1]), SelectY); + Temp = XMVectorSelect(Temp, XMVector3Dot(Corners[i], R.r[2]), SelectZ); + + FrustumMin = XMVectorMin(FrustumMin, Temp); + FrustumMax = XMVectorMax(FrustumMax, Temp); + } + + // Project the center of the box onto the axes. + XMVECTOR BoxDist = XMVector3Dot(Center, R.r[0]); + BoxDist = XMVectorSelect(BoxDist, XMVector3Dot(Center, R.r[1]), SelectY); + BoxDist = XMVectorSelect(BoxDist, XMVector3Dot(Center, R.r[2]), SelectZ); + + // The projection of the box onto the axis is just its Center and Extents. + // if (min > box_max || max < box_min) reject; + XMVECTOR Result = XMVectorOrInt(XMVectorGreater(FrustumMin, XMVectorAdd(BoxDist, Extents)), + XMVectorLess(FrustumMax, XMVectorSubtract(BoxDist, Extents))); + + if (DirectX::Internal::XMVector3AnyTrue(Result)) + return false; + } + + // Test against edge/edge axes (3*6). + XMVECTOR FrustumEdgeAxis[6]; + + FrustumEdgeAxis[0] = vRightTop; + FrustumEdgeAxis[1] = vRightBottom; + FrustumEdgeAxis[2] = vLeftTop; + FrustumEdgeAxis[3] = vLeftBottom; + FrustumEdgeAxis[4] = XMVectorSubtract(vRightTop, vLeftTop); + FrustumEdgeAxis[5] = XMVectorSubtract(vLeftBottom, vLeftTop); + + for (size_t i = 0; i < 3; ++i) + { + for (size_t j = 0; j < 6; j++) + { + // Compute the axis we are going to test. + XMVECTOR Axis = XMVector3Cross(R.r[i], FrustumEdgeAxis[j]); + + // Find the min/max values of the projection of the frustum onto the axis. + XMVECTOR FrustumMin, FrustumMax; + + FrustumMin = FrustumMax = XMVector3Dot(Axis, Corners[0]); + + for (size_t k = 1; k < CORNER_COUNT; k++) + { + XMVECTOR Temp = XMVector3Dot(Axis, Corners[k]); + FrustumMin = XMVectorMin(FrustumMin, Temp); + FrustumMax = XMVectorMax(FrustumMax, Temp); + } + + // Project the center of the box onto the axis. + XMVECTOR Dist = XMVector3Dot(Center, Axis); + + // Project the axes of the box onto the axis to find the "radius" of the box. + XMVECTOR Radius = XMVector3Dot(Axis, R.r[0]); + Radius = XMVectorSelect(Radius, XMVector3Dot(Axis, R.r[1]), SelectY); + Radius = XMVectorSelect(Radius, XMVector3Dot(Axis, R.r[2]), SelectZ); + Radius = XMVector3Dot(Extents, XMVectorAbs(Radius)); + + // if (center > max + radius || center < min - radius) reject; + Outside = XMVectorOrInt(Outside, XMVectorGreater(Dist, XMVectorAdd(FrustumMax, Radius))); + Outside = XMVectorOrInt(Outside, XMVectorLess(Dist, XMVectorSubtract(FrustumMin, Radius))); + } + } + + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return false; + + // If we did not find a separating plane then the box must intersect the frustum. + return true; +} + + +//----------------------------------------------------------------------------- +// Exact frustum vs frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingFrustum::Intersects(const BoundingFrustum& fr) const noexcept +{ + // Load origin and orientation of frustum B. + XMVECTOR OriginB = XMLoadFloat3(&Origin); + XMVECTOR OrientationB = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(OrientationB)); + + // Build the planes of frustum B. + XMVECTOR AxisB[6]; + AxisB[0] = XMVectorSet(0.0f, 0.0f, -1.0f, 0.0f); + AxisB[1] = XMVectorSet(0.0f, 0.0f, 1.0f, 0.0f); + AxisB[2] = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f); + AxisB[3] = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f); + AxisB[4] = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f); + AxisB[5] = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f); + + XMVECTOR PlaneDistB[6]; + PlaneDistB[0] = XMVectorNegate(XMVectorReplicatePtr(&Near)); + PlaneDistB[1] = XMVectorReplicatePtr(&Far); + PlaneDistB[2] = XMVectorZero(); + PlaneDistB[3] = XMVectorZero(); + PlaneDistB[4] = XMVectorZero(); + PlaneDistB[5] = XMVectorZero(); + + // Load origin and orientation of frustum A. + XMVECTOR OriginA = XMLoadFloat3(&fr.Origin); + XMVECTOR OrientationA = XMLoadFloat4(&fr.Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(OrientationA)); + + // Transform frustum A into the space of the frustum B in order to + // minimize the number of transforms we have to do. + OriginA = XMVector3InverseRotate(XMVectorSubtract(OriginA, OriginB), OrientationB); + OrientationA = XMQuaternionMultiply(OrientationA, XMQuaternionConjugate(OrientationB)); + + // Build the corners of frustum A (in the local space of B). + XMVECTOR RightTopA = XMVectorSet(fr.RightSlope, fr.TopSlope, 1.0f, 0.0f); + XMVECTOR RightBottomA = XMVectorSet(fr.RightSlope, fr.BottomSlope, 1.0f, 0.0f); + XMVECTOR LeftTopA = XMVectorSet(fr.LeftSlope, fr.TopSlope, 1.0f, 0.0f); + XMVECTOR LeftBottomA = XMVectorSet(fr.LeftSlope, fr.BottomSlope, 1.0f, 0.0f); + XMVECTOR NearA = XMVectorReplicatePtr(&fr.Near); + XMVECTOR FarA = XMVectorReplicatePtr(&fr.Far); + + RightTopA = XMVector3Rotate(RightTopA, OrientationA); + RightBottomA = XMVector3Rotate(RightBottomA, OrientationA); + LeftTopA = XMVector3Rotate(LeftTopA, OrientationA); + LeftBottomA = XMVector3Rotate(LeftBottomA, OrientationA); + + XMVECTOR CornersA[CORNER_COUNT]; + CornersA[0] = XMVectorMultiplyAdd(RightTopA, NearA, OriginA); + CornersA[1] = XMVectorMultiplyAdd(RightBottomA, NearA, OriginA); + CornersA[2] = XMVectorMultiplyAdd(LeftTopA, NearA, OriginA); + CornersA[3] = XMVectorMultiplyAdd(LeftBottomA, NearA, OriginA); + CornersA[4] = XMVectorMultiplyAdd(RightTopA, FarA, OriginA); + CornersA[5] = XMVectorMultiplyAdd(RightBottomA, FarA, OriginA); + CornersA[6] = XMVectorMultiplyAdd(LeftTopA, FarA, OriginA); + CornersA[7] = XMVectorMultiplyAdd(LeftBottomA, FarA, OriginA); + + // Check frustum A against each plane of frustum B. + XMVECTOR Outside = XMVectorFalseInt(); + XMVECTOR InsideAll = XMVectorTrueInt(); + + for (size_t i = 0; i < 6; ++i) + { + // Find the min/max projection of the frustum onto the plane normal. + XMVECTOR Min, Max; + + Min = Max = XMVector3Dot(AxisB[i], CornersA[0]); + + for (size_t j = 1; j < CORNER_COUNT; j++) + { + XMVECTOR Temp = XMVector3Dot(AxisB[i], CornersA[j]); + Min = XMVectorMin(Min, Temp); + Max = XMVectorMax(Max, Temp); + } + + // Outside the plane? + Outside = XMVectorOrInt(Outside, XMVectorGreater(Min, PlaneDistB[i])); + + // Fully inside the plane? + InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(Max, PlaneDistB[i])); + } + + // If the frustum A is outside any of the planes of frustum B it is outside. + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return false; + + // If frustum A is inside all planes of frustum B it is fully inside. + if (XMVector4EqualInt(InsideAll, XMVectorTrueInt())) + return true; + + // Build the corners of frustum B. + XMVECTOR RightTopB = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR RightBottomB = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR LeftTopB = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR LeftBottomB = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR NearB = XMVectorReplicatePtr(&Near); + XMVECTOR FarB = XMVectorReplicatePtr(&Far); + + XMVECTOR CornersB[BoundingFrustum::CORNER_COUNT]; + CornersB[0] = XMVectorMultiply(RightTopB, NearB); + CornersB[1] = XMVectorMultiply(RightBottomB, NearB); + CornersB[2] = XMVectorMultiply(LeftTopB, NearB); + CornersB[3] = XMVectorMultiply(LeftBottomB, NearB); + CornersB[4] = XMVectorMultiply(RightTopB, FarB); + CornersB[5] = XMVectorMultiply(RightBottomB, FarB); + CornersB[6] = XMVectorMultiply(LeftTopB, FarB); + CornersB[7] = XMVectorMultiply(LeftBottomB, FarB); + + // Build the planes of frustum A (in the local space of B). + XMVECTOR AxisA[6]; + XMVECTOR PlaneDistA[6]; + + AxisA[0] = XMVectorSet(0.0f, 0.0f, -1.0f, 0.0f); + AxisA[1] = XMVectorSet(0.0f, 0.0f, 1.0f, 0.0f); + AxisA[2] = XMVectorSet(1.0f, 0.0f, -fr.RightSlope, 0.0f); + AxisA[3] = XMVectorSet(-1.0f, 0.0f, fr.LeftSlope, 0.0f); + AxisA[4] = XMVectorSet(0.0f, 1.0f, -fr.TopSlope, 0.0f); + AxisA[5] = XMVectorSet(0.0f, -1.0f, fr.BottomSlope, 0.0f); + + AxisA[0] = XMVector3Rotate(AxisA[0], OrientationA); + AxisA[1] = XMVectorNegate(AxisA[0]); + AxisA[2] = XMVector3Rotate(AxisA[2], OrientationA); + AxisA[3] = XMVector3Rotate(AxisA[3], OrientationA); + AxisA[4] = XMVector3Rotate(AxisA[4], OrientationA); + AxisA[5] = XMVector3Rotate(AxisA[5], OrientationA); + + PlaneDistA[0] = XMVector3Dot(AxisA[0], CornersA[0]); // Re-use corner on near plane. + PlaneDistA[1] = XMVector3Dot(AxisA[1], CornersA[4]); // Re-use corner on far plane. + PlaneDistA[2] = XMVector3Dot(AxisA[2], OriginA); + PlaneDistA[3] = XMVector3Dot(AxisA[3], OriginA); + PlaneDistA[4] = XMVector3Dot(AxisA[4], OriginA); + PlaneDistA[5] = XMVector3Dot(AxisA[5], OriginA); + + // Check each axis of frustum A for a seperating plane (5). + for (size_t i = 0; i < 6; ++i) + { + // Find the minimum projection of the frustum onto the plane normal. + XMVECTOR Min; + + Min = XMVector3Dot(AxisA[i], CornersB[0]); + + for (size_t j = 1; j < CORNER_COUNT; j++) + { + XMVECTOR Temp = XMVector3Dot(AxisA[i], CornersB[j]); + Min = XMVectorMin(Min, Temp); + } + + // Outside the plane? + Outside = XMVectorOrInt(Outside, XMVectorGreater(Min, PlaneDistA[i])); + } + + // If the frustum B is outside any of the planes of frustum A it is outside. + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return false; + + // Check edge/edge axes (6 * 6). + XMVECTOR FrustumEdgeAxisA[6]; + FrustumEdgeAxisA[0] = RightTopA; + FrustumEdgeAxisA[1] = RightBottomA; + FrustumEdgeAxisA[2] = LeftTopA; + FrustumEdgeAxisA[3] = LeftBottomA; + FrustumEdgeAxisA[4] = XMVectorSubtract(RightTopA, LeftTopA); + FrustumEdgeAxisA[5] = XMVectorSubtract(LeftBottomA, LeftTopA); + + XMVECTOR FrustumEdgeAxisB[6]; + FrustumEdgeAxisB[0] = RightTopB; + FrustumEdgeAxisB[1] = RightBottomB; + FrustumEdgeAxisB[2] = LeftTopB; + FrustumEdgeAxisB[3] = LeftBottomB; + FrustumEdgeAxisB[4] = XMVectorSubtract(RightTopB, LeftTopB); + FrustumEdgeAxisB[5] = XMVectorSubtract(LeftBottomB, LeftTopB); + + for (size_t i = 0; i < 6; ++i) + { + for (size_t j = 0; j < 6; j++) + { + // Compute the axis we are going to test. + XMVECTOR Axis = XMVector3Cross(FrustumEdgeAxisA[i], FrustumEdgeAxisB[j]); + + // Find the min/max values of the projection of both frustums onto the axis. + XMVECTOR MinA, MaxA; + XMVECTOR MinB, MaxB; + + MinA = MaxA = XMVector3Dot(Axis, CornersA[0]); + MinB = MaxB = XMVector3Dot(Axis, CornersB[0]); + + for (size_t k = 1; k < CORNER_COUNT; k++) + { + XMVECTOR TempA = XMVector3Dot(Axis, CornersA[k]); + MinA = XMVectorMin(MinA, TempA); + MaxA = XMVectorMax(MaxA, TempA); + + XMVECTOR TempB = XMVector3Dot(Axis, CornersB[k]); + MinB = XMVectorMin(MinB, TempB); + MaxB = XMVectorMax(MaxB, TempB); + } + + // if (MinA > MaxB || MinB > MaxA) reject + Outside = XMVectorOrInt(Outside, XMVectorGreater(MinA, MaxB)); + Outside = XMVectorOrInt(Outside, XMVectorGreater(MinB, MaxA)); + } + } + + // If there is a seperating plane, then the frustums do not intersect. + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return false; + + // If we did not find a separating plane then the frustums intersect. + return true; +} + + +//----------------------------------------------------------------------------- +// Triangle vs frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingFrustum::Intersects(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept +{ + // Build the frustum planes (NOTE: D is negated from the usual). + XMVECTOR Planes[6]; + Planes[0] = XMVectorSet(0.0f, 0.0f, -1.0f, -Near); + Planes[1] = XMVectorSet(0.0f, 0.0f, 1.0f, Far); + Planes[2] = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f); + Planes[3] = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f); + Planes[4] = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f); + Planes[5] = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f); + + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + // Transform triangle into the local space of frustum. + XMVECTOR TV0 = XMVector3InverseRotate(XMVectorSubtract(V0, vOrigin), vOrientation); + XMVECTOR TV1 = XMVector3InverseRotate(XMVectorSubtract(V1, vOrigin), vOrientation); + XMVECTOR TV2 = XMVector3InverseRotate(XMVectorSubtract(V2, vOrigin), vOrientation); + + // Test each vertex of the triangle against the frustum planes. + XMVECTOR Outside = XMVectorFalseInt(); + XMVECTOR InsideAll = XMVectorTrueInt(); + + for (size_t i = 0; i < 6; ++i) + { + XMVECTOR Dist0 = XMVector3Dot(TV0, Planes[i]); + XMVECTOR Dist1 = XMVector3Dot(TV1, Planes[i]); + XMVECTOR Dist2 = XMVector3Dot(TV2, Planes[i]); + + XMVECTOR MinDist = XMVectorMin(Dist0, Dist1); + MinDist = XMVectorMin(MinDist, Dist2); + XMVECTOR MaxDist = XMVectorMax(Dist0, Dist1); + MaxDist = XMVectorMax(MaxDist, Dist2); + + XMVECTOR PlaneDist = XMVectorSplatW(Planes[i]); + + // Outside the plane? + Outside = XMVectorOrInt(Outside, XMVectorGreater(MinDist, PlaneDist)); + + // Fully inside the plane? + InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(MaxDist, PlaneDist)); + } + + // If the triangle is outside any of the planes it is outside. + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return false; + + // If the triangle is inside all planes it is fully inside. + if (XMVector4EqualInt(InsideAll, XMVectorTrueInt())) + return true; + + // Build the corners of the frustum. + XMVECTOR vRightTop = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR vRightBottom = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR vLeftTop = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR vLeftBottom = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR vNear = XMVectorReplicatePtr(&Near); + XMVECTOR vFar = XMVectorReplicatePtr(&Far); + + XMVECTOR Corners[CORNER_COUNT]; + Corners[0] = XMVectorMultiply(vRightTop, vNear); + Corners[1] = XMVectorMultiply(vRightBottom, vNear); + Corners[2] = XMVectorMultiply(vLeftTop, vNear); + Corners[3] = XMVectorMultiply(vLeftBottom, vNear); + Corners[4] = XMVectorMultiply(vRightTop, vFar); + Corners[5] = XMVectorMultiply(vRightBottom, vFar); + Corners[6] = XMVectorMultiply(vLeftTop, vFar); + Corners[7] = XMVectorMultiply(vLeftBottom, vFar); + + // Test the plane of the triangle. + XMVECTOR Normal = XMVector3Cross(XMVectorSubtract(V1, V0), XMVectorSubtract(V2, V0)); + XMVECTOR Dist = XMVector3Dot(Normal, V0); + + XMVECTOR MinDist, MaxDist; + MinDist = MaxDist = XMVector3Dot(Corners[0], Normal); + for (size_t i = 1; i < CORNER_COUNT; ++i) + { + XMVECTOR Temp = XMVector3Dot(Corners[i], Normal); + MinDist = XMVectorMin(MinDist, Temp); + MaxDist = XMVectorMax(MaxDist, Temp); + } + + Outside = XMVectorOrInt(XMVectorGreater(MinDist, Dist), XMVectorLess(MaxDist, Dist)); + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return false; + + // Check the edge/edge axes (3*6). + XMVECTOR TriangleEdgeAxis[3]; + TriangleEdgeAxis[0] = XMVectorSubtract(V1, V0); + TriangleEdgeAxis[1] = XMVectorSubtract(V2, V1); + TriangleEdgeAxis[2] = XMVectorSubtract(V0, V2); + + XMVECTOR FrustumEdgeAxis[6]; + FrustumEdgeAxis[0] = vRightTop; + FrustumEdgeAxis[1] = vRightBottom; + FrustumEdgeAxis[2] = vLeftTop; + FrustumEdgeAxis[3] = vLeftBottom; + FrustumEdgeAxis[4] = XMVectorSubtract(vRightTop, vLeftTop); + FrustumEdgeAxis[5] = XMVectorSubtract(vLeftBottom, vLeftTop); + + for (size_t i = 0; i < 3; ++i) + { + for (size_t j = 0; j < 6; j++) + { + // Compute the axis we are going to test. + XMVECTOR Axis = XMVector3Cross(TriangleEdgeAxis[i], FrustumEdgeAxis[j]); + + // Find the min/max of the projection of the triangle onto the axis. + XMVECTOR MinA, MaxA; + + XMVECTOR Dist0 = XMVector3Dot(V0, Axis); + XMVECTOR Dist1 = XMVector3Dot(V1, Axis); + XMVECTOR Dist2 = XMVector3Dot(V2, Axis); + + MinA = XMVectorMin(Dist0, Dist1); + MinA = XMVectorMin(MinA, Dist2); + MaxA = XMVectorMax(Dist0, Dist1); + MaxA = XMVectorMax(MaxA, Dist2); + + // Find the min/max of the projection of the frustum onto the axis. + XMVECTOR MinB, MaxB; + + MinB = MaxB = XMVector3Dot(Axis, Corners[0]); + + for (size_t k = 1; k < CORNER_COUNT; k++) + { + XMVECTOR Temp = XMVector3Dot(Axis, Corners[k]); + MinB = XMVectorMin(MinB, Temp); + MaxB = XMVectorMax(MaxB, Temp); + } + + // if (MinA > MaxB || MinB > MaxA) reject; + Outside = XMVectorOrInt(Outside, XMVectorGreater(MinA, MaxB)); + Outside = XMVectorOrInt(Outside, XMVectorGreater(MinB, MaxA)); + } + } + + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return false; + + // If we did not find a separating plane then the triangle must intersect the frustum. + return true; +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline PlaneIntersectionType XM_CALLCONV BoundingFrustum::Intersects(FXMVECTOR Plane) const noexcept +{ + assert(DirectX::Internal::XMPlaneIsUnit(Plane)); + + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + // Set w of the origin to one so we can dot4 with a plane. + vOrigin = XMVectorInsert<0, 0, 0, 0, 1>(vOrigin, XMVectorSplatOne()); + + // Build the corners of the frustum (in world space). + XMVECTOR RightTop = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR RightBottom = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR LeftTop = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR LeftBottom = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR vNear = XMVectorReplicatePtr(&Near); + XMVECTOR vFar = XMVectorReplicatePtr(&Far); + + RightTop = XMVector3Rotate(RightTop, vOrientation); + RightBottom = XMVector3Rotate(RightBottom, vOrientation); + LeftTop = XMVector3Rotate(LeftTop, vOrientation); + LeftBottom = XMVector3Rotate(LeftBottom, vOrientation); + + XMVECTOR Corners0 = XMVectorMultiplyAdd(RightTop, vNear, vOrigin); + XMVECTOR Corners1 = XMVectorMultiplyAdd(RightBottom, vNear, vOrigin); + XMVECTOR Corners2 = XMVectorMultiplyAdd(LeftTop, vNear, vOrigin); + XMVECTOR Corners3 = XMVectorMultiplyAdd(LeftBottom, vNear, vOrigin); + XMVECTOR Corners4 = XMVectorMultiplyAdd(RightTop, vFar, vOrigin); + XMVECTOR Corners5 = XMVectorMultiplyAdd(RightBottom, vFar, vOrigin); + XMVECTOR Corners6 = XMVectorMultiplyAdd(LeftTop, vFar, vOrigin); + XMVECTOR Corners7 = XMVectorMultiplyAdd(LeftBottom, vFar, vOrigin); + + XMVECTOR Outside, Inside; + DirectX::Internal::FastIntersectFrustumPlane(Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane, Outside, Inside); + + // If the frustum is outside any plane it is outside. + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return FRONT; + + // If the frustum is inside all planes it is inside. + if (XMVector4EqualInt(Inside, XMVectorTrueInt())) + return BACK; + + // The frustum is not inside all planes or outside a plane it intersects. + return INTERSECTING; +} + + +//----------------------------------------------------------------------------- +// Ray vs. frustum test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingFrustum::Intersects(FXMVECTOR rayOrigin, FXMVECTOR Direction, float& Dist) const noexcept +{ + // If ray starts inside the frustum, return a distance of 0 for the hit + if (Contains(rayOrigin) == CONTAINS) + { + Dist = 0.0f; + return true; + } + + // Build the frustum planes. + XMVECTOR Planes[6]; + Planes[0] = XMVectorSet(0.0f, 0.0f, -1.0f, Near); + Planes[1] = XMVectorSet(0.0f, 0.0f, 1.0f, -Far); + Planes[2] = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f); + Planes[3] = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f); + Planes[4] = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f); + Planes[5] = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f); + + // Load origin and orientation of the frustum. + XMVECTOR frOrigin = XMLoadFloat3(&Origin); + XMVECTOR frOrientation = XMLoadFloat4(&Orientation); + + // This algorithm based on "Fast Ray-Convex Polyhedron Intersectin," in James Arvo, ed., Graphics Gems II pp. 247-250 + float tnear = -FLT_MAX; + float tfar = FLT_MAX; + + for (size_t i = 0; i < 6; ++i) + { + XMVECTOR Plane = DirectX::Internal::XMPlaneTransform(Planes[i], frOrientation, frOrigin); + Plane = XMPlaneNormalize(Plane); + + XMVECTOR AxisDotOrigin = XMPlaneDotCoord(Plane, rayOrigin); + XMVECTOR AxisDotDirection = XMVector3Dot(Plane, Direction); + + if (XMVector3LessOrEqual(XMVectorAbs(AxisDotDirection), g_RayEpsilon)) + { + // Ray is parallel to plane - check if ray origin is inside plane's + if (XMVector3Greater(AxisDotOrigin, g_XMZero)) + { + // Ray origin is outside half-space. + Dist = 0.f; + return false; + } + } + else + { + // Ray not parallel - get distance to plane. + float vd = XMVectorGetX(AxisDotDirection); + float vn = XMVectorGetX(AxisDotOrigin); + float t = -vn / vd; + if (vd < 0.0f) + { + // Front face - T is a near point. + if (t > tfar) + { + Dist = 0.f; + return false; + } + if (t > tnear) + { + // Hit near face. + tnear = t; + } + } + else + { + // back face - T is far point. + if (t < tnear) + { + Dist = 0.f; + return false; + } + if (t < tfar) + { + // Hit far face. + tfar = t; + } + } + } + } + + // Survived all tests. + // Note: if ray originates on polyhedron, may want to change 0.0f to some + // epsilon to avoid intersecting the originating face. + float distance = (tnear >= 0.0f) ? tnear : tfar; + if (distance >= 0.0f) + { + Dist = distance; + return true; + } + + Dist = 0.f; + return false; +} + + +//----------------------------------------------------------------------------- +// Test a frustum vs 6 planes (typically forming another frustum). +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingFrustum::ContainedBy( + FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2, + GXMVECTOR Plane3, + HXMVECTOR Plane4, HXMVECTOR Plane5) const noexcept +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + // Set w of the origin to one so we can dot4 with a plane. + vOrigin = XMVectorInsert<0, 0, 0, 0, 1>(vOrigin, XMVectorSplatOne()); + + // Build the corners of the frustum (in world space). + XMVECTOR RightTop = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR RightBottom = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR LeftTop = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR LeftBottom = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR vNear = XMVectorReplicatePtr(&Near); + XMVECTOR vFar = XMVectorReplicatePtr(&Far); + + RightTop = XMVector3Rotate(RightTop, vOrientation); + RightBottom = XMVector3Rotate(RightBottom, vOrientation); + LeftTop = XMVector3Rotate(LeftTop, vOrientation); + LeftBottom = XMVector3Rotate(LeftBottom, vOrientation); + + XMVECTOR Corners0 = XMVectorMultiplyAdd(RightTop, vNear, vOrigin); + XMVECTOR Corners1 = XMVectorMultiplyAdd(RightBottom, vNear, vOrigin); + XMVECTOR Corners2 = XMVectorMultiplyAdd(LeftTop, vNear, vOrigin); + XMVECTOR Corners3 = XMVectorMultiplyAdd(LeftBottom, vNear, vOrigin); + XMVECTOR Corners4 = XMVectorMultiplyAdd(RightTop, vFar, vOrigin); + XMVECTOR Corners5 = XMVectorMultiplyAdd(RightBottom, vFar, vOrigin); + XMVECTOR Corners6 = XMVectorMultiplyAdd(LeftTop, vFar, vOrigin); + XMVECTOR Corners7 = XMVectorMultiplyAdd(LeftBottom, vFar, vOrigin); + + XMVECTOR Outside, Inside; + + // Test against each plane. + DirectX::Internal::FastIntersectFrustumPlane(Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane0, Outside, Inside); + + XMVECTOR AnyOutside = Outside; + XMVECTOR AllInside = Inside; + + DirectX::Internal::FastIntersectFrustumPlane(Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane1, Outside, Inside); + + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectFrustumPlane(Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane2, Outside, Inside); + + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectFrustumPlane(Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane3, Outside, Inside); + + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectFrustumPlane(Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane4, Outside, Inside); + + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectFrustumPlane(Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane5, Outside, Inside); + + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + // If the frustum is outside any plane it is outside. + if (XMVector4EqualInt(AnyOutside, XMVectorTrueInt())) + return DISJOINT; + + // If the frustum is inside all planes it is inside. + if (XMVector4EqualInt(AllInside, XMVectorTrueInt())) + return CONTAINS; + + // The frustum is not inside all planes or outside a plane, it may intersect. + return INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Build the 6 frustum planes from a frustum. +// +// The intended use for these routines is for fast culling to a view frustum. +// When the volume being tested against a view frustum is small relative to the +// view frustum it is usually either inside all six planes of the frustum +// (CONTAINS) or outside one of the planes of the frustum (DISJOINT). If neither +// of these cases is true then it may or may not be intersecting the frustum +// (INTERSECTS) +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingFrustum::GetPlanes(XMVECTOR* NearPlane, XMVECTOR* FarPlane, XMVECTOR* RightPlane, + XMVECTOR* LeftPlane, XMVECTOR* TopPlane, XMVECTOR* BottomPlane) const noexcept +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + if (NearPlane) + { + XMVECTOR vNearPlane = XMVectorSet(0.0f, 0.0f, -1.0f, Near); + vNearPlane = DirectX::Internal::XMPlaneTransform(vNearPlane, vOrientation, vOrigin); + *NearPlane = XMPlaneNormalize(vNearPlane); + } + + if (FarPlane) + { + XMVECTOR vFarPlane = XMVectorSet(0.0f, 0.0f, 1.0f, -Far); + vFarPlane = DirectX::Internal::XMPlaneTransform(vFarPlane, vOrientation, vOrigin); + *FarPlane = XMPlaneNormalize(vFarPlane); + } + + if (RightPlane) + { + XMVECTOR vRightPlane = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f); + vRightPlane = DirectX::Internal::XMPlaneTransform(vRightPlane, vOrientation, vOrigin); + *RightPlane = XMPlaneNormalize(vRightPlane); + } + + if (LeftPlane) + { + XMVECTOR vLeftPlane = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f); + vLeftPlane = DirectX::Internal::XMPlaneTransform(vLeftPlane, vOrientation, vOrigin); + *LeftPlane = XMPlaneNormalize(vLeftPlane); + } + + if (TopPlane) + { + XMVECTOR vTopPlane = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f); + vTopPlane = DirectX::Internal::XMPlaneTransform(vTopPlane, vOrientation, vOrigin); + *TopPlane = XMPlaneNormalize(vTopPlane); + } + + if (BottomPlane) + { + XMVECTOR vBottomPlane = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f); + vBottomPlane = DirectX::Internal::XMPlaneTransform(vBottomPlane, vOrientation, vOrigin); + *BottomPlane = XMPlaneNormalize(vBottomPlane); + } +} + + +//----------------------------------------------------------------------------- +// Build a frustum from a persepective projection matrix. The matrix may only +// contain a projection; any rotation, translation or scale will cause the +// constructed frustum to be incorrect. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingFrustum::CreateFromMatrix(BoundingFrustum& Out, FXMMATRIX Projection, bool rhcoords) noexcept +{ + // Corners of the projection frustum in homogenous space. + static XMVECTORF32 HomogenousPoints[6] = + { + { { { 1.0f, 0.0f, 1.0f, 1.0f } } }, // right (at far plane) + { { { -1.0f, 0.0f, 1.0f, 1.0f } } }, // left + { { { 0.0f, 1.0f, 1.0f, 1.0f } } }, // top + { { { 0.0f, -1.0f, 1.0f, 1.0f } } }, // bottom + + { { { 0.0f, 0.0f, 0.0f, 1.0f } } }, // near + { { { 0.0f, 0.0f, 1.0f, 1.0f } } } // far + }; + + XMVECTOR Determinant; + XMMATRIX matInverse = XMMatrixInverse(&Determinant, Projection); + + // Compute the frustum corners in world space. + XMVECTOR Points[6]; + + for (size_t i = 0; i < 6; ++i) + { + // Transform point. + Points[i] = XMVector4Transform(HomogenousPoints[i], matInverse); + } + + Out.Origin = XMFLOAT3(0.0f, 0.0f, 0.0f); + Out.Orientation = XMFLOAT4(0.0f, 0.0f, 0.0f, 1.0f); + + // Compute the slopes. + Points[0] = XMVectorMultiply(Points[0], XMVectorReciprocal(XMVectorSplatZ(Points[0]))); + Points[1] = XMVectorMultiply(Points[1], XMVectorReciprocal(XMVectorSplatZ(Points[1]))); + Points[2] = XMVectorMultiply(Points[2], XMVectorReciprocal(XMVectorSplatZ(Points[2]))); + Points[3] = XMVectorMultiply(Points[3], XMVectorReciprocal(XMVectorSplatZ(Points[3]))); + + Out.RightSlope = XMVectorGetX(Points[0]); + Out.LeftSlope = XMVectorGetX(Points[1]); + Out.TopSlope = XMVectorGetY(Points[2]); + Out.BottomSlope = XMVectorGetY(Points[3]); + + // Compute near and far. + Points[4] = XMVectorMultiply(Points[4], XMVectorReciprocal(XMVectorSplatW(Points[4]))); + Points[5] = XMVectorMultiply(Points[5], XMVectorReciprocal(XMVectorSplatW(Points[5]))); + + if (rhcoords) + { + Out.Near = XMVectorGetZ(Points[5]); + Out.Far = XMVectorGetZ(Points[4]); + } + else + { + Out.Near = XMVectorGetZ(Points[4]); + Out.Far = XMVectorGetZ(Points[5]); + } +} + + +/**************************************************************************** + * + * TriangleTests + * + ****************************************************************************/ + +namespace TriangleTests +{ + + //----------------------------------------------------------------------------- + // Compute the intersection of a ray (Origin, Direction) with a triangle + // (V0, V1, V2). Return true if there is an intersection and also set *pDist + // to the distance along the ray to the intersection. + // + // The algorithm is based on Moller, Tomas and Trumbore, "Fast, Minimum Storage + // Ray-Triangle Intersection", Journal of Graphics Tools, vol. 2, no. 1, + // pp 21-28, 1997. + //----------------------------------------------------------------------------- + _Use_decl_annotations_ + inline bool XM_CALLCONV Intersects( + FXMVECTOR Origin, FXMVECTOR Direction, FXMVECTOR V0, + GXMVECTOR V1, + HXMVECTOR V2, float& Dist) noexcept + { + assert(DirectX::Internal::XMVector3IsUnit(Direction)); + + XMVECTOR Zero = XMVectorZero(); + + XMVECTOR e1 = XMVectorSubtract(V1, V0); + XMVECTOR e2 = XMVectorSubtract(V2, V0); + + // p = Direction ^ e2; + XMVECTOR p = XMVector3Cross(Direction, e2); + + // det = e1 * p; + XMVECTOR det = XMVector3Dot(e1, p); + + XMVECTOR u, v, t; + + if (XMVector3GreaterOrEqual(det, g_RayEpsilon)) + { + // Determinate is positive (front side of the triangle). + XMVECTOR s = XMVectorSubtract(Origin, V0); + + // u = s * p; + u = XMVector3Dot(s, p); + + XMVECTOR NoIntersection = XMVectorLess(u, Zero); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(u, det)); + + // q = s ^ e1; + XMVECTOR q = XMVector3Cross(s, e1); + + // v = Direction * q; + v = XMVector3Dot(Direction, q); + + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(v, Zero)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(XMVectorAdd(u, v), det)); + + // t = e2 * q; + t = XMVector3Dot(e2, q); + + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(t, Zero)); + + if (XMVector4EqualInt(NoIntersection, XMVectorTrueInt())) + { + Dist = 0.f; + return false; + } + } + else if (XMVector3LessOrEqual(det, g_RayNegEpsilon)) + { + // Determinate is negative (back side of the triangle). + XMVECTOR s = XMVectorSubtract(Origin, V0); + + // u = s * p; + u = XMVector3Dot(s, p); + + XMVECTOR NoIntersection = XMVectorGreater(u, Zero); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(u, det)); + + // q = s ^ e1; + XMVECTOR q = XMVector3Cross(s, e1); + + // v = Direction * q; + v = XMVector3Dot(Direction, q); + + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(v, Zero)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(XMVectorAdd(u, v), det)); + + // t = e2 * q; + t = XMVector3Dot(e2, q); + + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(t, Zero)); + + if (XMVector4EqualInt(NoIntersection, XMVectorTrueInt())) + { + Dist = 0.f; + return false; + } + } + else + { + // Parallel ray. + Dist = 0.f; + return false; + } + + t = XMVectorDivide(t, det); + + // (u / det) and (v / dev) are the barycentric cooridinates of the intersection. + + // Store the x-component to *pDist + XMStoreFloat(&Dist, t); + + return true; + } + + + //----------------------------------------------------------------------------- + // Test if two triangles intersect. + // + // The final test of algorithm is based on Shen, Heng, and Tang, "A Fast + // Triangle-Triangle Overlap Test Using Signed Distances", Journal of Graphics + // Tools, vol. 8, no. 1, pp 17-23, 2003 and Guigue and Devillers, "Fast and + // Robust Triangle-Triangle Overlap Test Using Orientation Predicates", Journal + // of Graphics Tools, vol. 8, no. 1, pp 25-32, 2003. + // + // The final test could be considered an edge-edge separating plane test with + // the 9 possible cases narrowed down to the only two pairs of edges that can + // actaully result in a seperation. + //----------------------------------------------------------------------------- + _Use_decl_annotations_ + inline bool XM_CALLCONV Intersects(FXMVECTOR A0, FXMVECTOR A1, FXMVECTOR A2, GXMVECTOR B0, HXMVECTOR B1, HXMVECTOR B2) noexcept + { + static const XMVECTORU32 SelectY = { { { XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 } } }; + static const XMVECTORU32 SelectZ = { { { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 } } }; + static const XMVECTORU32 Select0111 = { { { XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_1 } } }; + static const XMVECTORU32 Select1011 = { { { XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_1 } } }; + static const XMVECTORU32 Select1101 = { { { XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1 } } }; + + XMVECTOR Zero = XMVectorZero(); + + // Compute the normal of triangle A. + XMVECTOR N1 = XMVector3Cross(XMVectorSubtract(A1, A0), XMVectorSubtract(A2, A0)); + + // Assert that the triangle is not degenerate. + assert(!XMVector3Equal(N1, Zero)); + + // Test points of B against the plane of A. + XMVECTOR BDist = XMVector3Dot(N1, XMVectorSubtract(B0, A0)); + BDist = XMVectorSelect(BDist, XMVector3Dot(N1, XMVectorSubtract(B1, A0)), SelectY); + BDist = XMVectorSelect(BDist, XMVector3Dot(N1, XMVectorSubtract(B2, A0)), SelectZ); + + // Ensure robustness with co-planar triangles by zeroing small distances. + uint32_t BDistIsZeroCR; + XMVECTOR BDistIsZero = XMVectorGreaterR(&BDistIsZeroCR, g_RayEpsilon, XMVectorAbs(BDist)); + BDist = XMVectorSelect(BDist, Zero, BDistIsZero); + + uint32_t BDistIsLessCR; + XMVECTOR BDistIsLess = XMVectorGreaterR(&BDistIsLessCR, Zero, BDist); + + uint32_t BDistIsGreaterCR; + XMVECTOR BDistIsGreater = XMVectorGreaterR(&BDistIsGreaterCR, BDist, Zero); + + // If all the points are on the same side we don't intersect. + if (XMComparisonAllTrue(BDistIsLessCR) || XMComparisonAllTrue(BDistIsGreaterCR)) + return false; + + // Compute the normal of triangle B. + XMVECTOR N2 = XMVector3Cross(XMVectorSubtract(B1, B0), XMVectorSubtract(B2, B0)); + + // Assert that the triangle is not degenerate. + assert(!XMVector3Equal(N2, Zero)); + + // Test points of A against the plane of B. + XMVECTOR ADist = XMVector3Dot(N2, XMVectorSubtract(A0, B0)); + ADist = XMVectorSelect(ADist, XMVector3Dot(N2, XMVectorSubtract(A1, B0)), SelectY); + ADist = XMVectorSelect(ADist, XMVector3Dot(N2, XMVectorSubtract(A2, B0)), SelectZ); + + // Ensure robustness with co-planar triangles by zeroing small distances. + uint32_t ADistIsZeroCR; + XMVECTOR ADistIsZero = XMVectorGreaterR(&ADistIsZeroCR, g_RayEpsilon, XMVectorAbs(BDist)); + ADist = XMVectorSelect(ADist, Zero, ADistIsZero); + + uint32_t ADistIsLessCR; + XMVECTOR ADistIsLess = XMVectorGreaterR(&ADistIsLessCR, Zero, ADist); + + uint32_t ADistIsGreaterCR; + XMVECTOR ADistIsGreater = XMVectorGreaterR(&ADistIsGreaterCR, ADist, Zero); + + // If all the points are on the same side we don't intersect. + if (XMComparisonAllTrue(ADistIsLessCR) || XMComparisonAllTrue(ADistIsGreaterCR)) + return false; + + // Special case for co-planar triangles. + if (XMComparisonAllTrue(ADistIsZeroCR) || XMComparisonAllTrue(BDistIsZeroCR)) + { + XMVECTOR Axis, Dist, MinDist; + + // Compute an axis perpindicular to the edge (points out). + Axis = XMVector3Cross(N1, XMVectorSubtract(A1, A0)); + Dist = XMVector3Dot(Axis, A0); + + // Test points of B against the axis. + MinDist = XMVector3Dot(B0, Axis); + MinDist = XMVectorMin(MinDist, XMVector3Dot(B1, Axis)); + MinDist = XMVectorMin(MinDist, XMVector3Dot(B2, Axis)); + if (XMVector4GreaterOrEqual(MinDist, Dist)) + return false; + + // Edge (A1, A2) + Axis = XMVector3Cross(N1, XMVectorSubtract(A2, A1)); + Dist = XMVector3Dot(Axis, A1); + + MinDist = XMVector3Dot(B0, Axis); + MinDist = XMVectorMin(MinDist, XMVector3Dot(B1, Axis)); + MinDist = XMVectorMin(MinDist, XMVector3Dot(B2, Axis)); + if (XMVector4GreaterOrEqual(MinDist, Dist)) + return false; + + // Edge (A2, A0) + Axis = XMVector3Cross(N1, XMVectorSubtract(A0, A2)); + Dist = XMVector3Dot(Axis, A2); + + MinDist = XMVector3Dot(B0, Axis); + MinDist = XMVectorMin(MinDist, XMVector3Dot(B1, Axis)); + MinDist = XMVectorMin(MinDist, XMVector3Dot(B2, Axis)); + if (XMVector4GreaterOrEqual(MinDist, Dist)) + return false; + + // Edge (B0, B1) + Axis = XMVector3Cross(N2, XMVectorSubtract(B1, B0)); + Dist = XMVector3Dot(Axis, B0); + + MinDist = XMVector3Dot(A0, Axis); + MinDist = XMVectorMin(MinDist, XMVector3Dot(A1, Axis)); + MinDist = XMVectorMin(MinDist, XMVector3Dot(A2, Axis)); + if (XMVector4GreaterOrEqual(MinDist, Dist)) + return false; + + // Edge (B1, B2) + Axis = XMVector3Cross(N2, XMVectorSubtract(B2, B1)); + Dist = XMVector3Dot(Axis, B1); + + MinDist = XMVector3Dot(A0, Axis); + MinDist = XMVectorMin(MinDist, XMVector3Dot(A1, Axis)); + MinDist = XMVectorMin(MinDist, XMVector3Dot(A2, Axis)); + if (XMVector4GreaterOrEqual(MinDist, Dist)) + return false; + + // Edge (B2,B0) + Axis = XMVector3Cross(N2, XMVectorSubtract(B0, B2)); + Dist = XMVector3Dot(Axis, B2); + + MinDist = XMVector3Dot(A0, Axis); + MinDist = XMVectorMin(MinDist, XMVector3Dot(A1, Axis)); + MinDist = XMVectorMin(MinDist, XMVector3Dot(A2, Axis)); + if (XMVector4GreaterOrEqual(MinDist, Dist)) + return false; + + return true; + } + + // + // Find the single vertex of A and B (ie the vertex on the opposite side + // of the plane from the other two) and reorder the edges so we can compute + // the signed edge/edge distances. + // + // if ( (V0 >= 0 && V1 < 0 && V2 < 0) || + // (V0 > 0 && V1 <= 0 && V2 <= 0) || + // (V0 <= 0 && V1 > 0 && V2 > 0) || + // (V0 < 0 && V1 >= 0 && V2 >= 0) ) then V0 is singular; + // + // If our singular vertex is not on the positive side of the plane we reverse + // the triangle winding so that the overlap comparisons will compare the + // correct edges with the correct signs. + // + XMVECTOR ADistIsLessEqual = XMVectorOrInt(ADistIsLess, ADistIsZero); + XMVECTOR ADistIsGreaterEqual = XMVectorOrInt(ADistIsGreater, ADistIsZero); + + XMVECTOR AA0, AA1, AA2; + bool bPositiveA; + + if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsGreaterEqual, ADistIsLess, Select0111)) || + DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsGreater, ADistIsLessEqual, Select0111))) + { + // A0 is singular, crossing from positive to negative. + AA0 = A0; AA1 = A1; AA2 = A2; + bPositiveA = true; + } + else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsLessEqual, ADistIsGreater, Select0111)) || + DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsLess, ADistIsGreaterEqual, Select0111))) + { + // A0 is singular, crossing from negative to positive. + AA0 = A0; AA1 = A2; AA2 = A1; + bPositiveA = false; + } + else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsGreaterEqual, ADistIsLess, Select1011)) || + DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsGreater, ADistIsLessEqual, Select1011))) + { + // A1 is singular, crossing from positive to negative. + AA0 = A1; AA1 = A2; AA2 = A0; + bPositiveA = true; + } + else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsLessEqual, ADistIsGreater, Select1011)) || + DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsLess, ADistIsGreaterEqual, Select1011))) + { + // A1 is singular, crossing from negative to positive. + AA0 = A1; AA1 = A0; AA2 = A2; + bPositiveA = false; + } + else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsGreaterEqual, ADistIsLess, Select1101)) || + DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsGreater, ADistIsLessEqual, Select1101))) + { + // A2 is singular, crossing from positive to negative. + AA0 = A2; AA1 = A0; AA2 = A1; + bPositiveA = true; + } + else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsLessEqual, ADistIsGreater, Select1101)) || + DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsLess, ADistIsGreaterEqual, Select1101))) + { + // A2 is singular, crossing from negative to positive. + AA0 = A2; AA1 = A1; AA2 = A0; + bPositiveA = false; + } + else + { + assert(false); + return false; + } + + XMVECTOR BDistIsLessEqual = XMVectorOrInt(BDistIsLess, BDistIsZero); + XMVECTOR BDistIsGreaterEqual = XMVectorOrInt(BDistIsGreater, BDistIsZero); + + XMVECTOR BB0, BB1, BB2; + bool bPositiveB; + + if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsGreaterEqual, BDistIsLess, Select0111)) || + DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsGreater, BDistIsLessEqual, Select0111))) + { + // B0 is singular, crossing from positive to negative. + BB0 = B0; BB1 = B1; BB2 = B2; + bPositiveB = true; + } + else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsLessEqual, BDistIsGreater, Select0111)) || + DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsLess, BDistIsGreaterEqual, Select0111))) + { + // B0 is singular, crossing from negative to positive. + BB0 = B0; BB1 = B2; BB2 = B1; + bPositiveB = false; + } + else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsGreaterEqual, BDistIsLess, Select1011)) || + DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsGreater, BDistIsLessEqual, Select1011))) + { + // B1 is singular, crossing from positive to negative. + BB0 = B1; BB1 = B2; BB2 = B0; + bPositiveB = true; + } + else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsLessEqual, BDistIsGreater, Select1011)) || + DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsLess, BDistIsGreaterEqual, Select1011))) + { + // B1 is singular, crossing from negative to positive. + BB0 = B1; BB1 = B0; BB2 = B2; + bPositiveB = false; + } + else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsGreaterEqual, BDistIsLess, Select1101)) || + DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsGreater, BDistIsLessEqual, Select1101))) + { + // B2 is singular, crossing from positive to negative. + BB0 = B2; BB1 = B0; BB2 = B1; + bPositiveB = true; + } + else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsLessEqual, BDistIsGreater, Select1101)) || + DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsLess, BDistIsGreaterEqual, Select1101))) + { + // B2 is singular, crossing from negative to positive. + BB0 = B2; BB1 = B1; BB2 = B0; + bPositiveB = false; + } + else + { + assert(false); + return false; + } + + XMVECTOR Delta0, Delta1; + + // Reverse the direction of the test depending on whether the singular vertices are + // the same sign or different signs. + if (bPositiveA ^ bPositiveB) + { + Delta0 = XMVectorSubtract(BB0, AA0); + Delta1 = XMVectorSubtract(AA0, BB0); + } + else + { + Delta0 = XMVectorSubtract(AA0, BB0); + Delta1 = XMVectorSubtract(BB0, AA0); + } + + // Check if the triangles overlap on the line of intersection between the + // planes of the two triangles by finding the signed line distances. + XMVECTOR Dist0 = XMVector3Dot(Delta0, XMVector3Cross(XMVectorSubtract(BB2, BB0), XMVectorSubtract(AA2, AA0))); + if (XMVector4Greater(Dist0, Zero)) + return false; + + XMVECTOR Dist1 = XMVector3Dot(Delta1, XMVector3Cross(XMVectorSubtract(BB1, BB0), XMVectorSubtract(AA1, AA0))); + if (XMVector4Greater(Dist1, Zero)) + return false; + + return true; + } + + + //----------------------------------------------------------------------------- + // Ray-triangle test + //----------------------------------------------------------------------------- + _Use_decl_annotations_ + inline PlaneIntersectionType XM_CALLCONV Intersects(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2, GXMVECTOR Plane) noexcept + { + XMVECTOR One = XMVectorSplatOne(); + + assert(DirectX::Internal::XMPlaneIsUnit(Plane)); + + // Set w of the points to one so we can dot4 with a plane. + XMVECTOR TV0 = XMVectorInsert<0, 0, 0, 0, 1>(V0, One); + XMVECTOR TV1 = XMVectorInsert<0, 0, 0, 0, 1>(V1, One); + XMVECTOR TV2 = XMVectorInsert<0, 0, 0, 0, 1>(V2, One); + + XMVECTOR Outside, Inside; + DirectX::Internal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane, Outside, Inside); + + // If the triangle is outside any plane it is outside. + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return FRONT; + + // If the triangle is inside all planes it is inside. + if (XMVector4EqualInt(Inside, XMVectorTrueInt())) + return BACK; + + // The triangle is not inside all planes or outside a plane it intersects. + return INTERSECTING; + } + + + //----------------------------------------------------------------------------- + // Test a triangle vs 6 planes (typically forming a frustum). + //----------------------------------------------------------------------------- + _Use_decl_annotations_ + inline ContainmentType XM_CALLCONV ContainedBy( + FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2, + GXMVECTOR Plane0, + HXMVECTOR Plane1, HXMVECTOR Plane2, + CXMVECTOR Plane3, CXMVECTOR Plane4, CXMVECTOR Plane5) noexcept + { + XMVECTOR One = XMVectorSplatOne(); + + // Set w of the points to one so we can dot4 with a plane. + XMVECTOR TV0 = XMVectorInsert<0, 0, 0, 0, 1>(V0, One); + XMVECTOR TV1 = XMVectorInsert<0, 0, 0, 0, 1>(V1, One); + XMVECTOR TV2 = XMVectorInsert<0, 0, 0, 0, 1>(V2, One); + + XMVECTOR Outside, Inside; + + // Test against each plane. + DirectX::Internal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane0, Outside, Inside); + + XMVECTOR AnyOutside = Outside; + XMVECTOR AllInside = Inside; + + DirectX::Internal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane1, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane2, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane3, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane4, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane5, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + // If the triangle is outside any plane it is outside. + if (XMVector4EqualInt(AnyOutside, XMVectorTrueInt())) + return DISJOINT; + + // If the triangle is inside all planes it is inside. + if (XMVector4EqualInt(AllInside, XMVectorTrueInt())) + return CONTAINS; + + // The triangle is not inside all planes or outside a plane, it may intersect. + return INTERSECTS; + } + +} // namespace TriangleTests + diff --git a/DirectXMath/DirectXColors.h b/DirectXMath/DirectXColors.h new file mode 100644 index 0000000..21eac36 --- /dev/null +++ b/DirectXMath/DirectXColors.h @@ -0,0 +1,165 @@ +//------------------------------------------------------------------------------------- +// DirectXColors.h -- C++ Color Math library +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#include "DirectXMath.h" + +namespace DirectX +{ + + namespace Colors + { + // Standard colors (Red/Green/Blue/Alpha) + XMGLOBALCONST XMVECTORF32 AliceBlue = { { { 0.941176534f, 0.972549081f, 1.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 AntiqueWhite = { { { 0.980392218f, 0.921568692f, 0.843137324f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Aqua = { { { 0.000000000f, 1.000000000f, 1.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Aquamarine = { { { 0.498039246f, 1.000000000f, 0.831372619f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Azure = { { { 0.941176534f, 1.000000000f, 1.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Beige = { { { 0.960784376f, 0.960784376f, 0.862745166f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Bisque = { { { 1.000000000f, 0.894117713f, 0.768627524f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Black = { { { 0.000000000f, 0.000000000f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 BlanchedAlmond = { { { 1.000000000f, 0.921568692f, 0.803921640f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Blue = { { { 0.000000000f, 0.000000000f, 1.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 BlueViolet = { { { 0.541176498f, 0.168627456f, 0.886274576f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Brown = { { { 0.647058845f, 0.164705887f, 0.164705887f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 BurlyWood = { { { 0.870588303f, 0.721568644f, 0.529411793f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 CadetBlue = { { { 0.372549027f, 0.619607866f, 0.627451003f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Chartreuse = { { { 0.498039246f, 1.000000000f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Chocolate = { { { 0.823529482f, 0.411764741f, 0.117647067f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Coral = { { { 1.000000000f, 0.498039246f, 0.313725501f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 CornflowerBlue = { { { 0.392156899f, 0.584313750f, 0.929411829f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Cornsilk = { { { 1.000000000f, 0.972549081f, 0.862745166f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Crimson = { { { 0.862745166f, 0.078431375f, 0.235294133f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Cyan = { { { 0.000000000f, 1.000000000f, 1.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkBlue = { { { 0.000000000f, 0.000000000f, 0.545098066f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkCyan = { { { 0.000000000f, 0.545098066f, 0.545098066f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkGoldenrod = { { { 0.721568644f, 0.525490224f, 0.043137256f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkGray = { { { 0.662745118f, 0.662745118f, 0.662745118f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkGreen = { { { 0.000000000f, 0.392156899f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkKhaki = { { { 0.741176486f, 0.717647076f, 0.419607878f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkMagenta = { { { 0.545098066f, 0.000000000f, 0.545098066f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkOliveGreen = { { { 0.333333343f, 0.419607878f, 0.184313729f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkOrange = { { { 1.000000000f, 0.549019635f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkOrchid = { { { 0.600000024f, 0.196078449f, 0.800000072f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkRed = { { { 0.545098066f, 0.000000000f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkSalmon = { { { 0.913725555f, 0.588235319f, 0.478431404f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkSeaGreen = { { { 0.560784340f, 0.737254918f, 0.545098066f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkSlateBlue = { { { 0.282352954f, 0.239215702f, 0.545098066f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkSlateGray = { { { 0.184313729f, 0.309803933f, 0.309803933f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkTurquoise = { { { 0.000000000f, 0.807843208f, 0.819607913f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DarkViolet = { { { 0.580392182f, 0.000000000f, 0.827451050f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DeepPink = { { { 1.000000000f, 0.078431375f, 0.576470613f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DeepSkyBlue = { { { 0.000000000f, 0.749019623f, 1.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DimGray = { { { 0.411764741f, 0.411764741f, 0.411764741f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 DodgerBlue = { { { 0.117647067f, 0.564705908f, 1.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Firebrick = { { { 0.698039234f, 0.133333340f, 0.133333340f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 FloralWhite = { { { 1.000000000f, 0.980392218f, 0.941176534f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 ForestGreen = { { { 0.133333340f, 0.545098066f, 0.133333340f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Fuchsia = { { { 1.000000000f, 0.000000000f, 1.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Gainsboro = { { { 0.862745166f, 0.862745166f, 0.862745166f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 GhostWhite = { { { 0.972549081f, 0.972549081f, 1.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Gold = { { { 1.000000000f, 0.843137324f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Goldenrod = { { { 0.854902029f, 0.647058845f, 0.125490203f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Gray = { { { 0.501960814f, 0.501960814f, 0.501960814f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Green = { { { 0.000000000f, 0.501960814f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 GreenYellow = { { { 0.678431392f, 1.000000000f, 0.184313729f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Honeydew = { { { 0.941176534f, 1.000000000f, 0.941176534f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 HotPink = { { { 1.000000000f, 0.411764741f, 0.705882370f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 IndianRed = { { { 0.803921640f, 0.360784322f, 0.360784322f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Indigo = { { { 0.294117659f, 0.000000000f, 0.509803951f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Ivory = { { { 1.000000000f, 1.000000000f, 0.941176534f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Khaki = { { { 0.941176534f, 0.901960850f, 0.549019635f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Lavender = { { { 0.901960850f, 0.901960850f, 0.980392218f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LavenderBlush = { { { 1.000000000f, 0.941176534f, 0.960784376f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LawnGreen = { { { 0.486274540f, 0.988235354f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LemonChiffon = { { { 1.000000000f, 0.980392218f, 0.803921640f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightBlue = { { { 0.678431392f, 0.847058892f, 0.901960850f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightCoral = { { { 0.941176534f, 0.501960814f, 0.501960814f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightCyan = { { { 0.878431439f, 1.000000000f, 1.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightGoldenrodYellow = { { { 0.980392218f, 0.980392218f, 0.823529482f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightGreen = { { { 0.564705908f, 0.933333397f, 0.564705908f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightGray = { { { 0.827451050f, 0.827451050f, 0.827451050f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightPink = { { { 1.000000000f, 0.713725507f, 0.756862819f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightSalmon = { { { 1.000000000f, 0.627451003f, 0.478431404f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightSeaGreen = { { { 0.125490203f, 0.698039234f, 0.666666687f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightSkyBlue = { { { 0.529411793f, 0.807843208f, 0.980392218f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightSlateGray = { { { 0.466666698f, 0.533333361f, 0.600000024f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightSteelBlue = { { { 0.690196097f, 0.768627524f, 0.870588303f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LightYellow = { { { 1.000000000f, 1.000000000f, 0.878431439f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Lime = { { { 0.000000000f, 1.000000000f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 LimeGreen = { { { 0.196078449f, 0.803921640f, 0.196078449f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Linen = { { { 0.980392218f, 0.941176534f, 0.901960850f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Magenta = { { { 1.000000000f, 0.000000000f, 1.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Maroon = { { { 0.501960814f, 0.000000000f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 MediumAquamarine = { { { 0.400000036f, 0.803921640f, 0.666666687f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 MediumBlue = { { { 0.000000000f, 0.000000000f, 0.803921640f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 MediumOrchid = { { { 0.729411781f, 0.333333343f, 0.827451050f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 MediumPurple = { { { 0.576470613f, 0.439215720f, 0.858823597f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 MediumSeaGreen = { { { 0.235294133f, 0.701960802f, 0.443137288f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 MediumSlateBlue = { { { 0.482352972f, 0.407843173f, 0.933333397f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 MediumSpringGreen = { { { 0.000000000f, 0.980392218f, 0.603921592f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 MediumTurquoise = { { { 0.282352954f, 0.819607913f, 0.800000072f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 MediumVioletRed = { { { 0.780392230f, 0.082352944f, 0.521568656f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 MidnightBlue = { { { 0.098039225f, 0.098039225f, 0.439215720f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 MintCream = { { { 0.960784376f, 1.000000000f, 0.980392218f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 MistyRose = { { { 1.000000000f, 0.894117713f, 0.882353008f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Moccasin = { { { 1.000000000f, 0.894117713f, 0.709803939f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 NavajoWhite = { { { 1.000000000f, 0.870588303f, 0.678431392f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Navy = { { { 0.000000000f, 0.000000000f, 0.501960814f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 OldLace = { { { 0.992156923f, 0.960784376f, 0.901960850f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Olive = { { { 0.501960814f, 0.501960814f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 OliveDrab = { { { 0.419607878f, 0.556862772f, 0.137254909f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Orange = { { { 1.000000000f, 0.647058845f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 OrangeRed = { { { 1.000000000f, 0.270588249f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Orchid = { { { 0.854902029f, 0.439215720f, 0.839215755f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 PaleGoldenrod = { { { 0.933333397f, 0.909803987f, 0.666666687f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 PaleGreen = { { { 0.596078455f, 0.984313786f, 0.596078455f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 PaleTurquoise = { { { 0.686274529f, 0.933333397f, 0.933333397f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 PaleVioletRed = { { { 0.858823597f, 0.439215720f, 0.576470613f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 PapayaWhip = { { { 1.000000000f, 0.937254965f, 0.835294187f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 PeachPuff = { { { 1.000000000f, 0.854902029f, 0.725490212f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Peru = { { { 0.803921640f, 0.521568656f, 0.247058839f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Pink = { { { 1.000000000f, 0.752941251f, 0.796078503f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Plum = { { { 0.866666734f, 0.627451003f, 0.866666734f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 PowderBlue = { { { 0.690196097f, 0.878431439f, 0.901960850f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Purple = { { { 0.501960814f, 0.000000000f, 0.501960814f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Red = { { { 1.000000000f, 0.000000000f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 RosyBrown = { { { 0.737254918f, 0.560784340f, 0.560784340f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 RoyalBlue = { { { 0.254901975f, 0.411764741f, 0.882353008f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 SaddleBrown = { { { 0.545098066f, 0.270588249f, 0.074509807f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Salmon = { { { 0.980392218f, 0.501960814f, 0.447058856f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 SandyBrown = { { { 0.956862807f, 0.643137276f, 0.376470625f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 SeaGreen = { { { 0.180392161f, 0.545098066f, 0.341176480f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 SeaShell = { { { 1.000000000f, 0.960784376f, 0.933333397f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Sienna = { { { 0.627451003f, 0.321568638f, 0.176470593f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Silver = { { { 0.752941251f, 0.752941251f, 0.752941251f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 SkyBlue = { { { 0.529411793f, 0.807843208f, 0.921568692f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 SlateBlue = { { { 0.415686309f, 0.352941185f, 0.803921640f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 SlateGray = { { { 0.439215720f, 0.501960814f, 0.564705908f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Snow = { { { 1.000000000f, 0.980392218f, 0.980392218f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 SpringGreen = { { { 0.000000000f, 1.000000000f, 0.498039246f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 SteelBlue = { { { 0.274509817f, 0.509803951f, 0.705882370f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Tan = { { { 0.823529482f, 0.705882370f, 0.549019635f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Teal = { { { 0.000000000f, 0.501960814f, 0.501960814f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Thistle = { { { 0.847058892f, 0.749019623f, 0.847058892f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Tomato = { { { 1.000000000f, 0.388235331f, 0.278431386f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Transparent = { { { 0.000000000f, 0.000000000f, 0.000000000f, 0.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Turquoise = { { { 0.250980407f, 0.878431439f, 0.815686345f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Violet = { { { 0.933333397f, 0.509803951f, 0.933333397f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Wheat = { { { 0.960784376f, 0.870588303f, 0.701960802f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 White = { { { 1.000000000f, 1.000000000f, 1.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 WhiteSmoke = { { { 0.960784376f, 0.960784376f, 0.960784376f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 Yellow = { { { 1.000000000f, 1.000000000f, 0.000000000f, 1.000000000f } } }; + XMGLOBALCONST XMVECTORF32 YellowGreen = { { { 0.603921592f, 0.803921640f, 0.196078449f, 1.000000000f } } }; + + } // namespace Colors + +} // namespace DirectX + diff --git a/DirectXMath/DirectXMath.h b/DirectXMath/DirectXMath.h new file mode 100644 index 0000000..ce9fa0f --- /dev/null +++ b/DirectXMath/DirectXMath.h @@ -0,0 +1,2242 @@ +//------------------------------------------------------------------------------------- +// DirectXMath.h -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#ifndef __cplusplus +#error DirectX Math requires C++ +#endif + +#define DIRECTX_MATH_VERSION 316 + +#if defined(_MSC_VER) && (_MSC_VER < 1910) +#error DirectX Math requires Visual C++ 2017 or later. +#endif + +#if defined(_MSC_VER) && !defined(_M_ARM) && !defined(_M_ARM64) && !defined(_M_HYBRID_X86_ARM64) && (!_MANAGED) && (!_M_CEE) && (!defined(_M_IX86_FP) || (_M_IX86_FP > 1)) && !defined(_XM_NO_INTRINSICS_) && !defined(_XM_VECTORCALL_) +#define _XM_VECTORCALL_ 1 +#endif + +#if _XM_VECTORCALL_ +#define XM_CALLCONV __vectorcall +#elif defined(__GNUC__) +#define XM_CALLCONV +#else +#define XM_CALLCONV __fastcall +#endif + +#ifndef XM_DEPRECATED +#ifdef __GNUC__ +#define XM_DEPRECATED __attribute__ ((deprecated)) +#else +#define XM_DEPRECATED __declspec(deprecated("This is deprecated and will be removed in a future version.")) +#endif +#endif + +#if !defined(_XM_AVX2_INTRINSICS_) && defined(__AVX2__) && !defined(_XM_NO_INTRINSICS_) +#define _XM_AVX2_INTRINSICS_ +#endif + +#if !defined(_XM_FMA3_INTRINSICS_) && defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +#define _XM_FMA3_INTRINSICS_ +#endif + +#if !defined(_XM_F16C_INTRINSICS_) && defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +#define _XM_F16C_INTRINSICS_ +#endif + +#if !defined(_XM_F16C_INTRINSICS_) && defined(__F16C__) && !defined(_XM_NO_INTRINSICS_) +#define _XM_F16C_INTRINSICS_ +#endif + +#if defined(_XM_FMA3_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_) +#define _XM_AVX_INTRINSICS_ +#endif + +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_) +#define _XM_AVX_INTRINSICS_ +#endif + +#if !defined(_XM_AVX_INTRINSICS_) && defined(__AVX__) && !defined(_XM_NO_INTRINSICS_) +#define _XM_AVX_INTRINSICS_ +#endif + +#if defined(_XM_AVX_INTRINSICS_) && !defined(_XM_SSE4_INTRINSICS_) +#define _XM_SSE4_INTRINSICS_ +#endif + +#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_SSE3_INTRINSICS_) +#define _XM_SSE3_INTRINSICS_ +#endif + +#if defined(_XM_SSE3_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) +#define _XM_SSE_INTRINSICS_ +#endif + +#if !defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +#if (defined(_M_IX86) || defined(_M_X64) || __i386__ || __x86_64__) && !defined(_M_HYBRID_X86_ARM64) +#define _XM_SSE_INTRINSICS_ +#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__ +#define _XM_ARM_NEON_INTRINSICS_ +#elif !defined(_XM_NO_INTRINSICS_) +#error DirectX Math does not support this target +#endif +#endif // !_XM_ARM_NEON_INTRINSICS_ && !_XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_ + +#if defined(_XM_SSE_INTRINSICS_) && defined(_MSC_VER) && (_MSC_VER >= 1920) && !defined(__clang__) && !defined(_XM_SVML_INTRINSICS_) && !defined(_XM_DISABLE_INTEL_SVML_) +#define _XM_SVML_INTRINSICS_ +#endif + +#if !defined(_XM_NO_XMVECTOR_OVERLOADS_) && (defined(__clang__) || defined(__GNUC__)) +#define _XM_NO_XMVECTOR_OVERLOADS_ +#endif + +#pragma warning(push) +#pragma warning(disable:4514 4820) +// C4514/4820: Off by default noise +#include +#include +#pragma warning(pop) + +#ifndef _XM_NO_INTRINSICS_ + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4987) +// C4987: Off by default noise +#include +#pragma warning(pop) +#endif + +#if (defined(__clang__) || defined(__GNUC__)) && (__x86_64__ || __i386__) +#include +#endif + +#ifdef _XM_SSE_INTRINSICS_ +#include +#include + +#ifdef _XM_SSE3_INTRINSICS_ +#include +#endif + +#ifdef _XM_SSE4_INTRINSICS_ +#include +#endif + +#ifdef _XM_AVX_INTRINSICS_ +#include +#endif + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)) +#include +#else +#include +#endif +#endif +#endif // !_XM_NO_INTRINSICS_ + +//#include "sal.h" +#include + +#pragma warning(push) +#pragma warning(disable : 4005 4668) +// C4005/4668: Old header issue +#include +#pragma warning(pop) + +#ifdef __GNUC__ +#define XM_ALIGNED_DATA(x) __attribute__ ((aligned(x))) +#define XM_ALIGNED_STRUCT(x) struct __attribute__ ((aligned(x))) +#else +#define XM_ALIGNED_DATA(x) __declspec(align(x)) +#define XM_ALIGNED_STRUCT(x) __declspec(align(x)) struct +#endif + +/**************************************************************************** + * + * Conditional intrinsics + * + ****************************************************************************/ + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + +#if defined(_XM_NO_MOVNT_) +#define XM_STREAM_PS( p, a ) _mm_store_ps((p), (a)) +#define XM256_STREAM_PS( p, a ) _mm256_store_ps((p), (a)) +#define XM_SFENCE() +#else +#define XM_STREAM_PS( p, a ) _mm_stream_ps((p), (a)) +#define XM256_STREAM_PS( p, a ) _mm256_stream_ps((p), (a)) +#define XM_SFENCE() _mm_sfence() +#endif + +#if defined(_XM_FMA3_INTRINSICS_) +#define XM_FMADD_PS( a, b, c ) _mm_fmadd_ps((a), (b), (c)) +#define XM_FNMADD_PS( a, b, c ) _mm_fnmadd_ps((a), (b), (c)) +#else +#define XM_FMADD_PS( a, b, c ) _mm_add_ps(_mm_mul_ps((a), (b)), (c)) +#define XM_FNMADD_PS( a, b, c ) _mm_sub_ps((c), _mm_mul_ps((a), (b))) +#endif + +#if defined(_XM_AVX_INTRINSICS_) && defined(_XM_FAVOR_INTEL_) +#define XM_PERMUTE_PS( v, c ) _mm_permute_ps((v), c ) +#else +#define XM_PERMUTE_PS( v, c ) _mm_shuffle_ps((v), (v), c ) +#endif + +#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_ + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + +#if defined(__clang__) || defined(__GNUC__) +#define XM_PREFETCH( a ) __builtin_prefetch(a) +#elif defined(_MSC_VER) +#define XM_PREFETCH( a ) __prefetch(a) +#else +#define XM_PREFETCH( a ) +#endif + +#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_ + +namespace DirectX +{ + + /**************************************************************************** + * + * Constant definitions + * + ****************************************************************************/ + +#if defined(__XNAMATH_H__) && defined(XM_PI) +#undef XM_PI +#undef XM_2PI +#undef XM_1DIVPI +#undef XM_1DIV2PI +#undef XM_PIDIV2 +#undef XM_PIDIV4 +#undef XM_SELECT_0 +#undef XM_SELECT_1 +#undef XM_PERMUTE_0X +#undef XM_PERMUTE_0Y +#undef XM_PERMUTE_0Z +#undef XM_PERMUTE_0W +#undef XM_PERMUTE_1X +#undef XM_PERMUTE_1Y +#undef XM_PERMUTE_1Z +#undef XM_PERMUTE_1W +#undef XM_CRMASK_CR6 +#undef XM_CRMASK_CR6TRUE +#undef XM_CRMASK_CR6FALSE +#undef XM_CRMASK_CR6BOUNDS +#undef XM_CACHE_LINE_SIZE +#endif + + constexpr float XM_PI = 3.141592654f; + constexpr float XM_2PI = 6.283185307f; + constexpr float XM_1DIVPI = 0.318309886f; + constexpr float XM_1DIV2PI = 0.159154943f; + constexpr float XM_PIDIV2 = 1.570796327f; + constexpr float XM_PIDIV4 = 0.785398163f; + + constexpr uint32_t XM_SELECT_0 = 0x00000000; + constexpr uint32_t XM_SELECT_1 = 0xFFFFFFFF; + + constexpr uint32_t XM_PERMUTE_0X = 0; + constexpr uint32_t XM_PERMUTE_0Y = 1; + constexpr uint32_t XM_PERMUTE_0Z = 2; + constexpr uint32_t XM_PERMUTE_0W = 3; + constexpr uint32_t XM_PERMUTE_1X = 4; + constexpr uint32_t XM_PERMUTE_1Y = 5; + constexpr uint32_t XM_PERMUTE_1Z = 6; + constexpr uint32_t XM_PERMUTE_1W = 7; + + constexpr uint32_t XM_SWIZZLE_X = 0; + constexpr uint32_t XM_SWIZZLE_Y = 1; + constexpr uint32_t XM_SWIZZLE_Z = 2; + constexpr uint32_t XM_SWIZZLE_W = 3; + + constexpr uint32_t XM_CRMASK_CR6 = 0x000000F0; + constexpr uint32_t XM_CRMASK_CR6TRUE = 0x00000080; + constexpr uint32_t XM_CRMASK_CR6FALSE = 0x00000020; + constexpr uint32_t XM_CRMASK_CR6BOUNDS = XM_CRMASK_CR6FALSE; + + constexpr size_t XM_CACHE_LINE_SIZE = 64; + + + /**************************************************************************** + * + * Macros + * + ****************************************************************************/ + +#if defined(__XNAMATH_H__) && defined(XMComparisonAllTrue) +#undef XMComparisonAllTrue +#undef XMComparisonAnyTrue +#undef XMComparisonAllFalse +#undef XMComparisonAnyFalse +#undef XMComparisonMixed +#undef XMComparisonAllInBounds +#undef XMComparisonAnyOutOfBounds +#endif + + // Unit conversion + + inline constexpr float XMConvertToRadians(float fDegrees) noexcept { return fDegrees * (XM_PI / 180.0f); } + inline constexpr float XMConvertToDegrees(float fRadians) noexcept { return fRadians * (180.0f / XM_PI); } + + // Condition register evaluation proceeding a recording (R) comparison + + inline constexpr bool XMComparisonAllTrue(uint32_t CR) noexcept { return (((CR)&XM_CRMASK_CR6TRUE) == XM_CRMASK_CR6TRUE); } + inline constexpr bool XMComparisonAnyTrue(uint32_t CR) noexcept { return (((CR)&XM_CRMASK_CR6FALSE) != XM_CRMASK_CR6FALSE); } + inline constexpr bool XMComparisonAllFalse(uint32_t CR) noexcept { return (((CR)&XM_CRMASK_CR6FALSE) == XM_CRMASK_CR6FALSE); } + inline constexpr bool XMComparisonAnyFalse(uint32_t CR) noexcept { return (((CR)&XM_CRMASK_CR6TRUE) != XM_CRMASK_CR6TRUE); } + inline constexpr bool XMComparisonMixed(uint32_t CR) noexcept { return (((CR)&XM_CRMASK_CR6) == 0); } + inline constexpr bool XMComparisonAllInBounds(uint32_t CR) noexcept { return (((CR)&XM_CRMASK_CR6BOUNDS) == XM_CRMASK_CR6BOUNDS); } + inline constexpr bool XMComparisonAnyOutOfBounds(uint32_t CR) noexcept { return (((CR)&XM_CRMASK_CR6BOUNDS) != XM_CRMASK_CR6BOUNDS); } + + + /**************************************************************************** + * + * Data types + * + ****************************************************************************/ + +#pragma warning(push) +#pragma warning(disable:4068 4201 4365 4324 4820) + // C4068: ignore unknown pragmas + // C4201: nonstandard extension used : nameless struct/union + // C4365: Off by default noise + // C4324/4820: padding warnings + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") +#endif + +//------------------------------------------------------------------------------ +#if defined(_XM_NO_INTRINSICS_) + struct __vector4 + { + union + { + float vector4_f32[4]; + uint32_t vector4_u32[4]; + }; + }; +#endif // _XM_NO_INTRINSICS_ + + //------------------------------------------------------------------------------ + // Vector intrinsic: Four 32 bit floating point components aligned on a 16 byte + // boundary and mapped to hardware vector registers +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + using XMVECTOR = __m128; +#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + using XMVECTOR = float32x4_t; +#else + using XMVECTOR = __vector4; +#endif + + // Fix-up for (1st-3rd) XMVECTOR parameters that are pass-in-register for x86, ARM, ARM64, and vector call; by reference otherwise +#if ( defined(_M_IX86) || defined(_M_ARM) || defined(_M_ARM64) || _XM_VECTORCALL_ || __i386__ || __arm__ || __aarch64__ ) && !defined(_XM_NO_INTRINSICS_) + typedef const XMVECTOR FXMVECTOR; +#else + typedef const XMVECTOR& FXMVECTOR; +#endif + + // Fix-up for (4th) XMVECTOR parameter to pass in-register for ARM, ARM64, and vector call; by reference otherwise +#if ( defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || _XM_VECTORCALL_ || __arm__ || __aarch64__ ) && !defined(_XM_NO_INTRINSICS_) + typedef const XMVECTOR GXMVECTOR; +#else + typedef const XMVECTOR& GXMVECTOR; +#endif + + // Fix-up for (5th & 6th) XMVECTOR parameter to pass in-register for ARM64 and vector call; by reference otherwise +#if ( defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || _XM_VECTORCALL_ || __aarch64__ ) && !defined(_XM_NO_INTRINSICS_) + typedef const XMVECTOR HXMVECTOR; +#else + typedef const XMVECTOR& HXMVECTOR; +#endif + + // Fix-up for (7th+) XMVECTOR parameters to pass by reference + typedef const XMVECTOR& CXMVECTOR; + + //------------------------------------------------------------------------------ + // Conversion types for constants + XM_ALIGNED_STRUCT(16) XMVECTORF32 + { + union + { + float f[4]; + XMVECTOR v; + }; + + inline operator XMVECTOR() const noexcept { return v; } + inline operator const float* () const noexcept { return f; } +#ifdef _XM_NO_INTRINSICS_ +#elif defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const noexcept { return _mm_castps_si128(v); } + inline operator __m128d() const noexcept { return _mm_castps_pd(v); } +#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(__GNUC__) + inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); } + inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); } +#endif + }; + + XM_ALIGNED_STRUCT(16) XMVECTORI32 + { + union + { + int32_t i[4]; + XMVECTOR v; + }; + + inline operator XMVECTOR() const noexcept { return v; } +#ifdef _XM_NO_INTRINSICS_ +#elif defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const noexcept { return _mm_castps_si128(v); } + inline operator __m128d() const noexcept { return _mm_castps_pd(v); } +#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(__GNUC__) + inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); } + inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); } +#endif + }; + + XM_ALIGNED_STRUCT(16) XMVECTORU8 + { + union + { + uint8_t u[16]; + XMVECTOR v; + }; + + inline operator XMVECTOR() const noexcept { return v; } +#ifdef _XM_NO_INTRINSICS_ +#elif defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const noexcept { return _mm_castps_si128(v); } + inline operator __m128d() const noexcept { return _mm_castps_pd(v); } +#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(__GNUC__) + inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); } + inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); } +#endif + }; + + XM_ALIGNED_STRUCT(16) XMVECTORU32 + { + union + { + uint32_t u[4]; + XMVECTOR v; + }; + + inline operator XMVECTOR() const noexcept { return v; } +#ifdef _XM_NO_INTRINSICS_ +#elif defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const noexcept { return _mm_castps_si128(v); } + inline operator __m128d() const noexcept { return _mm_castps_pd(v); } +#elif defined(_XM_ARM_NEON_INTRINSICS_) && defined(__GNUC__) + inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); } + inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); } +#endif + }; + + //------------------------------------------------------------------------------ + // Vector operators + +#ifndef _XM_NO_XMVECTOR_OVERLOADS_ + XMVECTOR XM_CALLCONV operator+ (FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV operator- (FXMVECTOR V) noexcept; + + XMVECTOR& XM_CALLCONV operator+= (XMVECTOR& V1, FXMVECTOR V2) noexcept; + XMVECTOR& XM_CALLCONV operator-= (XMVECTOR& V1, FXMVECTOR V2) noexcept; + XMVECTOR& XM_CALLCONV operator*= (XMVECTOR& V1, FXMVECTOR V2) noexcept; + XMVECTOR& XM_CALLCONV operator/= (XMVECTOR& V1, FXMVECTOR V2) noexcept; + + XMVECTOR& operator*= (XMVECTOR& V, float S) noexcept; + XMVECTOR& operator/= (XMVECTOR& V, float S) noexcept; + + XMVECTOR XM_CALLCONV operator+ (FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV operator- (FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV operator* (FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV operator/ (FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV operator* (FXMVECTOR V, float S) noexcept; + XMVECTOR XM_CALLCONV operator* (float S, FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV operator/ (FXMVECTOR V, float S) noexcept; +#endif /* !_XM_NO_XMVECTOR_OVERLOADS_ */ + + //------------------------------------------------------------------------------ + // Matrix type: Sixteen 32 bit floating point components aligned on a + // 16 byte boundary and mapped to four hardware vector registers + + struct XMMATRIX; + + // Fix-up for (1st) XMMATRIX parameter to pass in-register for ARM64 and vector call; by reference otherwise +#if ( defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || _XM_VECTORCALL_ || __aarch64__ ) && !defined(_XM_NO_INTRINSICS_) + typedef const XMMATRIX FXMMATRIX; +#else + typedef const XMMATRIX& FXMMATRIX; +#endif + + // Fix-up for (2nd+) XMMATRIX parameters to pass by reference + typedef const XMMATRIX& CXMMATRIX; + +#ifdef _XM_NO_INTRINSICS_ + struct XMMATRIX +#else + XM_ALIGNED_STRUCT(16) XMMATRIX +#endif + { +#ifdef _XM_NO_INTRINSICS_ + union + { + XMVECTOR r[4]; + struct + { + float _11, _12, _13, _14; + float _21, _22, _23, _24; + float _31, _32, _33, _34; + float _41, _42, _43, _44; + }; + float m[4][4]; + }; +#else + XMVECTOR r[4]; +#endif + + XMMATRIX() = default; + + XMMATRIX(const XMMATRIX&) = default; + +#if defined(_MSC_VER) && (_MSC_FULL_VER < 191426431) + XMMATRIX& operator= (const XMMATRIX& M) noexcept { r[0] = M.r[0]; r[1] = M.r[1]; r[2] = M.r[2]; r[3] = M.r[3]; return *this; } +#else + XMMATRIX& operator=(const XMMATRIX&) = default; + + XMMATRIX(XMMATRIX&&) = default; + XMMATRIX& operator=(XMMATRIX&&) = default; +#endif + + constexpr XMMATRIX(FXMVECTOR R0, FXMVECTOR R1, FXMVECTOR R2, CXMVECTOR R3) noexcept : r{ R0,R1,R2,R3 } {} + XMMATRIX(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33) noexcept; + explicit XMMATRIX(_In_reads_(16) const float* pArray) noexcept; + +#ifdef _XM_NO_INTRINSICS_ + float operator() (size_t Row, size_t Column) const noexcept { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) noexcept { return m[Row][Column]; } +#endif + + XMMATRIX operator+ () const noexcept { return *this; } + XMMATRIX operator- () const noexcept; + + XMMATRIX& XM_CALLCONV operator+= (FXMMATRIX M) noexcept; + XMMATRIX& XM_CALLCONV operator-= (FXMMATRIX M) noexcept; + XMMATRIX& XM_CALLCONV operator*= (FXMMATRIX M) noexcept; + XMMATRIX& operator*= (float S) noexcept; + XMMATRIX& operator/= (float S) noexcept; + + XMMATRIX XM_CALLCONV operator+ (FXMMATRIX M) const noexcept; + XMMATRIX XM_CALLCONV operator- (FXMMATRIX M) const noexcept; + XMMATRIX XM_CALLCONV operator* (FXMMATRIX M) const noexcept; + XMMATRIX operator* (float S) const noexcept; + XMMATRIX operator/ (float S) const noexcept; + + friend XMMATRIX XM_CALLCONV operator* (float S, FXMMATRIX M) noexcept; + }; + + //------------------------------------------------------------------------------ + // 2D Vector; 32 bit floating point components + struct XMFLOAT2 + { + float x; + float y; + + XMFLOAT2() = default; + + XMFLOAT2(const XMFLOAT2&) = default; + XMFLOAT2& operator=(const XMFLOAT2&) = default; + + XMFLOAT2(XMFLOAT2&&) = default; + XMFLOAT2& operator=(XMFLOAT2&&) = default; + + constexpr XMFLOAT2(float _x, float _y) noexcept : x(_x), y(_y) {} + explicit XMFLOAT2(_In_reads_(2) const float* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + }; + + // 2D Vector; 32 bit floating point components aligned on a 16 byte boundary + XM_ALIGNED_STRUCT(16) XMFLOAT2A : public XMFLOAT2 + { + XMFLOAT2A() = default; + + XMFLOAT2A(const XMFLOAT2A&) = default; + XMFLOAT2A& operator=(const XMFLOAT2A&) = default; + + XMFLOAT2A(XMFLOAT2A&&) = default; + XMFLOAT2A& operator=(XMFLOAT2A&&) = default; + + constexpr XMFLOAT2A(float _x, float _y) noexcept : XMFLOAT2(_x, _y) {} + explicit XMFLOAT2A(_In_reads_(2) const float* pArray) noexcept : XMFLOAT2(pArray) {} + }; + + //------------------------------------------------------------------------------ + // 2D Vector; 32 bit signed integer components + struct XMINT2 + { + int32_t x; + int32_t y; + + XMINT2() = default; + + XMINT2(const XMINT2&) = default; + XMINT2& operator=(const XMINT2&) = default; + + XMINT2(XMINT2&&) = default; + XMINT2& operator=(XMINT2&&) = default; + + constexpr XMINT2(int32_t _x, int32_t _y) noexcept : x(_x), y(_y) {} + explicit XMINT2(_In_reads_(2) const int32_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + }; + + // 2D Vector; 32 bit unsigned integer components + struct XMUINT2 + { + uint32_t x; + uint32_t y; + + XMUINT2() = default; + + XMUINT2(const XMUINT2&) = default; + XMUINT2& operator=(const XMUINT2&) = default; + + XMUINT2(XMUINT2&&) = default; + XMUINT2& operator=(XMUINT2&&) = default; + + constexpr XMUINT2(uint32_t _x, uint32_t _y) noexcept : x(_x), y(_y) {} + explicit XMUINT2(_In_reads_(2) const uint32_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + }; + + //------------------------------------------------------------------------------ + // 3D Vector; 32 bit floating point components + struct XMFLOAT3 + { + float x; + float y; + float z; + + XMFLOAT3() = default; + + XMFLOAT3(const XMFLOAT3&) = default; + XMFLOAT3& operator=(const XMFLOAT3&) = default; + + XMFLOAT3(XMFLOAT3&&) = default; + XMFLOAT3& operator=(XMFLOAT3&&) = default; + + constexpr XMFLOAT3(float _x, float _y, float _z) noexcept : x(_x), y(_y), z(_z) {} + explicit XMFLOAT3(_In_reads_(3) const float* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} + }; + + // 3D Vector; 32 bit floating point components aligned on a 16 byte boundary + XM_ALIGNED_STRUCT(16) XMFLOAT3A : public XMFLOAT3 + { + XMFLOAT3A() = default; + + XMFLOAT3A(const XMFLOAT3A&) = default; + XMFLOAT3A& operator=(const XMFLOAT3A&) = default; + + XMFLOAT3A(XMFLOAT3A&&) = default; + XMFLOAT3A& operator=(XMFLOAT3A&&) = default; + + constexpr XMFLOAT3A(float _x, float _y, float _z) noexcept : XMFLOAT3(_x, _y, _z) {} + explicit XMFLOAT3A(_In_reads_(3) const float* pArray) noexcept : XMFLOAT3(pArray) {} + }; + + //------------------------------------------------------------------------------ + // 3D Vector; 32 bit signed integer components + struct XMINT3 + { + int32_t x; + int32_t y; + int32_t z; + + XMINT3() = default; + + XMINT3(const XMINT3&) = default; + XMINT3& operator=(const XMINT3&) = default; + + XMINT3(XMINT3&&) = default; + XMINT3& operator=(XMINT3&&) = default; + + constexpr XMINT3(int32_t _x, int32_t _y, int32_t _z) noexcept : x(_x), y(_y), z(_z) {} + explicit XMINT3(_In_reads_(3) const int32_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} + }; + + // 3D Vector; 32 bit unsigned integer components + struct XMUINT3 + { + uint32_t x; + uint32_t y; + uint32_t z; + + XMUINT3() = default; + + XMUINT3(const XMUINT3&) = default; + XMUINT3& operator=(const XMUINT3&) = default; + + XMUINT3(XMUINT3&&) = default; + XMUINT3& operator=(XMUINT3&&) = default; + + constexpr XMUINT3(uint32_t _x, uint32_t _y, uint32_t _z) noexcept : x(_x), y(_y), z(_z) {} + explicit XMUINT3(_In_reads_(3) const uint32_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} + }; + + //------------------------------------------------------------------------------ + // 4D Vector; 32 bit floating point components + struct XMFLOAT4 + { + float x; + float y; + float z; + float w; + + XMFLOAT4() = default; + + XMFLOAT4(const XMFLOAT4&) = default; + XMFLOAT4& operator=(const XMFLOAT4&) = default; + + XMFLOAT4(XMFLOAT4&&) = default; + XMFLOAT4& operator=(XMFLOAT4&&) = default; + + constexpr XMFLOAT4(float _x, float _y, float _z, float _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit XMFLOAT4(_In_reads_(4) const float* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + }; + + // 4D Vector; 32 bit floating point components aligned on a 16 byte boundary + XM_ALIGNED_STRUCT(16) XMFLOAT4A : public XMFLOAT4 + { + XMFLOAT4A() = default; + + XMFLOAT4A(const XMFLOAT4A&) = default; + XMFLOAT4A& operator=(const XMFLOAT4A&) = default; + + XMFLOAT4A(XMFLOAT4A&&) = default; + XMFLOAT4A& operator=(XMFLOAT4A&&) = default; + + constexpr XMFLOAT4A(float _x, float _y, float _z, float _w) noexcept : XMFLOAT4(_x, _y, _z, _w) {} + explicit XMFLOAT4A(_In_reads_(4) const float* pArray) noexcept : XMFLOAT4(pArray) {} + }; + + //------------------------------------------------------------------------------ + // 4D Vector; 32 bit signed integer components + struct XMINT4 + { + int32_t x; + int32_t y; + int32_t z; + int32_t w; + + XMINT4() = default; + + XMINT4(const XMINT4&) = default; + XMINT4& operator=(const XMINT4&) = default; + + XMINT4(XMINT4&&) = default; + XMINT4& operator=(XMINT4&&) = default; + + constexpr XMINT4(int32_t _x, int32_t _y, int32_t _z, int32_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit XMINT4(_In_reads_(4) const int32_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + }; + + // 4D Vector; 32 bit unsigned integer components + struct XMUINT4 + { + uint32_t x; + uint32_t y; + uint32_t z; + uint32_t w; + + XMUINT4() = default; + + XMUINT4(const XMUINT4&) = default; + XMUINT4& operator=(const XMUINT4&) = default; + + XMUINT4(XMUINT4&&) = default; + XMUINT4& operator=(XMUINT4&&) = default; + + constexpr XMUINT4(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit XMUINT4(_In_reads_(4) const uint32_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + }; + +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wgnu-anonymous-struct" +#pragma clang diagnostic ignored "-Wnested-anon-types" +#endif + + //------------------------------------------------------------------------------ + // 3x3 Matrix: 32 bit floating point components + struct XMFLOAT3X3 + { + union + { + struct + { + float _11, _12, _13; + float _21, _22, _23; + float _31, _32, _33; + }; + float m[3][3]; + }; + + XMFLOAT3X3() = default; + + XMFLOAT3X3(const XMFLOAT3X3&) = default; + XMFLOAT3X3& operator=(const XMFLOAT3X3&) = default; + + XMFLOAT3X3(XMFLOAT3X3&&) = default; + XMFLOAT3X3& operator=(XMFLOAT3X3&&) = default; + + constexpr XMFLOAT3X3(float m00, float m01, float m02, + float m10, float m11, float m12, + float m20, float m21, float m22) noexcept + : _11(m00), _12(m01), _13(m02), + _21(m10), _22(m11), _23(m12), + _31(m20), _32(m21), _33(m22) {} + explicit XMFLOAT3X3(_In_reads_(9) const float* pArray) noexcept; + + float operator() (size_t Row, size_t Column) const noexcept { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) noexcept { return m[Row][Column]; } + }; + + //------------------------------------------------------------------------------ + // 4x3 Row-major Matrix: 32 bit floating point components + struct XMFLOAT4X3 + { + union + { + struct + { + float _11, _12, _13; + float _21, _22, _23; + float _31, _32, _33; + float _41, _42, _43; + }; + float m[4][3]; + float f[12]; + }; + + XMFLOAT4X3() = default; + + XMFLOAT4X3(const XMFLOAT4X3&) = default; + XMFLOAT4X3& operator=(const XMFLOAT4X3&) = default; + + XMFLOAT4X3(XMFLOAT4X3&&) = default; + XMFLOAT4X3& operator=(XMFLOAT4X3&&) = default; + + constexpr XMFLOAT4X3(float m00, float m01, float m02, + float m10, float m11, float m12, + float m20, float m21, float m22, + float m30, float m31, float m32) noexcept + : _11(m00), _12(m01), _13(m02), + _21(m10), _22(m11), _23(m12), + _31(m20), _32(m21), _33(m22), + _41(m30), _42(m31), _43(m32) {} + explicit XMFLOAT4X3(_In_reads_(12) const float* pArray) noexcept; + + float operator() (size_t Row, size_t Column) const noexcept { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) noexcept { return m[Row][Column]; } + }; + + // 4x3 Row-major Matrix: 32 bit floating point components aligned on a 16 byte boundary + XM_ALIGNED_STRUCT(16) XMFLOAT4X3A : public XMFLOAT4X3 + { + XMFLOAT4X3A() = default; + + XMFLOAT4X3A(const XMFLOAT4X3A&) = default; + XMFLOAT4X3A& operator=(const XMFLOAT4X3A&) = default; + + XMFLOAT4X3A(XMFLOAT4X3A&&) = default; + XMFLOAT4X3A& operator=(XMFLOAT4X3A&&) = default; + + constexpr XMFLOAT4X3A(float m00, float m01, float m02, + float m10, float m11, float m12, + float m20, float m21, float m22, + float m30, float m31, float m32) noexcept : + XMFLOAT4X3(m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32) {} + explicit XMFLOAT4X3A(_In_reads_(12) const float* pArray) noexcept : XMFLOAT4X3(pArray) {} + }; + + //------------------------------------------------------------------------------ + // 3x4 Column-major Matrix: 32 bit floating point components + struct XMFLOAT3X4 + { + union + { + struct + { + float _11, _12, _13, _14; + float _21, _22, _23, _24; + float _31, _32, _33, _34; + }; + float m[3][4]; + float f[12]; + }; + + XMFLOAT3X4() = default; + + XMFLOAT3X4(const XMFLOAT3X4&) = default; + XMFLOAT3X4& operator=(const XMFLOAT3X4&) = default; + + XMFLOAT3X4(XMFLOAT3X4&&) = default; + XMFLOAT3X4& operator=(XMFLOAT3X4&&) = default; + + constexpr XMFLOAT3X4(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23) noexcept + : _11(m00), _12(m01), _13(m02), _14(m03), + _21(m10), _22(m11), _23(m12), _24(m13), + _31(m20), _32(m21), _33(m22), _34(m23) {} + explicit XMFLOAT3X4(_In_reads_(12) const float* pArray) noexcept; + + float operator() (size_t Row, size_t Column) const noexcept { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) noexcept { return m[Row][Column]; } + }; + + // 3x4 Column-major Matrix: 32 bit floating point components aligned on a 16 byte boundary + XM_ALIGNED_STRUCT(16) XMFLOAT3X4A : public XMFLOAT3X4 + { + XMFLOAT3X4A() = default; + + XMFLOAT3X4A(const XMFLOAT3X4A&) = default; + XMFLOAT3X4A& operator=(const XMFLOAT3X4A&) = default; + + XMFLOAT3X4A(XMFLOAT3X4A&&) = default; + XMFLOAT3X4A& operator=(XMFLOAT3X4A&&) = default; + + constexpr XMFLOAT3X4A(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23) noexcept : + XMFLOAT3X4(m00, m01, m02, m03, m10, m11, m12, m13, m20, m21, m22, m23) {} + explicit XMFLOAT3X4A(_In_reads_(12) const float* pArray) noexcept : XMFLOAT3X4(pArray) {} + }; + + //------------------------------------------------------------------------------ + // 4x4 Matrix: 32 bit floating point components + struct XMFLOAT4X4 + { + union + { + struct + { + float _11, _12, _13, _14; + float _21, _22, _23, _24; + float _31, _32, _33, _34; + float _41, _42, _43, _44; + }; + float m[4][4]; + }; + + XMFLOAT4X4() = default; + + XMFLOAT4X4(const XMFLOAT4X4&) = default; + XMFLOAT4X4& operator=(const XMFLOAT4X4&) = default; + + XMFLOAT4X4(XMFLOAT4X4&&) = default; + XMFLOAT4X4& operator=(XMFLOAT4X4&&) = default; + + constexpr XMFLOAT4X4(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33) noexcept + : _11(m00), _12(m01), _13(m02), _14(m03), + _21(m10), _22(m11), _23(m12), _24(m13), + _31(m20), _32(m21), _33(m22), _34(m23), + _41(m30), _42(m31), _43(m32), _44(m33) {} + explicit XMFLOAT4X4(_In_reads_(16) const float* pArray) noexcept; + + float operator() (size_t Row, size_t Column) const noexcept { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) noexcept { return m[Row][Column]; } + }; + + // 4x4 Matrix: 32 bit floating point components aligned on a 16 byte boundary + XM_ALIGNED_STRUCT(16) XMFLOAT4X4A : public XMFLOAT4X4 + { + XMFLOAT4X4A() = default; + + XMFLOAT4X4A(const XMFLOAT4X4A&) = default; + XMFLOAT4X4A& operator=(const XMFLOAT4X4A&) = default; + + XMFLOAT4X4A(XMFLOAT4X4A&&) = default; + XMFLOAT4X4A& operator=(XMFLOAT4X4A&&) = default; + + constexpr XMFLOAT4X4A(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33) noexcept + : XMFLOAT4X4(m00, m01, m02, m03, m10, m11, m12, m13, m20, m21, m22, m23, m30, m31, m32, m33) {} + explicit XMFLOAT4X4A(_In_reads_(16) const float* pArray) noexcept : XMFLOAT4X4(pArray) {} + }; + + //////////////////////////////////////////////////////////////////////////////// + +#ifdef __clang__ +#pragma clang diagnostic pop +#endif + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +#pragma warning(pop) + +/**************************************************************************** + * + * Data conversion operations + * + ****************************************************************************/ + + XMVECTOR XM_CALLCONV XMConvertVectorIntToFloat(FXMVECTOR VInt, uint32_t DivExponent) noexcept; + XMVECTOR XM_CALLCONV XMConvertVectorFloatToInt(FXMVECTOR VFloat, uint32_t MulExponent) noexcept; + XMVECTOR XM_CALLCONV XMConvertVectorUIntToFloat(FXMVECTOR VUInt, uint32_t DivExponent) noexcept; + XMVECTOR XM_CALLCONV XMConvertVectorFloatToUInt(FXMVECTOR VFloat, uint32_t MulExponent) noexcept; + +#if defined(__XNAMATH_H__) && defined(XMVectorSetBinaryConstant) +#undef XMVectorSetBinaryConstant +#undef XMVectorSplatConstant +#undef XMVectorSplatConstantInt +#endif + + XMVECTOR XM_CALLCONV XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3) noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatConstant(int32_t IntConstant, uint32_t DivExponent) noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatConstantInt(int32_t IntConstant) noexcept; + + /**************************************************************************** + * + * Load operations + * + ****************************************************************************/ + + XMVECTOR XM_CALLCONV XMLoadInt(_In_ const uint32_t* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadFloat(_In_ const float* pSource) noexcept; + + XMVECTOR XM_CALLCONV XMLoadInt2(_In_reads_(2) const uint32_t* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadInt2A(_In_reads_(2) const uint32_t* PSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadFloat2(_In_ const XMFLOAT2* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadFloat2A(_In_ const XMFLOAT2A* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadSInt2(_In_ const XMINT2* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUInt2(_In_ const XMUINT2* pSource) noexcept; + + XMVECTOR XM_CALLCONV XMLoadInt3(_In_reads_(3) const uint32_t* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadInt3A(_In_reads_(3) const uint32_t* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadFloat3(_In_ const XMFLOAT3* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadFloat3A(_In_ const XMFLOAT3A* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadSInt3(_In_ const XMINT3* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUInt3(_In_ const XMUINT3* pSource) noexcept; + + XMVECTOR XM_CALLCONV XMLoadInt4(_In_reads_(4) const uint32_t* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadInt4A(_In_reads_(4) const uint32_t* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadFloat4(_In_ const XMFLOAT4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadFloat4A(_In_ const XMFLOAT4A* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadSInt4(_In_ const XMINT4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUInt4(_In_ const XMUINT4* pSource) noexcept; + + XMMATRIX XM_CALLCONV XMLoadFloat3x3(_In_ const XMFLOAT3X3* pSource) noexcept; + XMMATRIX XM_CALLCONV XMLoadFloat4x3(_In_ const XMFLOAT4X3* pSource) noexcept; + XMMATRIX XM_CALLCONV XMLoadFloat4x3A(_In_ const XMFLOAT4X3A* pSource) noexcept; + XMMATRIX XM_CALLCONV XMLoadFloat3x4(_In_ const XMFLOAT3X4* pSource) noexcept; + XMMATRIX XM_CALLCONV XMLoadFloat3x4A(_In_ const XMFLOAT3X4A* pSource) noexcept; + XMMATRIX XM_CALLCONV XMLoadFloat4x4(_In_ const XMFLOAT4X4* pSource) noexcept; + XMMATRIX XM_CALLCONV XMLoadFloat4x4A(_In_ const XMFLOAT4X4A* pSource) noexcept; + + /**************************************************************************** + * + * Store operations + * + ****************************************************************************/ + + void XM_CALLCONV XMStoreInt(_Out_ uint32_t* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreFloat(_Out_ float* pDestination, _In_ FXMVECTOR V) noexcept; + + void XM_CALLCONV XMStoreInt2(_Out_writes_(2) uint32_t* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreInt2A(_Out_writes_(2) uint32_t* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreFloat2(_Out_ XMFLOAT2* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreFloat2A(_Out_ XMFLOAT2A* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreSInt2(_Out_ XMINT2* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUInt2(_Out_ XMUINT2* pDestination, _In_ FXMVECTOR V) noexcept; + + void XM_CALLCONV XMStoreInt3(_Out_writes_(3) uint32_t* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreInt3A(_Out_writes_(3) uint32_t* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreFloat3(_Out_ XMFLOAT3* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreFloat3A(_Out_ XMFLOAT3A* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreSInt3(_Out_ XMINT3* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUInt3(_Out_ XMUINT3* pDestination, _In_ FXMVECTOR V) noexcept; + + void XM_CALLCONV XMStoreInt4(_Out_writes_(4) uint32_t* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreInt4A(_Out_writes_(4) uint32_t* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreFloat4(_Out_ XMFLOAT4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreFloat4A(_Out_ XMFLOAT4A* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreSInt4(_Out_ XMINT4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUInt4(_Out_ XMUINT4* pDestination, _In_ FXMVECTOR V) noexcept; + + void XM_CALLCONV XMStoreFloat3x3(_Out_ XMFLOAT3X3* pDestination, _In_ FXMMATRIX M) noexcept; + void XM_CALLCONV XMStoreFloat4x3(_Out_ XMFLOAT4X3* pDestination, _In_ FXMMATRIX M) noexcept; + void XM_CALLCONV XMStoreFloat4x3A(_Out_ XMFLOAT4X3A* pDestination, _In_ FXMMATRIX M) noexcept; + void XM_CALLCONV XMStoreFloat3x4(_Out_ XMFLOAT3X4* pDestination, _In_ FXMMATRIX M) noexcept; + void XM_CALLCONV XMStoreFloat3x4A(_Out_ XMFLOAT3X4A* pDestination, _In_ FXMMATRIX M) noexcept; + void XM_CALLCONV XMStoreFloat4x4(_Out_ XMFLOAT4X4* pDestination, _In_ FXMMATRIX M) noexcept; + void XM_CALLCONV XMStoreFloat4x4A(_Out_ XMFLOAT4X4A* pDestination, _In_ FXMMATRIX M) noexcept; + + /**************************************************************************** + * + * General vector operations + * + ****************************************************************************/ + + XMVECTOR XM_CALLCONV XMVectorZero() noexcept; + XMVECTOR XM_CALLCONV XMVectorSet(float x, float y, float z, float w) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w) noexcept; + XMVECTOR XM_CALLCONV XMVectorReplicate(float Value) noexcept; + XMVECTOR XM_CALLCONV XMVectorReplicatePtr(_In_ const float* pValue) noexcept; + XMVECTOR XM_CALLCONV XMVectorReplicateInt(uint32_t Value) noexcept; + XMVECTOR XM_CALLCONV XMVectorReplicateIntPtr(_In_ const uint32_t* pValue) noexcept; + XMVECTOR XM_CALLCONV XMVectorTrueInt() noexcept; + XMVECTOR XM_CALLCONV XMVectorFalseInt() noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatX(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatY(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatZ(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatW(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatOne() noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatInfinity() noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatQNaN() noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatEpsilon() noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatSignMask() noexcept; + + float XM_CALLCONV XMVectorGetByIndex(FXMVECTOR V, size_t i) noexcept; + float XM_CALLCONV XMVectorGetX(FXMVECTOR V) noexcept; + float XM_CALLCONV XMVectorGetY(FXMVECTOR V) noexcept; + float XM_CALLCONV XMVectorGetZ(FXMVECTOR V) noexcept; + float XM_CALLCONV XMVectorGetW(FXMVECTOR V) noexcept; + + void XM_CALLCONV XMVectorGetByIndexPtr(_Out_ float* f, _In_ FXMVECTOR V, _In_ size_t i) noexcept; + void XM_CALLCONV XMVectorGetXPtr(_Out_ float* x, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMVectorGetYPtr(_Out_ float* y, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMVectorGetZPtr(_Out_ float* z, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMVectorGetWPtr(_Out_ float* w, _In_ FXMVECTOR V) noexcept; + + uint32_t XM_CALLCONV XMVectorGetIntByIndex(FXMVECTOR V, size_t i) noexcept; + uint32_t XM_CALLCONV XMVectorGetIntX(FXMVECTOR V) noexcept; + uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V) noexcept; + uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V) noexcept; + uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V) noexcept; + + void XM_CALLCONV XMVectorGetIntByIndexPtr(_Out_ uint32_t* x, _In_ FXMVECTOR V, _In_ size_t i) noexcept; + void XM_CALLCONV XMVectorGetIntXPtr(_Out_ uint32_t* x, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMVectorGetIntYPtr(_Out_ uint32_t* y, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMVectorGetIntZPtr(_Out_ uint32_t* z, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMVectorGetIntWPtr(_Out_ uint32_t* w, _In_ FXMVECTOR V) noexcept; + + XMVECTOR XM_CALLCONV XMVectorSetByIndex(FXMVECTOR V, float f, size_t i) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetX(FXMVECTOR V, float x) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w) noexcept; + + XMVECTOR XM_CALLCONV XMVectorSetByIndexPtr(_In_ FXMVECTOR V, _In_ const float* f, _In_ size_t i) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetXPtr(_In_ FXMVECTOR V, _In_ const float* x) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetYPtr(_In_ FXMVECTOR V, _In_ const float* y) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetZPtr(_In_ FXMVECTOR V, _In_ const float* z) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetWPtr(_In_ FXMVECTOR V, _In_ const float* w) noexcept; + + XMVECTOR XM_CALLCONV XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetIntX(FXMVECTOR V, uint32_t x) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w) noexcept; + + XMVECTOR XM_CALLCONV XMVectorSetIntByIndexPtr(_In_ FXMVECTOR V, _In_ const uint32_t* x, _In_ size_t i) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetIntXPtr(_In_ FXMVECTOR V, _In_ const uint32_t* x) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetIntYPtr(_In_ FXMVECTOR V, _In_ const uint32_t* y) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetIntZPtr(_In_ FXMVECTOR V, _In_ const uint32_t* z) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetIntWPtr(_In_ FXMVECTOR V, _In_ const uint32_t* w) noexcept; + +#if defined(__XNAMATH_H__) && defined(XMVectorSwizzle) +#undef XMVectorSwizzle +#endif + + XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3) noexcept; + XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW) noexcept; + XMVECTOR XM_CALLCONV XMVectorSelectControl(uint32_t VectorIndex0, uint32_t VectorIndex1, uint32_t VectorIndex2, uint32_t VectorIndex3) noexcept; + XMVECTOR XM_CALLCONV XMVectorSelect(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Control) noexcept; + XMVECTOR XM_CALLCONV XMVectorMergeXY(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorMergeZW(FXMVECTOR V1, FXMVECTOR V2) noexcept; + +#if defined(__XNAMATH_H__) && defined(XMVectorShiftLeft) +#undef XMVectorShiftLeft +#undef XMVectorRotateLeft +#undef XMVectorRotateRight +#undef XMVectorInsert +#endif + + XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements) noexcept; + XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements) noexcept; + XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements) noexcept; + XMVECTOR XM_CALLCONV XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, uint32_t VSLeftRotateElements, + uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3) noexcept; + + XMVECTOR XM_CALLCONV XMVectorEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorEqualR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorEqualIntR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V, _In_ FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorNearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon) noexcept; + XMVECTOR XM_CALLCONV XMVectorNotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorNotEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorGreater(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorGreaterR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorGreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorGreaterOrEqualR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorLess(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorLessOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorInBounds(FXMVECTOR V, FXMVECTOR Bounds) noexcept; + XMVECTOR XM_CALLCONV XMVectorInBoundsR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V, _In_ FXMVECTOR Bounds) noexcept; + + XMVECTOR XM_CALLCONV XMVectorIsNaN(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorIsInfinite(FXMVECTOR V) noexcept; + + XMVECTOR XM_CALLCONV XMVectorMin(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorMax(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorRound(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorTruncate(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorFloor(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorCeiling(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorClamp(FXMVECTOR V, FXMVECTOR Min, FXMVECTOR Max) noexcept; + XMVECTOR XM_CALLCONV XMVectorSaturate(FXMVECTOR V) noexcept; + + XMVECTOR XM_CALLCONV XMVectorAndInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorAndCInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorOrInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorNorInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorXorInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + + XMVECTOR XM_CALLCONV XMVectorNegate(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorAdd(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorSum(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorAddAngles(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorSubtract(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorSubtractAngles(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorMultiply(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorMultiplyAdd(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3) noexcept; + XMVECTOR XM_CALLCONV XMVectorDivide(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3) noexcept; + XMVECTOR XM_CALLCONV XMVectorScale(FXMVECTOR V, float ScaleFactor) noexcept; + XMVECTOR XM_CALLCONV XMVectorReciprocalEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorReciprocal(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorSqrtEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorSqrt(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorReciprocalSqrtEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorReciprocalSqrt(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorExp2(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorExp10(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorExpE(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorExp(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorLog2(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorLog10(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorLogE(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorLog(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorPow(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorAbs(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorMod(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorModAngles(FXMVECTOR Angles) noexcept; + XMVECTOR XM_CALLCONV XMVectorSin(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorSinEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorCos(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorCosEst(FXMVECTOR V) noexcept; + void XM_CALLCONV XMVectorSinCos(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMVectorSinCosEst(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos, _In_ FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorTan(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorTanEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorSinH(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorCosH(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorTanH(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorASin(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorASinEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorACos(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorACosEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorATan(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorATanEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorATan2(FXMVECTOR Y, FXMVECTOR X) noexcept; + XMVECTOR XM_CALLCONV XMVectorATan2Est(FXMVECTOR Y, FXMVECTOR X) noexcept; + XMVECTOR XM_CALLCONV XMVectorLerp(FXMVECTOR V0, FXMVECTOR V1, float t) noexcept; + XMVECTOR XM_CALLCONV XMVectorLerpV(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR T) noexcept; + XMVECTOR XM_CALLCONV XMVectorHermite(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, GXMVECTOR Tangent1, float t) noexcept; + XMVECTOR XM_CALLCONV XMVectorHermiteV(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, GXMVECTOR Tangent1, HXMVECTOR T) noexcept; + XMVECTOR XM_CALLCONV XMVectorCatmullRom(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR Position3, float t) noexcept; + XMVECTOR XM_CALLCONV XMVectorCatmullRomV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR Position3, HXMVECTOR T) noexcept; + XMVECTOR XM_CALLCONV XMVectorBaryCentric(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, float f, float g) noexcept; + XMVECTOR XM_CALLCONV XMVectorBaryCentricV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR F, HXMVECTOR G) noexcept; + + /**************************************************************************** + * + * 2D vector operations + * + ****************************************************************************/ + + bool XM_CALLCONV XMVector2Equal(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector2EqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector2EqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector2EqualIntR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector2NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon) noexcept; + bool XM_CALLCONV XMVector2NotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector2NotEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector2Greater(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector2GreaterR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector2GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector2GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector2Less(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector2LessOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector2InBounds(FXMVECTOR V, FXMVECTOR Bounds) noexcept; + + bool XM_CALLCONV XMVector2IsNaN(FXMVECTOR V) noexcept; + bool XM_CALLCONV XMVector2IsInfinite(FXMVECTOR V) noexcept; + + XMVECTOR XM_CALLCONV XMVector2Dot(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVector2Cross(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVector2LengthSq(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector2ReciprocalLength(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector2LengthEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector2Length(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector2NormalizeEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector2Normalize(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector2ClampLength(FXMVECTOR V, float LengthMin, float LengthMax) noexcept; + XMVECTOR XM_CALLCONV XMVector2ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax) noexcept; + XMVECTOR XM_CALLCONV XMVector2Reflect(FXMVECTOR Incident, FXMVECTOR Normal) noexcept; + XMVECTOR XM_CALLCONV XMVector2Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex) noexcept; + XMVECTOR XM_CALLCONV XMVector2RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex) noexcept; + XMVECTOR XM_CALLCONV XMVector2Orthogonal(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2) noexcept; + XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2) noexcept; + XMVECTOR XM_CALLCONV XMVector2AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVector2LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point) noexcept; + XMVECTOR XM_CALLCONV XMVector2IntersectLine(FXMVECTOR Line1Point1, FXMVECTOR Line1Point2, FXMVECTOR Line2Point1, GXMVECTOR Line2Point2) noexcept; + XMVECTOR XM_CALLCONV XMVector2Transform(FXMVECTOR V, FXMMATRIX M) noexcept; + XMFLOAT4* XM_CALLCONV XMVector2TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4) + OutputStride * (VectorCount - 1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT2) + InputStride * (VectorCount - 1)) const XMFLOAT2* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept; + XMVECTOR XM_CALLCONV XMVector2TransformCoord(FXMVECTOR V, FXMMATRIX M) noexcept; + XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream(_Out_writes_bytes_(sizeof(XMFLOAT2) + OutputStride * (VectorCount - 1)) XMFLOAT2* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT2) + InputStride * (VectorCount - 1)) const XMFLOAT2* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept; + XMVECTOR XM_CALLCONV XMVector2TransformNormal(FXMVECTOR V, FXMMATRIX M) noexcept; + XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream(_Out_writes_bytes_(sizeof(XMFLOAT2) + OutputStride * (VectorCount - 1)) XMFLOAT2* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT2) + InputStride * (VectorCount - 1)) const XMFLOAT2* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept; + + /**************************************************************************** + * + * 3D vector operations + * + ****************************************************************************/ + + bool XM_CALLCONV XMVector3Equal(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector3EqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector3EqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector3EqualIntR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector3NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon) noexcept; + bool XM_CALLCONV XMVector3NotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector3NotEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector3Greater(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector3GreaterR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector3GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector3GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector3Less(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector3LessOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector3InBounds(FXMVECTOR V, FXMVECTOR Bounds) noexcept; + + bool XM_CALLCONV XMVector3IsNaN(FXMVECTOR V) noexcept; + bool XM_CALLCONV XMVector3IsInfinite(FXMVECTOR V) noexcept; + + XMVECTOR XM_CALLCONV XMVector3Dot(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVector3Cross(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVector3LengthSq(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector3ReciprocalLength(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector3LengthEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector3Length(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector3NormalizeEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector3Normalize(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector3ClampLength(FXMVECTOR V, float LengthMin, float LengthMax) noexcept; + XMVECTOR XM_CALLCONV XMVector3ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax) noexcept; + XMVECTOR XM_CALLCONV XMVector3Reflect(FXMVECTOR Incident, FXMVECTOR Normal) noexcept; + XMVECTOR XM_CALLCONV XMVector3Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex) noexcept; + XMVECTOR XM_CALLCONV XMVector3RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex) noexcept; + XMVECTOR XM_CALLCONV XMVector3Orthogonal(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2) noexcept; + XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2) noexcept; + XMVECTOR XM_CALLCONV XMVector3AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVector3LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point) noexcept; + void XM_CALLCONV XMVector3ComponentsFromNormal(_Out_ XMVECTOR* pParallel, _Out_ XMVECTOR* pPerpendicular, _In_ FXMVECTOR V, _In_ FXMVECTOR Normal) noexcept; + XMVECTOR XM_CALLCONV XMVector3Rotate(FXMVECTOR V, FXMVECTOR RotationQuaternion) noexcept; + XMVECTOR XM_CALLCONV XMVector3InverseRotate(FXMVECTOR V, FXMVECTOR RotationQuaternion) noexcept; + XMVECTOR XM_CALLCONV XMVector3Transform(FXMVECTOR V, FXMMATRIX M) noexcept; + XMFLOAT4* XM_CALLCONV XMVector3TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4) + OutputStride * (VectorCount - 1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3) + InputStride * (VectorCount - 1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept; + XMVECTOR XM_CALLCONV XMVector3TransformCoord(FXMVECTOR V, FXMMATRIX M) noexcept; + XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream(_Out_writes_bytes_(sizeof(XMFLOAT3) + OutputStride * (VectorCount - 1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3) + InputStride * (VectorCount - 1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept; + XMVECTOR XM_CALLCONV XMVector3TransformNormal(FXMVECTOR V, FXMMATRIX M) noexcept; + XMFLOAT3* XM_CALLCONV XMVector3TransformNormalStream(_Out_writes_bytes_(sizeof(XMFLOAT3) + OutputStride * (VectorCount - 1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3) + InputStride * (VectorCount - 1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept; + XMVECTOR XM_CALLCONV XMVector3Project(FXMVECTOR V, float ViewportX, float ViewportY, float ViewportWidth, float ViewportHeight, float ViewportMinZ, float ViewportMaxZ, + FXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World) noexcept; + XMFLOAT3* XM_CALLCONV XMVector3ProjectStream(_Out_writes_bytes_(sizeof(XMFLOAT3) + OutputStride * (VectorCount - 1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3) + InputStride * (VectorCount - 1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, + _In_ float ViewportX, _In_ float ViewportY, _In_ float ViewportWidth, _In_ float ViewportHeight, _In_ float ViewportMinZ, _In_ float ViewportMaxZ, + _In_ FXMMATRIX Projection, _In_ CXMMATRIX View, _In_ CXMMATRIX World) noexcept; + XMVECTOR XM_CALLCONV XMVector3Unproject(FXMVECTOR V, float ViewportX, float ViewportY, float ViewportWidth, float ViewportHeight, float ViewportMinZ, float ViewportMaxZ, + FXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World) noexcept; + XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream(_Out_writes_bytes_(sizeof(XMFLOAT3) + OutputStride * (VectorCount - 1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3) + InputStride * (VectorCount - 1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, + _In_ float ViewportX, _In_ float ViewportY, _In_ float ViewportWidth, _In_ float ViewportHeight, _In_ float ViewportMinZ, _In_ float ViewportMaxZ, + _In_ FXMMATRIX Projection, _In_ CXMMATRIX View, _In_ CXMMATRIX World) noexcept; + + /**************************************************************************** + * + * 4D vector operations + * + ****************************************************************************/ + + bool XM_CALLCONV XMVector4Equal(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector4EqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector4EqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector4EqualIntR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector4NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon) noexcept; + bool XM_CALLCONV XMVector4NotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector4NotEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector4Greater(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector4GreaterR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector4GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector4GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector4Less(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector4LessOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector4InBounds(FXMVECTOR V, FXMVECTOR Bounds) noexcept; + + bool XM_CALLCONV XMVector4IsNaN(FXMVECTOR V) noexcept; + bool XM_CALLCONV XMVector4IsInfinite(FXMVECTOR V) noexcept; + + XMVECTOR XM_CALLCONV XMVector4Dot(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVector4Cross(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3) noexcept; + XMVECTOR XM_CALLCONV XMVector4LengthSq(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector4ReciprocalLength(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector4LengthEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector4Length(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector4NormalizeEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector4Normalize(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector4ClampLength(FXMVECTOR V, float LengthMin, float LengthMax) noexcept; + XMVECTOR XM_CALLCONV XMVector4ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax) noexcept; + XMVECTOR XM_CALLCONV XMVector4Reflect(FXMVECTOR Incident, FXMVECTOR Normal) noexcept; + XMVECTOR XM_CALLCONV XMVector4Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex) noexcept; + XMVECTOR XM_CALLCONV XMVector4RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex) noexcept; + XMVECTOR XM_CALLCONV XMVector4Orthogonal(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2) noexcept; + XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2) noexcept; + XMVECTOR XM_CALLCONV XMVector4AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVector4Transform(FXMVECTOR V, FXMMATRIX M) noexcept; + XMFLOAT4* XM_CALLCONV XMVector4TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4) + OutputStride * (VectorCount - 1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT4) + InputStride * (VectorCount - 1)) const XMFLOAT4* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept; + + /**************************************************************************** + * + * Matrix operations + * + ****************************************************************************/ + + bool XM_CALLCONV XMMatrixIsNaN(FXMMATRIX M) noexcept; + bool XM_CALLCONV XMMatrixIsInfinite(FXMMATRIX M) noexcept; + bool XM_CALLCONV XMMatrixIsIdentity(FXMMATRIX M) noexcept; + + XMMATRIX XM_CALLCONV XMMatrixMultiply(FXMMATRIX M1, CXMMATRIX M2) noexcept; + XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose(FXMMATRIX M1, CXMMATRIX M2) noexcept; + XMMATRIX XM_CALLCONV XMMatrixTranspose(FXMMATRIX M) noexcept; + XMMATRIX XM_CALLCONV XMMatrixInverse(_Out_opt_ XMVECTOR* pDeterminant, _In_ FXMMATRIX M) noexcept; + XMMATRIX XM_CALLCONV XMMatrixVectorTensorProduct(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMMatrixDeterminant(FXMMATRIX M) noexcept; + + _Success_(return) + bool XM_CALLCONV XMMatrixDecompose(_Out_ XMVECTOR* outScale, _Out_ XMVECTOR* outRotQuat, _Out_ XMVECTOR* outTrans, _In_ FXMMATRIX M) noexcept; + + XMMATRIX XM_CALLCONV XMMatrixIdentity() noexcept; + XMMATRIX XM_CALLCONV XMMatrixSet(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33) noexcept; + XMMATRIX XM_CALLCONV XMMatrixTranslation(float OffsetX, float OffsetY, float OffsetZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixTranslationFromVector(FXMVECTOR Offset) noexcept; + XMMATRIX XM_CALLCONV XMMatrixScaling(float ScaleX, float ScaleY, float ScaleZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixScalingFromVector(FXMVECTOR Scale) noexcept; + XMMATRIX XM_CALLCONV XMMatrixRotationX(float Angle) noexcept; + XMMATRIX XM_CALLCONV XMMatrixRotationY(float Angle) noexcept; + XMMATRIX XM_CALLCONV XMMatrixRotationZ(float Angle) noexcept; + XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYaw(float Pitch, float Yaw, float Roll) noexcept; + XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYawFromVector(FXMVECTOR Angles) noexcept; + XMMATRIX XM_CALLCONV XMMatrixRotationNormal(FXMVECTOR NormalAxis, float Angle) noexcept; + XMMATRIX XM_CALLCONV XMMatrixRotationAxis(FXMVECTOR Axis, float Angle) noexcept; + XMMATRIX XM_CALLCONV XMMatrixRotationQuaternion(FXMVECTOR Quaternion) noexcept; + XMMATRIX XM_CALLCONV XMMatrixTransformation2D(FXMVECTOR ScalingOrigin, float ScalingOrientation, FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, float Rotation, GXMVECTOR Translation) noexcept; + XMMATRIX XM_CALLCONV XMMatrixTransformation(FXMVECTOR ScalingOrigin, FXMVECTOR ScalingOrientationQuaternion, FXMVECTOR Scaling, + GXMVECTOR RotationOrigin, HXMVECTOR RotationQuaternion, HXMVECTOR Translation) noexcept; + XMMATRIX XM_CALLCONV XMMatrixAffineTransformation2D(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, float Rotation, FXMVECTOR Translation) noexcept; + XMMATRIX XM_CALLCONV XMMatrixAffineTransformation(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, FXMVECTOR RotationQuaternion, GXMVECTOR Translation) noexcept; + XMMATRIX XM_CALLCONV XMMatrixReflect(FXMVECTOR ReflectionPlane) noexcept; + XMMATRIX XM_CALLCONV XMMatrixShadow(FXMVECTOR ShadowPlane, FXMVECTOR LightPosition) noexcept; + + XMMATRIX XM_CALLCONV XMMatrixLookAtLH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection) noexcept; + XMMATRIX XM_CALLCONV XMMatrixLookAtRH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection) noexcept; + XMMATRIX XM_CALLCONV XMMatrixLookToLH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection) noexcept; + XMMATRIX XM_CALLCONV XMMatrixLookToRH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection) noexcept; + XMMATRIX XM_CALLCONV XMMatrixPerspectiveLH(float ViewWidth, float ViewHeight, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixPerspectiveRH(float ViewWidth, float ViewHeight, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovLH(float FovAngleY, float AspectRatio, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovRH(float FovAngleY, float AspectRatio, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterLH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterRH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixOrthographicLH(float ViewWidth, float ViewHeight, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixOrthographicRH(float ViewWidth, float ViewHeight, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterLH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterRH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ) noexcept; + + + /**************************************************************************** + * + * Quaternion operations + * + ****************************************************************************/ + + bool XM_CALLCONV XMQuaternionEqual(FXMVECTOR Q1, FXMVECTOR Q2) noexcept; + bool XM_CALLCONV XMQuaternionNotEqual(FXMVECTOR Q1, FXMVECTOR Q2) noexcept; + + bool XM_CALLCONV XMQuaternionIsNaN(FXMVECTOR Q) noexcept; + bool XM_CALLCONV XMQuaternionIsInfinite(FXMVECTOR Q) noexcept; + bool XM_CALLCONV XMQuaternionIsIdentity(FXMVECTOR Q) noexcept; + + XMVECTOR XM_CALLCONV XMQuaternionDot(FXMVECTOR Q1, FXMVECTOR Q2) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionMultiply(FXMVECTOR Q1, FXMVECTOR Q2) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionLengthSq(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionReciprocalLength(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionLength(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionNormalizeEst(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionNormalize(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionConjugate(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionInverse(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionLn(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionExp(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionSlerp(FXMVECTOR Q0, FXMVECTOR Q1, float t) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionSlerpV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR T) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionSquad(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR Q3, float t) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionSquadV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR Q3, HXMVECTOR T) noexcept; + void XM_CALLCONV XMQuaternionSquadSetup(_Out_ XMVECTOR* pA, _Out_ XMVECTOR* pB, _Out_ XMVECTOR* pC, _In_ FXMVECTOR Q0, _In_ FXMVECTOR Q1, _In_ FXMVECTOR Q2, _In_ GXMVECTOR Q3) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionBaryCentric(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, float f, float g) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionBaryCentricV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR F, HXMVECTOR G) noexcept; + + XMVECTOR XM_CALLCONV XMQuaternionIdentity() noexcept; + XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYaw(float Pitch, float Yaw, float Roll) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYawFromVector(FXMVECTOR Angles) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionRotationNormal(FXMVECTOR NormalAxis, float Angle) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionRotationAxis(FXMVECTOR Axis, float Angle) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionRotationMatrix(FXMMATRIX M) noexcept; + + void XM_CALLCONV XMQuaternionToAxisAngle(_Out_ XMVECTOR* pAxis, _Out_ float* pAngle, _In_ FXMVECTOR Q) noexcept; + + /**************************************************************************** + * + * Plane operations + * + ****************************************************************************/ + + bool XM_CALLCONV XMPlaneEqual(FXMVECTOR P1, FXMVECTOR P2) noexcept; + bool XM_CALLCONV XMPlaneNearEqual(FXMVECTOR P1, FXMVECTOR P2, FXMVECTOR Epsilon) noexcept; + bool XM_CALLCONV XMPlaneNotEqual(FXMVECTOR P1, FXMVECTOR P2) noexcept; + + bool XM_CALLCONV XMPlaneIsNaN(FXMVECTOR P) noexcept; + bool XM_CALLCONV XMPlaneIsInfinite(FXMVECTOR P) noexcept; + + XMVECTOR XM_CALLCONV XMPlaneDot(FXMVECTOR P, FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMPlaneDotCoord(FXMVECTOR P, FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMPlaneDotNormal(FXMVECTOR P, FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMPlaneNormalizeEst(FXMVECTOR P) noexcept; + XMVECTOR XM_CALLCONV XMPlaneNormalize(FXMVECTOR P) noexcept; + XMVECTOR XM_CALLCONV XMPlaneIntersectLine(FXMVECTOR P, FXMVECTOR LinePoint1, FXMVECTOR LinePoint2) noexcept; + void XM_CALLCONV XMPlaneIntersectPlane(_Out_ XMVECTOR* pLinePoint1, _Out_ XMVECTOR* pLinePoint2, _In_ FXMVECTOR P1, _In_ FXMVECTOR P2) noexcept; + XMVECTOR XM_CALLCONV XMPlaneTransform(FXMVECTOR P, FXMMATRIX M) noexcept; + XMFLOAT4* XM_CALLCONV XMPlaneTransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4) + OutputStride * (PlaneCount - 1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT4) + InputStride * (PlaneCount - 1)) const XMFLOAT4* pInputStream, + _In_ size_t InputStride, _In_ size_t PlaneCount, _In_ FXMMATRIX M) noexcept; + + XMVECTOR XM_CALLCONV XMPlaneFromPointNormal(FXMVECTOR Point, FXMVECTOR Normal) noexcept; + XMVECTOR XM_CALLCONV XMPlaneFromPoints(FXMVECTOR Point1, FXMVECTOR Point2, FXMVECTOR Point3) noexcept; + + /**************************************************************************** + * + * Color operations + * + ****************************************************************************/ + + bool XM_CALLCONV XMColorEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept; + bool XM_CALLCONV XMColorNotEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept; + bool XM_CALLCONV XMColorGreater(FXMVECTOR C1, FXMVECTOR C2) noexcept; + bool XM_CALLCONV XMColorGreaterOrEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept; + bool XM_CALLCONV XMColorLess(FXMVECTOR C1, FXMVECTOR C2) noexcept; + bool XM_CALLCONV XMColorLessOrEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept; + + bool XM_CALLCONV XMColorIsNaN(FXMVECTOR C) noexcept; + bool XM_CALLCONV XMColorIsInfinite(FXMVECTOR C) noexcept; + + XMVECTOR XM_CALLCONV XMColorNegative(FXMVECTOR C) noexcept; + XMVECTOR XM_CALLCONV XMColorModulate(FXMVECTOR C1, FXMVECTOR C2) noexcept; + XMVECTOR XM_CALLCONV XMColorAdjustSaturation(FXMVECTOR C, float Saturation) noexcept; + XMVECTOR XM_CALLCONV XMColorAdjustContrast(FXMVECTOR C, float Contrast) noexcept; + + XMVECTOR XM_CALLCONV XMColorRGBToHSL(FXMVECTOR rgb) noexcept; + XMVECTOR XM_CALLCONV XMColorHSLToRGB(FXMVECTOR hsl) noexcept; + + XMVECTOR XM_CALLCONV XMColorRGBToHSV(FXMVECTOR rgb) noexcept; + XMVECTOR XM_CALLCONV XMColorHSVToRGB(FXMVECTOR hsv) noexcept; + + XMVECTOR XM_CALLCONV XMColorRGBToYUV(FXMVECTOR rgb) noexcept; + XMVECTOR XM_CALLCONV XMColorYUVToRGB(FXMVECTOR yuv) noexcept; + + XMVECTOR XM_CALLCONV XMColorRGBToYUV_HD(FXMVECTOR rgb) noexcept; + XMVECTOR XM_CALLCONV XMColorYUVToRGB_HD(FXMVECTOR yuv) noexcept; + + XMVECTOR XM_CALLCONV XMColorRGBToYUV_UHD(FXMVECTOR rgb) noexcept; + XMVECTOR XM_CALLCONV XMColorYUVToRGB_UHD(FXMVECTOR yuv) noexcept; + + XMVECTOR XM_CALLCONV XMColorRGBToXYZ(FXMVECTOR rgb) noexcept; + XMVECTOR XM_CALLCONV XMColorXYZToRGB(FXMVECTOR xyz) noexcept; + + XMVECTOR XM_CALLCONV XMColorXYZToSRGB(FXMVECTOR xyz) noexcept; + XMVECTOR XM_CALLCONV XMColorSRGBToXYZ(FXMVECTOR srgb) noexcept; + + XMVECTOR XM_CALLCONV XMColorRGBToSRGB(FXMVECTOR rgb) noexcept; + XMVECTOR XM_CALLCONV XMColorSRGBToRGB(FXMVECTOR srgb) noexcept; + + + /**************************************************************************** + * + * Miscellaneous operations + * + ****************************************************************************/ + + bool XMVerifyCPUSupport() noexcept; + + XMVECTOR XM_CALLCONV XMFresnelTerm(FXMVECTOR CosIncidentAngle, FXMVECTOR RefractionIndex) noexcept; + + bool XMScalarNearEqual(float S1, float S2, float Epsilon) noexcept; + float XMScalarModAngle(float Value) noexcept; + + float XMScalarSin(float Value) noexcept; + float XMScalarSinEst(float Value) noexcept; + + float XMScalarCos(float Value) noexcept; + float XMScalarCosEst(float Value) noexcept; + + void XMScalarSinCos(_Out_ float* pSin, _Out_ float* pCos, float Value) noexcept; + void XMScalarSinCosEst(_Out_ float* pSin, _Out_ float* pCos, float Value) noexcept; + + float XMScalarASin(float Value) noexcept; + float XMScalarASinEst(float Value) noexcept; + + float XMScalarACos(float Value) noexcept; + float XMScalarACosEst(float Value) noexcept; + + /**************************************************************************** + * + * Templates + * + ****************************************************************************/ + +#if defined(__XNAMATH_H__) && defined(XMMin) +#undef XMMin +#undef XMMax +#endif + + template inline T XMMin(T a, T b) noexcept { return (a < b) ? a : b; } + template inline T XMMax(T a, T b) noexcept { return (a > b) ? a : b; } + + //------------------------------------------------------------------------------ + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + +// PermuteHelper internal template (SSE only) + namespace Internal + { + // Slow path fallback for permutes that do not map to a single SSE shuffle opcode. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) noexcept + { + static const XMVECTORU32 selectMask = + { { { + WhichX ? 0xFFFFFFFF : 0, + WhichY ? 0xFFFFFFFF : 0, + WhichZ ? 0xFFFFFFFF : 0, + WhichW ? 0xFFFFFFFF : 0, + } } }; + + XMVECTOR shuffled1 = XM_PERMUTE_PS(v1, Shuffle); + XMVECTOR shuffled2 = XM_PERMUTE_PS(v2, Shuffle); + + XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1); + XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2); + + return _mm_or_ps(masked1, masked2); + } + }; + + // Fast path for permutes that only read from the first vector. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR) noexcept { return XM_PERMUTE_PS(v1, Shuffle); } + }; + + // Fast path for permutes that only read from the second vector. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR, FXMVECTOR v2) noexcept { return XM_PERMUTE_PS(v2, Shuffle); } + }; + + // Fast path for permutes that read XY from the first vector, ZW from the second. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) noexcept { return _mm_shuffle_ps(v1, v2, Shuffle); } + }; + + // Fast path for permutes that read XY from the second vector, ZW from the first. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) noexcept { return _mm_shuffle_ps(v2, v1, Shuffle); } + }; + } + +#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_ + + // General permute template + template + inline XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2) noexcept + { + static_assert(PermuteX <= 7, "PermuteX template parameter out of range"); + static_assert(PermuteY <= 7, "PermuteY template parameter out of range"); + static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range"); + static_assert(PermuteW <= 7, "PermuteW template parameter out of range"); + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + constexpr uint32_t Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3); + + constexpr bool WhichX = PermuteX > 3; + constexpr bool WhichY = PermuteY > 3; + constexpr bool WhichZ = PermuteZ > 3; + constexpr bool WhichW = PermuteW > 3; + + return Internal::PermuteHelper::Permute(V1, V2); +#else + + return XMVectorPermute(V1, V2, PermuteX, PermuteY, PermuteZ, PermuteW); + +#endif + } + + // Special-case permute templates + template<> inline constexpr XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 2, 3>(FXMVECTOR V1, FXMVECTOR) noexcept { return V1; } + template<> inline constexpr XMVECTOR XM_CALLCONV XMVectorPermute<4, 5, 6, 7>(FXMVECTOR, FXMVECTOR V2) noexcept { return V2; } + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 4, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_movelh_ps(V1, V2); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<6, 7, 2, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_movehl_ps(V1, V2); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 4, 1, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_unpacklo_ps(V1, V2); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 6, 3, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_unpackhi_ps(V1, V2); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 3, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(V1), _mm_castps_pd(V2))); } +#endif + +#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 1, 2, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x1); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 5, 2, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x2); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 5, 2, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x3); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 6, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x4); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 1, 6, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x5); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 5, 6, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x6); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 5, 6, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x7); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 2, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x8); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 1, 2, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x9); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 5, 2, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xA); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 5, 2, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xB); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xC); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 1, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xD); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 5, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xE); } +#endif + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + + // If the indices are all in the range 0-3 or 4-7, then use XMVectorSwizzle instead + // The mirror cases are not spelled out here as the programmer can always swap the arguments + // (i.e. prefer permutes where the X element comes from the V1 vector instead of the V2 vector) + + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 4, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_low_f32(V1), vget_low_f32(V2)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 0, 4, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_low_f32(V1)), vget_low_f32(V2)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 5, 4>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_low_f32(V1), vrev64_f32(vget_low_f32(V2))); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 0, 5, 4>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_low_f32(V1)), vrev64_f32(vget_low_f32(V2))); } + + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 3, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_high_f32(V1), vget_high_f32(V2)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3, 2, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V1)), vget_high_f32(V2)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 3, 7, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_high_f32(V1), vrev64_f32(vget_high_f32(V2))); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3, 2, 7, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V1)), vrev64_f32(vget_high_f32(V2))); } + + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_low_f32(V1), vget_high_f32(V2)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 0, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_low_f32(V1)), vget_high_f32(V2)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 7, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_low_f32(V1), vrev64_f32(vget_high_f32(V2))); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 0, 7, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_low_f32(V1)), vrev64_f32(vget_high_f32(V2))); } + + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3, 2, 4, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V1)), vget_low_f32(V2)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 3, 5, 4>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_high_f32(V1), vrev64_f32(vget_low_f32(V2))); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3, 2, 5, 4>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V1)), vrev64_f32(vget_low_f32(V2))); } + + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 4, 2, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vtrnq_f32(V1, V2).val[0]; } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 5, 3, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vtrnq_f32(V1, V2).val[1]; } + + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 4, 1, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vzipq_f32(V1, V2).val[0]; } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 6, 3, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vzipq_f32(V1, V2).val[1]; } + + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 2, 4, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vuzpq_f32(V1, V2).val[0]; } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 3, 5, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vuzpq_f32(V1, V2).val[1]; } + + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 2, 3, 4>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vextq_f32(V1, V2, 1); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 3, 4, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vextq_f32(V1, V2, 2); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3, 4, 5, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vextq_f32(V1, V2, 3); } + +#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_ + + //------------------------------------------------------------------------------ + + // General swizzle template + template + inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V) noexcept + { + static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range"); + static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range"); + static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range"); + static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range"); + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + return XM_PERMUTE_PS(V, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX)); +#else + + return XMVectorSwizzle(V, SwizzleX, SwizzleY, SwizzleZ, SwizzleW); + +#endif + } + + // Specialized swizzles + template<> inline constexpr XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 1, 2, 3>(FXMVECTOR V) noexcept { return V; } + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 1, 0, 1>(FXMVECTOR V) noexcept { return _mm_movelh_ps(V, V); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 3, 2, 3>(FXMVECTOR V) noexcept { return _mm_movehl_ps(V, V); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 1, 1>(FXMVECTOR V) noexcept { return _mm_unpacklo_ps(V, V); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 2, 3, 3>(FXMVECTOR V) noexcept { return _mm_unpackhi_ps(V, V); } +#endif + +#if defined(_XM_SSE3_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 2, 2>(FXMVECTOR V) noexcept { return _mm_moveldup_ps(V); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 1, 3, 3>(FXMVECTOR V) noexcept { return _mm_movehdup_ps(V); } +#endif + +#if defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) && defined(_XM_FAVOR_INTEL_) + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 0, 0>(FXMVECTOR V) noexcept { return _mm_broadcastss_ps(V); } +#endif + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 0, 0>(FXMVECTOR V) noexcept { return vdupq_lane_f32(vget_low_f32(V), 0); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 1, 1, 1>(FXMVECTOR V) noexcept { return vdupq_lane_f32(vget_low_f32(V), 1); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 2, 2, 2>(FXMVECTOR V) noexcept { return vdupq_lane_f32(vget_high_f32(V), 0); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3, 3, 3, 3>(FXMVECTOR V) noexcept { return vdupq_lane_f32(vget_high_f32(V), 1); } + + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 0, 3, 2>(FXMVECTOR V) noexcept { return vrev64q_f32(V); } + + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 1, 0, 1>(FXMVECTOR V) noexcept { float32x2_t vt = vget_low_f32(V); return vcombine_f32(vt, vt); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 3, 2, 3>(FXMVECTOR V) noexcept { float32x2_t vt = vget_high_f32(V); return vcombine_f32(vt, vt); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 0, 1, 0>(FXMVECTOR V) noexcept { float32x2_t vt = vrev64_f32(vget_low_f32(V)); return vcombine_f32(vt, vt); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3, 2, 3, 2>(FXMVECTOR V) noexcept { float32x2_t vt = vrev64_f32(vget_high_f32(V)); return vcombine_f32(vt, vt); } + + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 1, 3, 2>(FXMVECTOR V) noexcept { return vcombine_f32(vget_low_f32(V), vrev64_f32(vget_high_f32(V))); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 0, 2, 3>(FXMVECTOR V) noexcept { return vcombine_f32(vrev64_f32(vget_low_f32(V)), vget_high_f32(V)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 3, 1, 0>(FXMVECTOR V) noexcept { return vcombine_f32(vget_high_f32(V), vrev64_f32(vget_low_f32(V))); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3, 2, 0, 1>(FXMVECTOR V) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V)), vget_low_f32(V)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3, 2, 1, 0>(FXMVECTOR V) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V)), vrev64_f32(vget_low_f32(V))); } + + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 2, 2>(FXMVECTOR V) noexcept { return vtrnq_f32(V, V).val[0]; } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 1, 3, 3>(FXMVECTOR V) noexcept { return vtrnq_f32(V, V).val[1]; } + + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 1, 1>(FXMVECTOR V) noexcept { return vzipq_f32(V, V).val[0]; } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 2, 3, 3>(FXMVECTOR V) noexcept { return vzipq_f32(V, V).val[1]; } + + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 2, 0, 2>(FXMVECTOR V) noexcept { return vuzpq_f32(V, V).val[0]; } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 3, 1, 3>(FXMVECTOR V) noexcept { return vuzpq_f32(V, V).val[1]; } + + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 2, 3, 0>(FXMVECTOR V) noexcept { return vextq_f32(V, V, 1); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 3, 0, 1>(FXMVECTOR V) noexcept { return vextq_f32(V, V, 2); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3, 0, 1, 2>(FXMVECTOR V) noexcept { return vextq_f32(V, V, 3); } + +#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_ + + //------------------------------------------------------------------------------ + + template + inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2) noexcept + { + static_assert(Elements < 4, "Elements template parameter out of range"); + return XMVectorPermute(V1, V2); + } + + template + inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V) noexcept + { + static_assert(Elements < 4, "Elements template parameter out of range"); + return XMVectorSwizzle(V); + } + + template + inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V) noexcept + { + static_assert(Elements < 4, "Elements template parameter out of range"); + return XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V); + } + + template + inline XMVECTOR XM_CALLCONV XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS) noexcept + { + XMVECTOR Control = XMVectorSelectControl(Select0 & 1, Select1 & 1, Select2 & 1, Select3 & 1); + return XMVectorSelect(VD, XMVectorRotateLeft(VS), Control); + } + + /**************************************************************************** + * + * Globals + * + ****************************************************************************/ + + // The purpose of the following global constants is to prevent redundant + // reloading of the constants when they are referenced by more than one + // separate inline math routine called within the same function. Declaring + // a constant locally within a routine is sufficient to prevent redundant + // reloads of that constant when that single routine is called multiple + // times in a function, but if the constant is used (and declared) in a + // separate math routine it would be reloaded. + +#ifndef XMGLOBALCONST +#if defined(__GNUC__) && !defined(__MINGW32__) +#define XMGLOBALCONST extern const __attribute__((weak)) +#else +#define XMGLOBALCONST extern const __declspec(selectany) +#endif +#endif + + XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients0 = { { { -0.16666667f, +0.0083333310f, -0.00019840874f, +2.7525562e-06f } } }; + XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients1 = { { { -2.3889859e-08f, -0.16665852f /*Est1*/, +0.0083139502f /*Est2*/, -0.00018524670f /*Est3*/ } } }; + XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients0 = { { { -0.5f, +0.041666638f, -0.0013888378f, +2.4760495e-05f } } }; + XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients1 = { { { -2.6051615e-07f, -0.49992746f /*Est1*/, +0.041493919f /*Est2*/, -0.0012712436f /*Est3*/ } } }; + XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients0 = { { { 1.0f, 0.333333333f, 0.133333333f, 5.396825397e-2f } } }; + XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients1 = { { { 2.186948854e-2f, 8.863235530e-3f, 3.592128167e-3f, 1.455834485e-3f } } }; + XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients2 = { { { 5.900274264e-4f, 2.391290764e-4f, 9.691537707e-5f, 3.927832950e-5f } } }; + XMGLOBALCONST XMVECTORF32 g_XMArcCoefficients0 = { { { +1.5707963050f, -0.2145988016f, +0.0889789874f, -0.0501743046f } } }; + XMGLOBALCONST XMVECTORF32 g_XMArcCoefficients1 = { { { +0.0308918810f, -0.0170881256f, +0.0066700901f, -0.0012624911f } } }; + XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients0 = { { { -0.3333314528f, +0.1999355085f, -0.1420889944f, +0.1065626393f } } }; + XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients1 = { { { -0.0752896400f, +0.0429096138f, -0.0161657367f, +0.0028662257f } } }; + XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients0 = { { { +0.999866f, +0.999866f, +0.999866f, +0.999866f } } }; + XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients1 = { { { -0.3302995f, +0.180141f, -0.085133f, +0.0208351f } } }; + XMGLOBALCONST XMVECTORF32 g_XMTanEstCoefficients = { { { 2.484f, -1.954923183e-1f, 2.467401101f, XM_1DIVPI } } }; + XMGLOBALCONST XMVECTORF32 g_XMArcEstCoefficients = { { { +1.5707288f, -0.2121144f, +0.0742610f, -0.0187293f } } }; + XMGLOBALCONST XMVECTORF32 g_XMPiConstants0 = { { { XM_PI, XM_2PI, XM_1DIVPI, XM_1DIV2PI } } }; + XMGLOBALCONST XMVECTORF32 g_XMIdentityR0 = { { { 1.0f, 0.0f, 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMIdentityR1 = { { { 0.0f, 1.0f, 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMIdentityR2 = { { { 0.0f, 0.0f, 1.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMIdentityR3 = { { { 0.0f, 0.0f, 0.0f, 1.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR0 = { { { -1.0f, 0.0f, 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR1 = { { { 0.0f, -1.0f, 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR2 = { { { 0.0f, 0.0f, -1.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR3 = { { { 0.0f, 0.0f, 0.0f, -1.0f } } }; + XMGLOBALCONST XMVECTORU32 g_XMNegativeZero = { { { 0x80000000, 0x80000000, 0x80000000, 0x80000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMNegate3 = { { { 0x80000000, 0x80000000, 0x80000000, 0x00000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskXY = { { { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMMask3 = { { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskX = { { { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskY = { { { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskZ = { { { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskW = { { { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF } } }; + XMGLOBALCONST XMVECTORF32 g_XMOne = { { { 1.0f, 1.0f, 1.0f, 1.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMOne3 = { { { 1.0f, 1.0f, 1.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMZero = { { { 0.0f, 0.0f, 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMTwo = { { { 2.f, 2.f, 2.f, 2.f } } }; + XMGLOBALCONST XMVECTORF32 g_XMFour = { { { 4.f, 4.f, 4.f, 4.f } } }; + XMGLOBALCONST XMVECTORF32 g_XMSix = { { { 6.f, 6.f, 6.f, 6.f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegativeOne = { { { -1.0f, -1.0f, -1.0f, -1.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMOneHalf = { { { 0.5f, 0.5f, 0.5f, 0.5f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegativeOneHalf = { { { -0.5f, -0.5f, -0.5f, -0.5f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegativeTwoPi = { { { -XM_2PI, -XM_2PI, -XM_2PI, -XM_2PI } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegativePi = { { { -XM_PI, -XM_PI, -XM_PI, -XM_PI } } }; + XMGLOBALCONST XMVECTORF32 g_XMHalfPi = { { { XM_PIDIV2, XM_PIDIV2, XM_PIDIV2, XM_PIDIV2 } } }; + XMGLOBALCONST XMVECTORF32 g_XMPi = { { { XM_PI, XM_PI, XM_PI, XM_PI } } }; + XMGLOBALCONST XMVECTORF32 g_XMReciprocalPi = { { { XM_1DIVPI, XM_1DIVPI, XM_1DIVPI, XM_1DIVPI } } }; + XMGLOBALCONST XMVECTORF32 g_XMTwoPi = { { { XM_2PI, XM_2PI, XM_2PI, XM_2PI } } }; + XMGLOBALCONST XMVECTORF32 g_XMReciprocalTwoPi = { { { XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI } } }; + XMGLOBALCONST XMVECTORF32 g_XMEpsilon = { { { 1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f } } }; + XMGLOBALCONST XMVECTORI32 g_XMInfinity = { { { 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000 } } }; + XMGLOBALCONST XMVECTORI32 g_XMQNaN = { { { 0x7FC00000, 0x7FC00000, 0x7FC00000, 0x7FC00000 } } }; + XMGLOBALCONST XMVECTORI32 g_XMQNaNTest = { { { 0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF } } }; + XMGLOBALCONST XMVECTORI32 g_XMAbsMask = { { { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF } } }; + XMGLOBALCONST XMVECTORI32 g_XMFltMin = { { { 0x00800000, 0x00800000, 0x00800000, 0x00800000 } } }; + XMGLOBALCONST XMVECTORI32 g_XMFltMax = { { { 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF } } }; + XMGLOBALCONST XMVECTORU32 g_XMNegOneMask = { { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskA8R8G8B8 = { { { 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMFlipA8R8G8B8 = { { { 0x00000000, 0x00000000, 0x00000000, 0x80000000 } } }; + XMGLOBALCONST XMVECTORF32 g_XMFixAA8R8G8B8 = { { { 0.0f, 0.0f, 0.0f, float(0x80000000U) } } }; + XMGLOBALCONST XMVECTORF32 g_XMNormalizeA8R8G8B8 = { { { 1.0f / (255.0f * float(0x10000)), 1.0f / (255.0f * float(0x100)), 1.0f / 255.0f, 1.0f / (255.0f * float(0x1000000)) } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskA2B10G10R10 = { { { 0x000003FF, 0x000FFC00, 0x3FF00000, 0xC0000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMFlipA2B10G10R10 = { { { 0x00000200, 0x00080000, 0x20000000, 0x80000000 } } }; + XMGLOBALCONST XMVECTORF32 g_XMFixAA2B10G10R10 = { { { -512.0f, -512.0f * float(0x400), -512.0f * float(0x100000), float(0x80000000U) } } }; + XMGLOBALCONST XMVECTORF32 g_XMNormalizeA2B10G10R10 = { { { 1.0f / 511.0f, 1.0f / (511.0f * float(0x400)), 1.0f / (511.0f * float(0x100000)), 1.0f / (3.0f * float(0x40000000)) } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskX16Y16 = { { { 0x0000FFFF, 0xFFFF0000, 0x00000000, 0x00000000 } } }; + XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16 = { { { 0x00008000, 0x00000000, 0x00000000, 0x00000000 } } }; + XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16 = { { { -32768.0f, 0.0f, 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16 = { { { 1.0f / 32767.0f, 1.0f / (32767.0f * 65536.0f), 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskX16Y16Z16W16 = { { { 0x0000FFFF, 0x0000FFFF, 0xFFFF0000, 0xFFFF0000 } } }; + XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16Z16W16 = { { { 0x00008000, 0x00008000, 0x00000000, 0x00000000 } } }; + XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16Z16W16 = { { { -32768.0f, -32768.0f, 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16Z16W16 = { { { 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / (32767.0f * 65536.0f), 1.0f / (32767.0f * 65536.0f) } } }; + XMGLOBALCONST XMVECTORF32 g_XMNoFraction = { { { 8388608.0f, 8388608.0f, 8388608.0f, 8388608.0f } } }; + XMGLOBALCONST XMVECTORI32 g_XMMaskByte = { { { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegateX = { { { -1.0f, 1.0f, 1.0f, 1.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegateY = { { { 1.0f, -1.0f, 1.0f, 1.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegateZ = { { { 1.0f, 1.0f, -1.0f, 1.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegateW = { { { 1.0f, 1.0f, 1.0f, -1.0f } } }; + XMGLOBALCONST XMVECTORU32 g_XMSelect0101 = { { { XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1 } } }; + XMGLOBALCONST XMVECTORU32 g_XMSelect1010 = { { { XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 } } }; + XMGLOBALCONST XMVECTORI32 g_XMOneHalfMinusEpsilon = { { { 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD } } }; + XMGLOBALCONST XMVECTORU32 g_XMSelect1000 = { { { XM_SELECT_1, XM_SELECT_0, XM_SELECT_0, XM_SELECT_0 } } }; + XMGLOBALCONST XMVECTORU32 g_XMSelect1100 = { { { XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 } } }; + XMGLOBALCONST XMVECTORU32 g_XMSelect1110 = { { { XM_SELECT_1, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0 } } }; + XMGLOBALCONST XMVECTORU32 g_XMSelect1011 = { { { XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_1 } } }; + XMGLOBALCONST XMVECTORF32 g_XMFixupY16 = { { { 1.0f, 1.0f / 65536.0f, 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMFixupY16W16 = { { { 1.0f, 1.0f, 1.0f / 65536.0f, 1.0f / 65536.0f } } }; + XMGLOBALCONST XMVECTORU32 g_XMFlipY = { { { 0, 0x80000000, 0, 0 } } }; + XMGLOBALCONST XMVECTORU32 g_XMFlipZ = { { { 0, 0, 0x80000000, 0 } } }; + XMGLOBALCONST XMVECTORU32 g_XMFlipW = { { { 0, 0, 0, 0x80000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMFlipYZ = { { { 0, 0x80000000, 0x80000000, 0 } } }; + XMGLOBALCONST XMVECTORU32 g_XMFlipZW = { { { 0, 0, 0x80000000, 0x80000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMFlipYW = { { { 0, 0x80000000, 0, 0x80000000 } } }; + XMGLOBALCONST XMVECTORI32 g_XMMaskDec4 = { { { 0x3FF, 0x3FF << 10, 0x3FF << 20, static_cast(0xC0000000) } } }; + XMGLOBALCONST XMVECTORI32 g_XMXorDec4 = { { { 0x200, 0x200 << 10, 0x200 << 20, 0 } } }; + XMGLOBALCONST XMVECTORF32 g_XMAddUDec4 = { { { 0, 0, 0, 32768.0f * 65536.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMAddDec4 = { { { -512.0f, -512.0f * 1024.0f, -512.0f * 1024.0f * 1024.0f, 0 } } }; + XMGLOBALCONST XMVECTORF32 g_XMMulDec4 = { { { 1.0f, 1.0f / 1024.0f, 1.0f / (1024.0f * 1024.0f), 1.0f / (1024.0f * 1024.0f * 1024.0f) } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskByte4 = { { { 0xFF, 0xFF00, 0xFF0000, 0xFF000000 } } }; + XMGLOBALCONST XMVECTORI32 g_XMXorByte4 = { { { 0x80, 0x8000, 0x800000, 0x00000000 } } }; + XMGLOBALCONST XMVECTORF32 g_XMAddByte4 = { { { -128.0f, -128.0f * 256.0f, -128.0f * 65536.0f, 0 } } }; + XMGLOBALCONST XMVECTORF32 g_XMFixUnsigned = { { { 32768.0f * 65536.0f, 32768.0f * 65536.0f, 32768.0f * 65536.0f, 32768.0f * 65536.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMMaxInt = { { { 65536.0f * 32768.0f - 128.0f, 65536.0f * 32768.0f - 128.0f, 65536.0f * 32768.0f - 128.0f, 65536.0f * 32768.0f - 128.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMMaxUInt = { { { 65536.0f * 65536.0f - 256.0f, 65536.0f * 65536.0f - 256.0f, 65536.0f * 65536.0f - 256.0f, 65536.0f * 65536.0f - 256.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMUnsignedFix = { { { 32768.0f * 65536.0f, 32768.0f * 65536.0f, 32768.0f * 65536.0f, 32768.0f * 65536.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMsrgbScale = { { { 12.92f, 12.92f, 12.92f, 1.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMsrgbA = { { { 0.055f, 0.055f, 0.055f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMsrgbA1 = { { { 1.055f, 1.055f, 1.055f, 1.0f } } }; + XMGLOBALCONST XMVECTORI32 g_XMExponentBias = { { { 127, 127, 127, 127 } } }; + XMGLOBALCONST XMVECTORI32 g_XMSubnormalExponent = { { { -126, -126, -126, -126 } } }; + XMGLOBALCONST XMVECTORI32 g_XMNumTrailing = { { { 23, 23, 23, 23 } } }; + XMGLOBALCONST XMVECTORI32 g_XMMinNormal = { { { 0x00800000, 0x00800000, 0x00800000, 0x00800000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMNegInfinity = { { { 0xFF800000, 0xFF800000, 0xFF800000, 0xFF800000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMNegQNaN = { { { 0xFFC00000, 0xFFC00000, 0xFFC00000, 0xFFC00000 } } }; + XMGLOBALCONST XMVECTORI32 g_XMBin128 = { { { 0x43000000, 0x43000000, 0x43000000, 0x43000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMBinNeg150 = { { { 0xC3160000, 0xC3160000, 0xC3160000, 0xC3160000 } } }; + XMGLOBALCONST XMVECTORI32 g_XM253 = { { { 253, 253, 253, 253 } } }; + XMGLOBALCONST XMVECTORF32 g_XMExpEst1 = { { { -6.93147182e-1f, -6.93147182e-1f, -6.93147182e-1f, -6.93147182e-1f } } }; + XMGLOBALCONST XMVECTORF32 g_XMExpEst2 = { { { +2.40226462e-1f, +2.40226462e-1f, +2.40226462e-1f, +2.40226462e-1f } } }; + XMGLOBALCONST XMVECTORF32 g_XMExpEst3 = { { { -5.55036440e-2f, -5.55036440e-2f, -5.55036440e-2f, -5.55036440e-2f } } }; + XMGLOBALCONST XMVECTORF32 g_XMExpEst4 = { { { +9.61597636e-3f, +9.61597636e-3f, +9.61597636e-3f, +9.61597636e-3f } } }; + XMGLOBALCONST XMVECTORF32 g_XMExpEst5 = { { { -1.32823968e-3f, -1.32823968e-3f, -1.32823968e-3f, -1.32823968e-3f } } }; + XMGLOBALCONST XMVECTORF32 g_XMExpEst6 = { { { +1.47491097e-4f, +1.47491097e-4f, +1.47491097e-4f, +1.47491097e-4f } } }; + XMGLOBALCONST XMVECTORF32 g_XMExpEst7 = { { { -1.08635004e-5f, -1.08635004e-5f, -1.08635004e-5f, -1.08635004e-5f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLogEst0 = { { { +1.442693f, +1.442693f, +1.442693f, +1.442693f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLogEst1 = { { { -0.721242f, -0.721242f, -0.721242f, -0.721242f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLogEst2 = { { { +0.479384f, +0.479384f, +0.479384f, +0.479384f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLogEst3 = { { { -0.350295f, -0.350295f, -0.350295f, -0.350295f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLogEst4 = { { { +0.248590f, +0.248590f, +0.248590f, +0.248590f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLogEst5 = { { { -0.145700f, -0.145700f, -0.145700f, -0.145700f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLogEst6 = { { { +0.057148f, +0.057148f, +0.057148f, +0.057148f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLogEst7 = { { { -0.010578f, -0.010578f, -0.010578f, -0.010578f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLgE = { { { +1.442695f, +1.442695f, +1.442695f, +1.442695f } } }; + XMGLOBALCONST XMVECTORF32 g_XMInvLgE = { { { +6.93147182e-1f, +6.93147182e-1f, +6.93147182e-1f, +6.93147182e-1f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLg10 = { { { +3.321928f, +3.321928f, +3.321928f, +3.321928f } } }; + XMGLOBALCONST XMVECTORF32 g_XMInvLg10 = { { { +3.010299956e-1f, +3.010299956e-1f, +3.010299956e-1f, +3.010299956e-1f } } }; + XMGLOBALCONST XMVECTORF32 g_UByteMax = { { { 255.0f, 255.0f, 255.0f, 255.0f } } }; + XMGLOBALCONST XMVECTORF32 g_ByteMin = { { { -127.0f, -127.0f, -127.0f, -127.0f } } }; + XMGLOBALCONST XMVECTORF32 g_ByteMax = { { { 127.0f, 127.0f, 127.0f, 127.0f } } }; + XMGLOBALCONST XMVECTORF32 g_ShortMin = { { { -32767.0f, -32767.0f, -32767.0f, -32767.0f } } }; + XMGLOBALCONST XMVECTORF32 g_ShortMax = { { { 32767.0f, 32767.0f, 32767.0f, 32767.0f } } }; + XMGLOBALCONST XMVECTORF32 g_UShortMax = { { { 65535.0f, 65535.0f, 65535.0f, 65535.0f } } }; + + /**************************************************************************** + * + * Implementation + * + ****************************************************************************/ + +#pragma warning(push) +#pragma warning(disable:4068 4214 4204 4365 4616 4640 6001 6101) + // C4068/4616: ignore unknown pragmas + // C4214/4204: nonstandard extension used + // C4365/4640: Off by default noise + // C6001/6101: False positives + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") +#pragma prefast(disable : 26495, "Union initialization confuses /analyze") +#endif + +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wundefined-reinterpret-cast" +#endif + +//------------------------------------------------------------------------------ + + inline XMVECTOR XM_CALLCONV XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3) noexcept + { +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = (0 - (C0 & 1)) & 0x3F800000; + vResult.u[1] = (0 - (C1 & 1)) & 0x3F800000; + vResult.u[2] = (0 - (C2 & 1)) & 0x3F800000; + vResult.u[3] = (0 - (C3 & 1)) & 0x3F800000; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = (0 - (C0 & 1)) & 0x3F800000; + vResult.u[1] = (0 - (C1 & 1)) & 0x3F800000; + vResult.u[2] = (0 - (C2 & 1)) & 0x3F800000; + vResult.u[3] = (0 - (C3 & 1)) & 0x3F800000; + return vResult.v; +#else // XM_SSE_INTRINSICS_ + static const XMVECTORU32 g_vMask1 = { { { 1, 1, 1, 1 } } }; + // Move the parms to a vector + __m128i vTemp = _mm_set_epi32(static_cast(C3), static_cast(C2), static_cast(C1), static_cast(C0)); + // Mask off the low bits + vTemp = _mm_and_si128(vTemp, g_vMask1); + // 0xFFFFFFFF on true bits + vTemp = _mm_cmpeq_epi32(vTemp, g_vMask1); + // 0xFFFFFFFF -> 1.0f, 0x00000000 -> 0.0f + vTemp = _mm_and_si128(vTemp, g_XMOne); + return _mm_castsi128_ps(vTemp); +#endif + } + + //------------------------------------------------------------------------------ + + inline XMVECTOR XM_CALLCONV XMVectorSplatConstant(int32_t IntConstant, uint32_t DivExponent) noexcept + { + assert(IntConstant >= -16 && IntConstant <= 15); + assert(DivExponent < 32); +#if defined(_XM_NO_INTRINSICS_) + + using DirectX::XMConvertVectorIntToFloat; + + XMVECTORI32 V = { { { IntConstant, IntConstant, IntConstant, IntConstant } } }; + return XMConvertVectorIntToFloat(V.v, DivExponent); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Splat the int + int32x4_t vScale = vdupq_n_s32(IntConstant); + // Convert to a float + XMVECTOR vResult = vcvtq_f32_s32(vScale); + // Convert DivExponent into 1.0f/(1<(&vScale)[0]); + return vResult; +#else // XM_SSE_INTRINSICS_ + // Splat the int + __m128i vScale = _mm_set1_epi32(IntConstant); + // Convert to a float + XMVECTOR vResult = _mm_cvtepi32_ps(vScale); + // Convert DivExponent into 1.0f/(1<(uScale)); + // Multiply by the reciprocal (Perform a right shift by DivExponent) + vResult = _mm_mul_ps(vResult, _mm_castsi128_ps(vScale)); + return vResult; +#endif + } + + //------------------------------------------------------------------------------ + + inline XMVECTOR XM_CALLCONV XMVectorSplatConstantInt(int32_t IntConstant) noexcept + { + assert(IntConstant >= -16 && IntConstant <= 15); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORI32 V = { { { IntConstant, IntConstant, IntConstant, IntConstant } } }; + return V.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t V = vdupq_n_s32(IntConstant); + return reinterpret_cast(&V)[0]; +#else // XM_SSE_INTRINSICS_ + __m128i V = _mm_set1_epi32(IntConstant); + return _mm_castsi128_ps(V); +#endif + } + +#include "DirectXMathConvert.inl" +#include "DirectXMathVector.inl" +#include "DirectXMathMatrix.inl" +#include "DirectXMathMisc.inl" + +#ifdef __clang__ +#pragma clang diagnostic pop +#endif + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +#pragma warning(pop) + +} // namespace DirectX + diff --git a/DirectXMath/DirectXMathConvert.inl b/DirectXMath/DirectXMathConvert.inl new file mode 100644 index 0000000..8097f20 --- /dev/null +++ b/DirectXMath/DirectXMathConvert.inl @@ -0,0 +1,2187 @@ +//------------------------------------------------------------------------------------- +// DirectXMathConvert.inl -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +/**************************************************************************** + * + * Data conversion + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +#pragma warning(push) +#pragma warning(disable:4701) +// C4701: false positives + +inline XMVECTOR XM_CALLCONV XMConvertVectorIntToFloat +( + FXMVECTOR VInt, + uint32_t DivExponent +) noexcept +{ + assert(DivExponent < 32); +#if defined(_XM_NO_INTRINSICS_) + float fScale = 1.0f / static_cast(1U << DivExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do { + auto iTemp = static_cast(VInt.vector4_u32[ElementIndex]); + Result.vector4_f32[ElementIndex] = static_cast(iTemp)* fScale; + } while (++ElementIndex < 4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fScale = 1.0f / (float)(1U << DivExponent); + float32x4_t vResult = vcvtq_f32_s32(vreinterpretq_s32_f32(VInt)); + return vmulq_n_f32(vResult, fScale); +#else // _XM_SSE_INTRINSICS_ + // Convert to floats + XMVECTOR vResult = _mm_cvtepi32_ps(_mm_castps_si128(VInt)); + // Convert DivExponent into 1.0f/(1<(uScale)); + vResult = _mm_mul_ps(vResult, _mm_castsi128_ps(vScale)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToInt +( + FXMVECTOR VFloat, + uint32_t MulExponent +) noexcept +{ + assert(MulExponent < 32); +#if defined(_XM_NO_INTRINSICS_) + // Get the scalar factor. + auto fScale = static_cast(1U << MulExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do { + int32_t iResult; + float fTemp = VFloat.vector4_f32[ElementIndex] * fScale; + if (fTemp <= -(65536.0f * 32768.0f)) + { + iResult = (-0x7FFFFFFF) - 1; + } + else if (fTemp > (65536.0f * 32768.0f) - 128.0f) + { + iResult = 0x7FFFFFFF; + } + else { + iResult = static_cast(fTemp); + } + Result.vector4_u32[ElementIndex] = static_cast(iResult); + } while (++ElementIndex < 4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmulq_n_f32(VFloat, (float)(1U << MulExponent)); + // In case of positive overflow, detect it + uint32x4_t vOverflow = vcgtq_f32(vResult, g_XMMaxInt); + // Float to int conversion + int32x4_t vResulti = vcvtq_s32_f32(vResult); + // If there was positive overflow, set to 0x7FFFFFFF + vResult = vreinterpretq_f32_u32(vandq_u32(vOverflow, g_XMAbsMask)); + vOverflow = vbicq_u32(vreinterpretq_u32_s32(vResulti), vOverflow); + vOverflow = vorrq_u32(vOverflow, vreinterpretq_u32_f32(vResult)); + return vreinterpretq_f32_u32(vOverflow); +#else // _XM_SSE_INTRINSICS_ + XMVECTOR vResult = _mm_set_ps1(static_cast(1U << MulExponent)); + vResult = _mm_mul_ps(vResult, VFloat); + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(vResult); + // If there was positive overflow, set to 0x7FFFFFFF + vResult = _mm_and_ps(vOverflow, g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow, _mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow, vResult); + return vOverflow; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMConvertVectorUIntToFloat +( + FXMVECTOR VUInt, + uint32_t DivExponent +) noexcept +{ + assert(DivExponent < 32); +#if defined(_XM_NO_INTRINSICS_) + float fScale = 1.0f / static_cast(1U << DivExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do { + Result.vector4_f32[ElementIndex] = static_cast(VUInt.vector4_u32[ElementIndex])* fScale; + } while (++ElementIndex < 4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fScale = 1.0f / (float)(1U << DivExponent); + float32x4_t vResult = vcvtq_f32_u32(vreinterpretq_u32_f32(VUInt)); + return vmulq_n_f32(vResult, fScale); +#else // _XM_SSE_INTRINSICS_ + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(VUInt, g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(VUInt, vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask), 31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask), g_XMFixUnsigned); + vResult = _mm_add_ps(vResult, vMask); + // Convert DivExponent into 1.0f/(1<(uScale)); + vResult = _mm_mul_ps(vResult, _mm_castsi128_ps(iMask)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToUInt +( + FXMVECTOR VFloat, + uint32_t MulExponent +) noexcept +{ + assert(MulExponent < 32); +#if defined(_XM_NO_INTRINSICS_) + // Get the scalar factor. + auto fScale = static_cast(1U << MulExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do { + uint32_t uResult; + float fTemp = VFloat.vector4_f32[ElementIndex] * fScale; + if (fTemp <= 0.0f) + { + uResult = 0; + } + else if (fTemp >= (65536.0f * 65536.0f)) + { + uResult = 0xFFFFFFFFU; + } + else { + uResult = static_cast(fTemp); + } + Result.vector4_u32[ElementIndex] = uResult; + } while (++ElementIndex < 4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmulq_n_f32(VFloat, (float)(1U << MulExponent)); + // In case of overflow, detect it + uint32x4_t vOverflow = vcgtq_f32(vResult, g_XMMaxUInt); + // Float to int conversion + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + // If there was overflow, set to 0xFFFFFFFFU + vResult = vreinterpretq_f32_u32(vbicq_u32(vResulti, vOverflow)); + vOverflow = vorrq_u32(vOverflow, vreinterpretq_u32_f32(vResult)); + return vreinterpretq_f32_u32(vOverflow); +#else // _XM_SSE_INTRINSICS_ + XMVECTOR vResult = _mm_set_ps1(static_cast(1U << MulExponent)); + vResult = _mm_mul_ps(vResult, VFloat); + // Clamp to >=0 + vResult = _mm_max_ps(vResult, g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult, vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue, vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult, vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask, g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti), vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult, vOverflow); + return vResult; +#endif +} + +#pragma warning(pop) + +/**************************************************************************** + * + * Vector and matrix load operations + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt(const uint32_t* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = *pSource; + V.vector4_u32[1] = 0; + V.vector4_u32[2] = 0; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t zero = vdupq_n_u32(0); + return vreinterpretq_f32_u32(vld1q_lane_u32(pSource, zero, 0)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ss(reinterpret_cast(pSource)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat(const float* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = *pSource; + V.vector4_f32[1] = 0.f; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t zero = vdupq_n_f32(0); + return vld1q_lane_f32(pSource, zero, 0); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ss(pSource); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt2(const uint32_t* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = 0; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t x = vld1_u32(pSource); + uint32x2_t zero = vdup_n_u32(0); + return vreinterpretq_f32_u32(vcombine_u32(x, zero)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt2A(const uint32_t* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = 0; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#ifdef _MSC_VER + uint32x2_t x = vld1_u32_ex(pSource, 64); +#else + uint32x2_t x = vld1_u32(pSource); +#endif + uint32x2_t zero = vdup_n_u32(0); + return vreinterpretq_f32_u32(vcombine_u32(x, zero)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat2(const XMFLOAT2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t x = vld1_f32(reinterpret_cast(pSource)); + float32x2_t zero = vdup_n_f32(0); + return vcombine_f32(x, zero); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat2A(const XMFLOAT2A* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#ifdef _MSC_VER + float32x2_t x = vld1_f32_ex(reinterpret_cast(pSource), 64); +#else + float32x2_t x = vld1_f32(reinterpret_cast(pSource)); +#endif + float32x2_t zero = vdup_n_f32(0); + return vcombine_f32(x, zero); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadSInt2(const XMINT2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = static_cast(pSource->x); + V.vector4_f32[1] = static_cast(pSource->y); + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x2_t x = vld1_s32(reinterpret_cast(pSource)); + float32x2_t v = vcvt_f32_s32(x); + float32x2_t zero = vdup_n_f32(0); + return vcombine_f32(v, zero); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 V = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + return _mm_cvtepi32_ps(_mm_castps_si128(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUInt2(const XMUINT2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = static_cast(pSource->x); + V.vector4_f32[1] = static_cast(pSource->y); + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t x = vld1_u32(reinterpret_cast(pSource)); + float32x2_t v = vcvt_f32_u32(x); + float32x2_t zero = vdup_n_f32(0); + return vcombine_f32(v, zero); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 V = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(V, g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(V, vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask), 31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask), g_XMFixUnsigned); + vResult = _mm_add_ps(vResult, vMask); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt3(const uint32_t* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t x = vld1_u32(pSource); + uint32x2_t zero = vdup_n_u32(0); + uint32x2_t y = vld1_lane_u32(pSource + 2, zero, 0); + return vreinterpretq_f32_u32(vcombine_u32(x, y)); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + __m128 z = _mm_load_ss(reinterpret_cast(pSource + 2)); + return _mm_insert_ps(xy, z, 0x20); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + __m128 z = _mm_load_ss(reinterpret_cast(pSource + 2)); + return _mm_movelh_ps(xy, z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt3A(const uint32_t* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Reads an extra integer which is zero'd +#ifdef _MSC_VER + uint32x4_t V = vld1q_u32_ex(pSource, 128); +#else + uint32x4_t V = vld1q_u32(pSource); +#endif + return vreinterpretq_f32_u32(vsetq_lane_u32(0, V, 3)); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + __m128 z = _mm_load_ss(reinterpret_cast(pSource + 2)); + return _mm_insert_ps(xy, z, 0x20); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + __m128 z = _mm_load_ss(reinterpret_cast(pSource + 2)); + return _mm_movelh_ps(xy, z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat3(const XMFLOAT3* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t x = vld1_f32(reinterpret_cast(pSource)); + float32x2_t zero = vdup_n_f32(0); + float32x2_t y = vld1_lane_f32(reinterpret_cast(pSource) + 2, zero, 0); + return vcombine_f32(x, y); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + __m128 z = _mm_load_ss(&pSource->z); + return _mm_insert_ps(xy, z, 0x20); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + __m128 z = _mm_load_ss(&pSource->z); + return _mm_movelh_ps(xy, z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat3A(const XMFLOAT3A* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Reads an extra float which is zero'd +#ifdef _MSC_VER + float32x4_t V = vld1q_f32_ex(reinterpret_cast(pSource), 128); +#else + float32x4_t V = vld1q_f32(reinterpret_cast(pSource)); +#endif + return vsetq_lane_f32(0, V, 3); +#elif defined(_XM_SSE_INTRINSICS_) + // Reads an extra float which is zero'd + __m128 V = _mm_load_ps(&pSource->x); + return _mm_and_ps(V, g_XMMask3); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadSInt3(const XMINT3* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + V.vector4_f32[0] = static_cast(pSource->x); + V.vector4_f32[1] = static_cast(pSource->y); + V.vector4_f32[2] = static_cast(pSource->z); + V.vector4_f32[3] = 0.f; + return V; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x2_t x = vld1_s32(reinterpret_cast(pSource)); + int32x2_t zero = vdup_n_s32(0); + int32x2_t y = vld1_lane_s32(reinterpret_cast(pSource) + 2, zero, 0); + int32x4_t v = vcombine_s32(x, y); + return vcvtq_f32_s32(v); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + __m128 z = _mm_load_ss(reinterpret_cast(&pSource->z)); + __m128 V = _mm_movelh_ps(xy, z); + return _mm_cvtepi32_ps(_mm_castps_si128(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUInt3(const XMUINT3* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = static_cast(pSource->x); + V.vector4_f32[1] = static_cast(pSource->y); + V.vector4_f32[2] = static_cast(pSource->z); + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t x = vld1_u32(reinterpret_cast(pSource)); + uint32x2_t zero = vdup_n_u32(0); + uint32x2_t y = vld1_lane_u32(reinterpret_cast(pSource) + 2, zero, 0); + uint32x4_t v = vcombine_u32(x, y); + return vcvtq_f32_u32(v); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + __m128 z = _mm_load_ss(reinterpret_cast(&pSource->z)); + __m128 V = _mm_movelh_ps(xy, z); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(V, g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(V, vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask), 31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask), g_XMFixUnsigned); + vResult = _mm_add_ps(vResult, vMask); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt4(const uint32_t* pSource) noexcept +{ + assert(pSource); + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = pSource[3]; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vld1q_u32(pSource)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadu_si128(reinterpret_cast(pSource)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt4A(const uint32_t* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = pSource[3]; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#ifdef _MSC_VER + return vld1q_u32_ex(pSource, 128); +#else + return vreinterpretq_f32_u32(vld1q_u32(pSource)); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_load_si128(reinterpret_cast(pSource)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat4(const XMFLOAT4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = pSource->w; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_f32(reinterpret_cast(pSource)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_loadu_ps(&pSource->x); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat4A(const XMFLOAT4A* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = pSource->w; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#ifdef _MSC_VER + return vld1q_f32_ex(reinterpret_cast(pSource), 128); +#else + return vld1q_f32(reinterpret_cast(pSource)); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ps(&pSource->x); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadSInt4(const XMINT4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + V.vector4_f32[0] = static_cast(pSource->x); + V.vector4_f32[1] = static_cast(pSource->y); + V.vector4_f32[2] = static_cast(pSource->z); + V.vector4_f32[3] = static_cast(pSource->w); + return V; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t v = vld1q_s32(reinterpret_cast(pSource)); + return vcvtq_f32_s32(v); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadu_si128(reinterpret_cast(pSource)); + return _mm_cvtepi32_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUInt4(const XMUINT4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = static_cast(pSource->x); + V.vector4_f32[1] = static_cast(pSource->y); + V.vector4_f32[2] = static_cast(pSource->z); + V.vector4_f32[3] = static_cast(pSource->w); + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t v = vld1q_u32(reinterpret_cast(pSource)); + return vcvtq_f32_u32(v); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadu_si128(reinterpret_cast(pSource)); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(_mm_castsi128_ps(V), g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(_mm_castsi128_ps(V), vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask), 31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask), g_XMFixUnsigned); + vResult = _mm_add_ps(vResult, vMask); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat3x3(const XMFLOAT3X3* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + M.r[3].vector4_f32[0] = 0.0f; + M.r[3].vector4_f32[1] = 0.0f; + M.r[3].vector4_f32[2] = 0.0f; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t v0 = vld1q_f32(&pSource->m[0][0]); + float32x4_t v1 = vld1q_f32(&pSource->m[1][1]); + float32x2_t v2 = vcreate_f32(static_cast(*reinterpret_cast(&pSource->m[2][2]))); + float32x4_t T = vextq_f32(v0, v1, 3); + + XMMATRIX M; + M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3)); + M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T), g_XMMask3)); + M.r[2] = vcombine_f32(vget_high_f32(v1), v2); + M.r[3] = g_XMIdentityR3; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 Z = _mm_setzero_ps(); + + __m128 V1 = _mm_loadu_ps(&pSource->m[0][0]); + __m128 V2 = _mm_loadu_ps(&pSource->m[1][1]); + __m128 V3 = _mm_load_ss(&pSource->m[2][2]); + + __m128 T1 = _mm_unpackhi_ps(V1, Z); + __m128 T2 = _mm_unpacklo_ps(V2, Z); + __m128 T3 = _mm_shuffle_ps(V3, T2, _MM_SHUFFLE(0, 1, 0, 0)); + __m128 T4 = _mm_movehl_ps(T2, T3); + __m128 T5 = _mm_movehl_ps(Z, T1); + + XMMATRIX M; + M.r[0] = _mm_movelh_ps(V1, T1); + M.r[1] = _mm_add_ps(T4, T5); + M.r[2] = _mm_shuffle_ps(V2, V3, _MM_SHUFFLE(1, 0, 3, 2)); + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat4x3(const XMFLOAT4X3* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t v0 = vld1q_f32(&pSource->m[0][0]); + float32x4_t v1 = vld1q_f32(&pSource->m[1][1]); + float32x4_t v2 = vld1q_f32(&pSource->m[2][2]); + + float32x4_t T1 = vextq_f32(v0, v1, 3); + float32x4_t T2 = vcombine_f32(vget_high_f32(v1), vget_low_f32(v2)); + float32x4_t T3 = vextq_f32(v2, v2, 1); + + XMMATRIX M; + M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3)); + M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3)); + M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3)); + M.r[3] = vsetq_lane_f32(1.f, T3, 3); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + // Use unaligned load instructions to + // load the 12 floats + // vTemp1 = x1,y1,z1,x2 + XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]); + // vTemp2 = y2,z2,x3,y3 + XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]); + // vTemp4 = z3,x4,y4,z4 + XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]); + // vTemp3 = x3,y3,z3,z3 + XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2, vTemp4, _MM_SHUFFLE(0, 0, 3, 2)); + // vTemp2 = y2,z2,x2,x2 + vTemp2 = _mm_shuffle_ps(vTemp2, vTemp1, _MM_SHUFFLE(3, 3, 1, 0)); + // vTemp2 = x2,y2,z2,z2 + vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 1, 0, 2)); + // vTemp1 = x1,y1,z1,0 + vTemp1 = _mm_and_ps(vTemp1, g_XMMask3); + // vTemp2 = x2,y2,z2,0 + vTemp2 = _mm_and_ps(vTemp2, g_XMMask3); + // vTemp3 = x3,y3,z3,0 + vTemp3 = _mm_and_ps(vTemp3, g_XMMask3); + // vTemp4i = x4,y4,z4,0 + __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4), 32 / 8); + // vTemp4i = x4,y4,z4,1.0f + vTemp4i = _mm_or_si128(vTemp4i, g_XMIdentityR3); + XMMATRIX M(vTemp1, + vTemp2, + vTemp3, + _mm_castsi128_ps(vTemp4i)); + return M; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat4x3A(const XMFLOAT4X3A* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#ifdef _MSC_VER + float32x4_t v0 = vld1q_f32_ex(&pSource->m[0][0], 128); + float32x4_t v1 = vld1q_f32_ex(&pSource->m[1][1], 128); + float32x4_t v2 = vld1q_f32_ex(&pSource->m[2][2], 128); +#else + float32x4_t v0 = vld1q_f32(&pSource->m[0][0]); + float32x4_t v1 = vld1q_f32(&pSource->m[1][1]); + float32x4_t v2 = vld1q_f32(&pSource->m[2][2]); +#endif + + float32x4_t T1 = vextq_f32(v0, v1, 3); + float32x4_t T2 = vcombine_f32(vget_high_f32(v1), vget_low_f32(v2)); + float32x4_t T3 = vextq_f32(v2, v2, 1); + + XMMATRIX M; + M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3)); + M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3)); + M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3)); + M.r[3] = vsetq_lane_f32(1.f, T3, 3); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + // Use aligned load instructions to + // load the 12 floats + // vTemp1 = x1,y1,z1,x2 + XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]); + // vTemp2 = y2,z2,x3,y3 + XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]); + // vTemp4 = z3,x4,y4,z4 + XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]); + // vTemp3 = x3,y3,z3,z3 + XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2, vTemp4, _MM_SHUFFLE(0, 0, 3, 2)); + // vTemp2 = y2,z2,x2,x2 + vTemp2 = _mm_shuffle_ps(vTemp2, vTemp1, _MM_SHUFFLE(3, 3, 1, 0)); + // vTemp2 = x2,y2,z2,z2 + vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 1, 0, 2)); + // vTemp1 = x1,y1,z1,0 + vTemp1 = _mm_and_ps(vTemp1, g_XMMask3); + // vTemp2 = x2,y2,z2,0 + vTemp2 = _mm_and_ps(vTemp2, g_XMMask3); + // vTemp3 = x3,y3,z3,0 + vTemp3 = _mm_and_ps(vTemp3, g_XMMask3); + // vTemp4i = x4,y4,z4,0 + __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4), 32 / 8); + // vTemp4i = x4,y4,z4,1.0f + vTemp4i = _mm_or_si128(vTemp4i, g_XMIdentityR3); + XMMATRIX M(vTemp1, + vTemp2, + vTemp3, + _mm_castsi128_ps(vTemp4i)); + return M; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat3x4(const XMFLOAT3X4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[1][0]; + M.r[0].vector4_f32[2] = pSource->m[2][0]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[0][1]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[2][1]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[0][2]; + M.r[2].vector4_f32[1] = pSource->m[1][2]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + + M.r[3].vector4_f32[0] = pSource->m[0][3]; + M.r[3].vector4_f32[1] = pSource->m[1][3]; + M.r[3].vector4_f32[2] = pSource->m[2][3]; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2x4_t vTemp0 = vld4_f32(&pSource->_11); + float32x4_t vTemp1 = vld1q_f32(&pSource->_31); + + float32x2_t l = vget_low_f32(vTemp1); + float32x4_t T0 = vcombine_f32(vTemp0.val[0], l); + float32x2_t rl = vrev64_f32(l); + float32x4_t T1 = vcombine_f32(vTemp0.val[1], rl); + + float32x2_t h = vget_high_f32(vTemp1); + float32x4_t T2 = vcombine_f32(vTemp0.val[2], h); + float32x2_t rh = vrev64_f32(h); + float32x4_t T3 = vcombine_f32(vTemp0.val[3], rh); + + XMMATRIX M = {}; + M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T0), g_XMMask3)); + M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3)); + M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3)); + M.r[3] = vsetq_lane_f32(1.f, T3, 3); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_loadu_ps(&pSource->_11); + M.r[1] = _mm_loadu_ps(&pSource->_21); + M.r[2] = _mm_loadu_ps(&pSource->_31); + M.r[3] = g_XMIdentityR3; + + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2)); + XMMATRIX mResult; + + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1)); + return mResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat3x4A(const XMFLOAT3X4A* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[1][0]; + M.r[0].vector4_f32[2] = pSource->m[2][0]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[0][1]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[2][1]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[0][2]; + M.r[2].vector4_f32[1] = pSource->m[1][2]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + + M.r[3].vector4_f32[0] = pSource->m[0][3]; + M.r[3].vector4_f32[1] = pSource->m[1][3]; + M.r[3].vector4_f32[2] = pSource->m[2][3]; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#ifdef _MSC_VER + float32x2x4_t vTemp0 = vld4_f32_ex(&pSource->_11, 128); + float32x4_t vTemp1 = vld1q_f32_ex(&pSource->_31, 128); +#else + float32x2x4_t vTemp0 = vld4_f32(&pSource->_11); + float32x4_t vTemp1 = vld1q_f32(&pSource->_31); +#endif + + float32x2_t l = vget_low_f32(vTemp1); + float32x4_t T0 = vcombine_f32(vTemp0.val[0], l); + float32x2_t rl = vrev64_f32(l); + float32x4_t T1 = vcombine_f32(vTemp0.val[1], rl); + + float32x2_t h = vget_high_f32(vTemp1); + float32x4_t T2 = vcombine_f32(vTemp0.val[2], h); + float32x2_t rh = vrev64_f32(h); + float32x4_t T3 = vcombine_f32(vTemp0.val[3], rh); + + XMMATRIX M = {}; + M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T0), g_XMMask3)); + M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3)); + M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3)); + M.r[3] = vsetq_lane_f32(1.f, T3, 3); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_load_ps(&pSource->_11); + M.r[1] = _mm_load_ps(&pSource->_21); + M.r[2] = _mm_load_ps(&pSource->_31); + M.r[3] = g_XMIdentityR3; + + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2)); + XMMATRIX mResult; + + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1)); + return mResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat4x4(const XMFLOAT4X4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = pSource->m[0][3]; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = pSource->m[1][3]; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = pSource->m[2][3]; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = pSource->m[3][3]; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = vld1q_f32(reinterpret_cast(&pSource->_11)); + M.r[1] = vld1q_f32(reinterpret_cast(&pSource->_21)); + M.r[2] = vld1q_f32(reinterpret_cast(&pSource->_31)); + M.r[3] = vld1q_f32(reinterpret_cast(&pSource->_41)); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_loadu_ps(&pSource->_11); + M.r[1] = _mm_loadu_ps(&pSource->_21); + M.r[2] = _mm_loadu_ps(&pSource->_31); + M.r[3] = _mm_loadu_ps(&pSource->_41); + return M; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat4x4A(const XMFLOAT4X4A* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = pSource->m[0][3]; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = pSource->m[1][3]; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = pSource->m[2][3]; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = pSource->m[3][3]; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; +#ifdef _MSC_VER + M.r[0] = vld1q_f32_ex(reinterpret_cast(&pSource->_11), 128); + M.r[1] = vld1q_f32_ex(reinterpret_cast(&pSource->_21), 128); + M.r[2] = vld1q_f32_ex(reinterpret_cast(&pSource->_31), 128); + M.r[3] = vld1q_f32_ex(reinterpret_cast(&pSource->_41), 128); +#else + M.r[0] = vld1q_f32(reinterpret_cast(&pSource->_11)); + M.r[1] = vld1q_f32(reinterpret_cast(&pSource->_21)); + M.r[2] = vld1q_f32(reinterpret_cast(&pSource->_31)); + M.r[3] = vld1q_f32(reinterpret_cast(&pSource->_41)); +#endif + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_load_ps(&pSource->_11); + M.r[1] = _mm_load_ps(&pSource->_21); + M.r[2] = _mm_load_ps(&pSource->_31); + M.r[3] = _mm_load_ps(&pSource->_41); + return M; +#endif +} + +/**************************************************************************** + * + * Vector and matrix store operations + * + ****************************************************************************/ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt +( + uint32_t* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + *pDestination = XMVectorGetIntX(V); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(pDestination, *reinterpret_cast(&V), 0); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss(reinterpret_cast(pDestination), V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat +( + float* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + *pDestination = XMVectorGetX(V); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(pDestination, V, 0); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss(pDestination, V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt2 +( + uint32_t* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V)); + vst1_u32(pDestination, VL); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt2A +( + uint32_t* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V)); +#ifdef _MSC_VER + vst1_u32_ex(pDestination, VL, 64); +#else + vst1_u32(pDestination, VL); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat2 +( + XMFLOAT2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + vst1_f32(reinterpret_cast(pDestination), VL); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat2A +( + XMFLOAT2A* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); +#ifdef _MSC_VER + vst1_f32_ex(reinterpret_cast(pDestination), VL, 64); +#else + vst1_f32(reinterpret_cast(pDestination), VL); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreSInt2 +( + XMINT2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = static_cast(V.vector4_f32[0]); + pDestination->y = static_cast(V.vector4_f32[1]); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t v = vget_low_f32(V); + int32x2_t iv = vcvt_s32_f32(v); + vst1_s32(reinterpret_cast(pDestination), iv); +#elif defined(_XM_SSE_INTRINSICS_) + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(V, g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(V); + // If there was positive overflow, set to 0x7FFFFFFF + XMVECTOR vResult = _mm_and_ps(vOverflow, g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow, _mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow, vResult); + // Write two ints + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(vOverflow)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUInt2 +( + XMUINT2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = static_cast(V.vector4_f32[0]); + pDestination->y = static_cast(V.vector4_f32[1]); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t v = vget_low_f32(V); + uint32x2_t iv = vcvt_u32_f32(v); + vst1_u32(reinterpret_cast(pDestination), iv); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to >=0 + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult, vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue, vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult, vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask, g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti), vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult, vOverflow); + // Write two uints + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(vResult)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt3 +( + uint32_t* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V)); + vst1_u32(pDestination, VL); + vst1q_lane_u32(pDestination + 2, *reinterpret_cast(&V), 2); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); + __m128 z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + _mm_store_ss(reinterpret_cast(&pDestination[2]), z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt3A +( + uint32_t* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V)); +#ifdef _MSC_VER + vst1_u32_ex(pDestination, VL, 64); +#else + vst1_u32(pDestination, VL); +#endif + vst1q_lane_u32(pDestination + 2, *reinterpret_cast(&V), 2); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); + __m128 z = _mm_movehl_ps(V, V); + _mm_store_ss(reinterpret_cast(&pDestination[2]), z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3 +( + XMFLOAT3* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + vst1_f32(reinterpret_cast(pDestination), VL); + vst1q_lane_f32(reinterpret_cast(pDestination) + 2, V, 2); +#elif defined(_XM_SSE4_INTRINSICS_) + * reinterpret_cast(&pDestination->x) = _mm_extract_ps(V, 0); + *reinterpret_cast(&pDestination->y) = _mm_extract_ps(V, 1); + *reinterpret_cast(&pDestination->z) = _mm_extract_ps(V, 2); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); + __m128 z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + _mm_store_ss(&pDestination->z, z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3A +( + XMFLOAT3A* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); +#ifdef _MSC_VER + vst1_f32_ex(reinterpret_cast(pDestination), VL, 64); +#else + vst1_f32(reinterpret_cast(pDestination), VL); +#endif + vst1q_lane_f32(reinterpret_cast(pDestination) + 2, V, 2); +#elif defined(_XM_SSE4_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); + *reinterpret_cast(&pDestination->z) = _mm_extract_ps(V, 2); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); + __m128 z = _mm_movehl_ps(V, V); + _mm_store_ss(&pDestination->z, z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreSInt3 +( + XMINT3* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = static_cast(V.vector4_f32[0]); + pDestination->y = static_cast(V.vector4_f32[1]); + pDestination->z = static_cast(V.vector4_f32[2]); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t v = vcvtq_s32_f32(V); + int32x2_t vL = vget_low_s32(v); + vst1_s32(reinterpret_cast(pDestination), vL); + vst1q_lane_s32(reinterpret_cast(pDestination) + 2, v, 2); +#elif defined(_XM_SSE_INTRINSICS_) + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(V, g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(V); + // If there was positive overflow, set to 0x7FFFFFFF + XMVECTOR vResult = _mm_and_ps(vOverflow, g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow, _mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow, vResult); + // Write 3 uints + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(vOverflow)); + __m128 z = XM_PERMUTE_PS(vOverflow, _MM_SHUFFLE(2, 2, 2, 2)); + _mm_store_ss(reinterpret_cast(&pDestination->z), z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUInt3 +( + XMUINT3* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = static_cast(V.vector4_f32[0]); + pDestination->y = static_cast(V.vector4_f32[1]); + pDestination->z = static_cast(V.vector4_f32[2]); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t v = vcvtq_u32_f32(V); + uint32x2_t vL = vget_low_u32(v); + vst1_u32(reinterpret_cast(pDestination), vL); + vst1q_lane_u32(reinterpret_cast(pDestination) + 2, v, 2); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to >=0 + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult, vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue, vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult, vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask, g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti), vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult, vOverflow); + // Write 3 uints + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(vResult)); + __m128 z = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(2, 2, 2, 2)); + _mm_store_ss(reinterpret_cast(&pDestination->z), z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt4 +( + uint32_t* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; + pDestination[3] = V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_u32(pDestination, vreinterpretq_u32_f32(V)); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storeu_si128(reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt4A +( + uint32_t* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; + pDestination[3] = V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#ifdef _MSC_VER + vst1q_u32_ex(pDestination, V, 128); +#else + vst1q_u32(pDestination, vreinterpretq_u32_f32(V)); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_si128(reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4 +( + XMFLOAT4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; + pDestination->w = V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_f32(reinterpret_cast(pDestination), V); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storeu_ps(&pDestination->x, V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4A +( + XMFLOAT4A* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; + pDestination->w = V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#ifdef _MSC_VER + vst1q_f32_ex(reinterpret_cast(pDestination), V, 128); +#else + vst1q_f32(reinterpret_cast(pDestination), V); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ps(&pDestination->x, V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreSInt4 +( + XMINT4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = static_cast(V.vector4_f32[0]); + pDestination->y = static_cast(V.vector4_f32[1]); + pDestination->z = static_cast(V.vector4_f32[2]); + pDestination->w = static_cast(V.vector4_f32[3]); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t v = vcvtq_s32_f32(V); + vst1q_s32(reinterpret_cast(pDestination), v); +#elif defined(_XM_SSE_INTRINSICS_) + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(V, g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(V); + // If there was positive overflow, set to 0x7FFFFFFF + XMVECTOR vResult = _mm_and_ps(vOverflow, g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow, _mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow, vResult); + _mm_storeu_si128(reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vOverflow)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUInt4 +( + XMUINT4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = static_cast(V.vector4_f32[0]); + pDestination->y = static_cast(V.vector4_f32[1]); + pDestination->z = static_cast(V.vector4_f32[2]); + pDestination->w = static_cast(V.vector4_f32[3]); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t v = vcvtq_u32_f32(V); + vst1q_u32(reinterpret_cast(pDestination), v); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to >=0 + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult, vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue, vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult, vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask, g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti), vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult, vOverflow); + _mm_storeu_si128(reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vResult)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3x3 +( + XMFLOAT3X3* pDestination, + FXMMATRIX M +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t T1 = vextq_f32(M.r[0], M.r[1], 1); + float32x4_t T2 = vbslq_f32(g_XMMask3, M.r[0], T1); + vst1q_f32(&pDestination->m[0][0], T2); + + T1 = vextq_f32(M.r[1], M.r[1], 1); + T2 = vcombine_f32(vget_low_f32(T1), vget_low_f32(M.r[2])); + vst1q_f32(&pDestination->m[1][1], T2); + + vst1q_lane_f32(&pDestination->m[2][2], M.r[2], 2); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = M.r[0]; + XMVECTOR vTemp2 = M.r[1]; + XMVECTOR vTemp3 = M.r[2]; + XMVECTOR vWork = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(0, 0, 2, 2)); + vTemp1 = _mm_shuffle_ps(vTemp1, vWork, _MM_SHUFFLE(2, 0, 1, 0)); + _mm_storeu_ps(&pDestination->m[0][0], vTemp1); + vTemp2 = _mm_shuffle_ps(vTemp2, vTemp3, _MM_SHUFFLE(1, 0, 2, 1)); + _mm_storeu_ps(&pDestination->m[1][1], vTemp2); + vTemp3 = XM_PERMUTE_PS(vTemp3, _MM_SHUFFLE(2, 2, 2, 2)); + _mm_store_ss(&pDestination->m[2][2], vTemp3); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4x3 +( + XMFLOAT4X3* pDestination, + FXMMATRIX M +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t T1 = vextq_f32(M.r[0], M.r[1], 1); + float32x4_t T2 = vbslq_f32(g_XMMask3, M.r[0], T1); + vst1q_f32(&pDestination->m[0][0], T2); + + T1 = vextq_f32(M.r[1], M.r[1], 1); + T2 = vcombine_f32(vget_low_f32(T1), vget_low_f32(M.r[2])); + vst1q_f32(&pDestination->m[1][1], T2); + + T1 = vdupq_lane_f32(vget_high_f32(M.r[2]), 0); + T2 = vextq_f32(T1, M.r[3], 3); + vst1q_f32(&pDestination->m[2][2], T2); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = M.r[0]; + XMVECTOR vTemp2 = M.r[1]; + XMVECTOR vTemp3 = M.r[2]; + XMVECTOR vTemp4 = M.r[3]; + XMVECTOR vTemp2x = _mm_shuffle_ps(vTemp2, vTemp3, _MM_SHUFFLE(1, 0, 2, 1)); + vTemp2 = _mm_shuffle_ps(vTemp2, vTemp1, _MM_SHUFFLE(2, 2, 0, 0)); + vTemp1 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(0, 2, 1, 0)); + vTemp3 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(0, 0, 2, 2)); + vTemp3 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 1, 2, 0)); + _mm_storeu_ps(&pDestination->m[0][0], vTemp1); + _mm_storeu_ps(&pDestination->m[1][1], vTemp2x); + _mm_storeu_ps(&pDestination->m[2][2], vTemp3); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4x3A +( + XMFLOAT4X3A* pDestination, + FXMMATRIX M +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#ifdef _MSC_VER + float32x4_t T1 = vextq_f32(M.r[0], M.r[1], 1); + float32x4_t T2 = vbslq_f32(g_XMMask3, M.r[0], T1); + vst1q_f32_ex(&pDestination->m[0][0], T2, 128); + + T1 = vextq_f32(M.r[1], M.r[1], 1); + T2 = vcombine_f32(vget_low_f32(T1), vget_low_f32(M.r[2])); + vst1q_f32_ex(&pDestination->m[1][1], T2, 128); + + T1 = vdupq_lane_f32(vget_high_f32(M.r[2]), 0); + T2 = vextq_f32(T1, M.r[3], 3); + vst1q_f32_ex(&pDestination->m[2][2], T2, 128); +#else + float32x4_t T1 = vextq_f32(M.r[0], M.r[1], 1); + float32x4_t T2 = vbslq_f32(g_XMMask3, M.r[0], T1); + vst1q_f32(&pDestination->m[0][0], T2); + + T1 = vextq_f32(M.r[1], M.r[1], 1); + T2 = vcombine_f32(vget_low_f32(T1), vget_low_f32(M.r[2])); + vst1q_f32(&pDestination->m[1][1], T2); + + T1 = vdupq_lane_f32(vget_high_f32(M.r[2]), 0); + T2 = vextq_f32(T1, M.r[3], 3); + vst1q_f32(&pDestination->m[2][2], T2); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + // x1,y1,z1,w1 + XMVECTOR vTemp1 = M.r[0]; + // x2,y2,z2,w2 + XMVECTOR vTemp2 = M.r[1]; + // x3,y3,z3,w3 + XMVECTOR vTemp3 = M.r[2]; + // x4,y4,z4,w4 + XMVECTOR vTemp4 = M.r[3]; + // z1,z1,x2,y2 + XMVECTOR vTemp = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(1, 0, 2, 2)); + // y2,z2,x3,y3 (Final) + vTemp2 = _mm_shuffle_ps(vTemp2, vTemp3, _MM_SHUFFLE(1, 0, 2, 1)); + // x1,y1,z1,x2 (Final) + vTemp1 = _mm_shuffle_ps(vTemp1, vTemp, _MM_SHUFFLE(2, 0, 1, 0)); + // z3,z3,x4,x4 + vTemp3 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(0, 0, 2, 2)); + // z3,x4,y4,z4 (Final) + vTemp3 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 1, 2, 0)); + // Store in 3 operations + _mm_store_ps(&pDestination->m[0][0], vTemp1); + _mm_store_ps(&pDestination->m[1][1], vTemp2); + _mm_store_ps(&pDestination->m[2][2], vTemp3); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3x4 +( + XMFLOAT3X4* pDestination, + FXMMATRIX M +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[1].vector4_f32[0]; + pDestination->m[0][2] = M.r[2].vector4_f32[0]; + pDestination->m[0][3] = M.r[3].vector4_f32[0]; + + pDestination->m[1][0] = M.r[0].vector4_f32[1]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[2].vector4_f32[1]; + pDestination->m[1][3] = M.r[3].vector4_f32[1]; + + pDestination->m[2][0] = M.r[0].vector4_f32[2]; + pDestination->m[2][1] = M.r[1].vector4_f32[2]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + pDestination->m[2][3] = M.r[3].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]); + float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]); + + float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]); + float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]); + + vst1q_f32(&pDestination->m[0][0], T0.val[0]); + vst1q_f32(&pDestination->m[1][0], T0.val[1]); + vst1q_f32(&pDestination->m[2][0], T1.val[0]); +#elif defined(_XM_SSE_INTRINSICS_) + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2)); + + // x.x,y.x,z.x,w.x + XMVECTOR r0 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); + // x.y,y.y,z.y,w.y + XMVECTOR r1 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); + // x.z,y.z,z.z,w.z + XMVECTOR r2 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); + + _mm_storeu_ps(&pDestination->m[0][0], r0); + _mm_storeu_ps(&pDestination->m[1][0], r1); + _mm_storeu_ps(&pDestination->m[2][0], r2); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3x4A +( + XMFLOAT3X4A* pDestination, + FXMMATRIX M +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[1].vector4_f32[0]; + pDestination->m[0][2] = M.r[2].vector4_f32[0]; + pDestination->m[0][3] = M.r[3].vector4_f32[0]; + + pDestination->m[1][0] = M.r[0].vector4_f32[1]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[2].vector4_f32[1]; + pDestination->m[1][3] = M.r[3].vector4_f32[1]; + + pDestination->m[2][0] = M.r[0].vector4_f32[2]; + pDestination->m[2][1] = M.r[1].vector4_f32[2]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + pDestination->m[2][3] = M.r[3].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]); + float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]); + + float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]); + float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]); + +#ifdef _MSC_VER + vst1q_f32_ex(&pDestination->m[0][0], T0.val[0], 128); + vst1q_f32_ex(&pDestination->m[1][0], T0.val[1], 128); + vst1q_f32_ex(&pDestination->m[2][0], T1.val[0], 128); +#else + vst1q_f32(&pDestination->m[0][0], T0.val[0]); + vst1q_f32(&pDestination->m[1][0], T0.val[1]); + vst1q_f32(&pDestination->m[2][0], T1.val[0]); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2)); + + // x.x,y.x,z.x,w.x + XMVECTOR r0 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); + // x.y,y.y,z.y,w.y + XMVECTOR r1 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); + // x.z,y.z,z.z,w.z + XMVECTOR r2 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); + + _mm_store_ps(&pDestination->m[0][0], r0); + _mm_store_ps(&pDestination->m[1][0], r1); + _mm_store_ps(&pDestination->m[2][0], r2); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4x4 +( + XMFLOAT4X4* pDestination, + FXMMATRIX M +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + pDestination->m[0][3] = M.r[0].vector4_f32[3]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + pDestination->m[1][3] = M.r[1].vector4_f32[3]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + pDestination->m[2][3] = M.r[2].vector4_f32[3]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + pDestination->m[3][3] = M.r[3].vector4_f32[3]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_f32(reinterpret_cast(&pDestination->_11), M.r[0]); + vst1q_f32(reinterpret_cast(&pDestination->_21), M.r[1]); + vst1q_f32(reinterpret_cast(&pDestination->_31), M.r[2]); + vst1q_f32(reinterpret_cast(&pDestination->_41), M.r[3]); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storeu_ps(&pDestination->_11, M.r[0]); + _mm_storeu_ps(&pDestination->_21, M.r[1]); + _mm_storeu_ps(&pDestination->_31, M.r[2]); + _mm_storeu_ps(&pDestination->_41, M.r[3]); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4x4A +( + XMFLOAT4X4A* pDestination, + FXMMATRIX M +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + pDestination->m[0][3] = M.r[0].vector4_f32[3]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + pDestination->m[1][3] = M.r[1].vector4_f32[3]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + pDestination->m[2][3] = M.r[2].vector4_f32[3]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + pDestination->m[3][3] = M.r[3].vector4_f32[3]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#ifdef _MSC_VER + vst1q_f32_ex(reinterpret_cast(&pDestination->_11), M.r[0], 128); + vst1q_f32_ex(reinterpret_cast(&pDestination->_21), M.r[1], 128); + vst1q_f32_ex(reinterpret_cast(&pDestination->_31), M.r[2], 128); + vst1q_f32_ex(reinterpret_cast(&pDestination->_41), M.r[3], 128); +#else + vst1q_f32(reinterpret_cast(&pDestination->_11), M.r[0]); + vst1q_f32(reinterpret_cast(&pDestination->_21), M.r[1]); + vst1q_f32(reinterpret_cast(&pDestination->_31), M.r[2]); + vst1q_f32(reinterpret_cast(&pDestination->_41), M.r[3]); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ps(&pDestination->_11, M.r[0]); + _mm_store_ps(&pDestination->_21, M.r[1]); + _mm_store_ps(&pDestination->_31, M.r[2]); + _mm_store_ps(&pDestination->_41, M.r[3]); +#endif +} + diff --git a/DirectXMath/DirectXMathMatrix.inl b/DirectXMath/DirectXMathMatrix.inl new file mode 100644 index 0000000..5337457 --- /dev/null +++ b/DirectXMath/DirectXMathMatrix.inl @@ -0,0 +1,3422 @@ +//------------------------------------------------------------------------------------- +// DirectXMathMatrix.inl -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +/**************************************************************************** + * + * Matrix + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + // Comparison operations + //------------------------------------------------------------------------------ + + //------------------------------------------------------------------------------ + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma float_control(push) +#pragma float_control(precise, on) +#endif + +// Return true if any entry in the matrix is NaN +inline bool XM_CALLCONV XMMatrixIsNaN(FXMMATRIX M) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + size_t i = 16; + auto pWork = reinterpret_cast(&M.m[0][0]); + do { + // Fetch value into integer unit + uint32_t uTest = pWork[0]; + // Remove sign + uTest &= 0x7FFFFFFFU; + // NaN is 0x7F800001 through 0x7FFFFFFF inclusive + uTest -= 0x7F800001U; + if (uTest < 0x007FFFFFU) + { + break; // NaN found + } + ++pWork; // Next entry + } while (--i); + return (i != 0); // i == 0 if nothing matched +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Load in registers + float32x4_t vX = M.r[0]; + float32x4_t vY = M.r[1]; + float32x4_t vZ = M.r[2]; + float32x4_t vW = M.r[3]; + // Test themselves to check for NaN + uint32x4_t xmask = vmvnq_u32(vceqq_f32(vX, vX)); + uint32x4_t ymask = vmvnq_u32(vceqq_f32(vY, vY)); + uint32x4_t zmask = vmvnq_u32(vceqq_f32(vZ, vZ)); + uint32x4_t wmask = vmvnq_u32(vceqq_f32(vW, vW)); + // Or all the results + xmask = vorrq_u32(xmask, zmask); + ymask = vorrq_u32(ymask, wmask); + xmask = vorrq_u32(xmask, ymask); + // If any tested true, return true + uint8x8x2_t vTemp = vzip_u8( + vget_low_u8(vreinterpretq_u8_u32(xmask)), + vget_high_u8(vreinterpretq_u8_u32(xmask))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + return (r != 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Load in registers + XMVECTOR vX = M.r[0]; + XMVECTOR vY = M.r[1]; + XMVECTOR vZ = M.r[2]; + XMVECTOR vW = M.r[3]; + // Test themselves to check for NaN + vX = _mm_cmpneq_ps(vX, vX); + vY = _mm_cmpneq_ps(vY, vY); + vZ = _mm_cmpneq_ps(vZ, vZ); + vW = _mm_cmpneq_ps(vW, vW); + // Or all the results + vX = _mm_or_ps(vX, vZ); + vY = _mm_or_ps(vY, vW); + vX = _mm_or_ps(vX, vY); + // If any tested true, return true + return (_mm_movemask_ps(vX) != 0); +#else +#endif +} + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma float_control(pop) +#endif + +//------------------------------------------------------------------------------ + +// Return true if any entry in the matrix is +/-INF +inline bool XM_CALLCONV XMMatrixIsInfinite(FXMMATRIX M) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + size_t i = 16; + auto pWork = reinterpret_cast(&M.m[0][0]); + do { + // Fetch value into integer unit + uint32_t uTest = pWork[0]; + // Remove sign + uTest &= 0x7FFFFFFFU; + // INF is 0x7F800000 + if (uTest == 0x7F800000U) + { + break; // INF found + } + ++pWork; // Next entry + } while (--i); + return (i != 0); // i == 0 if nothing matched +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Load in registers + float32x4_t vX = M.r[0]; + float32x4_t vY = M.r[1]; + float32x4_t vZ = M.r[2]; + float32x4_t vW = M.r[3]; + // Mask off the sign bits + vX = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vX), g_XMAbsMask)); + vY = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vY), g_XMAbsMask)); + vZ = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vZ), g_XMAbsMask)); + vW = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vW), g_XMAbsMask)); + // Compare to infinity + uint32x4_t xmask = vceqq_f32(vX, g_XMInfinity); + uint32x4_t ymask = vceqq_f32(vY, g_XMInfinity); + uint32x4_t zmask = vceqq_f32(vZ, g_XMInfinity); + uint32x4_t wmask = vceqq_f32(vW, g_XMInfinity); + // Or the answers together + xmask = vorrq_u32(xmask, zmask); + ymask = vorrq_u32(ymask, wmask); + xmask = vorrq_u32(xmask, ymask); + // If any tested true, return true + uint8x8x2_t vTemp = vzip_u8( + vget_low_u8(vreinterpretq_u8_u32(xmask)), + vget_high_u8(vreinterpretq_u8_u32(xmask))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + return (r != 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bits + XMVECTOR vTemp1 = _mm_and_ps(M.r[0], g_XMAbsMask); + XMVECTOR vTemp2 = _mm_and_ps(M.r[1], g_XMAbsMask); + XMVECTOR vTemp3 = _mm_and_ps(M.r[2], g_XMAbsMask); + XMVECTOR vTemp4 = _mm_and_ps(M.r[3], g_XMAbsMask); + // Compare to infinity + vTemp1 = _mm_cmpeq_ps(vTemp1, g_XMInfinity); + vTemp2 = _mm_cmpeq_ps(vTemp2, g_XMInfinity); + vTemp3 = _mm_cmpeq_ps(vTemp3, g_XMInfinity); + vTemp4 = _mm_cmpeq_ps(vTemp4, g_XMInfinity); + // Or the answers together + vTemp1 = _mm_or_ps(vTemp1, vTemp2); + vTemp3 = _mm_or_ps(vTemp3, vTemp4); + vTemp1 = _mm_or_ps(vTemp1, vTemp3); + // If any are infinity, the signs are true. + return (_mm_movemask_ps(vTemp1) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +// Return true if the XMMatrix is equal to identity +inline bool XM_CALLCONV XMMatrixIsIdentity(FXMMATRIX M) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + // Use the integer pipeline to reduce branching to a minimum + auto pWork = reinterpret_cast(&M.m[0][0]); + // Convert 1.0f to zero and or them together + uint32_t uOne = pWork[0] ^ 0x3F800000U; + // Or all the 0.0f entries together + uint32_t uZero = pWork[1]; + uZero |= pWork[2]; + uZero |= pWork[3]; + // 2nd row + uZero |= pWork[4]; + uOne |= pWork[5] ^ 0x3F800000U; + uZero |= pWork[6]; + uZero |= pWork[7]; + // 3rd row + uZero |= pWork[8]; + uZero |= pWork[9]; + uOne |= pWork[10] ^ 0x3F800000U; + uZero |= pWork[11]; + // 4th row + uZero |= pWork[12]; + uZero |= pWork[13]; + uZero |= pWork[14]; + uOne |= pWork[15] ^ 0x3F800000U; + // If all zero entries are zero, the uZero==0 + uZero &= 0x7FFFFFFF; // Allow -0.0f + // If all 1.0f entries are 1.0f, then uOne==0 + uOne |= uZero; + return (uOne == 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t xmask = vceqq_f32(M.r[0], g_XMIdentityR0); + uint32x4_t ymask = vceqq_f32(M.r[1], g_XMIdentityR1); + uint32x4_t zmask = vceqq_f32(M.r[2], g_XMIdentityR2); + uint32x4_t wmask = vceqq_f32(M.r[3], g_XMIdentityR3); + xmask = vandq_u32(xmask, zmask); + ymask = vandq_u32(ymask, wmask); + xmask = vandq_u32(xmask, ymask); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(xmask)), vget_high_u8(vreinterpretq_u8_u32(xmask))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + return (r == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = _mm_cmpeq_ps(M.r[0], g_XMIdentityR0); + XMVECTOR vTemp2 = _mm_cmpeq_ps(M.r[1], g_XMIdentityR1); + XMVECTOR vTemp3 = _mm_cmpeq_ps(M.r[2], g_XMIdentityR2); + XMVECTOR vTemp4 = _mm_cmpeq_ps(M.r[3], g_XMIdentityR3); + vTemp1 = _mm_and_ps(vTemp1, vTemp2); + vTemp3 = _mm_and_ps(vTemp3, vTemp4); + vTemp1 = _mm_and_ps(vTemp1, vTemp3); + return (_mm_movemask_ps(vTemp1) == 0x0f); +#endif +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Perform a 4x4 matrix multiply by a 4x4 matrix +inline XMMATRIX XM_CALLCONV XMMatrixMultiply +( + FXMMATRIX M1, + CXMMATRIX M2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMMATRIX mResult; + // Cache the invariants in registers + float x = M1.m[0][0]; + float y = M1.m[0][1]; + float z = M1.m[0][2]; + float w = M1.m[0][3]; + // Perform the operation on the first row + mResult.m[0][0] = (M2.m[0][0] * x) + (M2.m[1][0] * y) + (M2.m[2][0] * z) + (M2.m[3][0] * w); + mResult.m[0][1] = (M2.m[0][1] * x) + (M2.m[1][1] * y) + (M2.m[2][1] * z) + (M2.m[3][1] * w); + mResult.m[0][2] = (M2.m[0][2] * x) + (M2.m[1][2] * y) + (M2.m[2][2] * z) + (M2.m[3][2] * w); + mResult.m[0][3] = (M2.m[0][3] * x) + (M2.m[1][3] * y) + (M2.m[2][3] * z) + (M2.m[3][3] * w); + // Repeat for all the other rows + x = M1.m[1][0]; + y = M1.m[1][1]; + z = M1.m[1][2]; + w = M1.m[1][3]; + mResult.m[1][0] = (M2.m[0][0] * x) + (M2.m[1][0] * y) + (M2.m[2][0] * z) + (M2.m[3][0] * w); + mResult.m[1][1] = (M2.m[0][1] * x) + (M2.m[1][1] * y) + (M2.m[2][1] * z) + (M2.m[3][1] * w); + mResult.m[1][2] = (M2.m[0][2] * x) + (M2.m[1][2] * y) + (M2.m[2][2] * z) + (M2.m[3][2] * w); + mResult.m[1][3] = (M2.m[0][3] * x) + (M2.m[1][3] * y) + (M2.m[2][3] * z) + (M2.m[3][3] * w); + x = M1.m[2][0]; + y = M1.m[2][1]; + z = M1.m[2][2]; + w = M1.m[2][3]; + mResult.m[2][0] = (M2.m[0][0] * x) + (M2.m[1][0] * y) + (M2.m[2][0] * z) + (M2.m[3][0] * w); + mResult.m[2][1] = (M2.m[0][1] * x) + (M2.m[1][1] * y) + (M2.m[2][1] * z) + (M2.m[3][1] * w); + mResult.m[2][2] = (M2.m[0][2] * x) + (M2.m[1][2] * y) + (M2.m[2][2] * z) + (M2.m[3][2] * w); + mResult.m[2][3] = (M2.m[0][3] * x) + (M2.m[1][3] * y) + (M2.m[2][3] * z) + (M2.m[3][3] * w); + x = M1.m[3][0]; + y = M1.m[3][1]; + z = M1.m[3][2]; + w = M1.m[3][3]; + mResult.m[3][0] = (M2.m[0][0] * x) + (M2.m[1][0] * y) + (M2.m[2][0] * z) + (M2.m[3][0] * w); + mResult.m[3][1] = (M2.m[0][1] * x) + (M2.m[1][1] * y) + (M2.m[2][1] * z) + (M2.m[3][1] * w); + mResult.m[3][2] = (M2.m[0][2] * x) + (M2.m[1][2] * y) + (M2.m[2][2] * z) + (M2.m[3][2] * w); + mResult.m[3][3] = (M2.m[0][3] * x) + (M2.m[1][3] * y) + (M2.m[2][3] * z) + (M2.m[3][3] * w); + return mResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX mResult; + float32x2_t VL = vget_low_f32(M1.r[0]); + float32x2_t VH = vget_high_f32(M1.r[0]); + // Perform the operation on the first row + float32x4_t vX = vmulq_lane_f32(M2.r[0], VL, 0); + float32x4_t vY = vmulq_lane_f32(M2.r[1], VL, 1); + float32x4_t vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + float32x4_t vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + mResult.r[0] = vaddq_f32(vZ, vW); + // Repeat for the other 3 rows + VL = vget_low_f32(M1.r[1]); + VH = vget_high_f32(M1.r[1]); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + mResult.r[1] = vaddq_f32(vZ, vW); + VL = vget_low_f32(M1.r[2]); + VH = vget_high_f32(M1.r[2]); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + mResult.r[2] = vaddq_f32(vZ, vW); + VL = vget_low_f32(M1.r[3]); + VH = vget_high_f32(M1.r[3]); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + mResult.r[3] = vaddq_f32(vZ, vW); + return mResult; +#elif defined(_XM_AVX2_INTRINSICS_) + __m256 t0 = _mm256_castps128_ps256(M1.r[0]); + t0 = _mm256_insertf128_ps(t0, M1.r[1], 1); + __m256 t1 = _mm256_castps128_ps256(M1.r[2]); + t1 = _mm256_insertf128_ps(t1, M1.r[3], 1); + + __m256 u0 = _mm256_castps128_ps256(M2.r[0]); + u0 = _mm256_insertf128_ps(u0, M2.r[1], 1); + __m256 u1 = _mm256_castps128_ps256(M2.r[2]); + u1 = _mm256_insertf128_ps(u1, M2.r[3], 1); + + __m256 a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 0, 0, 0)); + __m256 a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0)); + __m256 b0 = _mm256_permute2f128_ps(u0, u0, 0x00); + __m256 c0 = _mm256_mul_ps(a0, b0); + __m256 c1 = _mm256_mul_ps(a1, b0); + + a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(1, 1, 1, 1)); + a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1)); + b0 = _mm256_permute2f128_ps(u0, u0, 0x11); + __m256 c2 = _mm256_fmadd_ps(a0, b0, c0); + __m256 c3 = _mm256_fmadd_ps(a1, b0, c1); + + a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 2, 2)); + a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 b1 = _mm256_permute2f128_ps(u1, u1, 0x00); + __m256 c4 = _mm256_mul_ps(a0, b1); + __m256 c5 = _mm256_mul_ps(a1, b1); + + a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 3, 3, 3)); + a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3)); + b1 = _mm256_permute2f128_ps(u1, u1, 0x11); + __m256 c6 = _mm256_fmadd_ps(a0, b1, c4); + __m256 c7 = _mm256_fmadd_ps(a1, b1, c5); + + t0 = _mm256_add_ps(c2, c6); + t1 = _mm256_add_ps(c3, c7); + + XMMATRIX mResult; + mResult.r[0] = _mm256_castps256_ps128(t0); + mResult.r[1] = _mm256_extractf128_ps(t0, 1); + mResult.r[2] = _mm256_castps256_ps128(t1); + mResult.r[3] = _mm256_extractf128_ps(t1, 1); + return mResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX mResult; + // Splat the component X,Y,Z then W +#if defined(_XM_AVX_INTRINSICS_) + XMVECTOR vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 0); + XMVECTOR vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 1); + XMVECTOR vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 2); + XMVECTOR vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 3); +#else + // Use vW to hold the original row + XMVECTOR vW = M1.r[0]; + XMVECTOR vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); + XMVECTOR vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); + vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); +#endif + // Perform the operation on the first row + vX = _mm_mul_ps(vX, M2.r[0]); + vY = _mm_mul_ps(vY, M2.r[1]); + vZ = _mm_mul_ps(vZ, M2.r[2]); + vW = _mm_mul_ps(vW, M2.r[3]); + // Perform a binary add to reduce cumulative errors + vX = _mm_add_ps(vX, vZ); + vY = _mm_add_ps(vY, vW); + vX = _mm_add_ps(vX, vY); + mResult.r[0] = vX; + // Repeat for the other 3 rows +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 3); +#else + vW = M1.r[1]; + vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); + vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); + vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); + vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); +#endif + vX = _mm_mul_ps(vX, M2.r[0]); + vY = _mm_mul_ps(vY, M2.r[1]); + vZ = _mm_mul_ps(vZ, M2.r[2]); + vW = _mm_mul_ps(vW, M2.r[3]); + vX = _mm_add_ps(vX, vZ); + vY = _mm_add_ps(vY, vW); + vX = _mm_add_ps(vX, vY); + mResult.r[1] = vX; +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 3); +#else + vW = M1.r[2]; + vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); + vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); + vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); + vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); +#endif + vX = _mm_mul_ps(vX, M2.r[0]); + vY = _mm_mul_ps(vY, M2.r[1]); + vZ = _mm_mul_ps(vZ, M2.r[2]); + vW = _mm_mul_ps(vW, M2.r[3]); + vX = _mm_add_ps(vX, vZ); + vY = _mm_add_ps(vY, vW); + vX = _mm_add_ps(vX, vY); + mResult.r[2] = vX; +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 3); +#else + vW = M1.r[3]; + vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); + vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); + vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); + vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); +#endif + vX = _mm_mul_ps(vX, M2.r[0]); + vY = _mm_mul_ps(vY, M2.r[1]); + vZ = _mm_mul_ps(vZ, M2.r[2]); + vW = _mm_mul_ps(vW, M2.r[3]); + vX = _mm_add_ps(vX, vZ); + vY = _mm_add_ps(vY, vW); + vX = _mm_add_ps(vX, vY); + mResult.r[3] = vX; + return mResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose +( + FXMMATRIX M1, + CXMMATRIX M2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMMATRIX mResult; + // Cache the invariants in registers + float x = M2.m[0][0]; + float y = M2.m[1][0]; + float z = M2.m[2][0]; + float w = M2.m[3][0]; + // Perform the operation on the first row + mResult.m[0][0] = (M1.m[0][0] * x) + (M1.m[0][1] * y) + (M1.m[0][2] * z) + (M1.m[0][3] * w); + mResult.m[0][1] = (M1.m[1][0] * x) + (M1.m[1][1] * y) + (M1.m[1][2] * z) + (M1.m[1][3] * w); + mResult.m[0][2] = (M1.m[2][0] * x) + (M1.m[2][1] * y) + (M1.m[2][2] * z) + (M1.m[2][3] * w); + mResult.m[0][3] = (M1.m[3][0] * x) + (M1.m[3][1] * y) + (M1.m[3][2] * z) + (M1.m[3][3] * w); + // Repeat for all the other rows + x = M2.m[0][1]; + y = M2.m[1][1]; + z = M2.m[2][1]; + w = M2.m[3][1]; + mResult.m[1][0] = (M1.m[0][0] * x) + (M1.m[0][1] * y) + (M1.m[0][2] * z) + (M1.m[0][3] * w); + mResult.m[1][1] = (M1.m[1][0] * x) + (M1.m[1][1] * y) + (M1.m[1][2] * z) + (M1.m[1][3] * w); + mResult.m[1][2] = (M1.m[2][0] * x) + (M1.m[2][1] * y) + (M1.m[2][2] * z) + (M1.m[2][3] * w); + mResult.m[1][3] = (M1.m[3][0] * x) + (M1.m[3][1] * y) + (M1.m[3][2] * z) + (M1.m[3][3] * w); + x = M2.m[0][2]; + y = M2.m[1][2]; + z = M2.m[2][2]; + w = M2.m[3][2]; + mResult.m[2][0] = (M1.m[0][0] * x) + (M1.m[0][1] * y) + (M1.m[0][2] * z) + (M1.m[0][3] * w); + mResult.m[2][1] = (M1.m[1][0] * x) + (M1.m[1][1] * y) + (M1.m[1][2] * z) + (M1.m[1][3] * w); + mResult.m[2][2] = (M1.m[2][0] * x) + (M1.m[2][1] * y) + (M1.m[2][2] * z) + (M1.m[2][3] * w); + mResult.m[2][3] = (M1.m[3][0] * x) + (M1.m[3][1] * y) + (M1.m[3][2] * z) + (M1.m[3][3] * w); + x = M2.m[0][3]; + y = M2.m[1][3]; + z = M2.m[2][3]; + w = M2.m[3][3]; + mResult.m[3][0] = (M1.m[0][0] * x) + (M1.m[0][1] * y) + (M1.m[0][2] * z) + (M1.m[0][3] * w); + mResult.m[3][1] = (M1.m[1][0] * x) + (M1.m[1][1] * y) + (M1.m[1][2] * z) + (M1.m[1][3] * w); + mResult.m[3][2] = (M1.m[2][0] * x) + (M1.m[2][1] * y) + (M1.m[2][2] * z) + (M1.m[2][3] * w); + mResult.m[3][3] = (M1.m[3][0] * x) + (M1.m[3][1] * y) + (M1.m[3][2] * z) + (M1.m[3][3] * w); + return mResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(M1.r[0]); + float32x2_t VH = vget_high_f32(M1.r[0]); + // Perform the operation on the first row + float32x4_t vX = vmulq_lane_f32(M2.r[0], VL, 0); + float32x4_t vY = vmulq_lane_f32(M2.r[1], VL, 1); + float32x4_t vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + float32x4_t vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + float32x4_t r0 = vaddq_f32(vZ, vW); + // Repeat for the other 3 rows + VL = vget_low_f32(M1.r[1]); + VH = vget_high_f32(M1.r[1]); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + float32x4_t r1 = vaddq_f32(vZ, vW); + VL = vget_low_f32(M1.r[2]); + VH = vget_high_f32(M1.r[2]); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + float32x4_t r2 = vaddq_f32(vZ, vW); + VL = vget_low_f32(M1.r[3]); + VH = vget_high_f32(M1.r[3]); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + float32x4_t r3 = vaddq_f32(vZ, vW); + + // Transpose result + float32x4x2_t P0 = vzipq_f32(r0, r2); + float32x4x2_t P1 = vzipq_f32(r1, r3); + + float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]); + float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]); + + XMMATRIX mResult; + mResult.r[0] = T0.val[0]; + mResult.r[1] = T0.val[1]; + mResult.r[2] = T1.val[0]; + mResult.r[3] = T1.val[1]; + return mResult; +#elif defined(_XM_AVX2_INTRINSICS_) + __m256 t0 = _mm256_castps128_ps256(M1.r[0]); + t0 = _mm256_insertf128_ps(t0, M1.r[1], 1); + __m256 t1 = _mm256_castps128_ps256(M1.r[2]); + t1 = _mm256_insertf128_ps(t1, M1.r[3], 1); + + __m256 u0 = _mm256_castps128_ps256(M2.r[0]); + u0 = _mm256_insertf128_ps(u0, M2.r[1], 1); + __m256 u1 = _mm256_castps128_ps256(M2.r[2]); + u1 = _mm256_insertf128_ps(u1, M2.r[3], 1); + + __m256 a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 0, 0, 0)); + __m256 a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0)); + __m256 b0 = _mm256_permute2f128_ps(u0, u0, 0x00); + __m256 c0 = _mm256_mul_ps(a0, b0); + __m256 c1 = _mm256_mul_ps(a1, b0); + + a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(1, 1, 1, 1)); + a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1)); + b0 = _mm256_permute2f128_ps(u0, u0, 0x11); + __m256 c2 = _mm256_fmadd_ps(a0, b0, c0); + __m256 c3 = _mm256_fmadd_ps(a1, b0, c1); + + a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 2, 2)); + a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 b1 = _mm256_permute2f128_ps(u1, u1, 0x00); + __m256 c4 = _mm256_mul_ps(a0, b1); + __m256 c5 = _mm256_mul_ps(a1, b1); + + a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 3, 3, 3)); + a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3)); + b1 = _mm256_permute2f128_ps(u1, u1, 0x11); + __m256 c6 = _mm256_fmadd_ps(a0, b1, c4); + __m256 c7 = _mm256_fmadd_ps(a1, b1, c5); + + t0 = _mm256_add_ps(c2, c6); + t1 = _mm256_add_ps(c3, c7); + + // Transpose result + __m256 vTemp = _mm256_unpacklo_ps(t0, t1); + __m256 vTemp2 = _mm256_unpackhi_ps(t0, t1); + __m256 vTemp3 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20); + __m256 vTemp4 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31); + vTemp = _mm256_unpacklo_ps(vTemp3, vTemp4); + vTemp2 = _mm256_unpackhi_ps(vTemp3, vTemp4); + t0 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20); + t1 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31); + + XMMATRIX mResult; + mResult.r[0] = _mm256_castps256_ps128(t0); + mResult.r[1] = _mm256_extractf128_ps(t0, 1); + mResult.r[2] = _mm256_castps256_ps128(t1); + mResult.r[3] = _mm256_extractf128_ps(t1, 1); + return mResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the component X,Y,Z then W +#if defined(_XM_AVX_INTRINSICS_) + XMVECTOR vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 0); + XMVECTOR vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 1); + XMVECTOR vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 2); + XMVECTOR vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 3); +#else + // Use vW to hold the original row + XMVECTOR vW = M1.r[0]; + XMVECTOR vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); + XMVECTOR vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); + vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); +#endif + // Perform the operation on the first row + vX = _mm_mul_ps(vX, M2.r[0]); + vY = _mm_mul_ps(vY, M2.r[1]); + vZ = _mm_mul_ps(vZ, M2.r[2]); + vW = _mm_mul_ps(vW, M2.r[3]); + // Perform a binary add to reduce cumulative errors + vX = _mm_add_ps(vX, vZ); + vY = _mm_add_ps(vY, vW); + vX = _mm_add_ps(vX, vY); + XMVECTOR r0 = vX; + // Repeat for the other 3 rows +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 3); +#else + vW = M1.r[1]; + vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); + vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); + vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); + vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); +#endif + vX = _mm_mul_ps(vX, M2.r[0]); + vY = _mm_mul_ps(vY, M2.r[1]); + vZ = _mm_mul_ps(vZ, M2.r[2]); + vW = _mm_mul_ps(vW, M2.r[3]); + vX = _mm_add_ps(vX, vZ); + vY = _mm_add_ps(vY, vW); + vX = _mm_add_ps(vX, vY); + XMVECTOR r1 = vX; +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 3); +#else + vW = M1.r[2]; + vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); + vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); + vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); + vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); +#endif + vX = _mm_mul_ps(vX, M2.r[0]); + vY = _mm_mul_ps(vY, M2.r[1]); + vZ = _mm_mul_ps(vZ, M2.r[2]); + vW = _mm_mul_ps(vW, M2.r[3]); + vX = _mm_add_ps(vX, vZ); + vY = _mm_add_ps(vY, vW); + vX = _mm_add_ps(vX, vY); + XMVECTOR r2 = vX; +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 3); +#else + vW = M1.r[3]; + vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); + vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); + vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); + vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); +#endif + vX = _mm_mul_ps(vX, M2.r[0]); + vY = _mm_mul_ps(vY, M2.r[1]); + vZ = _mm_mul_ps(vZ, M2.r[2]); + vW = _mm_mul_ps(vW, M2.r[3]); + vX = _mm_add_ps(vX, vZ); + vY = _mm_add_ps(vY, vW); + vX = _mm_add_ps(vX, vY); + XMVECTOR r3 = vX; + + // Transpose result + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1, 0, 1, 0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(3, 2, 3, 2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(1, 0, 1, 0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(3, 2, 3, 2)); + + XMMATRIX mResult; + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1)); + return mResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixTranspose(FXMMATRIX M) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + // Original matrix: + // + // m00m01m02m03 + // m10m11m12m13 + // m20m21m22m23 + // m30m31m32m33 + + XMMATRIX P; + P.r[0] = XMVectorMergeXY(M.r[0], M.r[2]); // m00m20m01m21 + P.r[1] = XMVectorMergeXY(M.r[1], M.r[3]); // m10m30m11m31 + P.r[2] = XMVectorMergeZW(M.r[0], M.r[2]); // m02m22m03m23 + P.r[3] = XMVectorMergeZW(M.r[1], M.r[3]); // m12m32m13m33 + + XMMATRIX MT; + MT.r[0] = XMVectorMergeXY(P.r[0], P.r[1]); // m00m10m20m30 + MT.r[1] = XMVectorMergeZW(P.r[0], P.r[1]); // m01m11m21m31 + MT.r[2] = XMVectorMergeXY(P.r[2], P.r[3]); // m02m12m22m32 + MT.r[3] = XMVectorMergeZW(P.r[2], P.r[3]); // m03m13m23m33 + return MT; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]); + float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]); + + float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]); + float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]); + + XMMATRIX mResult; + mResult.r[0] = T0.val[0]; + mResult.r[1] = T0.val[1]; + mResult.r[2] = T1.val[0]; + mResult.r[3] = T1.val[1]; + return mResult; +#elif defined(_XM_AVX2_INTRINSICS_) + __m256 t0 = _mm256_castps128_ps256(M.r[0]); + t0 = _mm256_insertf128_ps(t0, M.r[1], 1); + __m256 t1 = _mm256_castps128_ps256(M.r[2]); + t1 = _mm256_insertf128_ps(t1, M.r[3], 1); + + __m256 vTemp = _mm256_unpacklo_ps(t0, t1); + __m256 vTemp2 = _mm256_unpackhi_ps(t0, t1); + __m256 vTemp3 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20); + __m256 vTemp4 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31); + vTemp = _mm256_unpacklo_ps(vTemp3, vTemp4); + vTemp2 = _mm256_unpackhi_ps(vTemp3, vTemp4); + t0 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20); + t1 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31); + + XMMATRIX mResult; + mResult.r[0] = _mm256_castps256_ps128(t0); + mResult.r[1] = _mm256_extractf128_ps(t0, 1); + mResult.r[2] = _mm256_castps256_ps128(t1); + mResult.r[3] = _mm256_extractf128_ps(t1, 1); + return mResult; +#elif defined(_XM_SSE_INTRINSICS_) + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2)); + + XMMATRIX mResult; + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1)); + return mResult; +#endif +} + +//------------------------------------------------------------------------------ +// Return the inverse and the determinant of a 4x4 matrix +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMMatrixInverse +( + XMVECTOR* pDeterminant, + FXMMATRIX M +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMMATRIX MT = XMMatrixTranspose(M); + + XMVECTOR V0[4], V1[4]; + V0[0] = XMVectorSwizzle(MT.r[2]); + V1[0] = XMVectorSwizzle(MT.r[3]); + V0[1] = XMVectorSwizzle(MT.r[0]); + V1[1] = XMVectorSwizzle(MT.r[1]); + V0[2] = XMVectorPermute(MT.r[2], MT.r[0]); + V1[2] = XMVectorPermute(MT.r[3], MT.r[1]); + + XMVECTOR D0 = XMVectorMultiply(V0[0], V1[0]); + XMVECTOR D1 = XMVectorMultiply(V0[1], V1[1]); + XMVECTOR D2 = XMVectorMultiply(V0[2], V1[2]); + + V0[0] = XMVectorSwizzle(MT.r[2]); + V1[0] = XMVectorSwizzle(MT.r[3]); + V0[1] = XMVectorSwizzle(MT.r[0]); + V1[1] = XMVectorSwizzle(MT.r[1]); + V0[2] = XMVectorPermute(MT.r[2], MT.r[0]); + V1[2] = XMVectorPermute(MT.r[3], MT.r[1]); + + D0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], D0); + D1 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], D1); + D2 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], D2); + + V0[0] = XMVectorSwizzle(MT.r[1]); + V1[0] = XMVectorPermute(D0, D2); + V0[1] = XMVectorSwizzle(MT.r[0]); + V1[1] = XMVectorPermute(D0, D2); + V0[2] = XMVectorSwizzle(MT.r[3]); + V1[2] = XMVectorPermute(D1, D2); + V0[3] = XMVectorSwizzle(MT.r[2]); + V1[3] = XMVectorPermute(D1, D2); + + XMVECTOR C0 = XMVectorMultiply(V0[0], V1[0]); + XMVECTOR C2 = XMVectorMultiply(V0[1], V1[1]); + XMVECTOR C4 = XMVectorMultiply(V0[2], V1[2]); + XMVECTOR C6 = XMVectorMultiply(V0[3], V1[3]); + + V0[0] = XMVectorSwizzle(MT.r[1]); + V1[0] = XMVectorPermute(D0, D2); + V0[1] = XMVectorSwizzle(MT.r[0]); + V1[1] = XMVectorPermute(D0, D2); + V0[2] = XMVectorSwizzle(MT.r[3]); + V1[2] = XMVectorPermute(D1, D2); + V0[3] = XMVectorSwizzle(MT.r[2]); + V1[3] = XMVectorPermute(D1, D2); + + C0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0); + C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2); + C4 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4); + C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6); + + V0[0] = XMVectorSwizzle(MT.r[1]); + V1[0] = XMVectorPermute(D0, D2); + V0[1] = XMVectorSwizzle(MT.r[0]); + V1[1] = XMVectorPermute(D0, D2); + V0[2] = XMVectorSwizzle(MT.r[3]); + V1[2] = XMVectorPermute(D1, D2); + V0[3] = XMVectorSwizzle(MT.r[2]); + V1[3] = XMVectorPermute(D1, D2); + + XMVECTOR C1 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0); + C0 = XMVectorMultiplyAdd(V0[0], V1[0], C0); + XMVECTOR C3 = XMVectorMultiplyAdd(V0[1], V1[1], C2); + C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2); + XMVECTOR C5 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4); + C4 = XMVectorMultiplyAdd(V0[2], V1[2], C4); + XMVECTOR C7 = XMVectorMultiplyAdd(V0[3], V1[3], C6); + C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6); + + XMMATRIX R; + R.r[0] = XMVectorSelect(C0, C1, g_XMSelect0101.v); + R.r[1] = XMVectorSelect(C2, C3, g_XMSelect0101.v); + R.r[2] = XMVectorSelect(C4, C5, g_XMSelect0101.v); + R.r[3] = XMVectorSelect(C6, C7, g_XMSelect0101.v); + + XMVECTOR Determinant = XMVector4Dot(R.r[0], MT.r[0]); + + if (pDeterminant != nullptr) + *pDeterminant = Determinant; + + XMVECTOR Reciprocal = XMVectorReciprocal(Determinant); + + XMMATRIX Result; + Result.r[0] = XMVectorMultiply(R.r[0], Reciprocal); + Result.r[1] = XMVectorMultiply(R.r[1], Reciprocal); + Result.r[2] = XMVectorMultiply(R.r[2], Reciprocal); + Result.r[3] = XMVectorMultiply(R.r[3], Reciprocal); + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Transpose matrix + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0)); + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2)); + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0)); + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2)); + + XMMATRIX MT; + MT.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); + MT.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); + MT.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); + MT.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1)); + + XMVECTOR V00 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(1, 1, 0, 0)); + XMVECTOR V10 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(3, 2, 3, 2)); + XMVECTOR V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(1, 1, 0, 0)); + XMVECTOR V11 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(3, 2, 3, 2)); + XMVECTOR V02 = _mm_shuffle_ps(MT.r[2], MT.r[0], _MM_SHUFFLE(2, 0, 2, 0)); + XMVECTOR V12 = _mm_shuffle_ps(MT.r[3], MT.r[1], _MM_SHUFFLE(3, 1, 3, 1)); + + XMVECTOR D0 = _mm_mul_ps(V00, V10); + XMVECTOR D1 = _mm_mul_ps(V01, V11); + XMVECTOR D2 = _mm_mul_ps(V02, V12); + + V00 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(3, 2, 3, 2)); + V10 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(1, 1, 0, 0)); + V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(3, 2, 3, 2)); + V11 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(1, 1, 0, 0)); + V02 = _mm_shuffle_ps(MT.r[2], MT.r[0], _MM_SHUFFLE(3, 1, 3, 1)); + V12 = _mm_shuffle_ps(MT.r[3], MT.r[1], _MM_SHUFFLE(2, 0, 2, 0)); + + D0 = XM_FNMADD_PS(V00, V10, D0); + D1 = XM_FNMADD_PS(V01, V11, D1); + D2 = XM_FNMADD_PS(V02, V12, D2); + // V11 = D0Y,D0W,D2Y,D2Y + V11 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(1, 1, 3, 1)); + V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(1, 0, 2, 1)); + V10 = _mm_shuffle_ps(V11, D0, _MM_SHUFFLE(0, 3, 0, 2)); + V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(0, 1, 0, 2)); + V11 = _mm_shuffle_ps(V11, D0, _MM_SHUFFLE(2, 1, 2, 1)); + // V13 = D1Y,D1W,D2W,D2W + XMVECTOR V13 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(3, 3, 3, 1)); + V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(1, 0, 2, 1)); + V12 = _mm_shuffle_ps(V13, D1, _MM_SHUFFLE(0, 3, 0, 2)); + XMVECTOR V03 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(0, 1, 0, 2)); + V13 = _mm_shuffle_ps(V13, D1, _MM_SHUFFLE(2, 1, 2, 1)); + + XMVECTOR C0 = _mm_mul_ps(V00, V10); + XMVECTOR C2 = _mm_mul_ps(V01, V11); + XMVECTOR C4 = _mm_mul_ps(V02, V12); + XMVECTOR C6 = _mm_mul_ps(V03, V13); + + // V11 = D0X,D0Y,D2X,D2X + V11 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(0, 0, 1, 0)); + V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(2, 1, 3, 2)); + V10 = _mm_shuffle_ps(D0, V11, _MM_SHUFFLE(2, 1, 0, 3)); + V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(1, 3, 2, 3)); + V11 = _mm_shuffle_ps(D0, V11, _MM_SHUFFLE(0, 2, 1, 2)); + // V13 = D1X,D1Y,D2Z,D2Z + V13 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(2, 2, 1, 0)); + V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(2, 1, 3, 2)); + V12 = _mm_shuffle_ps(D1, V13, _MM_SHUFFLE(2, 1, 0, 3)); + V03 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(1, 3, 2, 3)); + V13 = _mm_shuffle_ps(D1, V13, _MM_SHUFFLE(0, 2, 1, 2)); + + C0 = XM_FNMADD_PS(V00, V10, C0); + C2 = XM_FNMADD_PS(V01, V11, C2); + C4 = XM_FNMADD_PS(V02, V12, C4); + C6 = XM_FNMADD_PS(V03, V13, C6); + + V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(0, 3, 0, 3)); + // V10 = D0Z,D0Z,D2X,D2Y + V10 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(1, 0, 2, 2)); + V10 = XM_PERMUTE_PS(V10, _MM_SHUFFLE(0, 2, 3, 0)); + V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(2, 0, 3, 1)); + // V11 = D0X,D0W,D2X,D2Y + V11 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(1, 0, 3, 0)); + V11 = XM_PERMUTE_PS(V11, _MM_SHUFFLE(2, 1, 0, 3)); + V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(0, 3, 0, 3)); + // V12 = D1Z,D1Z,D2Z,D2W + V12 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(3, 2, 2, 2)); + V12 = XM_PERMUTE_PS(V12, _MM_SHUFFLE(0, 2, 3, 0)); + V03 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(2, 0, 3, 1)); + // V13 = D1X,D1W,D2Z,D2W + V13 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(3, 2, 3, 0)); + V13 = XM_PERMUTE_PS(V13, _MM_SHUFFLE(2, 1, 0, 3)); + + V00 = _mm_mul_ps(V00, V10); + V01 = _mm_mul_ps(V01, V11); + V02 = _mm_mul_ps(V02, V12); + V03 = _mm_mul_ps(V03, V13); + XMVECTOR C1 = _mm_sub_ps(C0, V00); + C0 = _mm_add_ps(C0, V00); + XMVECTOR C3 = _mm_add_ps(C2, V01); + C2 = _mm_sub_ps(C2, V01); + XMVECTOR C5 = _mm_sub_ps(C4, V02); + C4 = _mm_add_ps(C4, V02); + XMVECTOR C7 = _mm_add_ps(C6, V03); + C6 = _mm_sub_ps(C6, V03); + + C0 = _mm_shuffle_ps(C0, C1, _MM_SHUFFLE(3, 1, 2, 0)); + C2 = _mm_shuffle_ps(C2, C3, _MM_SHUFFLE(3, 1, 2, 0)); + C4 = _mm_shuffle_ps(C4, C5, _MM_SHUFFLE(3, 1, 2, 0)); + C6 = _mm_shuffle_ps(C6, C7, _MM_SHUFFLE(3, 1, 2, 0)); + C0 = XM_PERMUTE_PS(C0, _MM_SHUFFLE(3, 1, 2, 0)); + C2 = XM_PERMUTE_PS(C2, _MM_SHUFFLE(3, 1, 2, 0)); + C4 = XM_PERMUTE_PS(C4, _MM_SHUFFLE(3, 1, 2, 0)); + C6 = XM_PERMUTE_PS(C6, _MM_SHUFFLE(3, 1, 2, 0)); + // Get the determinant + XMVECTOR vTemp = XMVector4Dot(C0, MT.r[0]); + if (pDeterminant != nullptr) + *pDeterminant = vTemp; + vTemp = _mm_div_ps(g_XMOne, vTemp); + XMMATRIX mResult; + mResult.r[0] = _mm_mul_ps(C0, vTemp); + mResult.r[1] = _mm_mul_ps(C2, vTemp); + mResult.r[2] = _mm_mul_ps(C4, vTemp); + mResult.r[3] = _mm_mul_ps(C6, vTemp); + return mResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixVectorTensorProduct +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + XMMATRIX mResult; + mResult.r[0] = XMVectorMultiply(XMVectorSwizzle<0, 0, 0, 0>(V1), V2); + mResult.r[1] = XMVectorMultiply(XMVectorSwizzle<1, 1, 1, 1>(V1), V2); + mResult.r[2] = XMVectorMultiply(XMVectorSwizzle<2, 2, 2, 2>(V1), V2); + mResult.r[3] = XMVectorMultiply(XMVectorSwizzle<3, 3, 3, 3>(V1), V2); + return mResult; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMMatrixDeterminant(FXMMATRIX M) noexcept +{ + static const XMVECTORF32 Sign = { { { 1.0f, -1.0f, 1.0f, -1.0f } } }; + + XMVECTOR V0 = XMVectorSwizzle(M.r[2]); + XMVECTOR V1 = XMVectorSwizzle(M.r[3]); + XMVECTOR V2 = XMVectorSwizzle(M.r[2]); + XMVECTOR V3 = XMVectorSwizzle(M.r[3]); + XMVECTOR V4 = XMVectorSwizzle(M.r[2]); + XMVECTOR V5 = XMVectorSwizzle(M.r[3]); + + XMVECTOR P0 = XMVectorMultiply(V0, V1); + XMVECTOR P1 = XMVectorMultiply(V2, V3); + XMVECTOR P2 = XMVectorMultiply(V4, V5); + + V0 = XMVectorSwizzle(M.r[2]); + V1 = XMVectorSwizzle(M.r[3]); + V2 = XMVectorSwizzle(M.r[2]); + V3 = XMVectorSwizzle(M.r[3]); + V4 = XMVectorSwizzle(M.r[2]); + V5 = XMVectorSwizzle(M.r[3]); + + P0 = XMVectorNegativeMultiplySubtract(V0, V1, P0); + P1 = XMVectorNegativeMultiplySubtract(V2, V3, P1); + P2 = XMVectorNegativeMultiplySubtract(V4, V5, P2); + + V0 = XMVectorSwizzle(M.r[1]); + V1 = XMVectorSwizzle(M.r[1]); + V2 = XMVectorSwizzle(M.r[1]); + + XMVECTOR S = XMVectorMultiply(M.r[0], Sign.v); + XMVECTOR R = XMVectorMultiply(V0, P0); + R = XMVectorNegativeMultiplySubtract(V1, P1, R); + R = XMVectorMultiplyAdd(V2, P2, R); + + return XMVector4Dot(S, R); +} + +#define XM3RANKDECOMPOSE(a, b, c, x, y, z) \ + if((x) < (y)) \ + { \ + if((y) < (z)) \ + { \ + (a) = 2; \ + (b) = 1; \ + (c) = 0; \ + } \ + else \ + { \ + (a) = 1; \ + \ + if((x) < (z)) \ + { \ + (b) = 2; \ + (c) = 0; \ + } \ + else \ + { \ + (b) = 0; \ + (c) = 2; \ + } \ + } \ + } \ + else \ + { \ + if((x) < (z)) \ + { \ + (a) = 2; \ + (b) = 0; \ + (c) = 1; \ + } \ + else \ + { \ + (a) = 0; \ + \ + if((y) < (z)) \ + { \ + (b) = 2; \ + (c) = 1; \ + } \ + else \ + { \ + (b) = 1; \ + (c) = 2; \ + } \ + } \ + } + +#define XM3_DECOMP_EPSILON 0.0001f + +_Use_decl_annotations_ +inline bool XM_CALLCONV XMMatrixDecompose +( + XMVECTOR* outScale, + XMVECTOR* outRotQuat, + XMVECTOR* outTrans, + FXMMATRIX M +) noexcept +{ + static const XMVECTOR* pvCanonicalBasis[3] = { + &g_XMIdentityR0.v, + &g_XMIdentityR1.v, + &g_XMIdentityR2.v + }; + + assert(outScale != nullptr); + assert(outRotQuat != nullptr); + assert(outTrans != nullptr); + + // Get the translation + outTrans[0] = M.r[3]; + + XMVECTOR* ppvBasis[3]; + XMMATRIX matTemp; + ppvBasis[0] = &matTemp.r[0]; + ppvBasis[1] = &matTemp.r[1]; + ppvBasis[2] = &matTemp.r[2]; + + matTemp.r[0] = M.r[0]; + matTemp.r[1] = M.r[1]; + matTemp.r[2] = M.r[2]; + matTemp.r[3] = g_XMIdentityR3.v; + + auto pfScales = reinterpret_cast(outScale); + + size_t a, b, c; + XMVectorGetXPtr(&pfScales[0], XMVector3Length(ppvBasis[0][0])); + XMVectorGetXPtr(&pfScales[1], XMVector3Length(ppvBasis[1][0])); + XMVectorGetXPtr(&pfScales[2], XMVector3Length(ppvBasis[2][0])); + pfScales[3] = 0.f; + + XM3RANKDECOMPOSE(a, b, c, pfScales[0], pfScales[1], pfScales[2]) + + if (pfScales[a] < XM3_DECOMP_EPSILON) + { + ppvBasis[a][0] = pvCanonicalBasis[a][0]; + } + ppvBasis[a][0] = XMVector3Normalize(ppvBasis[a][0]); + + if (pfScales[b] < XM3_DECOMP_EPSILON) + { + size_t aa, bb, cc; + float fAbsX, fAbsY, fAbsZ; + + fAbsX = fabsf(XMVectorGetX(ppvBasis[a][0])); + fAbsY = fabsf(XMVectorGetY(ppvBasis[a][0])); + fAbsZ = fabsf(XMVectorGetZ(ppvBasis[a][0])); + + XM3RANKDECOMPOSE(aa, bb, cc, fAbsX, fAbsY, fAbsZ) + + ppvBasis[b][0] = XMVector3Cross(ppvBasis[a][0], pvCanonicalBasis[cc][0]); + } + + ppvBasis[b][0] = XMVector3Normalize(ppvBasis[b][0]); + + if (pfScales[c] < XM3_DECOMP_EPSILON) + { + ppvBasis[c][0] = XMVector3Cross(ppvBasis[a][0], ppvBasis[b][0]); + } + + ppvBasis[c][0] = XMVector3Normalize(ppvBasis[c][0]); + + float fDet = XMVectorGetX(XMMatrixDeterminant(matTemp)); + + // use Kramer's rule to check for handedness of coordinate system + if (fDet < 0.0f) + { + // switch coordinate system by negating the scale and inverting the basis vector on the x-axis + pfScales[a] = -pfScales[a]; + ppvBasis[a][0] = XMVectorNegate(ppvBasis[a][0]); + + fDet = -fDet; + } + + fDet -= 1.0f; + fDet *= fDet; + + if (XM3_DECOMP_EPSILON < fDet) + { + // Non-SRT matrix encountered + return false; + } + + // generate the quaternion from the matrix + outRotQuat[0] = XMQuaternionRotationMatrix(matTemp); + return true; +} + +#undef XM3_DECOMP_EPSILON +#undef XM3RANKDECOMPOSE + +//------------------------------------------------------------------------------ +// Transformation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixIdentity() noexcept +{ + XMMATRIX M; + M.r[0] = g_XMIdentityR0.v; + M.r[1] = g_XMIdentityR1.v; + M.r[2] = g_XMIdentityR2.v; + M.r[3] = g_XMIdentityR3.v; + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixSet +( + float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33 +) noexcept +{ + XMMATRIX M; +#if defined(_XM_NO_INTRINSICS_) + M.m[0][0] = m00; M.m[0][1] = m01; M.m[0][2] = m02; M.m[0][3] = m03; + M.m[1][0] = m10; M.m[1][1] = m11; M.m[1][2] = m12; M.m[1][3] = m13; + M.m[2][0] = m20; M.m[2][1] = m21; M.m[2][2] = m22; M.m[2][3] = m23; + M.m[3][0] = m30; M.m[3][1] = m31; M.m[3][2] = m32; M.m[3][3] = m33; +#else + M.r[0] = XMVectorSet(m00, m01, m02, m03); + M.r[1] = XMVectorSet(m10, m11, m12, m13); + M.r[2] = XMVectorSet(m20, m21, m22, m23); + M.r[3] = XMVectorSet(m30, m31, m32, m33); +#endif + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixTranslation +( + float OffsetX, + float OffsetY, + float OffsetZ +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.m[0][0] = 1.0f; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 1.0f; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = 1.0f; + M.m[2][3] = 0.0f; + + M.m[3][0] = OffsetX; + M.m[3][1] = OffsetY; + M.m[3][2] = OffsetZ; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = g_XMIdentityR0.v; + M.r[1] = g_XMIdentityR1.v; + M.r[2] = g_XMIdentityR2.v; + M.r[3] = XMVectorSet(OffsetX, OffsetY, OffsetZ, 1.f); + return M; +#endif +} + + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixTranslationFromVector(FXMVECTOR Offset) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.m[0][0] = 1.0f; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 1.0f; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = 1.0f; + M.m[2][3] = 0.0f; + + M.m[3][0] = Offset.vector4_f32[0]; + M.m[3][1] = Offset.vector4_f32[1]; + M.m[3][2] = Offset.vector4_f32[2]; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = g_XMIdentityR0.v; + M.r[1] = g_XMIdentityR1.v; + M.r[2] = g_XMIdentityR2.v; + M.r[3] = XMVectorSelect(g_XMIdentityR3.v, Offset, g_XMSelect1110.v); + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixScaling +( + float ScaleX, + float ScaleY, + float ScaleZ +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.m[0][0] = ScaleX; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = ScaleY; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = ScaleZ; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + const XMVECTOR Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32(ScaleX, Zero, 0); + M.r[1] = vsetq_lane_f32(ScaleY, Zero, 1); + M.r[2] = vsetq_lane_f32(ScaleZ, Zero, 2); + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_set_ps(0, 0, 0, ScaleX); + M.r[1] = _mm_set_ps(0, 0, ScaleY, 0); + M.r[2] = _mm_set_ps(0, ScaleZ, 0, 0); + M.r[3] = g_XMIdentityR3.v; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixScalingFromVector(FXMVECTOR Scale) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.m[0][0] = Scale.vector4_f32[0]; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = Scale.vector4_f32[1]; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = Scale.vector4_f32[2]; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskX)); + M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskY)); + M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskZ)); + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_and_ps(Scale, g_XMMaskX); + M.r[1] = _mm_and_ps(Scale, g_XMMaskY); + M.r[2] = _mm_and_ps(Scale, g_XMMaskZ); + M.r[3] = g_XMIdentityR3.v; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationX(float Angle) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMMATRIX M; + M.m[0][0] = 1.0f; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = fCosAngle; + M.m[1][2] = fSinAngle; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = -fSinAngle; + M.m[2][2] = fCosAngle; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + const float32x4_t Zero = vdupq_n_f32(0); + + float32x4_t T1 = vsetq_lane_f32(fCosAngle, Zero, 1); + T1 = vsetq_lane_f32(fSinAngle, T1, 2); + + float32x4_t T2 = vsetq_lane_f32(-fSinAngle, Zero, 1); + T2 = vsetq_lane_f32(fCosAngle, T2, 2); + + XMMATRIX M; + M.r[0] = g_XMIdentityR0.v; + M.r[1] = T1; + M.r[2] = T2; + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinAngle; + float CosAngle; + XMScalarSinCos(&SinAngle, &CosAngle, Angle); + + XMVECTOR vSin = _mm_set_ss(SinAngle); + XMVECTOR vCos = _mm_set_ss(CosAngle); + // x = 0,y = cos,z = sin, w = 0 + vCos = _mm_shuffle_ps(vCos, vSin, _MM_SHUFFLE(3, 0, 0, 3)); + XMMATRIX M; + M.r[0] = g_XMIdentityR0; + M.r[1] = vCos; + // x = 0,y = sin,z = cos, w = 0 + vCos = XM_PERMUTE_PS(vCos, _MM_SHUFFLE(3, 1, 2, 0)); + // x = 0,y = -sin,z = cos, w = 0 + vCos = _mm_mul_ps(vCos, g_XMNegateY); + M.r[2] = vCos; + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationY(float Angle) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMMATRIX M; + M.m[0][0] = fCosAngle; + M.m[0][1] = 0.0f; + M.m[0][2] = -fSinAngle; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 1.0f; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = fSinAngle; + M.m[2][1] = 0.0f; + M.m[2][2] = fCosAngle; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + const float32x4_t Zero = vdupq_n_f32(0); + + float32x4_t T0 = vsetq_lane_f32(fCosAngle, Zero, 0); + T0 = vsetq_lane_f32(-fSinAngle, T0, 2); + + float32x4_t T2 = vsetq_lane_f32(fSinAngle, Zero, 0); + T2 = vsetq_lane_f32(fCosAngle, T2, 2); + + XMMATRIX M; + M.r[0] = T0; + M.r[1] = g_XMIdentityR1.v; + M.r[2] = T2; + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinAngle; + float CosAngle; + XMScalarSinCos(&SinAngle, &CosAngle, Angle); + + XMVECTOR vSin = _mm_set_ss(SinAngle); + XMVECTOR vCos = _mm_set_ss(CosAngle); + // x = sin,y = 0,z = cos, w = 0 + vSin = _mm_shuffle_ps(vSin, vCos, _MM_SHUFFLE(3, 0, 3, 0)); + XMMATRIX M; + M.r[2] = vSin; + M.r[1] = g_XMIdentityR1; + // x = cos,y = 0,z = sin, w = 0 + vSin = XM_PERMUTE_PS(vSin, _MM_SHUFFLE(3, 0, 1, 2)); + // x = cos,y = 0,z = -sin, w = 0 + vSin = _mm_mul_ps(vSin, g_XMNegateZ); + M.r[0] = vSin; + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationZ(float Angle) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMMATRIX M; + M.m[0][0] = fCosAngle; + M.m[0][1] = fSinAngle; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = -fSinAngle; + M.m[1][1] = fCosAngle; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = 1.0f; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + const float32x4_t Zero = vdupq_n_f32(0); + + float32x4_t T0 = vsetq_lane_f32(fCosAngle, Zero, 0); + T0 = vsetq_lane_f32(fSinAngle, T0, 1); + + float32x4_t T1 = vsetq_lane_f32(-fSinAngle, Zero, 0); + T1 = vsetq_lane_f32(fCosAngle, T1, 1); + + XMMATRIX M; + M.r[0] = T0; + M.r[1] = T1; + M.r[2] = g_XMIdentityR2.v; + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinAngle; + float CosAngle; + XMScalarSinCos(&SinAngle, &CosAngle, Angle); + + XMVECTOR vSin = _mm_set_ss(SinAngle); + XMVECTOR vCos = _mm_set_ss(CosAngle); + // x = cos,y = sin,z = 0, w = 0 + vCos = _mm_unpacklo_ps(vCos, vSin); + XMMATRIX M; + M.r[0] = vCos; + // x = sin,y = cos,z = 0, w = 0 + vCos = XM_PERMUTE_PS(vCos, _MM_SHUFFLE(3, 2, 0, 1)); + // x = cos,y = -sin,z = 0, w = 0 + vCos = _mm_mul_ps(vCos, g_XMNegateX); + M.r[1] = vCos; + M.r[2] = g_XMIdentityR2; + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYaw +( + float Pitch, + float Yaw, + float Roll +) noexcept +{ + XMVECTOR Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f); + return XMMatrixRotationRollPitchYawFromVector(Angles); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYawFromVector +( + FXMVECTOR Angles // +) noexcept +{ + XMVECTOR Q = XMQuaternionRotationRollPitchYawFromVector(Angles); + return XMMatrixRotationQuaternion(Q); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationNormal +( + FXMVECTOR NormalAxis, + float Angle +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMVECTOR A = XMVectorSet(fSinAngle, fCosAngle, 1.0f - fCosAngle, 0.0f); + + XMVECTOR C2 = XMVectorSplatZ(A); + XMVECTOR C1 = XMVectorSplatY(A); + XMVECTOR C0 = XMVectorSplatX(A); + + XMVECTOR N0 = XMVectorSwizzle(NormalAxis); + XMVECTOR N1 = XMVectorSwizzle(NormalAxis); + + XMVECTOR V0 = XMVectorMultiply(C2, N0); + V0 = XMVectorMultiply(V0, N1); + + XMVECTOR R0 = XMVectorMultiply(C2, NormalAxis); + R0 = XMVectorMultiplyAdd(R0, NormalAxis, C1); + + XMVECTOR R1 = XMVectorMultiplyAdd(C0, NormalAxis, V0); + XMVECTOR R2 = XMVectorNegativeMultiplySubtract(C0, NormalAxis, V0); + + V0 = XMVectorSelect(A, R0, g_XMSelect1110.v); + XMVECTOR V1 = XMVectorPermute(R1, R2); + XMVECTOR V2 = XMVectorPermute(R1, R2); + + XMMATRIX M; + M.r[0] = XMVectorPermute(V0, V1); + M.r[1] = XMVectorPermute(V0, V1); + M.r[2] = XMVectorPermute(V0, V2); + M.r[3] = g_XMIdentityR3.v; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMVECTOR C2 = _mm_set_ps1(1.0f - fCosAngle); + XMVECTOR C1 = _mm_set_ps1(fCosAngle); + XMVECTOR C0 = _mm_set_ps1(fSinAngle); + + XMVECTOR N0 = XM_PERMUTE_PS(NormalAxis, _MM_SHUFFLE(3, 0, 2, 1)); + XMVECTOR N1 = XM_PERMUTE_PS(NormalAxis, _MM_SHUFFLE(3, 1, 0, 2)); + + XMVECTOR V0 = _mm_mul_ps(C2, N0); + V0 = _mm_mul_ps(V0, N1); + + XMVECTOR R0 = _mm_mul_ps(C2, NormalAxis); + R0 = _mm_mul_ps(R0, NormalAxis); + R0 = _mm_add_ps(R0, C1); + + XMVECTOR R1 = _mm_mul_ps(C0, NormalAxis); + R1 = _mm_add_ps(R1, V0); + XMVECTOR R2 = _mm_mul_ps(C0, NormalAxis); + R2 = _mm_sub_ps(V0, R2); + + V0 = _mm_and_ps(R0, g_XMMask3); + XMVECTOR V1 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(2, 1, 2, 0)); + V1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 3, 2, 1)); + XMVECTOR V2 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(0, 0, 1, 1)); + V2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 0, 2, 0)); + + R2 = _mm_shuffle_ps(V0, V1, _MM_SHUFFLE(1, 0, 3, 0)); + R2 = XM_PERMUTE_PS(R2, _MM_SHUFFLE(1, 3, 2, 0)); + + XMMATRIX M; + M.r[0] = R2; + + R2 = _mm_shuffle_ps(V0, V1, _MM_SHUFFLE(3, 2, 3, 1)); + R2 = XM_PERMUTE_PS(R2, _MM_SHUFFLE(1, 3, 0, 2)); + M.r[1] = R2; + + V2 = _mm_shuffle_ps(V2, V0, _MM_SHUFFLE(3, 2, 1, 0)); + M.r[2] = V2; + M.r[3] = g_XMIdentityR3.v; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationAxis +( + FXMVECTOR Axis, + float Angle +) noexcept +{ + assert(!XMVector3Equal(Axis, XMVectorZero())); + assert(!XMVector3IsInfinite(Axis)); + + XMVECTOR Normal = XMVector3Normalize(Axis); + return XMMatrixRotationNormal(Normal, Angle); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationQuaternion(FXMVECTOR Quaternion) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 Constant1110 = { { { 1.0f, 1.0f, 1.0f, 0.0f } } }; + + XMVECTOR Q0 = XMVectorAdd(Quaternion, Quaternion); + XMVECTOR Q1 = XMVectorMultiply(Quaternion, Q0); + + XMVECTOR V0 = XMVectorPermute(Q1, Constant1110.v); + XMVECTOR V1 = XMVectorPermute(Q1, Constant1110.v); + XMVECTOR R0 = XMVectorSubtract(Constant1110, V0); + R0 = XMVectorSubtract(R0, V1); + + V0 = XMVectorSwizzle(Quaternion); + V1 = XMVectorSwizzle(Q0); + V0 = XMVectorMultiply(V0, V1); + + V1 = XMVectorSplatW(Quaternion); + XMVECTOR V2 = XMVectorSwizzle(Q0); + V1 = XMVectorMultiply(V1, V2); + + XMVECTOR R1 = XMVectorAdd(V0, V1); + XMVECTOR R2 = XMVectorSubtract(V0, V1); + + V0 = XMVectorPermute(R1, R2); + V1 = XMVectorPermute(R1, R2); + + XMMATRIX M; + M.r[0] = XMVectorPermute(R0, V0); + M.r[1] = XMVectorPermute(R0, V0); + M.r[2] = XMVectorPermute(R0, V1); + M.r[3] = g_XMIdentityR3.v; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Constant1110 = { { { 1.0f, 1.0f, 1.0f, 0.0f } } }; + + XMVECTOR Q0 = _mm_add_ps(Quaternion, Quaternion); + XMVECTOR Q1 = _mm_mul_ps(Quaternion, Q0); + + XMVECTOR V0 = XM_PERMUTE_PS(Q1, _MM_SHUFFLE(3, 0, 0, 1)); + V0 = _mm_and_ps(V0, g_XMMask3); + XMVECTOR V1 = XM_PERMUTE_PS(Q1, _MM_SHUFFLE(3, 1, 2, 2)); + V1 = _mm_and_ps(V1, g_XMMask3); + XMVECTOR R0 = _mm_sub_ps(Constant1110, V0); + R0 = _mm_sub_ps(R0, V1); + + V0 = XM_PERMUTE_PS(Quaternion, _MM_SHUFFLE(3, 1, 0, 0)); + V1 = XM_PERMUTE_PS(Q0, _MM_SHUFFLE(3, 2, 1, 2)); + V0 = _mm_mul_ps(V0, V1); + + V1 = XM_PERMUTE_PS(Quaternion, _MM_SHUFFLE(3, 3, 3, 3)); + XMVECTOR V2 = XM_PERMUTE_PS(Q0, _MM_SHUFFLE(3, 0, 2, 1)); + V1 = _mm_mul_ps(V1, V2); + + XMVECTOR R1 = _mm_add_ps(V0, V1); + XMVECTOR R2 = _mm_sub_ps(V0, V1); + + V0 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(1, 0, 2, 1)); + V0 = XM_PERMUTE_PS(V0, _MM_SHUFFLE(1, 3, 2, 0)); + V1 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(2, 2, 0, 0)); + V1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 0, 2, 0)); + + Q1 = _mm_shuffle_ps(R0, V0, _MM_SHUFFLE(1, 0, 3, 0)); + Q1 = XM_PERMUTE_PS(Q1, _MM_SHUFFLE(1, 3, 2, 0)); + + XMMATRIX M; + M.r[0] = Q1; + + Q1 = _mm_shuffle_ps(R0, V0, _MM_SHUFFLE(3, 2, 3, 1)); + Q1 = XM_PERMUTE_PS(Q1, _MM_SHUFFLE(1, 3, 0, 2)); + M.r[1] = Q1; + + Q1 = _mm_shuffle_ps(V1, R0, _MM_SHUFFLE(3, 2, 1, 0)); + M.r[2] = Q1; + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixTransformation2D +( + FXMVECTOR ScalingOrigin, + float ScalingOrientation, + FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, + float Rotation, + GXMVECTOR Translation +) noexcept +{ + // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation * + // MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + XMVECTOR VScalingOrigin = XMVectorSelect(g_XMSelect1100.v, ScalingOrigin, g_XMSelect1100.v); + XMVECTOR NegScalingOrigin = XMVectorNegate(VScalingOrigin); + + XMMATRIX MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin); + XMMATRIX MScalingOrientation = XMMatrixRotationZ(ScalingOrientation); + XMMATRIX MScalingOrientationT = XMMatrixTranspose(MScalingOrientation); + XMVECTOR VScaling = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v); + XMMATRIX MScaling = XMMatrixScalingFromVector(VScaling); + XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v); + XMMATRIX MRotation = XMMatrixRotationZ(Rotation); + XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1100.v, Translation, g_XMSelect1100.v); + + XMMATRIX M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT); + M = XMMatrixMultiply(M, MScaling); + M = XMMatrixMultiply(M, MScalingOrientation); + M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin); + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixTransformation +( + FXMVECTOR ScalingOrigin, + FXMVECTOR ScalingOrientationQuaternion, + FXMVECTOR Scaling, + GXMVECTOR RotationOrigin, + HXMVECTOR RotationQuaternion, + HXMVECTOR Translation +) noexcept +{ + // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation * + // MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + XMVECTOR VScalingOrigin = XMVectorSelect(g_XMSelect1110.v, ScalingOrigin, g_XMSelect1110.v); + XMVECTOR NegScalingOrigin = XMVectorNegate(ScalingOrigin); + + XMMATRIX MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin); + XMMATRIX MScalingOrientation = XMMatrixRotationQuaternion(ScalingOrientationQuaternion); + XMMATRIX MScalingOrientationT = XMMatrixTranspose(MScalingOrientation); + XMMATRIX MScaling = XMMatrixScalingFromVector(Scaling); + XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1110.v, RotationOrigin, g_XMSelect1110.v); + XMMATRIX MRotation = XMMatrixRotationQuaternion(RotationQuaternion); + XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1110.v, Translation, g_XMSelect1110.v); + + XMMATRIX M; + M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT); + M = XMMatrixMultiply(M, MScaling); + M = XMMatrixMultiply(M, MScalingOrientation); + M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin); + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixAffineTransformation2D +( + FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, + float Rotation, + FXMVECTOR Translation +) noexcept +{ + // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + XMVECTOR VScaling = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v); + XMMATRIX MScaling = XMMatrixScalingFromVector(VScaling); + XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v); + XMMATRIX MRotation = XMMatrixRotationZ(Rotation); + XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1100.v, Translation, g_XMSelect1100.v); + + XMMATRIX M; + M = MScaling; + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixAffineTransformation +( + FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, + FXMVECTOR RotationQuaternion, + GXMVECTOR Translation +) noexcept +{ + // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + XMMATRIX MScaling = XMMatrixScalingFromVector(Scaling); + XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1110.v, RotationOrigin, g_XMSelect1110.v); + XMMATRIX MRotation = XMMatrixRotationQuaternion(RotationQuaternion); + XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1110.v, Translation, g_XMSelect1110.v); + + XMMATRIX M; + M = MScaling; + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixReflect(FXMVECTOR ReflectionPlane) noexcept +{ + assert(!XMVector3Equal(ReflectionPlane, XMVectorZero())); + assert(!XMPlaneIsInfinite(ReflectionPlane)); + + static const XMVECTORF32 NegativeTwo = { { { -2.0f, -2.0f, -2.0f, 0.0f } } }; + + XMVECTOR P = XMPlaneNormalize(ReflectionPlane); + XMVECTOR S = XMVectorMultiply(P, NegativeTwo); + + XMVECTOR A = XMVectorSplatX(P); + XMVECTOR B = XMVectorSplatY(P); + XMVECTOR C = XMVectorSplatZ(P); + XMVECTOR D = XMVectorSplatW(P); + + XMMATRIX M; + M.r[0] = XMVectorMultiplyAdd(A, S, g_XMIdentityR0.v); + M.r[1] = XMVectorMultiplyAdd(B, S, g_XMIdentityR1.v); + M.r[2] = XMVectorMultiplyAdd(C, S, g_XMIdentityR2.v); + M.r[3] = XMVectorMultiplyAdd(D, S, g_XMIdentityR3.v); + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixShadow +( + FXMVECTOR ShadowPlane, + FXMVECTOR LightPosition +) noexcept +{ + static const XMVECTORU32 Select0001 = { { { XM_SELECT_0, XM_SELECT_0, XM_SELECT_0, XM_SELECT_1 } } }; + + assert(!XMVector3Equal(ShadowPlane, XMVectorZero())); + assert(!XMPlaneIsInfinite(ShadowPlane)); + + XMVECTOR P = XMPlaneNormalize(ShadowPlane); + XMVECTOR Dot = XMPlaneDot(P, LightPosition); + P = XMVectorNegate(P); + XMVECTOR D = XMVectorSplatW(P); + XMVECTOR C = XMVectorSplatZ(P); + XMVECTOR B = XMVectorSplatY(P); + XMVECTOR A = XMVectorSplatX(P); + Dot = XMVectorSelect(Select0001.v, Dot, Select0001.v); + + XMMATRIX M; + M.r[3] = XMVectorMultiplyAdd(D, LightPosition, Dot); + Dot = XMVectorRotateLeft(Dot, 1); + M.r[2] = XMVectorMultiplyAdd(C, LightPosition, Dot); + Dot = XMVectorRotateLeft(Dot, 1); + M.r[1] = XMVectorMultiplyAdd(B, LightPosition, Dot); + Dot = XMVectorRotateLeft(Dot, 1); + M.r[0] = XMVectorMultiplyAdd(A, LightPosition, Dot); + return M; +} + +//------------------------------------------------------------------------------ +// View and projection initialization operations +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixLookAtLH +( + FXMVECTOR EyePosition, + FXMVECTOR FocusPosition, + FXMVECTOR UpDirection +) noexcept +{ + XMVECTOR EyeDirection = XMVectorSubtract(FocusPosition, EyePosition); + return XMMatrixLookToLH(EyePosition, EyeDirection, UpDirection); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixLookAtRH +( + FXMVECTOR EyePosition, + FXMVECTOR FocusPosition, + FXMVECTOR UpDirection +) noexcept +{ + XMVECTOR NegEyeDirection = XMVectorSubtract(EyePosition, FocusPosition); + return XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixLookToLH +( + FXMVECTOR EyePosition, + FXMVECTOR EyeDirection, + FXMVECTOR UpDirection +) noexcept +{ + assert(!XMVector3Equal(EyeDirection, XMVectorZero())); + assert(!XMVector3IsInfinite(EyeDirection)); + assert(!XMVector3Equal(UpDirection, XMVectorZero())); + assert(!XMVector3IsInfinite(UpDirection)); + + XMVECTOR R2 = XMVector3Normalize(EyeDirection); + + XMVECTOR R0 = XMVector3Cross(UpDirection, R2); + R0 = XMVector3Normalize(R0); + + XMVECTOR R1 = XMVector3Cross(R2, R0); + + XMVECTOR NegEyePosition = XMVectorNegate(EyePosition); + + XMVECTOR D0 = XMVector3Dot(R0, NegEyePosition); + XMVECTOR D1 = XMVector3Dot(R1, NegEyePosition); + XMVECTOR D2 = XMVector3Dot(R2, NegEyePosition); + + XMMATRIX M; + M.r[0] = XMVectorSelect(D0, R0, g_XMSelect1110.v); + M.r[1] = XMVectorSelect(D1, R1, g_XMSelect1110.v); + M.r[2] = XMVectorSelect(D2, R2, g_XMSelect1110.v); + M.r[3] = g_XMIdentityR3.v; + + M = XMMatrixTranspose(M); + + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixLookToRH +( + FXMVECTOR EyePosition, + FXMVECTOR EyeDirection, + FXMVECTOR UpDirection +) noexcept +{ + XMVECTOR NegEyeDirection = XMVectorNegate(EyeDirection); + return XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection); +} + +//------------------------------------------------------------------------------ + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable:28931, "PREfast noise: Esp:1266") +#endif + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveLH +( + float ViewWidth, + float ViewHeight, + float NearZ, + float FarZ +) noexcept +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (FarZ - NearZ); + + XMMATRIX M; + M.m[0][0] = TwoNearZ / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (FarZ - NearZ); + const float32x4_t Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32(TwoNearZ / ViewWidth, Zero, 0); + M.r[1] = vsetq_lane_f32(TwoNearZ / ViewHeight, Zero, 1); + M.r[2] = vsetq_lane_f32(fRange, g_XMIdentityR3.v, 2); + M.r[3] = vsetq_lane_f32(-fRange * NearZ, Zero, 2); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (FarZ - NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ / ViewWidth, + TwoNearZ / ViewHeight, + fRange, + -fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // TwoNearZ / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,1.0f + vValues = _mm_shuffle_ps(vValues, g_XMIdentityR3, _MM_SHUFFLE(3, 2, 3, 2)); + // 0,0,fRange,1.0f + vTemp = _mm_setzero_ps(); + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,0 + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveRH +( + float ViewWidth, + float ViewHeight, + float NearZ, + float FarZ +) noexcept +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (NearZ - FarZ); + + XMMATRIX M; + M.m[0][0] = TwoNearZ / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = -1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (NearZ - FarZ); + const float32x4_t Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32(TwoNearZ / ViewWidth, Zero, 0); + M.r[1] = vsetq_lane_f32(TwoNearZ / ViewHeight, Zero, 1); + M.r[2] = vsetq_lane_f32(fRange, g_XMNegIdentityR3.v, 2); + M.r[3] = vsetq_lane_f32(fRange * NearZ, Zero, 2); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (NearZ - FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ / ViewWidth, + TwoNearZ / ViewHeight, + fRange, + fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // TwoNearZ / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,-1.0f + vValues = _mm_shuffle_ps(vValues, g_XMNegIdentityR3, _MM_SHUFFLE(3, 2, 3, 2)); + // 0,0,fRange,-1.0f + vTemp = _mm_setzero_ps(); + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,0 + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovLH +( + float FovAngleY, + float AspectRatio, + float NearZ, + float FarZ +) noexcept +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f)); + assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + float Height = CosFov / SinFov; + float Width = Height / AspectRatio; + float fRange = FarZ / (FarZ - NearZ); + + XMMATRIX M; + M.m[0][0] = Width; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = Height; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + float fRange = FarZ / (FarZ - NearZ); + float Height = CosFov / SinFov; + float Width = Height / AspectRatio; + const float32x4_t Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32(Width, Zero, 0); + M.r[1] = vsetq_lane_f32(Height, Zero, 1); + M.r[2] = vsetq_lane_f32(fRange, g_XMIdentityR3.v, 2); + M.r[3] = vsetq_lane_f32(-fRange * NearZ, Zero, 2); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + float fRange = FarZ / (FarZ - NearZ); + // Note: This is recorded on the stack + float Height = CosFov / SinFov; + XMVECTOR rMem = { + Height / AspectRatio, + Height, + fRange, + -fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // CosFov / SinFov,0,0,0 + XMMATRIX M; + M.r[0] = vTemp; + // 0,Height / AspectRatio,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues, g_XMIdentityR3, _MM_SHUFFLE(3, 2, 3, 2)); + // 0,0,fRange,1.0f + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,0.0f + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovRH +( + float FovAngleY, + float AspectRatio, + float NearZ, + float FarZ +) noexcept +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f)); + assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + float Height = CosFov / SinFov; + float Width = Height / AspectRatio; + float fRange = FarZ / (NearZ - FarZ); + + XMMATRIX M; + M.m[0][0] = Width; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = Height; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = -1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + float fRange = FarZ / (NearZ - FarZ); + float Height = CosFov / SinFov; + float Width = Height / AspectRatio; + const float32x4_t Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32(Width, Zero, 0); + M.r[1] = vsetq_lane_f32(Height, Zero, 1); + M.r[2] = vsetq_lane_f32(fRange, g_XMNegIdentityR3.v, 2); + M.r[3] = vsetq_lane_f32(fRange * NearZ, Zero, 2); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + float fRange = FarZ / (NearZ - FarZ); + // Note: This is recorded on the stack + float Height = CosFov / SinFov; + XMVECTOR rMem = { + Height / AspectRatio, + Height, + fRange, + fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // CosFov / SinFov,0,0,0 + XMMATRIX M; + M.r[0] = vTemp; + // 0,Height / AspectRatio,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,-1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues, g_XMNegIdentityR3, _MM_SHUFFLE(3, 2, 3, 2)); + // 0,0,fRange,-1.0f + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0)); + M.r[2] = vTemp; + // 0,0,fRange * NearZ,0.0f + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterLH +( + float ViewLeft, + float ViewRight, + float ViewBottom, + float ViewTop, + float NearZ, + float FarZ +) noexcept +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (FarZ - NearZ); + + XMMATRIX M; + M.m[0][0] = TwoNearZ * ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ * ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = -(ViewLeft + ViewRight) * ReciprocalWidth; + M.m[2][1] = -(ViewTop + ViewBottom) * ReciprocalHeight; + M.m[2][2] = fRange; + M.m[2][3] = 1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (FarZ - NearZ); + const float32x4_t Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32(TwoNearZ * ReciprocalWidth, Zero, 0); + M.r[1] = vsetq_lane_f32(TwoNearZ * ReciprocalHeight, Zero, 1); + M.r[2] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + 1.0f); + M.r[3] = vsetq_lane_f32(-fRange * NearZ, Zero, 2); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (FarZ - NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ * ReciprocalWidth, + TwoNearZ * ReciprocalHeight, + -fRange * NearZ, + 0 + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // TwoNearZ*ReciprocalWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ*ReciprocalHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + M.r[1] = vTemp; + // 0,0,fRange,1.0f + M.r[2] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + 1.0f); + // 0,0,-fRange * NearZ,0.0f + vValues = _mm_and_ps(vValues, g_XMMaskZ); + M.r[3] = vValues; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterRH +( + float ViewLeft, + float ViewRight, + float ViewBottom, + float ViewTop, + float NearZ, + float FarZ +) noexcept +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (NearZ - FarZ); + + XMMATRIX M; + M.m[0][0] = TwoNearZ * ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ * ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = (ViewLeft + ViewRight) * ReciprocalWidth; + M.m[2][1] = (ViewTop + ViewBottom) * ReciprocalHeight; + M.m[2][2] = fRange; + M.m[2][3] = -1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (NearZ - FarZ); + const float32x4_t Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32(TwoNearZ * ReciprocalWidth, Zero, 0); + M.r[1] = vsetq_lane_f32(TwoNearZ * ReciprocalHeight, Zero, 1); + M.r[2] = XMVectorSet((ViewLeft + ViewRight) * ReciprocalWidth, + (ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + -1.0f); + M.r[3] = vsetq_lane_f32(fRange * NearZ, Zero, 2); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (NearZ - FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ * ReciprocalWidth, + TwoNearZ * ReciprocalHeight, + fRange * NearZ, + 0 + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // TwoNearZ*ReciprocalWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ*ReciprocalHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + M.r[1] = vTemp; + // 0,0,fRange,1.0f + M.r[2] = XMVectorSet((ViewLeft + ViewRight) * ReciprocalWidth, + (ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + -1.0f); + // 0,0,-fRange * NearZ,0.0f + vValues = _mm_and_ps(vValues, g_XMMaskZ); + M.r[3] = vValues; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixOrthographicLH +( + float ViewWidth, + float ViewHeight, + float NearZ, + float FarZ +) noexcept +{ + assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float fRange = 1.0f / (FarZ - NearZ); + + XMMATRIX M; + M.m[0][0] = 2.0f / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 2.0f / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fRange = 1.0f / (FarZ - NearZ); + + const float32x4_t Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32(2.0f / ViewWidth, Zero, 0); + M.r[1] = vsetq_lane_f32(2.0f / ViewHeight, Zero, 1); + M.r[2] = vsetq_lane_f32(fRange, Zero, 2); + M.r[3] = vsetq_lane_f32(-fRange * NearZ, g_XMIdentityR3.v, 2); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float fRange = 1.0f / (FarZ - NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + 2.0f / ViewWidth, + 2.0f / ViewHeight, + fRange, + -fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // 2.0f / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,2.0f / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues, g_XMIdentityR3, _MM_SHUFFLE(3, 2, 3, 2)); + // 0,0,fRange,0.0f + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 0, 0, 0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,1.0f + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 1, 0, 0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixOrthographicRH +( + float ViewWidth, + float ViewHeight, + float NearZ, + float FarZ +) noexcept +{ + assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float fRange = 1.0f / (NearZ - FarZ); + + XMMATRIX M; + M.m[0][0] = 2.0f / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 2.0f / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fRange = 1.0f / (NearZ - FarZ); + + const float32x4_t Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32(2.0f / ViewWidth, Zero, 0); + M.r[1] = vsetq_lane_f32(2.0f / ViewHeight, Zero, 1); + M.r[2] = vsetq_lane_f32(fRange, Zero, 2); + M.r[3] = vsetq_lane_f32(fRange * NearZ, g_XMIdentityR3.v, 2); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float fRange = 1.0f / (NearZ - FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + 2.0f / ViewWidth, + 2.0f / ViewHeight, + fRange, + fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // 2.0f / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,2.0f / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=fRange * NearZ,0,1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues, g_XMIdentityR3, _MM_SHUFFLE(3, 2, 3, 2)); + // 0,0,fRange,0.0f + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 0, 0, 0)); + M.r[2] = vTemp; + // 0,0,fRange * NearZ,1.0f + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 1, 0, 0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterLH +( + float ViewLeft, + float ViewRight, + float ViewBottom, + float ViewTop, + float NearZ, + float FarZ +) noexcept +{ + assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (FarZ - NearZ); + + XMMATRIX M; + M.m[0][0] = ReciprocalWidth + ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = ReciprocalHeight + ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 0.0f; + + M.m[3][0] = -(ViewLeft + ViewRight) * ReciprocalWidth; + M.m[3][1] = -(ViewTop + ViewBottom) * ReciprocalHeight; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (FarZ - NearZ); + const float32x4_t Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32(ReciprocalWidth + ReciprocalWidth, Zero, 0); + M.r[1] = vsetq_lane_f32(ReciprocalHeight + ReciprocalHeight, Zero, 1); + M.r[2] = vsetq_lane_f32(fRange, Zero, 2); + M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + -fRange * NearZ, + 1.0f); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float fReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float fReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (FarZ - NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + fReciprocalWidth, + fReciprocalHeight, + fRange, + 1.0f + }; + XMVECTOR rMem2 = { + -(ViewLeft + ViewRight), + -(ViewTop + ViewBottom), + -NearZ, + 1.0f + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // fReciprocalWidth*2,0,0,0 + vTemp = _mm_add_ss(vTemp, vTemp); + M.r[0] = vTemp; + // 0,fReciprocalHeight*2,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + vTemp = _mm_add_ps(vTemp, vTemp); + M.r[1] = vTemp; + // 0,0,fRange,0.0f + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskZ); + M.r[2] = vTemp; + // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f + vValues = _mm_mul_ps(vValues, rMem2); + M.r[3] = vValues; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterRH +( + float ViewLeft, + float ViewRight, + float ViewBottom, + float ViewTop, + float NearZ, + float FarZ +) noexcept +{ + assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (NearZ - FarZ); + + XMMATRIX M; + M.m[0][0] = ReciprocalWidth + ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = ReciprocalHeight + ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 0.0f; + + M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + fRange * NearZ, + 1.0f); + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (NearZ - FarZ); + const float32x4_t Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32(ReciprocalWidth + ReciprocalWidth, Zero, 0); + M.r[1] = vsetq_lane_f32(ReciprocalHeight + ReciprocalHeight, Zero, 1); + M.r[2] = vsetq_lane_f32(fRange, Zero, 2); + M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + fRange * NearZ, + 1.0f); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float fReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float fReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (NearZ - FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + fReciprocalWidth, + fReciprocalHeight, + fRange, + 1.0f + }; + XMVECTOR rMem2 = { + -(ViewLeft + ViewRight), + -(ViewTop + ViewBottom), + NearZ, + 1.0f + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // fReciprocalWidth*2,0,0,0 + vTemp = _mm_add_ss(vTemp, vTemp); + M.r[0] = vTemp; + // 0,fReciprocalHeight*2,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + vTemp = _mm_add_ps(vTemp, vTemp); + M.r[1] = vTemp; + // 0,0,fRange,0.0f + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskZ); + M.r[2] = vTemp; + // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f + vValues = _mm_mul_ps(vValues, rMem2); + M.r[3] = vValues; + return M; +#endif +} + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +/**************************************************************************** + * + * XMMATRIX operators and methods + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMMATRIX::XMMATRIX +( + float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33 +) noexcept +{ + r[0] = XMVectorSet(m00, m01, m02, m03); + r[1] = XMVectorSet(m10, m11, m12, m13); + r[2] = XMVectorSet(m20, m21, m22, m23); + r[3] = XMVectorSet(m30, m31, m32, m33); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX::XMMATRIX(const float* pArray) noexcept +{ + assert(pArray != nullptr); + r[0] = XMLoadFloat4(reinterpret_cast(pArray)); + r[1] = XMLoadFloat4(reinterpret_cast(pArray + 4)); + r[2] = XMLoadFloat4(reinterpret_cast(pArray + 8)); + r[3] = XMLoadFloat4(reinterpret_cast(pArray + 12)); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMATRIX::operator- () const noexcept +{ + XMMATRIX R; + R.r[0] = XMVectorNegate(r[0]); + R.r[1] = XMVectorNegate(r[1]); + R.r[2] = XMVectorNegate(r[2]); + R.r[3] = XMVectorNegate(r[3]); + return R; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XM_CALLCONV XMMATRIX::operator+= (FXMMATRIX M) noexcept +{ + r[0] = XMVectorAdd(r[0], M.r[0]); + r[1] = XMVectorAdd(r[1], M.r[1]); + r[2] = XMVectorAdd(r[2], M.r[2]); + r[3] = XMVectorAdd(r[3], M.r[3]); + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XM_CALLCONV XMMATRIX::operator-= (FXMMATRIX M) noexcept +{ + r[0] = XMVectorSubtract(r[0], M.r[0]); + r[1] = XMVectorSubtract(r[1], M.r[1]); + r[2] = XMVectorSubtract(r[2], M.r[2]); + r[3] = XMVectorSubtract(r[3], M.r[3]); + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XM_CALLCONV XMMATRIX::operator*=(FXMMATRIX M) noexcept +{ + *this = XMMatrixMultiply(*this, M); + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XMMATRIX::operator*= (float S) noexcept +{ + r[0] = XMVectorScale(r[0], S); + r[1] = XMVectorScale(r[1], S); + r[2] = XMVectorScale(r[2], S); + r[3] = XMVectorScale(r[3], S); + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XMMATRIX::operator/= (float S) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vS = XMVectorReplicate(S); + r[0] = XMVectorDivide(r[0], vS); + r[1] = XMVectorDivide(r[1], vS); + r[2] = XMVectorDivide(r[2], vS); + r[3] = XMVectorDivide(r[3], vS); + return *this; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ + float32x4_t vS = vdupq_n_f32(S); + r[0] = vdivq_f32(r[0], vS); + r[1] = vdivq_f32(r[1], vS); + r[2] = vdivq_f32(r[2], vS); + r[3] = vdivq_f32(r[3], vS); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x2_t vS = vdup_n_f32(S); + float32x2_t R0 = vrecpe_f32(vS); + float32x2_t S0 = vrecps_f32(R0, vS); + R0 = vmul_f32(S0, R0); + S0 = vrecps_f32(R0, vS); + R0 = vmul_f32(S0, R0); + float32x4_t Reciprocal = vcombine_f32(R0, R0); + r[0] = vmulq_f32(r[0], Reciprocal); + r[1] = vmulq_f32(r[1], Reciprocal); + r[2] = vmulq_f32(r[2], Reciprocal); + r[3] = vmulq_f32(r[3], Reciprocal); +#endif + return *this; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 vS = _mm_set_ps1(S); + r[0] = _mm_div_ps(r[0], vS); + r[1] = _mm_div_ps(r[1], vS); + r[2] = _mm_div_ps(r[2], vS); + r[3] = _mm_div_ps(r[3], vS); + return *this; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMATRIX::operator+ (FXMMATRIX M) const noexcept +{ + XMMATRIX R; + R.r[0] = XMVectorAdd(r[0], M.r[0]); + R.r[1] = XMVectorAdd(r[1], M.r[1]); + R.r[2] = XMVectorAdd(r[2], M.r[2]); + R.r[3] = XMVectorAdd(r[3], M.r[3]); + return R; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMATRIX::operator- (FXMMATRIX M) const noexcept +{ + XMMATRIX R; + R.r[0] = XMVectorSubtract(r[0], M.r[0]); + R.r[1] = XMVectorSubtract(r[1], M.r[1]); + R.r[2] = XMVectorSubtract(r[2], M.r[2]); + R.r[3] = XMVectorSubtract(r[3], M.r[3]); + return R; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMATRIX::operator*(FXMMATRIX M) const noexcept +{ + return XMMatrixMultiply(*this, M); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMATRIX::operator* (float S) const noexcept +{ + XMMATRIX R; + R.r[0] = XMVectorScale(r[0], S); + R.r[1] = XMVectorScale(r[1], S); + R.r[2] = XMVectorScale(r[2], S); + R.r[3] = XMVectorScale(r[3], S); + return R; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMATRIX::operator/ (float S) const noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vS = XMVectorReplicate(S); + XMMATRIX R; + R.r[0] = XMVectorDivide(r[0], vS); + R.r[1] = XMVectorDivide(r[1], vS); + R.r[2] = XMVectorDivide(r[2], vS); + R.r[3] = XMVectorDivide(r[3], vS); + return R; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ + float32x4_t vS = vdupq_n_f32(S); + XMMATRIX R; + R.r[0] = vdivq_f32(r[0], vS); + R.r[1] = vdivq_f32(r[1], vS); + R.r[2] = vdivq_f32(r[2], vS); + R.r[3] = vdivq_f32(r[3], vS); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x2_t vS = vdup_n_f32(S); + float32x2_t R0 = vrecpe_f32(vS); + float32x2_t S0 = vrecps_f32(R0, vS); + R0 = vmul_f32(S0, R0); + S0 = vrecps_f32(R0, vS); + R0 = vmul_f32(S0, R0); + float32x4_t Reciprocal = vcombine_f32(R0, R0); + XMMATRIX R; + R.r[0] = vmulq_f32(r[0], Reciprocal); + R.r[1] = vmulq_f32(r[1], Reciprocal); + R.r[2] = vmulq_f32(r[2], Reciprocal); + R.r[3] = vmulq_f32(r[3], Reciprocal); +#endif + return R; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 vS = _mm_set_ps1(S); + XMMATRIX R; + R.r[0] = _mm_div_ps(r[0], vS); + R.r[1] = _mm_div_ps(r[1], vS); + R.r[2] = _mm_div_ps(r[2], vS); + R.r[3] = _mm_div_ps(r[3], vS); + return R; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV operator* +( + float S, + FXMMATRIX M +) noexcept +{ + XMMATRIX R; + R.r[0] = XMVectorScale(M.r[0], S); + R.r[1] = XMVectorScale(M.r[1], S); + R.r[2] = XMVectorScale(M.r[2], S); + R.r[3] = XMVectorScale(M.r[3], S); + return R; +} + +/**************************************************************************** + * + * XMFLOAT3X3 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT3X3::XMFLOAT3X3(const float* pArray) noexcept +{ + assert(pArray != nullptr); + for (size_t Row = 0; Row < 3; Row++) + { + for (size_t Column = 0; Column < 3; Column++) + { + m[Row][Column] = pArray[Row * 3 + Column]; + } + } +} + +/**************************************************************************** + * + * XMFLOAT4X3 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT4X3::XMFLOAT4X3(const float* pArray) noexcept +{ + assert(pArray != nullptr); + + m[0][0] = pArray[0]; + m[0][1] = pArray[1]; + m[0][2] = pArray[2]; + + m[1][0] = pArray[3]; + m[1][1] = pArray[4]; + m[1][2] = pArray[5]; + + m[2][0] = pArray[6]; + m[2][1] = pArray[7]; + m[2][2] = pArray[8]; + + m[3][0] = pArray[9]; + m[3][1] = pArray[10]; + m[3][2] = pArray[11]; +} + +/**************************************************************************** +* +* XMFLOAT3X4 operators +* +****************************************************************************/ + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT3X4::XMFLOAT3X4(const float* pArray) noexcept +{ + assert(pArray != nullptr); + + m[0][0] = pArray[0]; + m[0][1] = pArray[1]; + m[0][2] = pArray[2]; + m[0][3] = pArray[3]; + + m[1][0] = pArray[4]; + m[1][1] = pArray[5]; + m[1][2] = pArray[6]; + m[1][3] = pArray[7]; + + m[2][0] = pArray[8]; + m[2][1] = pArray[9]; + m[2][2] = pArray[10]; + m[2][3] = pArray[11]; +} + +/**************************************************************************** + * + * XMFLOAT4X4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT4X4::XMFLOAT4X4(const float* pArray) noexcept +{ + assert(pArray != nullptr); + + m[0][0] = pArray[0]; + m[0][1] = pArray[1]; + m[0][2] = pArray[2]; + m[0][3] = pArray[3]; + + m[1][0] = pArray[4]; + m[1][1] = pArray[5]; + m[1][2] = pArray[6]; + m[1][3] = pArray[7]; + + m[2][0] = pArray[8]; + m[2][1] = pArray[9]; + m[2][2] = pArray[10]; + m[2][3] = pArray[11]; + + m[3][0] = pArray[12]; + m[3][1] = pArray[13]; + m[3][2] = pArray[14]; + m[3][3] = pArray[15]; +} + diff --git a/DirectXMath/DirectXMathMisc.inl b/DirectXMath/DirectXMathMisc.inl new file mode 100644 index 0000000..4cfd042 --- /dev/null +++ b/DirectXMath/DirectXMathMisc.inl @@ -0,0 +1,2452 @@ +//------------------------------------------------------------------------------------- +// DirectXMathMisc.inl -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +/**************************************************************************** + * + * Quaternion + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + // Comparison operations + //------------------------------------------------------------------------------ + + //------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMQuaternionEqual +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) noexcept +{ + return XMVector4Equal(Q1, Q2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMQuaternionNotEqual +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) noexcept +{ + return XMVector4NotEqual(Q1, Q2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMQuaternionIsNaN(FXMVECTOR Q) noexcept +{ + return XMVector4IsNaN(Q); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMQuaternionIsInfinite(FXMVECTOR Q) noexcept +{ + return XMVector4IsInfinite(Q); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMQuaternionIsIdentity(FXMVECTOR Q) noexcept +{ + return XMVector4Equal(Q, g_XMIdentityR3.v); +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionDot +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) noexcept +{ + return XMVector4Dot(Q1, Q2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionMultiply +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) noexcept +{ + // Returns the product Q2*Q1 (which is the concatenation of a rotation Q1 followed by the rotation Q2) + + // [ (Q2.w * Q1.x) + (Q2.x * Q1.w) + (Q2.y * Q1.z) - (Q2.z * Q1.y), + // (Q2.w * Q1.y) - (Q2.x * Q1.z) + (Q2.y * Q1.w) + (Q2.z * Q1.x), + // (Q2.w * Q1.z) + (Q2.x * Q1.y) - (Q2.y * Q1.x) + (Q2.z * Q1.w), + // (Q2.w * Q1.w) - (Q2.x * Q1.x) - (Q2.y * Q1.y) - (Q2.z * Q1.z) ] + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + (Q2.vector4_f32[3] * Q1.vector4_f32[0]) + (Q2.vector4_f32[0] * Q1.vector4_f32[3]) + (Q2.vector4_f32[1] * Q1.vector4_f32[2]) - (Q2.vector4_f32[2] * Q1.vector4_f32[1]), + (Q2.vector4_f32[3] * Q1.vector4_f32[1]) - (Q2.vector4_f32[0] * Q1.vector4_f32[2]) + (Q2.vector4_f32[1] * Q1.vector4_f32[3]) + (Q2.vector4_f32[2] * Q1.vector4_f32[0]), + (Q2.vector4_f32[3] * Q1.vector4_f32[2]) + (Q2.vector4_f32[0] * Q1.vector4_f32[1]) - (Q2.vector4_f32[1] * Q1.vector4_f32[0]) + (Q2.vector4_f32[2] * Q1.vector4_f32[3]), + (Q2.vector4_f32[3] * Q1.vector4_f32[3]) - (Q2.vector4_f32[0] * Q1.vector4_f32[0]) - (Q2.vector4_f32[1] * Q1.vector4_f32[1]) - (Q2.vector4_f32[2] * Q1.vector4_f32[2]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 ControlWZYX = { { { 1.0f, -1.0f, 1.0f, -1.0f } } }; + static const XMVECTORF32 ControlZWXY = { { { 1.0f, 1.0f, -1.0f, -1.0f } } }; + static const XMVECTORF32 ControlYXWZ = { { { -1.0f, 1.0f, 1.0f, -1.0f } } }; + + float32x2_t Q2L = vget_low_f32(Q2); + float32x2_t Q2H = vget_high_f32(Q2); + + float32x4_t Q2X = vdupq_lane_f32(Q2L, 0); + float32x4_t Q2Y = vdupq_lane_f32(Q2L, 1); + float32x4_t Q2Z = vdupq_lane_f32(Q2H, 0); + XMVECTOR vResult = vmulq_lane_f32(Q1, Q2H, 1); + + // Mul by Q1WZYX + float32x4_t vTemp = vrev64q_f32(Q1); + vTemp = vcombine_f32(vget_high_f32(vTemp), vget_low_f32(vTemp)); + Q2X = vmulq_f32(Q2X, vTemp); + vResult = vmlaq_f32(vResult, Q2X, ControlWZYX); + + // Mul by Q1ZWXY + vTemp = vreinterpretq_f32_u32(vrev64q_u32(vreinterpretq_u32_f32(vTemp))); + Q2Y = vmulq_f32(Q2Y, vTemp); + vResult = vmlaq_f32(vResult, Q2Y, ControlZWXY); + + // Mul by Q1YXWZ + vTemp = vreinterpretq_f32_u32(vrev64q_u32(vreinterpretq_u32_f32(vTemp))); + vTemp = vcombine_f32(vget_high_f32(vTemp), vget_low_f32(vTemp)); + Q2Z = vmulq_f32(Q2Z, vTemp); + vResult = vmlaq_f32(vResult, Q2Z, ControlYXWZ); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ControlWZYX = { { { 1.0f, -1.0f, 1.0f, -1.0f } } }; + static const XMVECTORF32 ControlZWXY = { { { 1.0f, 1.0f, -1.0f, -1.0f } } }; + static const XMVECTORF32 ControlYXWZ = { { { -1.0f, 1.0f, 1.0f, -1.0f } } }; + // Copy to SSE registers and use as few as possible for x86 + XMVECTOR Q2X = Q2; + XMVECTOR Q2Y = Q2; + XMVECTOR Q2Z = Q2; + XMVECTOR vResult = Q2; + // Splat with one instruction + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 3, 3, 3)); + Q2X = XM_PERMUTE_PS(Q2X, _MM_SHUFFLE(0, 0, 0, 0)); + Q2Y = XM_PERMUTE_PS(Q2Y, _MM_SHUFFLE(1, 1, 1, 1)); + Q2Z = XM_PERMUTE_PS(Q2Z, _MM_SHUFFLE(2, 2, 2, 2)); + // Retire Q1 and perform Q1*Q2W + vResult = _mm_mul_ps(vResult, Q1); + XMVECTOR Q1Shuffle = Q1; + // Shuffle the copies of Q1 + Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle, _MM_SHUFFLE(0, 1, 2, 3)); + // Mul by Q1WZYX + Q2X = _mm_mul_ps(Q2X, Q1Shuffle); + Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle, _MM_SHUFFLE(2, 3, 0, 1)); + // Flip the signs on y and z + vResult = XM_FMADD_PS(Q2X, ControlWZYX, vResult); + // Mul by Q1ZWXY + Q2Y = _mm_mul_ps(Q2Y, Q1Shuffle); + Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle, _MM_SHUFFLE(0, 1, 2, 3)); + // Flip the signs on z and w + Q2Y = _mm_mul_ps(Q2Y, ControlZWXY); + // Mul by Q1YXWZ + Q2Z = _mm_mul_ps(Q2Z, Q1Shuffle); + // Flip the signs on x and w + Q2Y = XM_FMADD_PS(Q2Z, ControlYXWZ, Q2Y); + vResult = _mm_add_ps(vResult, Q2Y); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionLengthSq(FXMVECTOR Q) noexcept +{ + return XMVector4LengthSq(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionReciprocalLength(FXMVECTOR Q) noexcept +{ + return XMVector4ReciprocalLength(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionLength(FXMVECTOR Q) noexcept +{ + return XMVector4Length(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionNormalizeEst(FXMVECTOR Q) noexcept +{ + return XMVector4NormalizeEst(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionNormalize(FXMVECTOR Q) noexcept +{ + return XMVector4Normalize(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionConjugate(FXMVECTOR Q) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + -Q.vector4_f32[0], + -Q.vector4_f32[1], + -Q.vector4_f32[2], + Q.vector4_f32[3] + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 NegativeOne3 = { { { -1.0f, -1.0f, -1.0f, 1.0f } } }; + return vmulq_f32(Q, NegativeOne3.v); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 NegativeOne3 = { { { -1.0f, -1.0f, -1.0f, 1.0f } } }; + return _mm_mul_ps(Q, NegativeOne3); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionInverse(FXMVECTOR Q) noexcept +{ + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR L = XMVector4LengthSq(Q); + XMVECTOR Conjugate = XMQuaternionConjugate(Q); + + XMVECTOR Control = XMVectorLessOrEqual(L, g_XMEpsilon.v); + + XMVECTOR Result = XMVectorDivide(Conjugate, L); + + Result = XMVectorSelect(Result, Zero, Control); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionLn(FXMVECTOR Q) noexcept +{ + static const XMVECTORF32 OneMinusEpsilon = { { { 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f } } }; + + XMVECTOR QW = XMVectorSplatW(Q); + XMVECTOR Q0 = XMVectorSelect(g_XMSelect1110.v, Q, g_XMSelect1110.v); + + XMVECTOR ControlW = XMVectorInBounds(QW, OneMinusEpsilon.v); + + XMVECTOR Theta = XMVectorACos(QW); + XMVECTOR SinTheta = XMVectorSin(Theta); + + XMVECTOR S = XMVectorDivide(Theta, SinTheta); + + XMVECTOR Result = XMVectorMultiply(Q0, S); + Result = XMVectorSelect(Q0, Result, ControlW); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionExp(FXMVECTOR Q) noexcept +{ + XMVECTOR Theta = XMVector3Length(Q); + + XMVECTOR SinTheta, CosTheta; + XMVectorSinCos(&SinTheta, &CosTheta, Theta); + + XMVECTOR S = XMVectorDivide(SinTheta, Theta); + + XMVECTOR Result = XMVectorMultiply(Q, S); + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR Control = XMVectorNearEqual(Theta, Zero, g_XMEpsilon.v); + Result = XMVectorSelect(Result, Q, Control); + + Result = XMVectorSelect(CosTheta, Result, g_XMSelect1110.v); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionSlerp +( + FXMVECTOR Q0, + FXMVECTOR Q1, + float t +) noexcept +{ + XMVECTOR T = XMVectorReplicate(t); + return XMQuaternionSlerpV(Q0, Q1, T); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionSlerpV +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR T +) noexcept +{ + assert((XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T))); + + // Result = Q0 * sin((1.0 - t) * Omega) / sin(Omega) + Q1 * sin(t * Omega) / sin(Omega) + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + const XMVECTORF32 OneMinusEpsilon = { { { 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f } } }; + + XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1); + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR Control = XMVectorLess(CosOmega, Zero); + XMVECTOR Sign = XMVectorSelect(g_XMOne.v, g_XMNegativeOne.v, Control); + + CosOmega = XMVectorMultiply(CosOmega, Sign); + + Control = XMVectorLess(CosOmega, OneMinusEpsilon); + + XMVECTOR SinOmega = XMVectorNegativeMultiplySubtract(CosOmega, CosOmega, g_XMOne.v); + SinOmega = XMVectorSqrt(SinOmega); + + XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega); + + XMVECTOR SignMask = XMVectorSplatSignMask(); + XMVECTOR V01 = XMVectorShiftLeft(T, Zero, 2); + SignMask = XMVectorShiftLeft(SignMask, Zero, 3); + V01 = XMVectorXorInt(V01, SignMask); + V01 = XMVectorAdd(g_XMIdentityR0.v, V01); + + XMVECTOR InvSinOmega = XMVectorReciprocal(SinOmega); + + XMVECTOR S0 = XMVectorMultiply(V01, Omega); + S0 = XMVectorSin(S0); + S0 = XMVectorMultiply(S0, InvSinOmega); + + S0 = XMVectorSelect(V01, S0, Control); + + XMVECTOR S1 = XMVectorSplatY(S0); + S0 = XMVectorSplatX(S0); + + S1 = XMVectorMultiply(S1, Sign); + + XMVECTOR Result = XMVectorMultiply(Q0, S0); + Result = XMVectorMultiplyAdd(Q1, S1, Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 OneMinusEpsilon = { { { 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f } } }; + static const XMVECTORU32 SignMask2 = { { { 0x80000000, 0x00000000, 0x00000000, 0x00000000 } } }; + + XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1); + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR Control = XMVectorLess(CosOmega, Zero); + XMVECTOR Sign = XMVectorSelect(g_XMOne, g_XMNegativeOne, Control); + + CosOmega = _mm_mul_ps(CosOmega, Sign); + + Control = XMVectorLess(CosOmega, OneMinusEpsilon); + + XMVECTOR SinOmega = _mm_mul_ps(CosOmega, CosOmega); + SinOmega = _mm_sub_ps(g_XMOne, SinOmega); + SinOmega = _mm_sqrt_ps(SinOmega); + + XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega); + + XMVECTOR V01 = XM_PERMUTE_PS(T, _MM_SHUFFLE(2, 3, 0, 1)); + V01 = _mm_and_ps(V01, g_XMMaskXY); + V01 = _mm_xor_ps(V01, SignMask2); + V01 = _mm_add_ps(g_XMIdentityR0, V01); + + XMVECTOR S0 = _mm_mul_ps(V01, Omega); + S0 = XMVectorSin(S0); + S0 = _mm_div_ps(S0, SinOmega); + + S0 = XMVectorSelect(V01, S0, Control); + + XMVECTOR S1 = XMVectorSplatY(S0); + S0 = XMVectorSplatX(S0); + + S1 = _mm_mul_ps(S1, Sign); + XMVECTOR Result = _mm_mul_ps(Q0, S0); + S1 = _mm_mul_ps(S1, Q1); + Result = _mm_add_ps(Result, S1); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionSquad +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + GXMVECTOR Q3, + float t +) noexcept +{ + XMVECTOR T = XMVectorReplicate(t); + return XMQuaternionSquadV(Q0, Q1, Q2, Q3, T); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionSquadV +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + GXMVECTOR Q3, + HXMVECTOR T +) noexcept +{ + assert((XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T))); + + XMVECTOR TP = T; + const XMVECTOR Two = XMVectorSplatConstant(2, 0); + + XMVECTOR Q03 = XMQuaternionSlerpV(Q0, Q3, T); + XMVECTOR Q12 = XMQuaternionSlerpV(Q1, Q2, T); + + TP = XMVectorNegativeMultiplySubtract(TP, TP, TP); + TP = XMVectorMultiply(TP, Two); + + XMVECTOR Result = XMQuaternionSlerpV(Q03, Q12, TP); + + return Result; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMQuaternionSquadSetup +( + XMVECTOR* pA, + XMVECTOR* pB, + XMVECTOR* pC, + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + GXMVECTOR Q3 +) noexcept +{ + assert(pA); + assert(pB); + assert(pC); + + XMVECTOR LS12 = XMQuaternionLengthSq(XMVectorAdd(Q1, Q2)); + XMVECTOR LD12 = XMQuaternionLengthSq(XMVectorSubtract(Q1, Q2)); + XMVECTOR SQ2 = XMVectorNegate(Q2); + + XMVECTOR Control1 = XMVectorLess(LS12, LD12); + SQ2 = XMVectorSelect(Q2, SQ2, Control1); + + XMVECTOR LS01 = XMQuaternionLengthSq(XMVectorAdd(Q0, Q1)); + XMVECTOR LD01 = XMQuaternionLengthSq(XMVectorSubtract(Q0, Q1)); + XMVECTOR SQ0 = XMVectorNegate(Q0); + + XMVECTOR LS23 = XMQuaternionLengthSq(XMVectorAdd(SQ2, Q3)); + XMVECTOR LD23 = XMQuaternionLengthSq(XMVectorSubtract(SQ2, Q3)); + XMVECTOR SQ3 = XMVectorNegate(Q3); + + XMVECTOR Control0 = XMVectorLess(LS01, LD01); + XMVECTOR Control2 = XMVectorLess(LS23, LD23); + + SQ0 = XMVectorSelect(Q0, SQ0, Control0); + SQ3 = XMVectorSelect(Q3, SQ3, Control2); + + XMVECTOR InvQ1 = XMQuaternionInverse(Q1); + XMVECTOR InvQ2 = XMQuaternionInverse(SQ2); + + XMVECTOR LnQ0 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ0)); + XMVECTOR LnQ2 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ2)); + XMVECTOR LnQ1 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, Q1)); + XMVECTOR LnQ3 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, SQ3)); + + const XMVECTOR NegativeOneQuarter = XMVectorSplatConstant(-1, 2); + + XMVECTOR ExpQ02 = XMVectorMultiply(XMVectorAdd(LnQ0, LnQ2), NegativeOneQuarter); + XMVECTOR ExpQ13 = XMVectorMultiply(XMVectorAdd(LnQ1, LnQ3), NegativeOneQuarter); + ExpQ02 = XMQuaternionExp(ExpQ02); + ExpQ13 = XMQuaternionExp(ExpQ13); + + *pA = XMQuaternionMultiply(Q1, ExpQ02); + *pB = XMQuaternionMultiply(SQ2, ExpQ13); + *pC = SQ2; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionBaryCentric +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + float f, + float g +) noexcept +{ + float s = f + g; + + XMVECTOR Result; + if ((s < 0.00001f) && (s > -0.00001f)) + { + Result = Q0; + } + else + { + XMVECTOR Q01 = XMQuaternionSlerp(Q0, Q1, s); + XMVECTOR Q02 = XMQuaternionSlerp(Q0, Q2, s); + + Result = XMQuaternionSlerp(Q01, Q02, g / s); + } + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionBaryCentricV +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + GXMVECTOR F, + HXMVECTOR G +) noexcept +{ + assert((XMVectorGetY(F) == XMVectorGetX(F)) && (XMVectorGetZ(F) == XMVectorGetX(F)) && (XMVectorGetW(F) == XMVectorGetX(F))); + assert((XMVectorGetY(G) == XMVectorGetX(G)) && (XMVectorGetZ(G) == XMVectorGetX(G)) && (XMVectorGetW(G) == XMVectorGetX(G))); + + const XMVECTOR Epsilon = XMVectorSplatConstant(1, 16); + + XMVECTOR S = XMVectorAdd(F, G); + + XMVECTOR Result; + if (XMVector4InBounds(S, Epsilon)) + { + Result = Q0; + } + else + { + XMVECTOR Q01 = XMQuaternionSlerpV(Q0, Q1, S); + XMVECTOR Q02 = XMQuaternionSlerpV(Q0, Q2, S); + XMVECTOR GS = XMVectorReciprocal(S); + GS = XMVectorMultiply(G, GS); + + Result = XMQuaternionSlerpV(Q01, Q02, GS); + } + + return Result; +} + +//------------------------------------------------------------------------------ +// Transformation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionIdentity() noexcept +{ + return g_XMIdentityR3.v; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYaw +( + float Pitch, + float Yaw, + float Roll +) noexcept +{ + XMVECTOR Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f); + XMVECTOR Q = XMQuaternionRotationRollPitchYawFromVector(Angles); + return Q; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYawFromVector +( + FXMVECTOR Angles // +) noexcept +{ + static const XMVECTORF32 Sign = { { { 1.0f, -1.0f, -1.0f, 1.0f } } }; + + XMVECTOR HalfAngles = XMVectorMultiply(Angles, g_XMOneHalf.v); + + XMVECTOR SinAngles, CosAngles; + XMVectorSinCos(&SinAngles, &CosAngles, HalfAngles); + + XMVECTOR P0 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR Y0 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR R0 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR P1 = XMVectorPermute(CosAngles, SinAngles); + XMVECTOR Y1 = XMVectorPermute(CosAngles, SinAngles); + XMVECTOR R1 = XMVectorPermute(CosAngles, SinAngles); + + XMVECTOR Q1 = XMVectorMultiply(P1, Sign.v); + XMVECTOR Q0 = XMVectorMultiply(P0, Y0); + Q1 = XMVectorMultiply(Q1, Y1); + Q0 = XMVectorMultiply(Q0, R0); + XMVECTOR Q = XMVectorMultiplyAdd(Q1, R1, Q0); + + return Q; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionRotationNormal +( + FXMVECTOR NormalAxis, + float Angle +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR N = XMVectorSelect(g_XMOne.v, NormalAxis, g_XMSelect1110.v); + + float SinV, CosV; + XMScalarSinCos(&SinV, &CosV, 0.5f * Angle); + + XMVECTOR Scale = XMVectorSet(SinV, SinV, SinV, CosV); + return XMVectorMultiply(N, Scale); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR N = _mm_and_ps(NormalAxis, g_XMMask3); + N = _mm_or_ps(N, g_XMIdentityR3); + XMVECTOR Scale = _mm_set_ps1(0.5f * Angle); + XMVECTOR vSine; + XMVECTOR vCosine; + XMVectorSinCos(&vSine, &vCosine, Scale); + Scale = _mm_and_ps(vSine, g_XMMask3); + vCosine = _mm_and_ps(vCosine, g_XMMaskW); + Scale = _mm_or_ps(Scale, vCosine); + N = _mm_mul_ps(N, Scale); + return N; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionRotationAxis +( + FXMVECTOR Axis, + float Angle +) noexcept +{ + assert(!XMVector3Equal(Axis, XMVectorZero())); + assert(!XMVector3IsInfinite(Axis)); + + XMVECTOR Normal = XMVector3Normalize(Axis); + XMVECTOR Q = XMQuaternionRotationNormal(Normal, Angle); + return Q; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionRotationMatrix(FXMMATRIX M) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 q; + float r22 = M.m[2][2]; + if (r22 <= 0.f) // x^2 + y^2 >= z^2 + w^2 + { + float dif10 = M.m[1][1] - M.m[0][0]; + float omr22 = 1.f - r22; + if (dif10 <= 0.f) // x^2 >= y^2 + { + float fourXSqr = omr22 - dif10; + float inv4x = 0.5f / sqrtf(fourXSqr); + q.f[0] = fourXSqr * inv4x; + q.f[1] = (M.m[0][1] + M.m[1][0]) * inv4x; + q.f[2] = (M.m[0][2] + M.m[2][0]) * inv4x; + q.f[3] = (M.m[1][2] - M.m[2][1]) * inv4x; + } + else // y^2 >= x^2 + { + float fourYSqr = omr22 + dif10; + float inv4y = 0.5f / sqrtf(fourYSqr); + q.f[0] = (M.m[0][1] + M.m[1][0]) * inv4y; + q.f[1] = fourYSqr * inv4y; + q.f[2] = (M.m[1][2] + M.m[2][1]) * inv4y; + q.f[3] = (M.m[2][0] - M.m[0][2]) * inv4y; + } + } + else // z^2 + w^2 >= x^2 + y^2 + { + float sum10 = M.m[1][1] + M.m[0][0]; + float opr22 = 1.f + r22; + if (sum10 <= 0.f) // z^2 >= w^2 + { + float fourZSqr = opr22 - sum10; + float inv4z = 0.5f / sqrtf(fourZSqr); + q.f[0] = (M.m[0][2] + M.m[2][0]) * inv4z; + q.f[1] = (M.m[1][2] + M.m[2][1]) * inv4z; + q.f[2] = fourZSqr * inv4z; + q.f[3] = (M.m[0][1] - M.m[1][0]) * inv4z; + } + else // w^2 >= z^2 + { + float fourWSqr = opr22 + sum10; + float inv4w = 0.5f / sqrtf(fourWSqr); + q.f[0] = (M.m[1][2] - M.m[2][1]) * inv4w; + q.f[1] = (M.m[2][0] - M.m[0][2]) * inv4w; + q.f[2] = (M.m[0][1] - M.m[1][0]) * inv4w; + q.f[3] = fourWSqr * inv4w; + } + } + return q.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 XMPMMP = { { { +1.0f, -1.0f, -1.0f, +1.0f } } }; + static const XMVECTORF32 XMMPMP = { { { -1.0f, +1.0f, -1.0f, +1.0f } } }; + static const XMVECTORF32 XMMMPP = { { { -1.0f, -1.0f, +1.0f, +1.0f } } }; + static const XMVECTORU32 Select0110 = { { { XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0 } } }; + static const XMVECTORU32 Select0010 = { { { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 } } }; + + float32x4_t r0 = M.r[0]; + float32x4_t r1 = M.r[1]; + float32x4_t r2 = M.r[2]; + + float32x4_t r00 = vdupq_lane_f32(vget_low_f32(r0), 0); + float32x4_t r11 = vdupq_lane_f32(vget_low_f32(r1), 1); + float32x4_t r22 = vdupq_lane_f32(vget_high_f32(r2), 0); + + // x^2 >= y^2 equivalent to r11 - r00 <= 0 + float32x4_t r11mr00 = vsubq_f32(r11, r00); + uint32x4_t x2gey2 = vcleq_f32(r11mr00, g_XMZero); + + // z^2 >= w^2 equivalent to r11 + r00 <= 0 + float32x4_t r11pr00 = vaddq_f32(r11, r00); + uint32x4_t z2gew2 = vcleq_f32(r11pr00, g_XMZero); + + // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0 + uint32x4_t x2py2gez2pw2 = vcleq_f32(r22, g_XMZero); + + // (4*x^2, 4*y^2, 4*z^2, 4*w^2) + float32x4_t t0 = vmulq_f32(XMPMMP, r00); + float32x4_t x2y2z2w2 = vmlaq_f32(t0, XMMPMP, r11); + x2y2z2w2 = vmlaq_f32(x2y2z2w2, XMMMPP, r22); + x2y2z2w2 = vaddq_f32(x2y2z2w2, g_XMOne); + + // (r01, r02, r12, r11) + t0 = vextq_f32(r0, r0, 1); + float32x4_t t1 = vextq_f32(r1, r1, 1); + t0 = vcombine_f32(vget_low_f32(t0), vrev64_f32(vget_low_f32(t1))); + + // (r10, r20, r21, r10) + t1 = vextq_f32(r2, r2, 3); + float32x4_t r10 = vdupq_lane_f32(vget_low_f32(r1), 0); + t1 = vbslq_f32(Select0110, t1, r10); + + // (4*x*y, 4*x*z, 4*y*z, unused) + float32x4_t xyxzyz = vaddq_f32(t0, t1); + + // (r21, r20, r10, r10) + t0 = vcombine_f32(vrev64_f32(vget_low_f32(r2)), vget_low_f32(r10)); + + // (r12, r02, r01, r12) + float32x4_t t2 = vcombine_f32(vrev64_f32(vget_high_f32(r0)), vrev64_f32(vget_low_f32(r0))); + float32x4_t t3 = vdupq_lane_f32(vget_high_f32(r1), 0); + t1 = vbslq_f32(Select0110, t2, t3); + + // (4*x*w, 4*y*w, 4*z*w, unused) + float32x4_t xwywzw = vsubq_f32(t0, t1); + xwywzw = vmulq_f32(XMMPMP, xwywzw); + + // (4*x*x, 4*x*y, 4*x*z, 4*x*w) + t0 = vextq_f32(xyxzyz, xyxzyz, 3); + t1 = vbslq_f32(Select0110, t0, x2y2z2w2); + t2 = vdupq_lane_f32(vget_low_f32(xwywzw), 0); + float32x4_t tensor0 = vbslq_f32(g_XMSelect1110, t1, t2); + + // (4*y*x, 4*y*y, 4*y*z, 4*y*w) + t0 = vbslq_f32(g_XMSelect1011, xyxzyz, x2y2z2w2); + t1 = vdupq_lane_f32(vget_low_f32(xwywzw), 1); + float32x4_t tensor1 = vbslq_f32(g_XMSelect1110, t0, t1); + + // (4*z*x, 4*z*y, 4*z*z, 4*z*w) + t0 = vextq_f32(xyxzyz, xyxzyz, 1); + t1 = vcombine_f32(vget_low_f32(t0), vrev64_f32(vget_high_f32(xwywzw))); + float32x4_t tensor2 = vbslq_f32(Select0010, x2y2z2w2, t1); + + // (4*w*x, 4*w*y, 4*w*z, 4*w*w) + float32x4_t tensor3 = vbslq_f32(g_XMSelect1110, xwywzw, x2y2z2w2); + + // Select the row of the tensor-product matrix that has the largest + // magnitude. + t0 = vbslq_f32(x2gey2, tensor0, tensor1); + t1 = vbslq_f32(z2gew2, tensor2, tensor3); + t2 = vbslq_f32(x2py2gez2pw2, t0, t1); + + // Normalize the row. No division by zero is possible because the + // quaternion is unit-length (and the row is a nonzero multiple of + // the quaternion). + t0 = XMVector4Length(t2); + return XMVectorDivide(t2, t0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 XMPMMP = { { { +1.0f, -1.0f, -1.0f, +1.0f } } }; + static const XMVECTORF32 XMMPMP = { { { -1.0f, +1.0f, -1.0f, +1.0f } } }; + static const XMVECTORF32 XMMMPP = { { { -1.0f, -1.0f, +1.0f, +1.0f } } }; + + XMVECTOR r0 = M.r[0]; // (r00, r01, r02, 0) + XMVECTOR r1 = M.r[1]; // (r10, r11, r12, 0) + XMVECTOR r2 = M.r[2]; // (r20, r21, r22, 0) + + // (r00, r00, r00, r00) + XMVECTOR r00 = XM_PERMUTE_PS(r0, _MM_SHUFFLE(0, 0, 0, 0)); + // (r11, r11, r11, r11) + XMVECTOR r11 = XM_PERMUTE_PS(r1, _MM_SHUFFLE(1, 1, 1, 1)); + // (r22, r22, r22, r22) + XMVECTOR r22 = XM_PERMUTE_PS(r2, _MM_SHUFFLE(2, 2, 2, 2)); + + // x^2 >= y^2 equivalent to r11 - r00 <= 0 + // (r11 - r00, r11 - r00, r11 - r00, r11 - r00) + XMVECTOR r11mr00 = _mm_sub_ps(r11, r00); + XMVECTOR x2gey2 = _mm_cmple_ps(r11mr00, g_XMZero); + + // z^2 >= w^2 equivalent to r11 + r00 <= 0 + // (r11 + r00, r11 + r00, r11 + r00, r11 + r00) + XMVECTOR r11pr00 = _mm_add_ps(r11, r00); + XMVECTOR z2gew2 = _mm_cmple_ps(r11pr00, g_XMZero); + + // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0 + XMVECTOR x2py2gez2pw2 = _mm_cmple_ps(r22, g_XMZero); + + // (4*x^2, 4*y^2, 4*z^2, 4*w^2) + XMVECTOR t0 = XM_FMADD_PS(XMPMMP, r00, g_XMOne); + XMVECTOR t1 = _mm_mul_ps(XMMPMP, r11); + XMVECTOR t2 = XM_FMADD_PS(XMMMPP, r22, t0); + XMVECTOR x2y2z2w2 = _mm_add_ps(t1, t2); + + // (r01, r02, r12, r11) + t0 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1, 2, 2, 1)); + // (r10, r10, r20, r21) + t1 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(1, 0, 0, 0)); + // (r10, r20, r21, r10) + t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1, 3, 2, 0)); + // (4*x*y, 4*x*z, 4*y*z, unused) + XMVECTOR xyxzyz = _mm_add_ps(t0, t1); + + // (r21, r20, r10, r10) + t0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 1)); + // (r12, r12, r02, r01) + t1 = _mm_shuffle_ps(r1, r0, _MM_SHUFFLE(1, 2, 2, 2)); + // (r12, r02, r01, r12) + t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1, 3, 2, 0)); + // (4*x*w, 4*y*w, 4*z*w, unused) + XMVECTOR xwywzw = _mm_sub_ps(t0, t1); + xwywzw = _mm_mul_ps(XMMPMP, xwywzw); + + // (4*x^2, 4*y^2, 4*x*y, unused) + t0 = _mm_shuffle_ps(x2y2z2w2, xyxzyz, _MM_SHUFFLE(0, 0, 1, 0)); + // (4*z^2, 4*w^2, 4*z*w, unused) + t1 = _mm_shuffle_ps(x2y2z2w2, xwywzw, _MM_SHUFFLE(0, 2, 3, 2)); + // (4*x*z, 4*y*z, 4*x*w, 4*y*w) + t2 = _mm_shuffle_ps(xyxzyz, xwywzw, _MM_SHUFFLE(1, 0, 2, 1)); + + // (4*x*x, 4*x*y, 4*x*z, 4*x*w) + XMVECTOR tensor0 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2, 0, 2, 0)); + // (4*y*x, 4*y*y, 4*y*z, 4*y*w) + XMVECTOR tensor1 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3, 1, 1, 2)); + // (4*z*x, 4*z*y, 4*z*z, 4*z*w) + XMVECTOR tensor2 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(2, 0, 1, 0)); + // (4*w*x, 4*w*y, 4*w*z, 4*w*w) + XMVECTOR tensor3 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(1, 2, 3, 2)); + + // Select the row of the tensor-product matrix that has the largest + // magnitude. + t0 = _mm_and_ps(x2gey2, tensor0); + t1 = _mm_andnot_ps(x2gey2, tensor1); + t0 = _mm_or_ps(t0, t1); + t1 = _mm_and_ps(z2gew2, tensor2); + t2 = _mm_andnot_ps(z2gew2, tensor3); + t1 = _mm_or_ps(t1, t2); + t0 = _mm_and_ps(x2py2gez2pw2, t0); + t1 = _mm_andnot_ps(x2py2gez2pw2, t1); + t2 = _mm_or_ps(t0, t1); + + // Normalize the row. No division by zero is possible because the + // quaternion is unit-length (and the row is a nonzero multiple of + // the quaternion). + t0 = XMVector4Length(t2); + return _mm_div_ps(t2, t0); +#endif +} + +//------------------------------------------------------------------------------ +// Conversion operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMQuaternionToAxisAngle +( + XMVECTOR* pAxis, + float* pAngle, + FXMVECTOR Q +) noexcept +{ + assert(pAxis); + assert(pAngle); + + *pAxis = Q; + + *pAngle = 2.0f * XMScalarACos(XMVectorGetW(Q)); +} + +/**************************************************************************** + * + * Plane + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + // Comparison operations + //------------------------------------------------------------------------------ + + //------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMPlaneEqual +( + FXMVECTOR P1, + FXMVECTOR P2 +) noexcept +{ + return XMVector4Equal(P1, P2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMPlaneNearEqual +( + FXMVECTOR P1, + FXMVECTOR P2, + FXMVECTOR Epsilon +) noexcept +{ + XMVECTOR NP1 = XMPlaneNormalize(P1); + XMVECTOR NP2 = XMPlaneNormalize(P2); + return XMVector4NearEqual(NP1, NP2, Epsilon); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMPlaneNotEqual +( + FXMVECTOR P1, + FXMVECTOR P2 +) noexcept +{ + return XMVector4NotEqual(P1, P2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMPlaneIsNaN(FXMVECTOR P) noexcept +{ + return XMVector4IsNaN(P); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMPlaneIsInfinite(FXMVECTOR P) noexcept +{ + return XMVector4IsInfinite(P); +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneDot +( + FXMVECTOR P, + FXMVECTOR V +) noexcept +{ + return XMVector4Dot(P, V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneDotCoord +( + FXMVECTOR P, + FXMVECTOR V +) noexcept +{ + // Result = P[0] * V[0] + P[1] * V[1] + P[2] * V[2] + P[3] + + XMVECTOR V3 = XMVectorSelect(g_XMOne.v, V, g_XMSelect1110.v); + XMVECTOR Result = XMVector4Dot(P, V3); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneDotNormal +( + FXMVECTOR P, + FXMVECTOR V +) noexcept +{ + return XMVector3Dot(P, V); +} + +//------------------------------------------------------------------------------ +// XMPlaneNormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +inline XMVECTOR XM_CALLCONV XMPlaneNormalizeEst(FXMVECTOR P) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR Result = XMVector3ReciprocalLengthEst(P); + return XMVectorMultiply(P, Result); + +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(P, P, 0x7f); + XMVECTOR vResult = _mm_rsqrt_ps(vTemp); + return _mm_mul_ps(vResult, P); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(P, P); + // x=Dot.y, y=Dot.z + XMVECTOR vTemp = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1)); + // Result.x = x+y + vDot = _mm_add_ss(vDot, vTemp); + // x=Dot.z + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + // Result.x = (x+y)+z + vDot = _mm_add_ss(vDot, vTemp); + // Splat x + vDot = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0)); + // Get the reciprocal + vDot = _mm_rsqrt_ps(vDot); + // Get the reciprocal + vDot = _mm_mul_ps(vDot, P); + return vDot; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneNormalize(FXMVECTOR P) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float fLengthSq = sqrtf((P.vector4_f32[0] * P.vector4_f32[0]) + (P.vector4_f32[1] * P.vector4_f32[1]) + (P.vector4_f32[2] * P.vector4_f32[2])); + // Prevent divide by zero + if (fLengthSq > 0) + { + fLengthSq = 1.0f / fLengthSq; + } + XMVECTORF32 vResult = { { { + P.vector4_f32[0] * fLengthSq, + P.vector4_f32[1] * fLengthSq, + P.vector4_f32[2] * fLengthSq, + P.vector4_f32[3] * fLengthSq + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vLength = XMVector3ReciprocalLength(P); + return XMVectorMultiply(P, vLength); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vLengthSq = _mm_dp_ps(P, P, 0x7f); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(P, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vLengthSq); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z only + XMVECTOR vLengthSq = _mm_mul_ps(P, P); + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 1, 2, 1)); + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(P, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vLengthSq); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneIntersectLine +( + FXMVECTOR P, + FXMVECTOR LinePoint1, + FXMVECTOR LinePoint2 +) noexcept +{ + XMVECTOR V1 = XMVector3Dot(P, LinePoint1); + XMVECTOR V2 = XMVector3Dot(P, LinePoint2); + XMVECTOR D = XMVectorSubtract(V1, V2); + + XMVECTOR VT = XMPlaneDotCoord(P, LinePoint1); + VT = XMVectorDivide(VT, D); + + XMVECTOR Point = XMVectorSubtract(LinePoint2, LinePoint1); + Point = XMVectorMultiplyAdd(Point, VT, LinePoint1); + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR Control = XMVectorNearEqual(D, Zero, g_XMEpsilon.v); + + return XMVectorSelect(Point, g_XMQNaN.v, Control); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMPlaneIntersectPlane +( + XMVECTOR* pLinePoint1, + XMVECTOR* pLinePoint2, + FXMVECTOR P1, + FXMVECTOR P2 +) noexcept +{ + assert(pLinePoint1); + assert(pLinePoint2); + + XMVECTOR V1 = XMVector3Cross(P2, P1); + + XMVECTOR LengthSq = XMVector3LengthSq(V1); + + XMVECTOR V2 = XMVector3Cross(P2, V1); + + XMVECTOR P1W = XMVectorSplatW(P1); + XMVECTOR Point = XMVectorMultiply(V2, P1W); + + XMVECTOR V3 = XMVector3Cross(V1, P1); + + XMVECTOR P2W = XMVectorSplatW(P2); + Point = XMVectorMultiplyAdd(V3, P2W, Point); + + XMVECTOR LinePoint1 = XMVectorDivide(Point, LengthSq); + + XMVECTOR LinePoint2 = XMVectorAdd(LinePoint1, V1); + + XMVECTOR Control = XMVectorLessOrEqual(LengthSq, g_XMEpsilon.v); + *pLinePoint1 = XMVectorSelect(LinePoint1, g_XMQNaN.v, Control); + *pLinePoint2 = XMVectorSelect(LinePoint2, g_XMQNaN.v, Control); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneTransform +( + FXMVECTOR P, + FXMMATRIX M +) noexcept +{ + XMVECTOR W = XMVectorSplatW(P); + XMVECTOR Z = XMVectorSplatZ(P); + XMVECTOR Y = XMVectorSplatY(P); + XMVECTOR X = XMVectorSplatX(P); + + XMVECTOR Result = XMVectorMultiply(W, M.r[3]); + Result = XMVectorMultiplyAdd(Z, M.r[2], Result); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + return Result; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT4* XM_CALLCONV XMPlaneTransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + const XMFLOAT4* pInputStream, + size_t InputStride, + size_t PlaneCount, + FXMMATRIX M +) noexcept +{ + return XMVector4TransformStream(pOutputStream, + OutputStride, + pInputStream, + InputStride, + PlaneCount, + M); +} + +//------------------------------------------------------------------------------ +// Conversion operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneFromPointNormal +( + FXMVECTOR Point, + FXMVECTOR Normal +) noexcept +{ + XMVECTOR W = XMVector3Dot(Point, Normal); + W = XMVectorNegate(W); + return XMVectorSelect(W, Normal, g_XMSelect1110.v); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneFromPoints +( + FXMVECTOR Point1, + FXMVECTOR Point2, + FXMVECTOR Point3 +) noexcept +{ + XMVECTOR V21 = XMVectorSubtract(Point1, Point2); + XMVECTOR V31 = XMVectorSubtract(Point1, Point3); + + XMVECTOR N = XMVector3Cross(V21, V31); + N = XMVector3Normalize(N); + + XMVECTOR D = XMPlaneDotNormal(N, Point1); + D = XMVectorNegate(D); + + XMVECTOR Result = XMVectorSelect(D, N, g_XMSelect1110.v); + + return Result; +} + +/**************************************************************************** + * + * Color + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + // Comparison operations + //------------------------------------------------------------------------------ + + //------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) noexcept +{ + return XMVector4Equal(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorNotEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) noexcept +{ + return XMVector4NotEqual(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorGreater +( + FXMVECTOR C1, + FXMVECTOR C2 +) noexcept +{ + return XMVector4Greater(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorGreaterOrEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) noexcept +{ + return XMVector4GreaterOrEqual(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorLess +( + FXMVECTOR C1, + FXMVECTOR C2 +) noexcept +{ + return XMVector4Less(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorLessOrEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) noexcept +{ + return XMVector4LessOrEqual(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorIsNaN(FXMVECTOR C) noexcept +{ + return XMVector4IsNaN(C); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorIsInfinite(FXMVECTOR C) noexcept +{ + return XMVector4IsInfinite(C); +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorNegative(FXMVECTOR vColor) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + 1.0f - vColor.vector4_f32[0], + 1.0f - vColor.vector4_f32[1], + 1.0f - vColor.vector4_f32[2], + vColor.vector4_f32[3] + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vTemp = veorq_u32(vreinterpretq_u32_f32(vColor), g_XMNegate3); + return vaddq_f32(vreinterpretq_f32_u32(vTemp), g_XMOne3); +#elif defined(_XM_SSE_INTRINSICS_) + // Negate only x,y and z. + XMVECTOR vTemp = _mm_xor_ps(vColor, g_XMNegate3); + // Add 1,1,1,0 to -x,-y,-z,w + return _mm_add_ps(vTemp, g_XMOne3); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorModulate +( + FXMVECTOR C1, + FXMVECTOR C2 +) noexcept +{ + return XMVectorMultiply(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorAdjustSaturation +( + FXMVECTOR vColor, + float fSaturation +) noexcept +{ + // Luminance = 0.2125f * C[0] + 0.7154f * C[1] + 0.0721f * C[2]; + // Result = (C - Luminance) * Saturation + Luminance; + + const XMVECTORF32 gvLuminance = { { { 0.2125f, 0.7154f, 0.0721f, 0.0f } } }; +#if defined(_XM_NO_INTRINSICS_) + float fLuminance = (vColor.vector4_f32[0] * gvLuminance.f[0]) + (vColor.vector4_f32[1] * gvLuminance.f[1]) + (vColor.vector4_f32[2] * gvLuminance.f[2]); + XMVECTOR vResult; + vResult.vector4_f32[0] = ((vColor.vector4_f32[0] - fLuminance) * fSaturation) + fLuminance; + vResult.vector4_f32[1] = ((vColor.vector4_f32[1] - fLuminance) * fSaturation) + fLuminance; + vResult.vector4_f32[2] = ((vColor.vector4_f32[2] - fLuminance) * fSaturation) + fLuminance; + vResult.vector4_f32[3] = vColor.vector4_f32[3]; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vLuminance = XMVector3Dot(vColor, gvLuminance); + XMVECTOR vResult = vsubq_f32(vColor, vLuminance); + vResult = vmlaq_n_f32(vLuminance, vResult, fSaturation); + return vbslq_f32(g_XMSelect1110, vResult, vColor); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vLuminance = XMVector3Dot(vColor, gvLuminance); + // Splat fSaturation + XMVECTOR vSaturation = _mm_set_ps1(fSaturation); + // vResult = ((vColor-vLuminance)*vSaturation)+vLuminance; + XMVECTOR vResult = _mm_sub_ps(vColor, vLuminance); + vResult = XM_FMADD_PS(vResult, vSaturation, vLuminance); + // Retain w from the source color + vLuminance = _mm_shuffle_ps(vResult, vColor, _MM_SHUFFLE(3, 2, 2, 2)); // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w + vResult = _mm_shuffle_ps(vResult, vLuminance, _MM_SHUFFLE(3, 0, 1, 0)); // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorAdjustContrast +( + FXMVECTOR vColor, + float fContrast +) noexcept +{ + // Result = (vColor - 0.5f) * fContrast + 0.5f; + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + ((vColor.vector4_f32[0] - 0.5f) * fContrast) + 0.5f, + ((vColor.vector4_f32[1] - 0.5f) * fContrast) + 0.5f, + ((vColor.vector4_f32[2] - 0.5f) * fContrast) + 0.5f, + vColor.vector4_f32[3] // Leave W untouched + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vResult = vsubq_f32(vColor, g_XMOneHalf.v); + vResult = vmlaq_n_f32(g_XMOneHalf.v, vResult, fContrast); + return vbslq_f32(g_XMSelect1110, vResult, vColor); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vScale = _mm_set_ps1(fContrast); // Splat the scale + XMVECTOR vResult = _mm_sub_ps(vColor, g_XMOneHalf); // Subtract 0.5f from the source (Saving source) + vResult = XM_FMADD_PS(vResult, vScale, g_XMOneHalf); +// Retain w from the source color + vScale = _mm_shuffle_ps(vResult, vColor, _MM_SHUFFLE(3, 2, 2, 2)); // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w + vResult = _mm_shuffle_ps(vResult, vScale, _MM_SHUFFLE(3, 0, 1, 0)); // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToHSL(FXMVECTOR rgb) noexcept +{ + XMVECTOR r = XMVectorSplatX(rgb); + XMVECTOR g = XMVectorSplatY(rgb); + XMVECTOR b = XMVectorSplatZ(rgb); + + XMVECTOR min = XMVectorMin(r, XMVectorMin(g, b)); + XMVECTOR max = XMVectorMax(r, XMVectorMax(g, b)); + + XMVECTOR l = XMVectorMultiply(XMVectorAdd(min, max), g_XMOneHalf); + + XMVECTOR d = XMVectorSubtract(max, min); + + XMVECTOR la = XMVectorSelect(rgb, l, g_XMSelect1110); + + if (XMVector3Less(d, g_XMEpsilon)) + { + // Achromatic, assume H and S of 0 + return XMVectorSelect(la, g_XMZero, g_XMSelect1100); + } + else + { + XMVECTOR s, h; + + XMVECTOR d2 = XMVectorAdd(min, max); + + if (XMVector3Greater(l, g_XMOneHalf)) + { + // d / (2-max-min) + s = XMVectorDivide(d, XMVectorSubtract(g_XMTwo, d2)); + } + else + { + // d / (max+min) + s = XMVectorDivide(d, d2); + } + + if (XMVector3Equal(r, max)) + { + // Red is max + h = XMVectorDivide(XMVectorSubtract(g, b), d); + } + else if (XMVector3Equal(g, max)) + { + // Green is max + h = XMVectorDivide(XMVectorSubtract(b, r), d); + h = XMVectorAdd(h, g_XMTwo); + } + else + { + // Blue is max + h = XMVectorDivide(XMVectorSubtract(r, g), d); + h = XMVectorAdd(h, g_XMFour); + } + + h = XMVectorDivide(h, g_XMSix); + + if (XMVector3Less(h, g_XMZero)) + h = XMVectorAdd(h, g_XMOne); + + XMVECTOR lha = XMVectorSelect(la, h, g_XMSelect1100); + return XMVectorSelect(s, lha, g_XMSelect1011); + } +} + +//------------------------------------------------------------------------------ + +namespace Internal +{ + + inline XMVECTOR XM_CALLCONV XMColorHue2Clr(FXMVECTOR p, FXMVECTOR q, FXMVECTOR h) noexcept + { + static const XMVECTORF32 oneSixth = { { { 1.0f / 6.0f, 1.0f / 6.0f, 1.0f / 6.0f, 1.0f / 6.0f } } }; + static const XMVECTORF32 twoThirds = { { { 2.0f / 3.0f, 2.0f / 3.0f, 2.0f / 3.0f, 2.0f / 3.0f } } }; + + XMVECTOR t = h; + + if (XMVector3Less(t, g_XMZero)) + t = XMVectorAdd(t, g_XMOne); + + if (XMVector3Greater(t, g_XMOne)) + t = XMVectorSubtract(t, g_XMOne); + + if (XMVector3Less(t, oneSixth)) + { + // p + (q - p) * 6 * t + XMVECTOR t1 = XMVectorSubtract(q, p); + XMVECTOR t2 = XMVectorMultiply(g_XMSix, t); + return XMVectorMultiplyAdd(t1, t2, p); + } + + if (XMVector3Less(t, g_XMOneHalf)) + return q; + + if (XMVector3Less(t, twoThirds)) + { + // p + (q - p) * 6 * (2/3 - t) + XMVECTOR t1 = XMVectorSubtract(q, p); + XMVECTOR t2 = XMVectorMultiply(g_XMSix, XMVectorSubtract(twoThirds, t)); + return XMVectorMultiplyAdd(t1, t2, p); + } + + return p; + } + +} // namespace Internal + +inline XMVECTOR XM_CALLCONV XMColorHSLToRGB(FXMVECTOR hsl) noexcept +{ + static const XMVECTORF32 oneThird = { { { 1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f } } }; + + XMVECTOR s = XMVectorSplatY(hsl); + XMVECTOR l = XMVectorSplatZ(hsl); + + if (XMVector3NearEqual(s, g_XMZero, g_XMEpsilon)) + { + // Achromatic + return XMVectorSelect(hsl, l, g_XMSelect1110); + } + else + { + XMVECTOR h = XMVectorSplatX(hsl); + + XMVECTOR q; + if (XMVector3Less(l, g_XMOneHalf)) + { + q = XMVectorMultiply(l, XMVectorAdd(g_XMOne, s)); + } + else + { + q = XMVectorSubtract(XMVectorAdd(l, s), XMVectorMultiply(l, s)); + } + + XMVECTOR p = XMVectorSubtract(XMVectorMultiply(g_XMTwo, l), q); + + XMVECTOR r = DirectX::Internal::XMColorHue2Clr(p, q, XMVectorAdd(h, oneThird)); + XMVECTOR g = DirectX::Internal::XMColorHue2Clr(p, q, h); + XMVECTOR b = DirectX::Internal::XMColorHue2Clr(p, q, XMVectorSubtract(h, oneThird)); + + XMVECTOR rg = XMVectorSelect(g, r, g_XMSelect1000); + XMVECTOR ba = XMVectorSelect(hsl, b, g_XMSelect1110); + + return XMVectorSelect(ba, rg, g_XMSelect1100); + } +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToHSV(FXMVECTOR rgb) noexcept +{ + XMVECTOR r = XMVectorSplatX(rgb); + XMVECTOR g = XMVectorSplatY(rgb); + XMVECTOR b = XMVectorSplatZ(rgb); + + XMVECTOR min = XMVectorMin(r, XMVectorMin(g, b)); + XMVECTOR v = XMVectorMax(r, XMVectorMax(g, b)); + + XMVECTOR d = XMVectorSubtract(v, min); + + XMVECTOR s = (XMVector3NearEqual(v, g_XMZero, g_XMEpsilon)) ? g_XMZero : XMVectorDivide(d, v); + + if (XMVector3Less(d, g_XMEpsilon)) + { + // Achromatic, assume H of 0 + XMVECTOR hv = XMVectorSelect(v, g_XMZero, g_XMSelect1000); + XMVECTOR hva = XMVectorSelect(rgb, hv, g_XMSelect1110); + return XMVectorSelect(s, hva, g_XMSelect1011); + } + else + { + XMVECTOR h; + + if (XMVector3Equal(r, v)) + { + // Red is max + h = XMVectorDivide(XMVectorSubtract(g, b), d); + + if (XMVector3Less(g, b)) + h = XMVectorAdd(h, g_XMSix); + } + else if (XMVector3Equal(g, v)) + { + // Green is max + h = XMVectorDivide(XMVectorSubtract(b, r), d); + h = XMVectorAdd(h, g_XMTwo); + } + else + { + // Blue is max + h = XMVectorDivide(XMVectorSubtract(r, g), d); + h = XMVectorAdd(h, g_XMFour); + } + + h = XMVectorDivide(h, g_XMSix); + + XMVECTOR hv = XMVectorSelect(v, h, g_XMSelect1000); + XMVECTOR hva = XMVectorSelect(rgb, hv, g_XMSelect1110); + return XMVectorSelect(s, hva, g_XMSelect1011); + } +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorHSVToRGB(FXMVECTOR hsv) noexcept +{ + XMVECTOR h = XMVectorSplatX(hsv); + XMVECTOR s = XMVectorSplatY(hsv); + XMVECTOR v = XMVectorSplatZ(hsv); + + XMVECTOR h6 = XMVectorMultiply(h, g_XMSix); + + XMVECTOR i = XMVectorFloor(h6); + XMVECTOR f = XMVectorSubtract(h6, i); + + // p = v* (1-s) + XMVECTOR p = XMVectorMultiply(v, XMVectorSubtract(g_XMOne, s)); + + // q = v*(1-f*s) + XMVECTOR q = XMVectorMultiply(v, XMVectorSubtract(g_XMOne, XMVectorMultiply(f, s))); + + // t = v*(1 - (1-f)*s) + XMVECTOR t = XMVectorMultiply(v, XMVectorSubtract(g_XMOne, XMVectorMultiply(XMVectorSubtract(g_XMOne, f), s))); + + auto ii = static_cast(XMVectorGetX(XMVectorMod(i, g_XMSix))); + + XMVECTOR _rgb; + + switch (ii) + { + case 0: // rgb = vtp + { + XMVECTOR vt = XMVectorSelect(t, v, g_XMSelect1000); + _rgb = XMVectorSelect(p, vt, g_XMSelect1100); + } + break; + case 1: // rgb = qvp + { + XMVECTOR qv = XMVectorSelect(v, q, g_XMSelect1000); + _rgb = XMVectorSelect(p, qv, g_XMSelect1100); + } + break; + case 2: // rgb = pvt + { + XMVECTOR pv = XMVectorSelect(v, p, g_XMSelect1000); + _rgb = XMVectorSelect(t, pv, g_XMSelect1100); + } + break; + case 3: // rgb = pqv + { + XMVECTOR pq = XMVectorSelect(q, p, g_XMSelect1000); + _rgb = XMVectorSelect(v, pq, g_XMSelect1100); + } + break; + case 4: // rgb = tpv + { + XMVECTOR tp = XMVectorSelect(p, t, g_XMSelect1000); + _rgb = XMVectorSelect(v, tp, g_XMSelect1100); + } + break; + default: // rgb = vpq + { + XMVECTOR vp = XMVectorSelect(p, v, g_XMSelect1000); + _rgb = XMVectorSelect(q, vp, g_XMSelect1100); + } + break; + } + + return XMVectorSelect(hsv, _rgb, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToYUV(FXMVECTOR rgb) noexcept +{ + static const XMVECTORF32 Scale0 = { { { 0.299f, -0.147f, 0.615f, 0.0f } } }; + static const XMVECTORF32 Scale1 = { { { 0.587f, -0.289f, -0.515f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { 0.114f, 0.436f, -0.100f, 0.0f } } }; + + XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVector3Transform(rgb, M); + + return XMVectorSelect(rgb, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorYUVToRGB(FXMVECTOR yuv) noexcept +{ + static const XMVECTORF32 Scale1 = { { { 0.0f, -0.395f, 2.032f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { 1.140f, -0.581f, 0.0f, 0.0f } } }; + + XMMATRIX M(g_XMOne, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVector3Transform(yuv, M); + + return XMVectorSelect(yuv, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToYUV_HD(FXMVECTOR rgb) noexcept +{ + static const XMVECTORF32 Scale0 = { { { 0.2126f, -0.0997f, 0.6150f, 0.0f } } }; + static const XMVECTORF32 Scale1 = { { { 0.7152f, -0.3354f, -0.5586f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { 0.0722f, 0.4351f, -0.0564f, 0.0f } } }; + + XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVector3Transform(rgb, M); + + return XMVectorSelect(rgb, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorYUVToRGB_HD(FXMVECTOR yuv) noexcept +{ + static const XMVECTORF32 Scale1 = { { { 0.0f, -0.2153f, 2.1324f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { 1.2803f, -0.3806f, 0.0f, 0.0f } } }; + + XMMATRIX M(g_XMOne, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVector3Transform(yuv, M); + + return XMVectorSelect(yuv, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToYUV_UHD(FXMVECTOR rgb) noexcept +{ + static const XMVECTORF32 Scale0 = { { { 0.2627f, -0.1215f, 0.6150f, 0.0f } } }; + static const XMVECTORF32 Scale1 = { { { 0.6780f, -0.3136f, -0.5655f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { 0.0593f, 0.4351f, -0.0495f, 0.0f } } }; + + XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVector3Transform(rgb, M); + + return XMVectorSelect(rgb, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorYUVToRGB_UHD(FXMVECTOR yuv) noexcept +{ + static const XMVECTORF32 Scale1 = { { { 0.0f, -0.1891f, 2.1620f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { 1.1989f, -0.4645f, 0.0f, 0.0f } } }; + + XMMATRIX M(g_XMOne, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVector3Transform(yuv, M); + + return XMVectorSelect(yuv, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToXYZ(FXMVECTOR rgb) noexcept +{ + static const XMVECTORF32 Scale0 = { { { 0.4887180f, 0.1762044f, 0.0000000f, 0.0f } } }; + static const XMVECTORF32 Scale1 = { { { 0.3106803f, 0.8129847f, 0.0102048f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { 0.2006017f, 0.0108109f, 0.9897952f, 0.0f } } }; + static const XMVECTORF32 Scale = { { { 1.f / 0.17697f, 1.f / 0.17697f, 1.f / 0.17697f, 0.0f } } }; + + XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVectorMultiply(XMVector3Transform(rgb, M), Scale); + + return XMVectorSelect(rgb, clr, g_XMSelect1110); +} + +inline XMVECTOR XM_CALLCONV XMColorXYZToRGB(FXMVECTOR xyz) noexcept +{ + static const XMVECTORF32 Scale0 = { { { 2.3706743f, -0.5138850f, 0.0052982f, 0.0f } } }; + static const XMVECTORF32 Scale1 = { { { -0.9000405f, 1.4253036f, -0.0146949f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { -0.4706338f, 0.0885814f, 1.0093968f, 0.0f } } }; + static const XMVECTORF32 Scale = { { { 0.17697f, 0.17697f, 0.17697f, 0.0f } } }; + + XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVector3Transform(XMVectorMultiply(xyz, Scale), M); + + return XMVectorSelect(xyz, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorXYZToSRGB(FXMVECTOR xyz) noexcept +{ + static const XMVECTORF32 Scale0 = { { { 3.2406f, -0.9689f, 0.0557f, 0.0f } } }; + static const XMVECTORF32 Scale1 = { { { -1.5372f, 1.8758f, -0.2040f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { -0.4986f, 0.0415f, 1.0570f, 0.0f } } }; + static const XMVECTORF32 Cutoff = { { { 0.0031308f, 0.0031308f, 0.0031308f, 0.0f } } }; + static const XMVECTORF32 Exp = { { { 1.0f / 2.4f, 1.0f / 2.4f, 1.0f / 2.4f, 1.0f } } }; + + XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero); + XMVECTOR lclr = XMVector3Transform(xyz, M); + + XMVECTOR sel = XMVectorGreater(lclr, Cutoff); + + // clr = 12.92 * lclr for lclr <= 0.0031308f + XMVECTOR smallC = XMVectorMultiply(lclr, g_XMsrgbScale); + + // clr = (1+a)*pow(lclr, 1/2.4) - a for lclr > 0.0031308 (where a = 0.055) + XMVECTOR largeC = XMVectorSubtract(XMVectorMultiply(g_XMsrgbA1, XMVectorPow(lclr, Exp)), g_XMsrgbA); + + XMVECTOR clr = XMVectorSelect(smallC, largeC, sel); + + return XMVectorSelect(xyz, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorSRGBToXYZ(FXMVECTOR srgb) noexcept +{ + static const XMVECTORF32 Scale0 = { { { 0.4124f, 0.2126f, 0.0193f, 0.0f } } }; + static const XMVECTORF32 Scale1 = { { { 0.3576f, 0.7152f, 0.1192f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { 0.1805f, 0.0722f, 0.9505f, 0.0f } } }; + static const XMVECTORF32 Cutoff = { { { 0.04045f, 0.04045f, 0.04045f, 0.0f } } }; + static const XMVECTORF32 Exp = { { { 2.4f, 2.4f, 2.4f, 1.0f } } }; + + XMVECTOR sel = XMVectorGreater(srgb, Cutoff); + + // lclr = clr / 12.92 + XMVECTOR smallC = XMVectorDivide(srgb, g_XMsrgbScale); + + // lclr = pow( (clr + a) / (1+a), 2.4 ) + XMVECTOR largeC = XMVectorPow(XMVectorDivide(XMVectorAdd(srgb, g_XMsrgbA), g_XMsrgbA1), Exp); + + XMVECTOR lclr = XMVectorSelect(smallC, largeC, sel); + + XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVector3Transform(lclr, M); + + return XMVectorSelect(srgb, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToSRGB(FXMVECTOR rgb) noexcept +{ + static const XMVECTORF32 Cutoff = { { { 0.0031308f, 0.0031308f, 0.0031308f, 1.f } } }; + static const XMVECTORF32 Linear = { { { 12.92f, 12.92f, 12.92f, 1.f } } }; + static const XMVECTORF32 Scale = { { { 1.055f, 1.055f, 1.055f, 1.f } } }; + static const XMVECTORF32 Bias = { { { 0.055f, 0.055f, 0.055f, 0.f } } }; + static const XMVECTORF32 InvGamma = { { { 1.0f / 2.4f, 1.0f / 2.4f, 1.0f / 2.4f, 1.f } } }; + + XMVECTOR V = XMVectorSaturate(rgb); + XMVECTOR V0 = XMVectorMultiply(V, Linear); + XMVECTOR V1 = XMVectorSubtract(XMVectorMultiply(Scale, XMVectorPow(V, InvGamma)), Bias); + XMVECTOR select = XMVectorLess(V, Cutoff); + V = XMVectorSelect(V1, V0, select); + return XMVectorSelect(rgb, V, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorSRGBToRGB(FXMVECTOR srgb) noexcept +{ + static const XMVECTORF32 Cutoff = { { { 0.04045f, 0.04045f, 0.04045f, 1.f } } }; + static const XMVECTORF32 ILinear = { { { 1.f / 12.92f, 1.f / 12.92f, 1.f / 12.92f, 1.f } } }; + static const XMVECTORF32 Scale = { { { 1.f / 1.055f, 1.f / 1.055f, 1.f / 1.055f, 1.f } } }; + static const XMVECTORF32 Bias = { { { 0.055f, 0.055f, 0.055f, 0.f } } }; + static const XMVECTORF32 Gamma = { { { 2.4f, 2.4f, 2.4f, 1.f } } }; + + XMVECTOR V = XMVectorSaturate(srgb); + XMVECTOR V0 = XMVectorMultiply(V, ILinear); + XMVECTOR V1 = XMVectorPow(XMVectorMultiply(XMVectorAdd(V, Bias), Scale), Gamma); + XMVECTOR select = XMVectorGreater(V, Cutoff); + V = XMVectorSelect(V0, V1, select); + return XMVectorSelect(srgb, V, g_XMSelect1110); +} + +/**************************************************************************** + * + * Miscellaneous + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline bool XMVerifyCPUSupport() noexcept +{ +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + int CPUInfo[4] = { -1 }; +#if defined(__clang__) || defined(__GNUC__) + __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuid(CPUInfo, 0); +#endif + +#ifdef __AVX2__ + if (CPUInfo[0] < 7) + return false; +#else + if (CPUInfo[0] < 1) + return false; +#endif + +#if defined(__clang__) || defined(__GNUC__) + __cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuid(CPUInfo, 1); +#endif + +#if defined(__AVX2__) || defined(_XM_AVX2_INTRINSICS_) + // The compiler can emit FMA3 instructions even without explicit intrinsics use + if ((CPUInfo[2] & 0x38081001) != 0x38081001) + return false; // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support +#elif defined(_XM_FMA3_INTRINSICS_) && defined(_XM_F16C_INTRINSICS_) + if ((CPUInfo[2] & 0x38081001) != 0x38081001) + return false; // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support +#elif defined(_XM_FMA3_INTRINSICS_) + if ((CPUInfo[2] & 0x18081001) != 0x18081001) + return false; // No AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support +#elif defined(_XM_F16C_INTRINSICS_) + if ((CPUInfo[2] & 0x38080001) != 0x38080001) + return false; // No F16C/AVX/OSXSAVE/SSE4.1/SSE3 support +#elif defined(__AVX__) || defined(_XM_AVX_INTRINSICS_) + if ((CPUInfo[2] & 0x18080001) != 0x18080001) + return false; // No AVX/OSXSAVE/SSE4.1/SSE3 support +#elif defined(_XM_SSE4_INTRINSICS_) + if ((CPUInfo[2] & 0x80001) != 0x80001) + return false; // No SSE3/SSE4.1 support +#elif defined(_XM_SSE3_INTRINSICS_) + if (!(CPUInfo[2] & 0x1)) + return false; // No SSE3 support +#endif + + // The x64 processor model requires SSE2 support, but no harm in checking + if ((CPUInfo[3] & 0x6000000) != 0x6000000) + return false; // No SSE2/SSE support + +#if defined(__AVX2__) || defined(_XM_AVX2_INTRINSICS_) +#if defined(__clang__) || defined(__GNUC__) + __cpuid_count(7, 0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuidex(CPUInfo, 7, 0); +#endif + if (!(CPUInfo[1] & 0x20)) + return false; // No AVX2 support +#endif + + return true; +#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + // ARM-NEON support is required for the Windows on ARM platform + return true; +#else + // No intrinsics path always supported + return true; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMFresnelTerm +( + FXMVECTOR CosIncidentAngle, + FXMVECTOR RefractionIndex +) noexcept +{ + assert(!XMVector4IsInfinite(CosIncidentAngle)); + + // Result = 0.5f * (g - c)^2 / (g + c)^2 * ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) where + // c = CosIncidentAngle + // g = sqrt(c^2 + RefractionIndex^2 - 1) + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR G = XMVectorMultiplyAdd(RefractionIndex, RefractionIndex, g_XMNegativeOne.v); + G = XMVectorMultiplyAdd(CosIncidentAngle, CosIncidentAngle, G); + G = XMVectorAbs(G); + G = XMVectorSqrt(G); + + XMVECTOR S = XMVectorAdd(G, CosIncidentAngle); + XMVECTOR D = XMVectorSubtract(G, CosIncidentAngle); + + XMVECTOR V0 = XMVectorMultiply(D, D); + XMVECTOR V1 = XMVectorMultiply(S, S); + V1 = XMVectorReciprocal(V1); + V0 = XMVectorMultiply(g_XMOneHalf.v, V0); + V0 = XMVectorMultiply(V0, V1); + + XMVECTOR V2 = XMVectorMultiplyAdd(CosIncidentAngle, S, g_XMNegativeOne.v); + XMVECTOR V3 = XMVectorMultiplyAdd(CosIncidentAngle, D, g_XMOne.v); + V2 = XMVectorMultiply(V2, V2); + V3 = XMVectorMultiply(V3, V3); + V3 = XMVectorReciprocal(V3); + V2 = XMVectorMultiplyAdd(V2, V3, g_XMOne.v); + + XMVECTOR Result = XMVectorMultiply(V0, V2); + + Result = XMVectorSaturate(Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // G = sqrt(abs((RefractionIndex^2-1) + CosIncidentAngle^2)) + XMVECTOR G = _mm_mul_ps(RefractionIndex, RefractionIndex); + XMVECTOR vTemp = _mm_mul_ps(CosIncidentAngle, CosIncidentAngle); + G = _mm_sub_ps(G, g_XMOne); + vTemp = _mm_add_ps(vTemp, G); + // max((0-vTemp),vTemp) == abs(vTemp) + // The abs is needed to deal with refraction and cosine being zero + G = _mm_setzero_ps(); + G = _mm_sub_ps(G, vTemp); + G = _mm_max_ps(G, vTemp); + // Last operation, the sqrt() + G = _mm_sqrt_ps(G); + + // Calc G-C and G+C + XMVECTOR GAddC = _mm_add_ps(G, CosIncidentAngle); + XMVECTOR GSubC = _mm_sub_ps(G, CosIncidentAngle); + // Perform the term (0.5f *(g - c)^2) / (g + c)^2 + XMVECTOR vResult = _mm_mul_ps(GSubC, GSubC); + vTemp = _mm_mul_ps(GAddC, GAddC); + vResult = _mm_mul_ps(vResult, g_XMOneHalf); + vResult = _mm_div_ps(vResult, vTemp); + // Perform the term ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) + GAddC = _mm_mul_ps(GAddC, CosIncidentAngle); + GSubC = _mm_mul_ps(GSubC, CosIncidentAngle); + GAddC = _mm_sub_ps(GAddC, g_XMOne); + GSubC = _mm_add_ps(GSubC, g_XMOne); + GAddC = _mm_mul_ps(GAddC, GAddC); + GSubC = _mm_mul_ps(GSubC, GSubC); + GAddC = _mm_div_ps(GAddC, GSubC); + GAddC = _mm_add_ps(GAddC, g_XMOne); + // Multiply the two term parts + vResult = _mm_mul_ps(vResult, GAddC); + // Clamp to 0.0 - 1.0f + vResult = _mm_max_ps(vResult, g_XMZero); + vResult = _mm_min_ps(vResult, g_XMOne); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XMScalarNearEqual +( + float S1, + float S2, + float Epsilon +) noexcept +{ + float Delta = S1 - S2; + return (fabsf(Delta) <= Epsilon); +} + +//------------------------------------------------------------------------------ +// Modulo the range of the given angle such that -XM_PI <= Angle < XM_PI +inline float XMScalarModAngle(float Angle) noexcept +{ + // Note: The modulo is performed with unsigned math only to work + // around a precision error on numbers that are close to PI + + // Normalize the range from 0.0f to XM_2PI + Angle = Angle + XM_PI; + // Perform the modulo, unsigned + float fTemp = fabsf(Angle); + fTemp = fTemp - (XM_2PI * static_cast(static_cast(fTemp / XM_2PI))); + // Restore the number to the range of -XM_PI to XM_PI-epsilon + fTemp = fTemp - XM_PI; + // If the modulo'd value was negative, restore negation + if (Angle < 0.0f) + { + fTemp = -fTemp; + } + return fTemp; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarSin(float Value) noexcept +{ + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI * Value; + if (Value >= 0.0f) + { + quotient = static_cast(static_cast(quotient + 0.5f)); + } + else + { + quotient = static_cast(static_cast(quotient - 0.5f)); + } + float y = Value - XM_2PI * quotient; + + // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). + if (y > XM_PIDIV2) + { + y = XM_PI - y; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + } + + // 11-degree minimax approximation + float y2 = y * y; + return (((((-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f) * y2 + 0.0083333310f) * y2 - 0.16666667f) * y2 + 1.0f) * y; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarSinEst(float Value) noexcept +{ + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI * Value; + if (Value >= 0.0f) + { + quotient = static_cast(static_cast(quotient + 0.5f)); + } + else + { + quotient = static_cast(static_cast(quotient - 0.5f)); + } + float y = Value - XM_2PI * quotient; + + // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). + if (y > XM_PIDIV2) + { + y = XM_PI - y; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + } + + // 7-degree minimax approximation + float y2 = y * y; + return (((-0.00018524670f * y2 + 0.0083139502f) * y2 - 0.16665852f) * y2 + 1.0f) * y; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarCos(float Value) noexcept +{ + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI * Value; + if (Value >= 0.0f) + { + quotient = static_cast(static_cast(quotient + 0.5f)); + } + else + { + quotient = static_cast(static_cast(quotient - 0.5f)); + } + float y = Value - XM_2PI * quotient; + + // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x). + float sign; + if (y > XM_PIDIV2) + { + y = XM_PI - y; + sign = -1.0f; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + sign = -1.0f; + } + else + { + sign = +1.0f; + } + + // 10-degree minimax approximation + float y2 = y * y; + float p = ((((-2.6051615e-07f * y2 + 2.4760495e-05f) * y2 - 0.0013888378f) * y2 + 0.041666638f) * y2 - 0.5f) * y2 + 1.0f; + return sign * p; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarCosEst(float Value) noexcept +{ + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI * Value; + if (Value >= 0.0f) + { + quotient = static_cast(static_cast(quotient + 0.5f)); + } + else + { + quotient = static_cast(static_cast(quotient - 0.5f)); + } + float y = Value - XM_2PI * quotient; + + // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x). + float sign; + if (y > XM_PIDIV2) + { + y = XM_PI - y; + sign = -1.0f; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + sign = -1.0f; + } + else + { + sign = +1.0f; + } + + // 6-degree minimax approximation + float y2 = y * y; + float p = ((-0.0012712436f * y2 + 0.041493919f) * y2 - 0.49992746f) * y2 + 1.0f; + return sign * p; +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XMScalarSinCos +( + float* pSin, + float* pCos, + float Value +) noexcept +{ + assert(pSin); + assert(pCos); + + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI * Value; + if (Value >= 0.0f) + { + quotient = static_cast(static_cast(quotient + 0.5f)); + } + else + { + quotient = static_cast(static_cast(quotient - 0.5f)); + } + float y = Value - XM_2PI * quotient; + + // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). + float sign; + if (y > XM_PIDIV2) + { + y = XM_PI - y; + sign = -1.0f; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + sign = -1.0f; + } + else + { + sign = +1.0f; + } + + float y2 = y * y; + + // 11-degree minimax approximation + *pSin = (((((-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f) * y2 + 0.0083333310f) * y2 - 0.16666667f) * y2 + 1.0f) * y; + + // 10-degree minimax approximation + float p = ((((-2.6051615e-07f * y2 + 2.4760495e-05f) * y2 - 0.0013888378f) * y2 + 0.041666638f) * y2 - 0.5f) * y2 + 1.0f; + *pCos = sign * p; +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XMScalarSinCosEst +( + float* pSin, + float* pCos, + float Value +) noexcept +{ + assert(pSin); + assert(pCos); + + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI * Value; + if (Value >= 0.0f) + { + quotient = static_cast(static_cast(quotient + 0.5f)); + } + else + { + quotient = static_cast(static_cast(quotient - 0.5f)); + } + float y = Value - XM_2PI * quotient; + + // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). + float sign; + if (y > XM_PIDIV2) + { + y = XM_PI - y; + sign = -1.0f; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + sign = -1.0f; + } + else + { + sign = +1.0f; + } + + float y2 = y * y; + + // 7-degree minimax approximation + *pSin = (((-0.00018524670f * y2 + 0.0083139502f) * y2 - 0.16665852f) * y2 + 1.0f) * y; + + // 6-degree minimax approximation + float p = ((-0.0012712436f * y2 + 0.041493919f) * y2 - 0.49992746f) * y2 + 1.0f; + *pCos = sign * p; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarASin(float Value) noexcept +{ + // Clamp input to [-1,1]. + bool nonnegative = (Value >= 0.0f); + float x = fabsf(Value); + float omx = 1.0f - x; + if (omx < 0.0f) + { + omx = 0.0f; + } + float root = sqrtf(omx); + + // 7-degree minimax approximation + float result = ((((((-0.0012624911f * x + 0.0066700901f) * x - 0.0170881256f) * x + 0.0308918810f) * x - 0.0501743046f) * x + 0.0889789874f) * x - 0.2145988016f) * x + 1.5707963050f; + result *= root; // acos(|x|) + + // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x) + return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2); +} + +//------------------------------------------------------------------------------ + +inline float XMScalarASinEst(float Value) noexcept +{ + // Clamp input to [-1,1]. + bool nonnegative = (Value >= 0.0f); + float x = fabsf(Value); + float omx = 1.0f - x; + if (omx < 0.0f) + { + omx = 0.0f; + } + float root = sqrtf(omx); + + // 3-degree minimax approximation + float result = ((-0.0187293f * x + 0.0742610f) * x - 0.2121144f) * x + 1.5707288f; + result *= root; // acos(|x|) + + // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x) + return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2); +} + +//------------------------------------------------------------------------------ + +inline float XMScalarACos(float Value) noexcept +{ + // Clamp input to [-1,1]. + bool nonnegative = (Value >= 0.0f); + float x = fabsf(Value); + float omx = 1.0f - x; + if (omx < 0.0f) + { + omx = 0.0f; + } + float root = sqrtf(omx); + + // 7-degree minimax approximation + float result = ((((((-0.0012624911f * x + 0.0066700901f) * x - 0.0170881256f) * x + 0.0308918810f) * x - 0.0501743046f) * x + 0.0889789874f) * x - 0.2145988016f) * x + 1.5707963050f; + result *= root; + + // acos(x) = pi - acos(-x) when x < 0 + return (nonnegative ? result : XM_PI - result); +} + +//------------------------------------------------------------------------------ + +inline float XMScalarACosEst(float Value) noexcept +{ + // Clamp input to [-1,1]. + bool nonnegative = (Value >= 0.0f); + float x = fabsf(Value); + float omx = 1.0f - x; + if (omx < 0.0f) + { + omx = 0.0f; + } + float root = sqrtf(omx); + + // 3-degree minimax approximation + float result = ((-0.0187293f * x + 0.0742610f) * x - 0.2121144f) * x + 1.5707288f; + result *= root; + + // acos(x) = pi - acos(-x) when x < 0 + return (nonnegative ? result : XM_PI - result); +} + diff --git a/DirectXMath/DirectXMathVector.inl b/DirectXMath/DirectXMathVector.inl new file mode 100644 index 0000000..35d2e2a --- /dev/null +++ b/DirectXMath/DirectXMathVector.inl @@ -0,0 +1,14819 @@ +//------------------------------------------------------------------------------------- +// DirectXMathVector.inl -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#if defined(_XM_NO_INTRINSICS_) +#define XMISNAN(x) isnan(x) +#define XMISINF(x) isinf(x) +#endif + +#if defined(_XM_SSE_INTRINSICS_) + +#define XM3UNPACK3INTO4(l1, l2, l3) \ + XMVECTOR V3 = _mm_shuffle_ps(l2, l3, _MM_SHUFFLE(0, 0, 3, 2));\ + XMVECTOR V2 = _mm_shuffle_ps(l2, l1, _MM_SHUFFLE(3, 3, 1, 0));\ + V2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 0, 2));\ + XMVECTOR V4 = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(L3), 32 / 8)) + +#define XM3PACK4INTO3(v2x) \ + v2x = _mm_shuffle_ps(V2, V3, _MM_SHUFFLE(1, 0, 2, 1));\ + V2 = _mm_shuffle_ps(V2, V1, _MM_SHUFFLE(2, 2, 0, 0));\ + V1 = _mm_shuffle_ps(V1, V2, _MM_SHUFFLE(0, 2, 1, 0));\ + V3 = _mm_shuffle_ps(V3, V4, _MM_SHUFFLE(0, 0, 2, 2));\ + V3 = _mm_shuffle_ps(V3, V4, _MM_SHUFFLE(2, 1, 2, 0)) + +#endif + +/**************************************************************************** + * + * General Vector + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + // Assignment operations + //------------------------------------------------------------------------------ + + //------------------------------------------------------------------------------ + // Return a vector with all elements equaling zero +inline XMVECTOR XM_CALLCONV XMVectorZero() noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { 0.0f, 0.0f, 0.0f, 0.0f } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_f32(0); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_setzero_ps(); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with four floating point values +inline XMVECTOR XM_CALLCONV XMVectorSet +( + float x, + float y, + float z, + float w +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { x, y, z, w } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t V0 = vcreate_f32( + static_cast(*reinterpret_cast(&x)) + | (static_cast(*reinterpret_cast(&y)) << 32)); + float32x2_t V1 = vcreate_f32( + static_cast(*reinterpret_cast(&z)) + | (static_cast(*reinterpret_cast(&w)) << 32)); + return vcombine_f32(V0, V1); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_set_ps(w, z, y, x); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with four integer values +inline XMVECTOR XM_CALLCONV XMVectorSetInt +( + uint32_t x, + uint32_t y, + uint32_t z, + uint32_t w +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult = { { { x, y, z, w } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t V0 = vcreate_u32(static_cast(x) | (static_cast(y) << 32)); + uint32x2_t V1 = vcreate_u32(static_cast(z) | (static_cast(w) << 32)); + return vreinterpretq_f32_u32(vcombine_u32(V0, V1)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_set_epi32(static_cast(w), static_cast(z), static_cast(y), static_cast(x)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated floating point value +inline XMVECTOR XM_CALLCONV XMVectorReplicate(float Value) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = Value; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_f32(Value); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_set_ps1(Value); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr(const float* pValue) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float Value = pValue[0]; + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = Value; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_dup_f32(pValue); +#elif defined(_XM_AVX_INTRINSICS_) + return _mm_broadcast_ss(pValue); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ps1(pValue); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated integer value +inline XMVECTOR XM_CALLCONV XMVectorReplicateInt(uint32_t Value) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = Value; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vdupq_n_u32(Value)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_set1_epi32(static_cast(Value)); + return _mm_castsi128_ps(vTemp); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorReplicateIntPtr(const uint32_t* pValue) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t Value = pValue[0]; + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = Value; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vld1q_dup_u32(pValue)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ps1(reinterpret_cast(pValue)); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with all bits set (true mask) +inline XMVECTOR XM_CALLCONV XMVectorTrueInt() noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult = { { { 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_s32(vdupq_n_s32(-1)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_set1_epi32(-1); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with all bits clear (false mask) +inline XMVECTOR XM_CALLCONV XMVectorFalseInt() noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { 0.0f, 0.0f, 0.0f, 0.0f } } }; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vdupq_n_u32(0)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_setzero_ps(); +#endif +} + +//------------------------------------------------------------------------------ +// Replicate the x component of the vector +inline XMVECTOR XM_CALLCONV XMVectorSplatX(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = V.vector4_f32[0]; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_lane_f32(vget_low_f32(V), 0); +#elif defined(_XM_AVX2_INTRINSICS_) && defined(_XM_FAVOR_INTEL_) + return _mm_broadcastss_ps(V); +#elif defined(_XM_SSE_INTRINSICS_) + return XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); +#endif +} + +//------------------------------------------------------------------------------ +// Replicate the y component of the vector +inline XMVECTOR XM_CALLCONV XMVectorSplatY(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = V.vector4_f32[1]; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_lane_f32(vget_low_f32(V), 1); +#elif defined(_XM_SSE_INTRINSICS_) + return XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); +#endif +} + +//------------------------------------------------------------------------------ +// Replicate the z component of the vector +inline XMVECTOR XM_CALLCONV XMVectorSplatZ(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = V.vector4_f32[2]; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_lane_f32(vget_high_f32(V), 0); +#elif defined(_XM_SSE_INTRINSICS_) + return XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); +#endif +} + +//------------------------------------------------------------------------------ +// Replicate the w component of the vector +inline XMVECTOR XM_CALLCONV XMVectorSplatW(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = V.vector4_f32[3]; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_lane_f32(vget_high_f32(V), 1); +#elif defined(_XM_SSE_INTRINSICS_) + return XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); +#endif +} + +//------------------------------------------------------------------------------ +// Return a vector of 1.0f,1.0f,1.0f,1.0f +inline XMVECTOR XM_CALLCONV XMVectorSplatOne() noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = 1.0f; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_f32(1.0f); +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMOne; +#endif +} + +//------------------------------------------------------------------------------ +// Return a vector of INF,INF,INF,INF +inline XMVECTOR XM_CALLCONV XMVectorSplatInfinity() noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = 0x7F800000; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vdupq_n_u32(0x7F800000)); +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMInfinity; +#endif +} + +//------------------------------------------------------------------------------ +// Return a vector of Q_NAN,Q_NAN,Q_NAN,Q_NAN +inline XMVECTOR XM_CALLCONV XMVectorSplatQNaN() noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = 0x7FC00000; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vdupq_n_u32(0x7FC00000)); +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMQNaN; +#endif +} + +//------------------------------------------------------------------------------ +// Return a vector of 1.192092896e-7f,1.192092896e-7f,1.192092896e-7f,1.192092896e-7f +inline XMVECTOR XM_CALLCONV XMVectorSplatEpsilon() noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = 0x34000000; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vdupq_n_u32(0x34000000)); +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMEpsilon; +#endif +} + +//------------------------------------------------------------------------------ +// Return a vector of -0.0f (0x80000000),-0.0f,-0.0f,-0.0f +inline XMVECTOR XM_CALLCONV XMVectorSplatSignMask() noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = 0x80000000U; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vdupq_n_u32(0x80000000U)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_set1_epi32(static_cast(0x80000000)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +// Return a floating point value via an index. This is not a recommended +// function to use due to performance loss. +inline float XM_CALLCONV XMVectorGetByIndex(FXMVECTOR V, size_t i) noexcept +{ + assert(i < 4); + _Analysis_assume_(i < 4); +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[i]; +#else + XMVECTORF32 U; + U.v = V; + return U.f[i]; +#endif +} + +//------------------------------------------------------------------------------ +// Return the X component in an FPU register. +inline float XM_CALLCONV XMVectorGetX(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[0]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_f32(V, 0); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cvtss_f32(V); +#endif +} + +// Return the Y component in an FPU register. +inline float XM_CALLCONV XMVectorGetY(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_f32(V, 1); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + return _mm_cvtss_f32(vTemp); +#endif +} + +// Return the Z component in an FPU register. +inline float XM_CALLCONV XMVectorGetZ(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_f32(V, 2); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + return _mm_cvtss_f32(vTemp); +#endif +} + +// Return the W component in an FPU register. +inline float XM_CALLCONV XMVectorGetW(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_f32(V, 3); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + return _mm_cvtss_f32(vTemp); +#endif +} + +//------------------------------------------------------------------------------ + +// Store a component indexed by i into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetByIndexPtr(float* f, FXMVECTOR V, size_t i) noexcept +{ + assert(f != nullptr); + assert(i < 4); + _Analysis_assume_(i < 4); +#if defined(_XM_NO_INTRINSICS_) + *f = V.vector4_f32[i]; +#else + XMVECTORF32 U; + U.v = V; + *f = U.f[i]; +#endif +} + +//------------------------------------------------------------------------------ + +// Store the X component into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetXPtr(float* x, FXMVECTOR V) noexcept +{ + assert(x != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *x = V.vector4_f32[0]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(x, V, 0); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss(x, V); +#endif +} + +// Store the Y component into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetYPtr(float* y, FXMVECTOR V) noexcept +{ + assert(y != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *y = V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(y, V, 1); +#elif defined(_XM_SSE4_INTRINSICS_) + * (reinterpret_cast(y)) = _mm_extract_ps(V, 1); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + _mm_store_ss(y, vResult); +#endif +} + +// Store the Z component into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetZPtr(float* z, FXMVECTOR V) noexcept +{ + assert(z != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *z = V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(z, V, 2); +#elif defined(_XM_SSE4_INTRINSICS_) + * (reinterpret_cast(z)) = _mm_extract_ps(V, 2); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + _mm_store_ss(z, vResult); +#endif +} + +// Store the W component into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetWPtr(float* w, FXMVECTOR V) noexcept +{ + assert(w != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *w = V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(w, V, 3); +#elif defined(_XM_SSE4_INTRINSICS_) + * (reinterpret_cast(w)) = _mm_extract_ps(V, 3); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + _mm_store_ss(w, vResult); +#endif +} + +//------------------------------------------------------------------------------ + +// Return an integer value via an index. This is not a recommended +// function to use due to performance loss. +inline uint32_t XM_CALLCONV XMVectorGetIntByIndex(FXMVECTOR V, size_t i) noexcept +{ + assert(i < 4); + _Analysis_assume_(i < 4); +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[i]; +#else + XMVECTORU32 U; + U.v = V; + return U.u[i]; +#endif +} + +//------------------------------------------------------------------------------ + +// Return the X component in an integer register. +inline uint32_t XM_CALLCONV XMVectorGetIntX(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[0]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_u32(vreinterpretq_u32_f32(V), 0); +#elif defined(_XM_SSE_INTRINSICS_) + return static_cast(_mm_cvtsi128_si32(_mm_castps_si128(V))); +#endif +} + +// Return the Y component in an integer register. +inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_u32(vreinterpretq_u32_f32(V), 1); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128(V); + return static_cast(_mm_extract_epi32(V1, 1)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V), _MM_SHUFFLE(1, 1, 1, 1)); + return static_cast(_mm_cvtsi128_si32(vResulti)); +#endif +} + +// Return the Z component in an integer register. +inline uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_u32(vreinterpretq_u32_f32(V), 2); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128(V); + return static_cast(_mm_extract_epi32(V1, 2)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V), _MM_SHUFFLE(2, 2, 2, 2)); + return static_cast(_mm_cvtsi128_si32(vResulti)); +#endif +} + +// Return the W component in an integer register. +inline uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_u32(vreinterpretq_u32_f32(V), 3); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128(V); + return static_cast(_mm_extract_epi32(V1, 3)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V), _MM_SHUFFLE(3, 3, 3, 3)); + return static_cast(_mm_cvtsi128_si32(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ + +// Store a component indexed by i into a 32 bit integer location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetIntByIndexPtr(uint32_t* x, FXMVECTOR V, size_t i) noexcept +{ + assert(x != nullptr); + assert(i < 4); + _Analysis_assume_(i < 4); +#if defined(_XM_NO_INTRINSICS_) + *x = V.vector4_u32[i]; +#else + XMVECTORU32 U; + U.v = V; + *x = U.u[i]; +#endif +} + +//------------------------------------------------------------------------------ + +// Store the X component into a 32 bit integer location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetIntXPtr(uint32_t* x, FXMVECTOR V) noexcept +{ + assert(x != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *x = V.vector4_u32[0]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(x, *reinterpret_cast(&V), 0); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss(reinterpret_cast(x), V); +#endif +} + +// Store the Y component into a 32 bit integer location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetIntYPtr(uint32_t* y, FXMVECTOR V) noexcept +{ + assert(y != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *y = V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(y, *reinterpret_cast(&V), 1); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128(V); + *y = static_cast(_mm_extract_epi32(V1, 1)); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + _mm_store_ss(reinterpret_cast(y), vResult); +#endif +} + +// Store the Z component into a 32 bit integer locaCantion in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetIntZPtr(uint32_t* z, FXMVECTOR V) noexcept +{ + assert(z != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *z = V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(z, *reinterpret_cast(&V), 2); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128(V); + *z = static_cast(_mm_extract_epi32(V1, 2)); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + _mm_store_ss(reinterpret_cast(z), vResult); +#endif +} + +// Store the W component into a 32 bit integer location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetIntWPtr(uint32_t* w, FXMVECTOR V) noexcept +{ + assert(w != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *w = V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(w, *reinterpret_cast(&V), 3); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128(V); + *w = static_cast(_mm_extract_epi32(V1, 3)); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + _mm_store_ss(reinterpret_cast(w), vResult); +#endif +} + +//------------------------------------------------------------------------------ + +// Set a single indexed floating point component +inline XMVECTOR XM_CALLCONV XMVectorSetByIndex(FXMVECTOR V, float f, size_t i) noexcept +{ + assert(i < 4); + _Analysis_assume_(i < 4); + XMVECTORF32 U; + U.v = V; + U.f[i] = f; + return U.v; +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to a passed floating point value +inline XMVECTOR XM_CALLCONV XMVectorSetX(FXMVECTOR V, float x) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { { { + x, + V.vector4_f32[1], + V.vector4_f32[2], + V.vector4_f32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_f32(x, V, 0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_set_ss(x); + vResult = _mm_move_ss(V, vResult); + return vResult; +#endif +} + +// Sets the Y component of a vector to a passed floating point value +inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { { { + V.vector4_f32[0], + y, + V.vector4_f32[2], + V.vector4_f32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_f32(y, V, 1); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vResult = _mm_set_ss(y); + vResult = _mm_insert_ps(V, vResult, 0x10); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Swap y and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1)); + // Convert input to vector + XMVECTOR vTemp = _mm_set_ss(y); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap y and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1)); + return vResult; +#endif +} +// Sets the Z component of a vector to a passed floating point value +inline XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { { { + V.vector4_f32[0], + V.vector4_f32[1], + z, + V.vector4_f32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_f32(z, V, 2); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vResult = _mm_set_ss(z); + vResult = _mm_insert_ps(V, vResult, 0x20); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Swap z and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2)); + // Convert input to vector + XMVECTOR vTemp = _mm_set_ss(z); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap z and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2)); + return vResult; +#endif +} + +// Sets the W component of a vector to a passed floating point value +inline XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { { { + V.vector4_f32[0], + V.vector4_f32[1], + V.vector4_f32[2], + w + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_f32(w, V, 3); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vResult = _mm_set_ss(w); + vResult = _mm_insert_ps(V, vResult, 0x30); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Swap w and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3)); + // Convert input to vector + XMVECTOR vTemp = _mm_set_ss(w); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap w and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +// Sets a component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetByIndexPtr(FXMVECTOR V, const float* f, size_t i) noexcept +{ + assert(f != nullptr); + assert(i < 4); + _Analysis_assume_(i < 4); + XMVECTORF32 U; + U.v = V; + U.f[i] = *f; + return U.v; +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetXPtr(FXMVECTOR V, const float* x) noexcept +{ + assert(x != nullptr); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { { { + *x, + V.vector4_f32[1], + V.vector4_f32[2], + V.vector4_f32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_f32(x, V, 0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_load_ss(x); + vResult = _mm_move_ss(V, vResult); + return vResult; +#endif +} + +// Sets the Y component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetYPtr(FXMVECTOR V, const float* y) noexcept +{ + assert(y != nullptr); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { { { + V.vector4_f32[0], + *y, + V.vector4_f32[2], + V.vector4_f32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_f32(y, V, 1); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap y and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(y); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap y and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1)); + return vResult; +#endif +} + +// Sets the Z component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetZPtr(FXMVECTOR V, const float* z) noexcept +{ + assert(z != nullptr); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { { { + V.vector4_f32[0], + V.vector4_f32[1], + *z, + V.vector4_f32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_f32(z, V, 2); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap z and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(z); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap z and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2)); + return vResult; +#endif +} + +// Sets the W component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetWPtr(FXMVECTOR V, const float* w) noexcept +{ + assert(w != nullptr); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { { { + V.vector4_f32[0], + V.vector4_f32[1], + V.vector4_f32[2], + *w + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_f32(w, V, 3); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap w and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(w); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap w and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +// Sets a component of a vector to an integer passed by value +inline XMVECTOR XM_CALLCONV XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i) noexcept +{ + assert(i < 4); + _Analysis_assume_(i < 4); + XMVECTORU32 tmp; + tmp.v = V; + tmp.u[i] = x; + return tmp; +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to an integer passed by value +inline XMVECTOR XM_CALLCONV XMVectorSetIntX(FXMVECTOR V, uint32_t x) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { { { + x, + V.vector4_u32[1], + V.vector4_u32[2], + V.vector4_u32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vsetq_lane_u32(x, vreinterpretq_u32_f32(V), 0)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cvtsi32_si128(static_cast(x)); + XMVECTOR vResult = _mm_move_ss(V, _mm_castsi128_ps(vTemp)); + return vResult; +#endif +} + +// Sets the Y component of a vector to an integer passed by value +inline XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { { { + V.vector4_u32[0], + y, + V.vector4_u32[2], + V.vector4_u32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vsetq_lane_u32(y, vreinterpretq_u32_f32(V), 1)); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i vResult = _mm_castps_si128(V); + vResult = _mm_insert_epi32(vResult, static_cast(y), 1); + return _mm_castsi128_ps(vResult); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap y and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1)); + // Convert input to vector + __m128i vTemp = _mm_cvtsi32_si128(static_cast(y)); + // Replace the x component + vResult = _mm_move_ss(vResult, _mm_castsi128_ps(vTemp)); + // Swap y and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1)); + return vResult; +#endif +} + +// Sets the Z component of a vector to an integer passed by value +inline XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { { { + V.vector4_u32[0], + V.vector4_u32[1], + z, + V.vector4_u32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vsetq_lane_u32(z, vreinterpretq_u32_f32(V), 2)); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i vResult = _mm_castps_si128(V); + vResult = _mm_insert_epi32(vResult, static_cast(z), 2); + return _mm_castsi128_ps(vResult); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap z and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2)); + // Convert input to vector + __m128i vTemp = _mm_cvtsi32_si128(static_cast(z)); + // Replace the x component + vResult = _mm_move_ss(vResult, _mm_castsi128_ps(vTemp)); + // Swap z and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2)); + return vResult; +#endif +} + +// Sets the W component of a vector to an integer passed by value +inline XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { { { + V.vector4_u32[0], + V.vector4_u32[1], + V.vector4_u32[2], + w + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vsetq_lane_u32(w, vreinterpretq_u32_f32(V), 3)); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i vResult = _mm_castps_si128(V); + vResult = _mm_insert_epi32(vResult, static_cast(w), 3); + return _mm_castsi128_ps(vResult); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap w and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3)); + // Convert input to vector + __m128i vTemp = _mm_cvtsi32_si128(static_cast(w)); + // Replace the x component + vResult = _mm_move_ss(vResult, _mm_castsi128_ps(vTemp)); + // Swap w and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +// Sets a component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetIntByIndexPtr(FXMVECTOR V, const uint32_t* x, size_t i) noexcept +{ + assert(x != nullptr); + assert(i < 4); + _Analysis_assume_(i < 4); + XMVECTORU32 tmp; + tmp.v = V; + tmp.u[i] = *x; + return tmp; +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetIntXPtr(FXMVECTOR V, const uint32_t* x) noexcept +{ + assert(x != nullptr); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { { { + *x, + V.vector4_u32[1], + V.vector4_u32[2], + V.vector4_u32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vld1q_lane_u32(x, *reinterpret_cast(&V), 0)); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(x)); + XMVECTOR vResult = _mm_move_ss(V, vTemp); + return vResult; +#endif +} + +// Sets the Y component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetIntYPtr(FXMVECTOR V, const uint32_t* y) noexcept +{ + assert(y != nullptr); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { { { + V.vector4_u32[0], + *y, + V.vector4_u32[2], + V.vector4_u32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vld1q_lane_u32(y, *reinterpret_cast(&V), 1)); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap y and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(y)); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap y and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1)); + return vResult; +#endif +} + +// Sets the Z component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetIntZPtr(FXMVECTOR V, const uint32_t* z) noexcept +{ + assert(z != nullptr); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { { { + V.vector4_u32[0], + V.vector4_u32[1], + *z, + V.vector4_u32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vld1q_lane_u32(z, *reinterpret_cast(&V), 2)); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap z and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(z)); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap z and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2)); + return vResult; +#endif +} + +// Sets the W component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetIntWPtr(FXMVECTOR V, const uint32_t* w) noexcept +{ + assert(w != nullptr); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { { { + V.vector4_u32[0], + V.vector4_u32[1], + V.vector4_u32[2], + *w + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vld1q_lane_u32(w, *reinterpret_cast(&V), 3)); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap w and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(w)); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap w and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSwizzle +( + FXMVECTOR V, + uint32_t E0, + uint32_t E1, + uint32_t E2, + uint32_t E3 +) noexcept +{ + assert((E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4)); + _Analysis_assume_((E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4)); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + V.vector4_f32[E0], + V.vector4_f32[E1], + V.vector4_f32[E2], + V.vector4_f32[E3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const uint32_t ControlElement[4] = + { + 0x03020100, // XM_SWIZZLE_X + 0x07060504, // XM_SWIZZLE_Y + 0x0B0A0908, // XM_SWIZZLE_Z + 0x0F0E0D0C, // XM_SWIZZLE_W + }; + + uint8x8x2_t tbl; + tbl.val[0] = vreinterpret_u8_f32(vget_low_f32(V)); + tbl.val[1] = vreinterpret_u8_f32(vget_high_f32(V)); + + uint32x2_t idx = vcreate_u32(static_cast(ControlElement[E0]) | (static_cast(ControlElement[E1]) << 32)); + const uint8x8_t rL = vtbl2_u8(tbl, vreinterpret_u8_u32(idx)); + + idx = vcreate_u32(static_cast(ControlElement[E2]) | (static_cast(ControlElement[E3]) << 32)); + const uint8x8_t rH = vtbl2_u8(tbl, vreinterpret_u8_u32(idx)); + + return vcombine_f32(vreinterpret_f32_u8(rL), vreinterpret_f32_u8(rH)); +#elif defined(_XM_AVX_INTRINSICS_) + unsigned int elem[4] = { E0, E1, E2, E3 }; + __m128i vControl = _mm_loadu_si128(reinterpret_cast(&elem[0])); + return _mm_permutevar_ps(V, vControl); +#else + auto aPtr = reinterpret_cast(&V); + + XMVECTOR Result; + auto pWork = reinterpret_cast(&Result); + + pWork[0] = aPtr[E0]; + pWork[1] = aPtr[E1]; + pWork[2] = aPtr[E2]; + pWork[3] = aPtr[E3]; + + return Result; +#endif +} + +//------------------------------------------------------------------------------ +inline XMVECTOR XM_CALLCONV XMVectorPermute +( + FXMVECTOR V1, + FXMVECTOR V2, + uint32_t PermuteX, + uint32_t PermuteY, + uint32_t PermuteZ, + uint32_t PermuteW +) noexcept +{ + assert(PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7); + _Analysis_assume_(PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7); + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + static const uint32_t ControlElement[8] = + { + 0x03020100, // XM_PERMUTE_0X + 0x07060504, // XM_PERMUTE_0Y + 0x0B0A0908, // XM_PERMUTE_0Z + 0x0F0E0D0C, // XM_PERMUTE_0W + 0x13121110, // XM_PERMUTE_1X + 0x17161514, // XM_PERMUTE_1Y + 0x1B1A1918, // XM_PERMUTE_1Z + 0x1F1E1D1C, // XM_PERMUTE_1W + }; + + uint8x8x4_t tbl; + tbl.val[0] = vreinterpret_u8_f32(vget_low_f32(V1)); + tbl.val[1] = vreinterpret_u8_f32(vget_high_f32(V1)); + tbl.val[2] = vreinterpret_u8_f32(vget_low_f32(V2)); + tbl.val[3] = vreinterpret_u8_f32(vget_high_f32(V2)); + + uint32x2_t idx = vcreate_u32(static_cast(ControlElement[PermuteX]) | (static_cast(ControlElement[PermuteY]) << 32)); + const uint8x8_t rL = vtbl4_u8(tbl, vreinterpret_u8_u32(idx)); + + idx = vcreate_u32(static_cast(ControlElement[PermuteZ]) | (static_cast(ControlElement[PermuteW]) << 32)); + const uint8x8_t rH = vtbl4_u8(tbl, vreinterpret_u8_u32(idx)); + + return vcombine_f32(vreinterpret_f32_u8(rL), vreinterpret_f32_u8(rH)); +#elif defined(_XM_AVX_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + static const XMVECTORU32 three = { { { 3, 3, 3, 3 } } }; + + XM_ALIGNED_DATA(16) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW }; + __m128i vControl = _mm_load_si128(reinterpret_cast(&elem[0])); + + __m128i vSelect = _mm_cmpgt_epi32(vControl, three); + vControl = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(vControl), three)); + + __m128 shuffled1 = _mm_permutevar_ps(V1, vControl); + __m128 shuffled2 = _mm_permutevar_ps(V2, vControl); + + __m128 masked1 = _mm_andnot_ps(_mm_castsi128_ps(vSelect), shuffled1); + __m128 masked2 = _mm_and_ps(_mm_castsi128_ps(vSelect), shuffled2); + + return _mm_or_ps(masked1, masked2); +#else + + const uint32_t* aPtr[2]; + aPtr[0] = reinterpret_cast(&V1); + aPtr[1] = reinterpret_cast(&V2); + + XMVECTOR Result; + auto pWork = reinterpret_cast(&Result); + + const uint32_t i0 = PermuteX & 3; + const uint32_t vi0 = PermuteX >> 2; + pWork[0] = aPtr[vi0][i0]; + + const uint32_t i1 = PermuteY & 3; + const uint32_t vi1 = PermuteY >> 2; + pWork[1] = aPtr[vi1][i1]; + + const uint32_t i2 = PermuteZ & 3; + const uint32_t vi2 = PermuteZ >> 2; + pWork[2] = aPtr[vi2][i2]; + + const uint32_t i3 = PermuteW & 3; + const uint32_t vi3 = PermuteW >> 2; + pWork[3] = aPtr[vi3][i3]; + + return Result; +#endif +} + +//------------------------------------------------------------------------------ +// Define a control vector to be used in XMVectorSelect +// operations. The four integers specified in XMVectorSelectControl +// serve as indices to select between components in two vectors. +// The first index controls selection for the first component of +// the vectors involved in a select operation, the second index +// controls selection for the second component etc. A value of +// zero for an index causes the corresponding component from the first +// vector to be selected whereas a one causes the component from the +// second vector to be selected instead. + +inline XMVECTOR XM_CALLCONV XMVectorSelectControl +( + uint32_t VectorIndex0, + uint32_t VectorIndex1, + uint32_t VectorIndex2, + uint32_t VectorIndex3 +) noexcept +{ +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + // x=Index0,y=Index1,z=Index2,w=Index3 + __m128i vTemp = _mm_set_epi32(static_cast(VectorIndex3), static_cast(VectorIndex2), static_cast(VectorIndex1), static_cast(VectorIndex0)); + // Any non-zero entries become 0xFFFFFFFF else 0 + vTemp = _mm_cmpgt_epi32(vTemp, g_XMZero); + return _mm_castsi128_ps(vTemp); +#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + int32x2_t V0 = vcreate_s32(static_cast(VectorIndex0) | (static_cast(VectorIndex1) << 32)); + int32x2_t V1 = vcreate_s32(static_cast(VectorIndex2) | (static_cast(VectorIndex3) << 32)); + int32x4_t vTemp = vcombine_s32(V0, V1); + // Any non-zero entries become 0xFFFFFFFF else 0 + return vreinterpretq_f32_u32(vcgtq_s32(vTemp, g_XMZero)); +#else + XMVECTOR ControlVector; + const uint32_t ControlElement[] = + { + XM_SELECT_0, + XM_SELECT_1 + }; + + assert(VectorIndex0 < 2); + assert(VectorIndex1 < 2); + assert(VectorIndex2 < 2); + assert(VectorIndex3 < 2); + _Analysis_assume_(VectorIndex0 < 2); + _Analysis_assume_(VectorIndex1 < 2); + _Analysis_assume_(VectorIndex2 < 2); + _Analysis_assume_(VectorIndex3 < 2); + + ControlVector.vector4_u32[0] = ControlElement[VectorIndex0]; + ControlVector.vector4_u32[1] = ControlElement[VectorIndex1]; + ControlVector.vector4_u32[2] = ControlElement[VectorIndex2]; + ControlVector.vector4_u32[3] = ControlElement[VectorIndex3]; + + return ControlVector; + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSelect +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Control +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { { { + (V1.vector4_u32[0] & ~Control.vector4_u32[0]) | (V2.vector4_u32[0] & Control.vector4_u32[0]), + (V1.vector4_u32[1] & ~Control.vector4_u32[1]) | (V2.vector4_u32[1] & Control.vector4_u32[1]), + (V1.vector4_u32[2] & ~Control.vector4_u32[2]) | (V2.vector4_u32[2] & Control.vector4_u32[2]), + (V1.vector4_u32[3] & ~Control.vector4_u32[3]) | (V2.vector4_u32[3] & Control.vector4_u32[3]), + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vbslq_f32(vreinterpretq_u32_f32(Control), V2, V1); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = _mm_andnot_ps(Control, V1); + XMVECTOR vTemp2 = _mm_and_ps(V2, Control); + return _mm_or_ps(vTemp1, vTemp2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMergeXY +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { { { + V1.vector4_u32[0], + V2.vector4_u32[0], + V1.vector4_u32[1], + V2.vector4_u32[1], + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vzipq_f32(V1, V2).val[0]; +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_unpacklo_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMergeZW +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { { { + V1.vector4_u32[2], + V2.vector4_u32[2], + V1.vector4_u32[3], + V2.vector4_u32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vzipq_f32(V1, V2).val[1]; +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_unpackhi_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements) noexcept +{ + assert(Elements < 4); + _Analysis_assume_(Elements < 4); + return XMVectorPermute(V1, V2, Elements, ((Elements)+1), ((Elements)+2), ((Elements)+3)); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements) noexcept +{ + assert(Elements < 4); + _Analysis_assume_(Elements < 4); + return XMVectorSwizzle(V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements) noexcept +{ + assert(Elements < 4); + _Analysis_assume_(Elements < 4); + return XMVectorSwizzle(V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorInsert( + FXMVECTOR VD, FXMVECTOR VS, + uint32_t VSLeftRotateElements, + uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3) noexcept +{ + XMVECTOR Control = XMVectorSelectControl(Select0 & 1, Select1 & 1, Select2 & 1, Select3 & 1); + return XMVectorSelect(VD, XMVectorRotateLeft(VS, VSLeftRotateElements), Control); +} + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFF : 0, + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vceqq_f32(V1, V2)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpeq_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorEqualR +( + uint32_t* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + assert(pCR != nullptr); +#if defined(_XM_NO_INTRINSICS_) + uint32_t ux = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uint32_t uy = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uint32_t uz = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uint32_t uw = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + uint32_t CR = 0; + if (ux & uy & uz & uw) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if (!(ux | uy | uz | uw)) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + + XMVECTORU32 Control = { { { ux, uy, uz, uw } } }; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vreinterpret_u8_u32(vget_low_u32(vResult)), vreinterpret_u8_u32(vget_high_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + // All elements are equal + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + // All elements are not equal + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vreinterpretq_f32_u32(vResult); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp); + if (iTest == 0xf) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +// Treat the components of the vectors as unsigned integers and +// compare individual bits between the two. This is useful for +// comparing control vectors and result vectors returned from +// other comparison operations. + +inline XMVECTOR XM_CALLCONV XMVectorEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V1.vector4_u32[0] == V2.vector4_u32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_u32[1] == V2.vector4_u32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_u32[2] == V2.vector4_u32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_u32[3] == V2.vector4_u32[3]) ? 0xFFFFFFFF : 0, + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vceqq_s32(vreinterpretq_s32_f32(V1), vreinterpretq_s32_f32(V2))); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorEqualIntR +( + uint32_t* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + assert(pCR != nullptr); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control = XMVectorEqualInt(V1, V2); + + *pCR = 0; + if (XMVector4EqualInt(Control, XMVectorTrueInt())) + { + // All elements are equal + *pCR |= XM_CRMASK_CR6TRUE; + } + else if (XMVector4EqualInt(Control, XMVectorFalseInt())) + { + // All elements are not equal + *pCR |= XM_CRMASK_CR6FALSE; + } + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + // All elements are equal + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + // All elements are not equal + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vreinterpretq_f32_u32(vResult); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + int iTemp = _mm_movemask_ps(_mm_castsi128_ps(V)); + uint32_t CR = 0; + if (iTemp == 0x0F) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTemp) + { + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + float fDeltax = V1.vector4_f32[0] - V2.vector4_f32[0]; + float fDeltay = V1.vector4_f32[1] - V2.vector4_f32[1]; + float fDeltaz = V1.vector4_f32[2] - V2.vector4_f32[2]; + float fDeltaw = V1.vector4_f32[3] - V2.vector4_f32[3]; + + fDeltax = fabsf(fDeltax); + fDeltay = fabsf(fDeltay); + fDeltaz = fabsf(fDeltaz); + fDeltaw = fabsf(fDeltaw); + + XMVECTORU32 Control = { { { + (fDeltax <= Epsilon.vector4_f32[0]) ? 0xFFFFFFFFU : 0, + (fDeltay <= Epsilon.vector4_f32[1]) ? 0xFFFFFFFFU : 0, + (fDeltaz <= Epsilon.vector4_f32[2]) ? 0xFFFFFFFFU : 0, + (fDeltaw <= Epsilon.vector4_f32[3]) ? 0xFFFFFFFFU : 0, + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vDelta = vsubq_f32(V1, V2); +#ifdef _MSC_VER + return vacleq_f32(vDelta, Epsilon); +#else + return vreinterpretq_f32_u32(vcleq_f32(vabsq_f32(vDelta), Epsilon)); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1, V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp, vDelta); + vTemp = _mm_max_ps(vTemp, vDelta); + vTemp = _mm_cmple_ps(vTemp, Epsilon); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V1.vector4_f32[0] != V2.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[1] != V2.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[2] != V2.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[3] != V2.vector4_f32[3]) ? 0xFFFFFFFF : 0, + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(V1, V2))); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpneq_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V1.vector4_u32[0] != V2.vector4_u32[0]) ? 0xFFFFFFFFU : 0, + (V1.vector4_u32[1] != V2.vector4_u32[1]) ? 0xFFFFFFFFU : 0, + (V1.vector4_u32[2] != V2.vector4_u32[2]) ? 0xFFFFFFFFU : 0, + (V1.vector4_u32[3] != V2.vector4_u32[3]) ? 0xFFFFFFFFU : 0 + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vmvnq_u32( + vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)))); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return _mm_xor_ps(_mm_castsi128_ps(V), g_XMNegOneMask); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorGreater +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFF : 0 + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vcgtq_f32(V1, V2)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpgt_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorGreaterR +( + uint32_t* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + assert(pCR != nullptr); +#if defined(_XM_NO_INTRINSICS_) + + uint32_t ux = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uint32_t uy = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uint32_t uz = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uint32_t uw = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + uint32_t CR = 0; + if (ux & uy & uz & uw) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if (!(ux | uy | uz | uw)) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + + XMVECTORU32 Control = { { { ux, uy, uz, uw } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgtq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vreinterpretq_f32_u32(vResult); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp); + if (iTest == 0xf) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorGreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0 + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vcgeq_f32(V1, V2)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpge_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorGreaterOrEqualR +( + uint32_t* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + assert(pCR != nullptr); +#if defined(_XM_NO_INTRINSICS_) + + uint32_t ux = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uint32_t uy = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uint32_t uz = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uint32_t uw = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + uint32_t CR = 0; + if (ux & uy & uz & uw) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if (!(ux | uy | uz | uw)) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + + XMVECTORU32 Control = { { { ux, uy, uz, uw } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgeq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + // All elements are greater or equal + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + // All elements are not greater or equal + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vreinterpretq_f32_u32(vResult); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1, V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp); + if (iTest == 0xf) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLess +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V1.vector4_f32[0] < V2.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[1] < V2.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[2] < V2.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[3] < V2.vector4_f32[3]) ? 0xFFFFFFFF : 0 + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vcltq_f32(V1, V2)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmplt_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V1.vector4_f32[0] <= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[1] <= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[2] <= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[3] <= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0 + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vcleq_f32(V1, V2)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmple_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorInBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFF : 0 + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test if less than or equal + uint32x4_t vTemp1 = vcleq_f32(V, Bounds); + // Negate the bounds + uint32x4_t vTemp2 = vreinterpretq_u32_f32(vnegq_f32(Bounds)); + // Test if greater or equal (Reversed) + vTemp2 = vcleq_f32(vreinterpretq_f32_u32(vTemp2), V); + // Blend answers + vTemp1 = vandq_u32(vTemp1, vTemp2); + return vreinterpretq_f32_u32(vTemp1); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2, V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1, vTemp2); + return vTemp1; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorInBoundsR +( + uint32_t* pCR, + FXMVECTOR V, + FXMVECTOR Bounds +) noexcept +{ + assert(pCR != nullptr); +#if defined(_XM_NO_INTRINSICS_) + + uint32_t ux = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uint32_t uy = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uint32_t uz = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uint32_t uw = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + + uint32_t CR = 0; + if (ux & uy & uz & uw) + { + // All elements are in bounds + CR = XM_CRMASK_CR6BOUNDS; + } + *pCR = CR; + + XMVECTORU32 Control = { { { ux, uy, uz, uw } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test if less than or equal + uint32x4_t vTemp1 = vcleq_f32(V, Bounds); + // Negate the bounds + uint32x4_t vTemp2 = vreinterpretq_u32_f32(vnegq_f32(Bounds)); + // Test if greater or equal (Reversed) + vTemp2 = vcleq_f32(vreinterpretq_f32_u32(vTemp2), V); + // Blend answers + vTemp1 = vandq_u32(vTemp1, vTemp2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTemp1)), vget_high_u8(vreinterpretq_u8_u32(vTemp1))); + uint16x4x2_t vTemp3 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp3.val[1]), 1); + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + // All elements are in bounds + CR = XM_CRMASK_CR6BOUNDS; + } + *pCR = CR; + return vreinterpretq_f32_u32(vTemp1); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2, V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1, vTemp2); + + uint32_t CR = 0; + if (_mm_movemask_ps(vTemp1) == 0xf) + { + // All elements are in bounds + CR = XM_CRMASK_CR6BOUNDS; + } + *pCR = CR; + return vTemp1; +#endif +} + +//------------------------------------------------------------------------------ + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma float_control(push) +#pragma float_control(precise, on) +#endif + +inline XMVECTOR XM_CALLCONV XMVectorIsNaN(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + XMISNAN(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0, + XMISNAN(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0, + XMISNAN(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0, + XMISNAN(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0 + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test against itself. NaN is always not equal + uint32x4_t vTempNan = vceqq_f32(V, V); + // Flip results + return vreinterpretq_f32_u32(vmvnq_u32(vTempNan)); +#elif defined(_XM_SSE_INTRINSICS_) + // Test against itself. NaN is always not equal + return _mm_cmpneq_ps(V, V); +#endif +} + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma float_control(pop) +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorIsInfinite(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + XMISINF(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0, + XMISINF(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0, + XMISINF(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0, + XMISINF(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0 + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bit + uint32x4_t vTemp = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + // Compare to infinity + vTemp = vceqq_f32(vreinterpretq_f32_u32(vTemp), g_XMInfinity); + // If any are infinity, the signs are true. + return vreinterpretq_f32_u32(vTemp); +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + __m128 vTemp = _mm_and_ps(V, g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp, g_XMInfinity); + // If any are infinity, the signs are true. + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +// Rounding and clamping operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMin +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + (V1.vector4_f32[0] < V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0], + (V1.vector4_f32[1] < V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1], + (V1.vector4_f32[2] < V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2], + (V1.vector4_f32[3] < V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vminq_f32(V1, V2); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_min_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMax +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + (V1.vector4_f32[0] > V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0], + (V1.vector4_f32[1] > V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1], + (V1.vector4_f32[2] > V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2], + (V1.vector4_f32[3] > V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmaxq_f32(V1, V2); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_max_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +namespace Internal +{ + // Round to nearest (even) a.k.a. banker's rounding + inline float round_to_nearest(float x) noexcept + { + float i = floorf(x); + x -= i; + if (x < 0.5f) + return i; + if (x > 0.5f) + return i + 1.f; + + float int_part; + (void)modff(i / 2.f, &int_part); + if ((2.f * int_part) == i) + { + return i; + } + + return i + 1.f; + } +} + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma float_control(push) +#pragma float_control(precise, on) +#endif + +inline XMVECTOR XM_CALLCONV XMVectorRound(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + Internal::round_to_nearest(V.vector4_f32[0]), + Internal::round_to_nearest(V.vector4_f32[1]), + Internal::round_to_nearest(V.vector4_f32[2]), + Internal::round_to_nearest(V.vector4_f32[3]) + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ + return vrndnq_f32(V); +#else + uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(V), g_XMNegativeZero); + float32x4_t sMagic = vreinterpretq_f32_u32(vorrq_u32(g_XMNoFraction, sign)); + float32x4_t R1 = vaddq_f32(V, sMagic); + R1 = vsubq_f32(R1, sMagic); + float32x4_t R2 = vabsq_f32(V); + uint32x4_t mask = vcleq_f32(R2, g_XMNoFraction); + return vbslq_f32(mask, R1, V); +#endif +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_round_ps(V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 sign = _mm_and_ps(V, g_XMNegativeZero); + __m128 sMagic = _mm_or_ps(g_XMNoFraction, sign); + __m128 R1 = _mm_add_ps(V, sMagic); + R1 = _mm_sub_ps(R1, sMagic); + __m128 R2 = _mm_and_ps(V, g_XMAbsMask); + __m128 mask = _mm_cmple_ps(R2, g_XMNoFraction); + R2 = _mm_andnot_ps(mask, V); + R1 = _mm_and_ps(R1, mask); + XMVECTOR vResult = _mm_xor_ps(R1, R2); + return vResult; +#endif +} + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma float_control(pop) +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorTruncate(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + uint32_t i; + + // Avoid C4701 + Result.vector4_f32[0] = 0.0f; + + for (i = 0; i < 4; i++) + { + if (XMISNAN(V.vector4_f32[i])) + { + Result.vector4_u32[i] = 0x7FC00000; + } + else if (fabsf(V.vector4_f32[i]) < 8388608.0f) + { + Result.vector4_f32[i] = static_cast(static_cast(V.vector4_f32[i])); + } + else + { + Result.vector4_f32[i] = V.vector4_f32[i]; + } + } + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ + return vrndq_f32(V); +#else + float32x4_t vTest = vabsq_f32(V); + vTest = vreinterpretq_f32_u32(vcltq_f32(vTest, g_XMNoFraction)); + + int32x4_t vInt = vcvtq_s32_f32(V); + float32x4_t vResult = vcvtq_f32_s32(vInt); + + // All numbers less than 8388608 will use the round to int + // All others, use the ORIGINAL value + return vbslq_f32(vreinterpretq_u32_f32(vTest), vResult, V); +#endif +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_round_ps(V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +#elif defined(_XM_SSE_INTRINSICS_) + // To handle NAN, INF and numbers greater than 8388608, use masking + // Get the abs value + __m128i vTest = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF + vTest = _mm_cmplt_epi32(vTest, g_XMNoFraction); + // Convert to int and back to float for rounding with truncation + __m128i vInt = _mm_cvttps_epi32(V); + // Convert back to floats + XMVECTOR vResult = _mm_cvtepi32_ps(vInt); + // All numbers less than 8388608 will use the round to int + vResult = _mm_and_ps(vResult, _mm_castsi128_ps(vTest)); + // All others, use the ORIGINAL value + vTest = _mm_andnot_si128(vTest, _mm_castps_si128(V)); + vResult = _mm_or_ps(vResult, _mm_castsi128_ps(vTest)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorFloor(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + floorf(V.vector4_f32[0]), + floorf(V.vector4_f32[1]), + floorf(V.vector4_f32[2]), + floorf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ + return vrndmq_f32(V); +#else + float32x4_t vTest = vabsq_f32(V); + vTest = vreinterpretq_f32_u32(vcltq_f32(vTest, g_XMNoFraction)); + // Truncate + int32x4_t vInt = vcvtq_s32_f32(V); + float32x4_t vResult = vcvtq_f32_s32(vInt); + uint32x4_t vLargerMask = vcgtq_f32(vResult, V); + // 0 -> 0, 0xffffffff -> -1.0f + float32x4_t vLarger = vcvtq_f32_s32(vreinterpretq_s32_u32(vLargerMask)); + vResult = vaddq_f32(vResult, vLarger); + // All numbers less than 8388608 will use the round to int + // All others, use the ORIGINAL value + return vbslq_f32(vreinterpretq_u32_f32(vTest), vResult, V); +#endif +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_floor_ps(V); +#elif defined(_XM_SSE_INTRINSICS_) + // To handle NAN, INF and numbers greater than 8388608, use masking + __m128i vTest = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + vTest = _mm_cmplt_epi32(vTest, g_XMNoFraction); + // Truncate + __m128i vInt = _mm_cvttps_epi32(V); + XMVECTOR vResult = _mm_cvtepi32_ps(vInt); + __m128 vLarger = _mm_cmpgt_ps(vResult, V); + // 0 -> 0, 0xffffffff -> -1.0f + vLarger = _mm_cvtepi32_ps(_mm_castps_si128(vLarger)); + vResult = _mm_add_ps(vResult, vLarger); + // All numbers less than 8388608 will use the round to int + vResult = _mm_and_ps(vResult, _mm_castsi128_ps(vTest)); + // All others, use the ORIGINAL value + vTest = _mm_andnot_si128(vTest, _mm_castps_si128(V)); + vResult = _mm_or_ps(vResult, _mm_castsi128_ps(vTest)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCeiling(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + ceilf(V.vector4_f32[0]), + ceilf(V.vector4_f32[1]), + ceilf(V.vector4_f32[2]), + ceilf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ + return vrndpq_f32(V); +#else + float32x4_t vTest = vabsq_f32(V); + vTest = vreinterpretq_f32_u32(vcltq_f32(vTest, g_XMNoFraction)); + // Truncate + int32x4_t vInt = vcvtq_s32_f32(V); + float32x4_t vResult = vcvtq_f32_s32(vInt); + uint32x4_t vSmallerMask = vcltq_f32(vResult, V); + // 0 -> 0, 0xffffffff -> -1.0f + float32x4_t vSmaller = vcvtq_f32_s32(vreinterpretq_s32_u32(vSmallerMask)); + vResult = vsubq_f32(vResult, vSmaller); + // All numbers less than 8388608 will use the round to int + // All others, use the ORIGINAL value + return vbslq_f32(vreinterpretq_u32_f32(vTest), vResult, V); +#endif +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_ceil_ps(V); +#elif defined(_XM_SSE_INTRINSICS_) + // To handle NAN, INF and numbers greater than 8388608, use masking + __m128i vTest = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + vTest = _mm_cmplt_epi32(vTest, g_XMNoFraction); + // Truncate + __m128i vInt = _mm_cvttps_epi32(V); + XMVECTOR vResult = _mm_cvtepi32_ps(vInt); + __m128 vSmaller = _mm_cmplt_ps(vResult, V); + // 0 -> 0, 0xffffffff -> -1.0f + vSmaller = _mm_cvtepi32_ps(_mm_castps_si128(vSmaller)); + vResult = _mm_sub_ps(vResult, vSmaller); + // All numbers less than 8388608 will use the round to int + vResult = _mm_and_ps(vResult, _mm_castsi128_ps(vTest)); + // All others, use the ORIGINAL value + vTest = _mm_andnot_si128(vTest, _mm_castps_si128(V)); + vResult = _mm_or_ps(vResult, _mm_castsi128_ps(vTest)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorClamp +( + FXMVECTOR V, + FXMVECTOR Min, + FXMVECTOR Max +) noexcept +{ + assert(XMVector4LessOrEqual(Min, Max)); + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVectorMax(Min, V); + Result = XMVectorMin(Max, Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmaxq_f32(Min, V); + vResult = vminq_f32(Max, vResult); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult; + vResult = _mm_max_ps(Min, V); + vResult = _mm_min_ps(Max, vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSaturate(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + + return XMVectorClamp(V, Zero, g_XMOne.v); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Set <0 to 0 + float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0)); + // Set>1 to 1 + return vminq_f32(vResult, vdupq_n_f32(1.0f)); +#elif defined(_XM_SSE_INTRINSICS_) + // Set <0 to 0 + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + // Set>1 to 1 + return _mm_min_ps(vResult, g_XMOne); +#endif +} + +//------------------------------------------------------------------------------ +// Bitwise logical operations +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorAndInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { { { + V1.vector4_u32[0] & V2.vector4_u32[0], + V1.vector4_u32[1] & V2.vector4_u32[1], + V1.vector4_u32[2] & V2.vector4_u32[2], + V1.vector4_u32[3] & V2.vector4_u32[3] + } } }; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2))); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_and_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorAndCInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { { { + V1.vector4_u32[0] & ~V2.vector4_u32[0], + V1.vector4_u32[1] & ~V2.vector4_u32[1], + V1.vector4_u32[2] & ~V2.vector4_u32[2], + V1.vector4_u32[3] & ~V2.vector4_u32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2))); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_andnot_si128(_mm_castps_si128(V2), _mm_castps_si128(V1)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorOrInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { { { + V1.vector4_u32[0] | V2.vector4_u32[0], + V1.vector4_u32[1] | V2.vector4_u32[1], + V1.vector4_u32[2] | V2.vector4_u32[2], + V1.vector4_u32[3] | V2.vector4_u32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2))); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_or_si128(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNorInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { { { + ~(V1.vector4_u32[0] | V2.vector4_u32[0]), + ~(V1.vector4_u32[1] | V2.vector4_u32[1]), + ~(V1.vector4_u32[2] | V2.vector4_u32[2]), + ~(V1.vector4_u32[3] | V2.vector4_u32[3]) + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t Result = vorrq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)); + return vreinterpretq_f32_u32(vbicq_u32(g_XMNegOneMask, Result)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i Result; + Result = _mm_or_si128(_mm_castps_si128(V1), _mm_castps_si128(V2)); + Result = _mm_andnot_si128(Result, g_XMNegOneMask); + return _mm_castsi128_ps(Result); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorXorInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { { { + V1.vector4_u32[0] ^ V2.vector4_u32[0], + V1.vector4_u32[1] ^ V2.vector4_u32[1], + V1.vector4_u32[2] ^ V2.vector4_u32[2], + V1.vector4_u32[3] ^ V2.vector4_u32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2))); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_xor_si128(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNegate(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + -V.vector4_f32[0], + -V.vector4_f32[1], + -V.vector4_f32[2], + -V.vector4_f32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vnegq_f32(V); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR Z; + + Z = _mm_setzero_ps(); + + return _mm_sub_ps(Z, V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorAdd +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + V1.vector4_f32[0] + V2.vector4_f32[0], + V1.vector4_f32[1] + V2.vector4_f32[1], + V1.vector4_f32[2] + V2.vector4_f32[2], + V1.vector4_f32[3] + V2.vector4_f32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vaddq_f32(V1, V2); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_add_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSum(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result; + Result.f[0] = + Result.f[1] = + Result.f[2] = + Result.f[3] = V.vector4_f32[0] + V.vector4_f32[1] + V.vector4_f32[2] + V.vector4_f32[3]; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ + float32x4_t vTemp = vpaddq_f32(V, V); + return vpaddq_f32(vTemp, vTemp); +#else + float32x2_t v1 = vget_low_f32(V); + float32x2_t v2 = vget_high_f32(V); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + return vcombine_f32(v1, v1); +#endif +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vTemp = _mm_hadd_ps(V, V); + return _mm_hadd_ps(vTemp, vTemp); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 3, 0, 1)); + XMVECTOR vTemp2 = _mm_add_ps(V, vTemp); + vTemp = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 0, 3, 2)); + return _mm_add_ps(vTemp, vTemp2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorAddAngles +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + + // Add the given angles together. If the range of V1 is such + // that -Pi <= V1 < Pi and the range of V2 is such that + // -2Pi <= V2 <= 2Pi, then the range of the resulting angle + // will be -Pi <= Result < Pi. + XMVECTOR Result = XMVectorAdd(V1, V2); + + XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v); + XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask); + + Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v); + Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask); + + Result = XMVectorAdd(Result, Offset); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Adjust the angles + float32x4_t vResult = vaddq_f32(V1, V2); + // Less than Pi? + uint32x4_t vOffset = vcltq_f32(vResult, g_XMNegativePi); + vOffset = vandq_u32(vOffset, g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = vaddq_f32(vResult, vreinterpretq_f32_u32(vOffset)); + // Greater than or equal to Pi? + vOffset = vcgeq_f32(vResult, g_XMPi); + vOffset = vandq_u32(vOffset, g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = vsubq_f32(vResult, vreinterpretq_f32_u32(vOffset)); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Adjust the angles + XMVECTOR vResult = _mm_add_ps(V1, V2); + // Less than Pi? + XMVECTOR vOffset = _mm_cmplt_ps(vResult, g_XMNegativePi); + vOffset = _mm_and_ps(vOffset, g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = _mm_add_ps(vResult, vOffset); + // Greater than or equal to Pi? + vOffset = _mm_cmpge_ps(vResult, g_XMPi); + vOffset = _mm_and_ps(vOffset, g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = _mm_sub_ps(vResult, vOffset); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSubtract +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + V1.vector4_f32[0] - V2.vector4_f32[0], + V1.vector4_f32[1] - V2.vector4_f32[1], + V1.vector4_f32[2] - V2.vector4_f32[2], + V1.vector4_f32[3] - V2.vector4_f32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsubq_f32(V1, V2); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_sub_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSubtractAngles +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + + // Subtract the given angles. If the range of V1 is such + // that -Pi <= V1 < Pi and the range of V2 is such that + // -2Pi <= V2 <= 2Pi, then the range of the resulting angle + // will be -Pi <= Result < Pi. + XMVECTOR Result = XMVectorSubtract(V1, V2); + + XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v); + XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask); + + Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v); + Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask); + + Result = XMVectorAdd(Result, Offset); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Adjust the angles + XMVECTOR vResult = vsubq_f32(V1, V2); + // Less than Pi? + uint32x4_t vOffset = vcltq_f32(vResult, g_XMNegativePi); + vOffset = vandq_u32(vOffset, g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = vaddq_f32(vResult, vreinterpretq_f32_u32(vOffset)); + // Greater than or equal to Pi? + vOffset = vcgeq_f32(vResult, g_XMPi); + vOffset = vandq_u32(vOffset, g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = vsubq_f32(vResult, vreinterpretq_f32_u32(vOffset)); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Adjust the angles + XMVECTOR vResult = _mm_sub_ps(V1, V2); + // Less than Pi? + XMVECTOR vOffset = _mm_cmplt_ps(vResult, g_XMNegativePi); + vOffset = _mm_and_ps(vOffset, g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = _mm_add_ps(vResult, vOffset); + // Greater than or equal to Pi? + vOffset = _mm_cmpge_ps(vResult, g_XMPi); + vOffset = _mm_and_ps(vOffset, g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = _mm_sub_ps(vResult, vOffset); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMultiply +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + V1.vector4_f32[0] * V2.vector4_f32[0], + V1.vector4_f32[1] * V2.vector4_f32[1], + V1.vector4_f32[2] * V2.vector4_f32[2], + V1.vector4_f32[3] * V2.vector4_f32[3] + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmulq_f32(V1, V2); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_mul_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + V1.vector4_f32[0] * V2.vector4_f32[0] + V3.vector4_f32[0], + V1.vector4_f32[1] * V2.vector4_f32[1] + V3.vector4_f32[1], + V1.vector4_f32[2] * V2.vector4_f32[2] + V3.vector4_f32[2], + V1.vector4_f32[3] * V2.vector4_f32[3] + V3.vector4_f32[3] + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ + return vfmaq_f32(V3, V1, V2); +#else + return vmlaq_f32(V3, V1, V2); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + return XM_FMADD_PS(V1, V2, V3); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorDivide +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + V1.vector4_f32[0] / V2.vector4_f32[0], + V1.vector4_f32[1] / V2.vector4_f32[1], + V1.vector4_f32[2] / V2.vector4_f32[2], + V1.vector4_f32[3] / V2.vector4_f32[3] + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ + return vdivq_f32(V1, V2); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t Reciprocal = vrecpeq_f32(V2); + float32x4_t S = vrecpsq_f32(Reciprocal, V2); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, V2); + Reciprocal = vmulq_f32(S, Reciprocal); + return vmulq_f32(V1, Reciprocal); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_div_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + V3.vector4_f32[0] - (V1.vector4_f32[0] * V2.vector4_f32[0]), + V3.vector4_f32[1] - (V1.vector4_f32[1] * V2.vector4_f32[1]), + V3.vector4_f32[2] - (V1.vector4_f32[2] * V2.vector4_f32[2]), + V3.vector4_f32[3] - (V1.vector4_f32[3] * V2.vector4_f32[3]) + } } }; + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ + return vfmsq_f32(V3, V1, V2); +#else + return vmlsq_f32(V3, V1, V2); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + return XM_FNMADD_PS(V1, V2, V3); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorScale +( + FXMVECTOR V, + float ScaleFactor +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + V.vector4_f32[0] * ScaleFactor, + V.vector4_f32[1] * ScaleFactor, + V.vector4_f32[2] * ScaleFactor, + V.vector4_f32[3] * ScaleFactor + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmulq_n_f32(V, ScaleFactor); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_set_ps1(ScaleFactor); + return _mm_mul_ps(vResult, V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorReciprocalEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + 1.f / V.vector4_f32[0], + 1.f / V.vector4_f32[1], + 1.f / V.vector4_f32[2], + 1.f / V.vector4_f32[3] + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vrecpeq_f32(V); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_rcp_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorReciprocal(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + 1.f / V.vector4_f32[0], + 1.f / V.vector4_f32[1], + 1.f / V.vector4_f32[2], + 1.f / V.vector4_f32[3] + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ + float32x4_t one = vdupq_n_f32(1.0f); + return vdivq_f32(one, V); +#else + // 2 iterations of Newton-Raphson refinement + float32x4_t Reciprocal = vrecpeq_f32(V); + float32x4_t S = vrecpsq_f32(Reciprocal, V); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, V); + return vmulq_f32(S, Reciprocal); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_div_ps(g_XMOne, V); +#endif +} + +//------------------------------------------------------------------------------ +// Return an estimated square root +inline XMVECTOR XM_CALLCONV XMVectorSqrtEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + sqrtf(V.vector4_f32[0]), + sqrtf(V.vector4_f32[1]), + sqrtf(V.vector4_f32[2]), + sqrtf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // 1 iteration of Newton-Raphson refinment of sqrt + float32x4_t S0 = vrsqrteq_f32(V); + float32x4_t P0 = vmulq_f32(V, S0); + float32x4_t R0 = vrsqrtsq_f32(P0, S0); + float32x4_t S1 = vmulq_f32(S0, R0); + + XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v); + XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0)); + XMVECTOR Result = vmulq_f32(V, S1); + XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero); + return XMVectorSelect(V, Result, Select); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_sqrt_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSqrt(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + sqrtf(V.vector4_f32[0]), + sqrtf(V.vector4_f32[1]), + sqrtf(V.vector4_f32[2]), + sqrtf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // 3 iterations of Newton-Raphson refinment of sqrt + float32x4_t S0 = vrsqrteq_f32(V); + float32x4_t P0 = vmulq_f32(V, S0); + float32x4_t R0 = vrsqrtsq_f32(P0, S0); + float32x4_t S1 = vmulq_f32(S0, R0); + float32x4_t P1 = vmulq_f32(V, S1); + float32x4_t R1 = vrsqrtsq_f32(P1, S1); + float32x4_t S2 = vmulq_f32(S1, R1); + float32x4_t P2 = vmulq_f32(V, S2); + float32x4_t R2 = vrsqrtsq_f32(P2, S2); + float32x4_t S3 = vmulq_f32(S2, R2); + + XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v); + XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0)); + XMVECTOR Result = vmulq_f32(V, S3); + XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero); + return XMVectorSelect(V, Result, Select); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_sqrt_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorReciprocalSqrtEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + 1.f / sqrtf(V.vector4_f32[0]), + 1.f / sqrtf(V.vector4_f32[1]), + 1.f / sqrtf(V.vector4_f32[2]), + 1.f / sqrtf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vrsqrteq_f32(V); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_rsqrt_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorReciprocalSqrt(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + 1.f / sqrtf(V.vector4_f32[0]), + 1.f / sqrtf(V.vector4_f32[1]), + 1.f / sqrtf(V.vector4_f32[2]), + 1.f / sqrtf(V.vector4_f32[3]) + } } }; + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t S0 = vrsqrteq_f32(V); + + float32x4_t P0 = vmulq_f32(V, S0); + float32x4_t R0 = vrsqrtsq_f32(P0, S0); + + float32x4_t S1 = vmulq_f32(S0, R0); + float32x4_t P1 = vmulq_f32(V, S1); + float32x4_t R1 = vrsqrtsq_f32(P1, S1); + + return vmulq_f32(S1, R1); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_sqrt_ps(V); + vResult = _mm_div_ps(g_XMOne, vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorExp2(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + exp2f(V.vector4_f32[0]), + exp2f(V.vector4_f32[1]), + exp2f(V.vector4_f32[2]), + exp2f(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t itrunc = vcvtq_s32_f32(V); + float32x4_t ftrunc = vcvtq_f32_s32(itrunc); + float32x4_t y = vsubq_f32(V, ftrunc); + + float32x4_t poly = vmlaq_f32(g_XMExpEst6, g_XMExpEst7, y); + poly = vmlaq_f32(g_XMExpEst5, poly, y); + poly = vmlaq_f32(g_XMExpEst4, poly, y); + poly = vmlaq_f32(g_XMExpEst3, poly, y); + poly = vmlaq_f32(g_XMExpEst2, poly, y); + poly = vmlaq_f32(g_XMExpEst1, poly, y); + poly = vmlaq_f32(g_XMOne, poly, y); + + int32x4_t biased = vaddq_s32(itrunc, g_XMExponentBias); + biased = vshlq_n_s32(biased, 23); + float32x4_t result0 = XMVectorDivide(vreinterpretq_f32_s32(biased), poly); + + biased = vaddq_s32(itrunc, g_XM253); + biased = vshlq_n_s32(biased, 23); + float32x4_t result1 = XMVectorDivide(vreinterpretq_f32_s32(biased), poly); + result1 = vmulq_f32(g_XMMinNormal.v, result1); + + // Use selection to handle the cases + // if (V is NaN) -> QNaN; + // else if (V sign bit set) + // if (V > -150) + // if (V.exponent < -126) -> result1 + // else -> result0 + // else -> +0 + // else + // if (V < 128) -> result0 + // else -> +inf + + uint32x4_t comp = vcltq_s32(vreinterpretq_s32_f32(V), g_XMBin128); + float32x4_t result2 = vbslq_f32(comp, result0, g_XMInfinity); + + comp = vcltq_s32(itrunc, g_XMSubnormalExponent); + float32x4_t result3 = vbslq_f32(comp, result1, result0); + + comp = vcltq_s32(vreinterpretq_s32_f32(V), g_XMBinNeg150); + float32x4_t result4 = vbslq_f32(comp, result3, g_XMZero); + + int32x4_t sign = vandq_s32(vreinterpretq_s32_f32(V), g_XMNegativeZero); + comp = vceqq_s32(sign, g_XMNegativeZero); + float32x4_t result5 = vbslq_f32(comp, result4, result2); + + int32x4_t t0 = vandq_s32(vreinterpretq_s32_f32(V), g_XMQNaNTest); + int32x4_t t1 = vandq_s32(vreinterpretq_s32_f32(V), g_XMInfinity); + t0 = vreinterpretq_s32_u32(vceqq_s32(t0, g_XMZero)); + t1 = vreinterpretq_s32_u32(vceqq_s32(t1, g_XMInfinity)); + int32x4_t isNaN = vbicq_s32(t1, t0); + + float32x4_t vResult = vbslq_f32(vreinterpretq_u32_s32(isNaN), g_XMQNaN, result5); + return vResult; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_exp2_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i itrunc = _mm_cvttps_epi32(V); + __m128 ftrunc = _mm_cvtepi32_ps(itrunc); + __m128 y = _mm_sub_ps(V, ftrunc); + + __m128 poly = XM_FMADD_PS(g_XMExpEst7, y, g_XMExpEst6); + poly = XM_FMADD_PS(poly, y, g_XMExpEst5); + poly = XM_FMADD_PS(poly, y, g_XMExpEst4); + poly = XM_FMADD_PS(poly, y, g_XMExpEst3); + poly = XM_FMADD_PS(poly, y, g_XMExpEst2); + poly = XM_FMADD_PS(poly, y, g_XMExpEst1); + poly = XM_FMADD_PS(poly, y, g_XMOne); + + __m128i biased = _mm_add_epi32(itrunc, g_XMExponentBias); + biased = _mm_slli_epi32(biased, 23); + __m128 result0 = _mm_div_ps(_mm_castsi128_ps(biased), poly); + + biased = _mm_add_epi32(itrunc, g_XM253); + biased = _mm_slli_epi32(biased, 23); + __m128 result1 = _mm_div_ps(_mm_castsi128_ps(biased), poly); + result1 = _mm_mul_ps(g_XMMinNormal.v, result1); + + // Use selection to handle the cases + // if (V is NaN) -> QNaN; + // else if (V sign bit set) + // if (V > -150) + // if (V.exponent < -126) -> result1 + // else -> result0 + // else -> +0 + // else + // if (V < 128) -> result0 + // else -> +inf + + __m128i comp = _mm_cmplt_epi32(_mm_castps_si128(V), g_XMBin128); + __m128i select0 = _mm_and_si128(comp, _mm_castps_si128(result0)); + __m128i select1 = _mm_andnot_si128(comp, g_XMInfinity); + __m128i result2 = _mm_or_si128(select0, select1); + + comp = _mm_cmplt_epi32(itrunc, g_XMSubnormalExponent); + select1 = _mm_and_si128(comp, _mm_castps_si128(result1)); + select0 = _mm_andnot_si128(comp, _mm_castps_si128(result0)); + __m128i result3 = _mm_or_si128(select0, select1); + + comp = _mm_cmplt_epi32(_mm_castps_si128(V), g_XMBinNeg150); + select0 = _mm_and_si128(comp, result3); + select1 = _mm_andnot_si128(comp, g_XMZero); + __m128i result4 = _mm_or_si128(select0, select1); + + __m128i sign = _mm_and_si128(_mm_castps_si128(V), g_XMNegativeZero); + comp = _mm_cmpeq_epi32(sign, g_XMNegativeZero); + select0 = _mm_and_si128(comp, result4); + select1 = _mm_andnot_si128(comp, result2); + __m128i result5 = _mm_or_si128(select0, select1); + + __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + t0 = _mm_cmpeq_epi32(t0, g_XMZero); + t1 = _mm_cmpeq_epi32(t1, g_XMInfinity); + __m128i isNaN = _mm_andnot_si128(t0, t1); + + select0 = _mm_and_si128(isNaN, g_XMQNaN); + select1 = _mm_andnot_si128(isNaN, result5); + __m128i vResult = _mm_or_si128(select0, select1); + + return _mm_castsi128_ps(vResult); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorExp10(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + powf(10.0f, V.vector4_f32[0]), + powf(10.0f, V.vector4_f32[1]), + powf(10.0f, V.vector4_f32[2]), + powf(10.0f, V.vector4_f32[3]) + } } }; + return Result.v; + +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_exp10_ps(V); + return Result; +#else + // exp10(V) = exp2(vin*log2(10)) + XMVECTOR Vten = XMVectorMultiply(g_XMLg10, V); + return XMVectorExp2(Vten); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorExpE(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + expf(V.vector4_f32[0]), + expf(V.vector4_f32[1]), + expf(V.vector4_f32[2]), + expf(V.vector4_f32[3]) + } } }; + return Result.v; + +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_exp_ps(V); + return Result; +#else + // expE(V) = exp2(vin*log2(e)) + XMVECTOR Ve = XMVectorMultiply(g_XMLgE, V); + return XMVectorExp2(Ve); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorExp(FXMVECTOR V) noexcept +{ + return XMVectorExp2(V); +} + +//------------------------------------------------------------------------------ + +#if defined(_XM_SSE_INTRINSICS_) + +namespace Internal +{ + inline __m128i multi_sll_epi32(__m128i value, __m128i count) noexcept + { + __m128i v = _mm_shuffle_epi32(value, _MM_SHUFFLE(0, 0, 0, 0)); + __m128i c = _mm_shuffle_epi32(count, _MM_SHUFFLE(0, 0, 0, 0)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r0 = _mm_sll_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 1, 1, 1)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(1, 1, 1, 1)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r1 = _mm_sll_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(2, 2, 2, 2)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(2, 2, 2, 2)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r2 = _mm_sll_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(3, 3, 3, 3)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(3, 3, 3, 3)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r3 = _mm_sll_epi32(v, c); + + // (r0,r0,r1,r1) + __m128 r01 = _mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1), _MM_SHUFFLE(0, 0, 0, 0)); + // (r2,r2,r3,r3) + __m128 r23 = _mm_shuffle_ps(_mm_castsi128_ps(r2), _mm_castsi128_ps(r3), _MM_SHUFFLE(0, 0, 0, 0)); + // (r0,r1,r2,r3) + __m128 result = _mm_shuffle_ps(r01, r23, _MM_SHUFFLE(2, 0, 2, 0)); + return _mm_castps_si128(result); + } + + inline __m128i multi_srl_epi32(__m128i value, __m128i count) noexcept + { + __m128i v = _mm_shuffle_epi32(value, _MM_SHUFFLE(0, 0, 0, 0)); + __m128i c = _mm_shuffle_epi32(count, _MM_SHUFFLE(0, 0, 0, 0)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r0 = _mm_srl_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 1, 1, 1)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(1, 1, 1, 1)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r1 = _mm_srl_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(2, 2, 2, 2)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(2, 2, 2, 2)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r2 = _mm_srl_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(3, 3, 3, 3)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(3, 3, 3, 3)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r3 = _mm_srl_epi32(v, c); + + // (r0,r0,r1,r1) + __m128 r01 = _mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1), _MM_SHUFFLE(0, 0, 0, 0)); + // (r2,r2,r3,r3) + __m128 r23 = _mm_shuffle_ps(_mm_castsi128_ps(r2), _mm_castsi128_ps(r3), _MM_SHUFFLE(0, 0, 0, 0)); + // (r0,r1,r2,r3) + __m128 result = _mm_shuffle_ps(r01, r23, _MM_SHUFFLE(2, 0, 2, 0)); + return _mm_castps_si128(result); + } + + inline __m128i GetLeadingBit(const __m128i value) noexcept + { + static const XMVECTORI32 g_XM0000FFFF = { { { 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF } } }; + static const XMVECTORI32 g_XM000000FF = { { { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF } } }; + static const XMVECTORI32 g_XM0000000F = { { { 0x0000000F, 0x0000000F, 0x0000000F, 0x0000000F } } }; + static const XMVECTORI32 g_XM00000003 = { { { 0x00000003, 0x00000003, 0x00000003, 0x00000003 } } }; + + __m128i v = value, r, c, b, s; + + c = _mm_cmpgt_epi32(v, g_XM0000FFFF); // c = (v > 0xFFFF) + b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0) + r = _mm_slli_epi32(b, 4); // r = (b << 4) + v = multi_srl_epi32(v, r); // v = (v >> r) + + c = _mm_cmpgt_epi32(v, g_XM000000FF); // c = (v > 0xFF) + b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0) + s = _mm_slli_epi32(b, 3); // s = (b << 3) + v = multi_srl_epi32(v, s); // v = (v >> s) + r = _mm_or_si128(r, s); // r = (r | s) + + c = _mm_cmpgt_epi32(v, g_XM0000000F); // c = (v > 0xF) + b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0) + s = _mm_slli_epi32(b, 2); // s = (b << 2) + v = multi_srl_epi32(v, s); // v = (v >> s) + r = _mm_or_si128(r, s); // r = (r | s) + + c = _mm_cmpgt_epi32(v, g_XM00000003); // c = (v > 0x3) + b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0) + s = _mm_slli_epi32(b, 1); // s = (b << 1) + v = multi_srl_epi32(v, s); // v = (v >> s) + r = _mm_or_si128(r, s); // r = (r | s) + + s = _mm_srli_epi32(v, 1); + r = _mm_or_si128(r, s); + return r; + } +} // namespace Internal + +#endif // _XM_SSE_INTRINSICS_ + +#if defined(_XM_ARM_NEON_INTRINSICS_) + +namespace Internal +{ + inline int32x4_t GetLeadingBit(const int32x4_t value) noexcept + { + static const XMVECTORI32 g_XM0000FFFF = { { { 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF } } }; + static const XMVECTORI32 g_XM000000FF = { { { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF } } }; + static const XMVECTORI32 g_XM0000000F = { { { 0x0000000F, 0x0000000F, 0x0000000F, 0x0000000F } } }; + static const XMVECTORI32 g_XM00000003 = { { { 0x00000003, 0x00000003, 0x00000003, 0x00000003 } } }; + + uint32x4_t c = vcgtq_s32(value, g_XM0000FFFF); // c = (v > 0xFFFF) + int32x4_t b = vshrq_n_s32(vreinterpretq_s32_u32(c), 31); // b = (c ? 1 : 0) + int32x4_t r = vshlq_n_s32(b, 4); // r = (b << 4) + r = vnegq_s32(r); + int32x4_t v = vshlq_s32(value, r); // v = (v >> r) + + c = vcgtq_s32(v, g_XM000000FF); // c = (v > 0xFF) + b = vshrq_n_s32(vreinterpretq_s32_u32(c), 31); // b = (c ? 1 : 0) + int32x4_t s = vshlq_n_s32(b, 3); // s = (b << 3) + s = vnegq_s32(s); + v = vshlq_s32(v, s); // v = (v >> s) + r = vorrq_s32(r, s); // r = (r | s) + + c = vcgtq_s32(v, g_XM0000000F); // c = (v > 0xF) + b = vshrq_n_s32(vreinterpretq_s32_u32(c), 31); // b = (c ? 1 : 0) + s = vshlq_n_s32(b, 2); // s = (b << 2) + s = vnegq_s32(s); + v = vshlq_s32(v, s); // v = (v >> s) + r = vorrq_s32(r, s); // r = (r | s) + + c = vcgtq_s32(v, g_XM00000003); // c = (v > 0x3) + b = vshrq_n_s32(vreinterpretq_s32_u32(c), 31); // b = (c ? 1 : 0) + s = vshlq_n_s32(b, 1); // s = (b << 1) + s = vnegq_s32(s); + v = vshlq_s32(v, s); // v = (v >> s) + r = vorrq_s32(r, s); // r = (r | s) + + s = vshrq_n_s32(v, 1); + r = vorrq_s32(r, s); + return r; + } + +} // namespace Internal + +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLog2(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + log2f(V.vector4_f32[0]), + log2f(V.vector4_f32[1]), + log2f(V.vector4_f32[2]), + log2f(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t rawBiased = vandq_s32(vreinterpretq_s32_f32(V), g_XMInfinity); + int32x4_t trailing = vandq_s32(vreinterpretq_s32_f32(V), g_XMQNaNTest); + uint32x4_t isExponentZero = vceqq_s32(vreinterpretq_s32_f32(g_XMZero), rawBiased); + + // Compute exponent and significand for normals. + int32x4_t biased = vshrq_n_s32(rawBiased, 23); + int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias); + int32x4_t trailingNor = trailing; + + // Compute exponent and significand for subnormals. + int32x4_t leading = Internal::GetLeadingBit(trailing); + int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading); + int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift); + int32x4_t trailingSub = vshlq_s32(trailing, shift); + trailingSub = vandq_s32(trailingSub, g_XMQNaNTest); + int32x4_t e = vbslq_s32(isExponentZero, exponentSub, exponentNor); + int32x4_t t = vbslq_s32(isExponentZero, trailingSub, trailingNor); + + // Compute the approximation. + int32x4_t tmp = vorrq_s32(vreinterpretq_s32_f32(g_XMOne), t); + float32x4_t y = vsubq_f32(vreinterpretq_f32_s32(tmp), g_XMOne); + + float32x4_t log2 = vmlaq_f32(g_XMLogEst6, g_XMLogEst7, y); + log2 = vmlaq_f32(g_XMLogEst5, log2, y); + log2 = vmlaq_f32(g_XMLogEst4, log2, y); + log2 = vmlaq_f32(g_XMLogEst3, log2, y); + log2 = vmlaq_f32(g_XMLogEst2, log2, y); + log2 = vmlaq_f32(g_XMLogEst1, log2, y); + log2 = vmlaq_f32(g_XMLogEst0, log2, y); + log2 = vmlaq_f32(vcvtq_f32_s32(e), log2, y); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + uint32x4_t isInfinite = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + isInfinite = vceqq_u32(isInfinite, g_XMInfinity); + + uint32x4_t isGreaterZero = vcgtq_f32(V, g_XMZero); + uint32x4_t isNotFinite = vcgtq_f32(V, g_XMInfinity); + uint32x4_t isPositive = vbicq_u32(isGreaterZero, isNotFinite); + + uint32x4_t isZero = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + isZero = vceqq_u32(isZero, g_XMZero); + + uint32x4_t t0 = vandq_u32(vreinterpretq_u32_f32(V), g_XMQNaNTest); + uint32x4_t t1 = vandq_u32(vreinterpretq_u32_f32(V), g_XMInfinity); + t0 = vceqq_u32(t0, g_XMZero); + t1 = vceqq_u32(t1, g_XMInfinity); + uint32x4_t isNaN = vbicq_u32(t1, t0); + + float32x4_t result = vbslq_f32(isInfinite, g_XMInfinity, log2); + float32x4_t tmp2 = vbslq_f32(isZero, g_XMNegInfinity, g_XMNegQNaN); + result = vbslq_f32(isPositive, result, tmp2); + result = vbslq_f32(isNaN, g_XMQNaN, result); + return result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_log2_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased); + + // Compute exponent and significand for normals. + __m128i biased = _mm_srli_epi32(rawBiased, 23); + __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias); + __m128i trailingNor = trailing; + + // Compute exponent and significand for subnormals. + __m128i leading = Internal::GetLeadingBit(trailing); + __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading); + __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift); + __m128i trailingSub = Internal::multi_sll_epi32(trailing, shift); + trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest); + + __m128i select0 = _mm_and_si128(isExponentZero, exponentSub); + __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor); + __m128i e = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isExponentZero, trailingSub); + select1 = _mm_andnot_si128(isExponentZero, trailingNor); + __m128i t = _mm_or_si128(select0, select1); + + // Compute the approximation. + __m128i tmp = _mm_or_si128(g_XMOne, t); + __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne); + + __m128 log2 = XM_FMADD_PS(g_XMLogEst7, y, g_XMLogEst6); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst5); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst4); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst3); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst2); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst1); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst0); + log2 = XM_FMADD_PS(log2, y, _mm_cvtepi32_ps(e)); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity); + + __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero); + __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity); + __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero); + + __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isZero = _mm_cmpeq_epi32(isZero, g_XMZero); + + __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + t0 = _mm_cmpeq_epi32(t0, g_XMZero); + t1 = _mm_cmpeq_epi32(t1, g_XMInfinity); + __m128i isNaN = _mm_andnot_si128(t0, t1); + + select0 = _mm_and_si128(isInfinite, g_XMInfinity); + select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2)); + __m128i result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isZero, g_XMNegInfinity); + select1 = _mm_andnot_si128(isZero, g_XMNegQNaN); + tmp = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isPositive, result); + select1 = _mm_andnot_si128(isPositive, tmp); + result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isNaN, g_XMQNaN); + select1 = _mm_andnot_si128(isNaN, result); + result = _mm_or_si128(select0, select1); + + return _mm_castsi128_ps(result); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLog10(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + log10f(V.vector4_f32[0]), + log10f(V.vector4_f32[1]), + log10f(V.vector4_f32[2]), + log10f(V.vector4_f32[3]) + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t rawBiased = vandq_s32(vreinterpretq_s32_f32(V), g_XMInfinity); + int32x4_t trailing = vandq_s32(vreinterpretq_s32_f32(V), g_XMQNaNTest); + uint32x4_t isExponentZero = vceqq_s32(g_XMZero, rawBiased); + + // Compute exponent and significand for normals. + int32x4_t biased = vshrq_n_s32(rawBiased, 23); + int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias); + int32x4_t trailingNor = trailing; + + // Compute exponent and significand for subnormals. + int32x4_t leading = Internal::GetLeadingBit(trailing); + int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading); + int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift); + int32x4_t trailingSub = vshlq_s32(trailing, shift); + trailingSub = vandq_s32(trailingSub, g_XMQNaNTest); + int32x4_t e = vbslq_s32(isExponentZero, exponentSub, exponentNor); + int32x4_t t = vbslq_s32(isExponentZero, trailingSub, trailingNor); + + // Compute the approximation. + int32x4_t tmp = vorrq_s32(g_XMOne, t); + float32x4_t y = vsubq_f32(vreinterpretq_f32_s32(tmp), g_XMOne); + + float32x4_t log2 = vmlaq_f32(g_XMLogEst6, g_XMLogEst7, y); + log2 = vmlaq_f32(g_XMLogEst5, log2, y); + log2 = vmlaq_f32(g_XMLogEst4, log2, y); + log2 = vmlaq_f32(g_XMLogEst3, log2, y); + log2 = vmlaq_f32(g_XMLogEst2, log2, y); + log2 = vmlaq_f32(g_XMLogEst1, log2, y); + log2 = vmlaq_f32(g_XMLogEst0, log2, y); + log2 = vmlaq_f32(vcvtq_f32_s32(e), log2, y); + + log2 = vmulq_f32(g_XMInvLg10, log2); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + uint32x4_t isInfinite = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + isInfinite = vceqq_u32(isInfinite, g_XMInfinity); + + uint32x4_t isGreaterZero = vcgtq_s32(vreinterpretq_s32_f32(V), g_XMZero); + uint32x4_t isNotFinite = vcgtq_s32(vreinterpretq_s32_f32(V), g_XMInfinity); + uint32x4_t isPositive = vbicq_u32(isGreaterZero, isNotFinite); + + uint32x4_t isZero = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + isZero = vceqq_u32(isZero, g_XMZero); + + uint32x4_t t0 = vandq_u32(vreinterpretq_u32_f32(V), g_XMQNaNTest); + uint32x4_t t1 = vandq_u32(vreinterpretq_u32_f32(V), g_XMInfinity); + t0 = vceqq_u32(t0, g_XMZero); + t1 = vceqq_u32(t1, g_XMInfinity); + uint32x4_t isNaN = vbicq_u32(t1, t0); + + float32x4_t result = vbslq_f32(isInfinite, g_XMInfinity, log2); + float32x4_t tmp2 = vbslq_f32(isZero, g_XMNegInfinity, g_XMNegQNaN); + result = vbslq_f32(isPositive, result, tmp2); + result = vbslq_f32(isNaN, g_XMQNaN, result); + return result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_log10_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased); + + // Compute exponent and significand for normals. + __m128i biased = _mm_srli_epi32(rawBiased, 23); + __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias); + __m128i trailingNor = trailing; + + // Compute exponent and significand for subnormals. + __m128i leading = Internal::GetLeadingBit(trailing); + __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading); + __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift); + __m128i trailingSub = Internal::multi_sll_epi32(trailing, shift); + trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest); + + __m128i select0 = _mm_and_si128(isExponentZero, exponentSub); + __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor); + __m128i e = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isExponentZero, trailingSub); + select1 = _mm_andnot_si128(isExponentZero, trailingNor); + __m128i t = _mm_or_si128(select0, select1); + + // Compute the approximation. + __m128i tmp = _mm_or_si128(g_XMOne, t); + __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne); + + __m128 log2 = XM_FMADD_PS(g_XMLogEst7, y, g_XMLogEst6); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst5); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst4); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst3); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst2); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst1); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst0); + log2 = XM_FMADD_PS(log2, y, _mm_cvtepi32_ps(e)); + + log2 = _mm_mul_ps(g_XMInvLg10, log2); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity); + + __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero); + __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity); + __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero); + + __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isZero = _mm_cmpeq_epi32(isZero, g_XMZero); + + __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + t0 = _mm_cmpeq_epi32(t0, g_XMZero); + t1 = _mm_cmpeq_epi32(t1, g_XMInfinity); + __m128i isNaN = _mm_andnot_si128(t0, t1); + + select0 = _mm_and_si128(isInfinite, g_XMInfinity); + select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2)); + __m128i result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isZero, g_XMNegInfinity); + select1 = _mm_andnot_si128(isZero, g_XMNegQNaN); + tmp = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isPositive, result); + select1 = _mm_andnot_si128(isPositive, tmp); + result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isNaN, g_XMQNaN); + select1 = _mm_andnot_si128(isNaN, result); + result = _mm_or_si128(select0, select1); + + return _mm_castsi128_ps(result); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLogE(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + logf(V.vector4_f32[0]), + logf(V.vector4_f32[1]), + logf(V.vector4_f32[2]), + logf(V.vector4_f32[3]) + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t rawBiased = vandq_s32(vreinterpretq_s32_f32(V), g_XMInfinity); + int32x4_t trailing = vandq_s32(vreinterpretq_s32_f32(V), g_XMQNaNTest); + uint32x4_t isExponentZero = vceqq_s32(g_XMZero, rawBiased); + + // Compute exponent and significand for normals. + int32x4_t biased = vshrq_n_s32(rawBiased, 23); + int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias); + int32x4_t trailingNor = trailing; + + // Compute exponent and significand for subnormals. + int32x4_t leading = Internal::GetLeadingBit(trailing); + int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading); + int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift); + int32x4_t trailingSub = vshlq_s32(trailing, shift); + trailingSub = vandq_s32(trailingSub, g_XMQNaNTest); + int32x4_t e = vbslq_s32(isExponentZero, exponentSub, exponentNor); + int32x4_t t = vbslq_s32(isExponentZero, trailingSub, trailingNor); + + // Compute the approximation. + int32x4_t tmp = vorrq_s32(g_XMOne, t); + float32x4_t y = vsubq_f32(vreinterpretq_f32_s32(tmp), g_XMOne); + + float32x4_t log2 = vmlaq_f32(g_XMLogEst6, g_XMLogEst7, y); + log2 = vmlaq_f32(g_XMLogEst5, log2, y); + log2 = vmlaq_f32(g_XMLogEst4, log2, y); + log2 = vmlaq_f32(g_XMLogEst3, log2, y); + log2 = vmlaq_f32(g_XMLogEst2, log2, y); + log2 = vmlaq_f32(g_XMLogEst1, log2, y); + log2 = vmlaq_f32(g_XMLogEst0, log2, y); + log2 = vmlaq_f32(vcvtq_f32_s32(e), log2, y); + + log2 = vmulq_f32(g_XMInvLgE, log2); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + uint32x4_t isInfinite = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + isInfinite = vceqq_u32(isInfinite, g_XMInfinity); + + uint32x4_t isGreaterZero = vcgtq_s32(vreinterpretq_s32_f32(V), g_XMZero); + uint32x4_t isNotFinite = vcgtq_s32(vreinterpretq_s32_f32(V), g_XMInfinity); + uint32x4_t isPositive = vbicq_u32(isGreaterZero, isNotFinite); + + uint32x4_t isZero = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + isZero = vceqq_u32(isZero, g_XMZero); + + uint32x4_t t0 = vandq_u32(vreinterpretq_u32_f32(V), g_XMQNaNTest); + uint32x4_t t1 = vandq_u32(vreinterpretq_u32_f32(V), g_XMInfinity); + t0 = vceqq_u32(t0, g_XMZero); + t1 = vceqq_u32(t1, g_XMInfinity); + uint32x4_t isNaN = vbicq_u32(t1, t0); + + float32x4_t result = vbslq_f32(isInfinite, g_XMInfinity, log2); + float32x4_t tmp2 = vbslq_f32(isZero, g_XMNegInfinity, g_XMNegQNaN); + result = vbslq_f32(isPositive, result, tmp2); + result = vbslq_f32(isNaN, g_XMQNaN, result); + return result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_log_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased); + + // Compute exponent and significand for normals. + __m128i biased = _mm_srli_epi32(rawBiased, 23); + __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias); + __m128i trailingNor = trailing; + + // Compute exponent and significand for subnormals. + __m128i leading = Internal::GetLeadingBit(trailing); + __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading); + __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift); + __m128i trailingSub = Internal::multi_sll_epi32(trailing, shift); + trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest); + + __m128i select0 = _mm_and_si128(isExponentZero, exponentSub); + __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor); + __m128i e = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isExponentZero, trailingSub); + select1 = _mm_andnot_si128(isExponentZero, trailingNor); + __m128i t = _mm_or_si128(select0, select1); + + // Compute the approximation. + __m128i tmp = _mm_or_si128(g_XMOne, t); + __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne); + + __m128 log2 = XM_FMADD_PS(g_XMLogEst7, y, g_XMLogEst6); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst5); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst4); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst3); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst2); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst1); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst0); + log2 = XM_FMADD_PS(log2, y, _mm_cvtepi32_ps(e)); + + log2 = _mm_mul_ps(g_XMInvLgE, log2); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity); + + __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero); + __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity); + __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero); + + __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isZero = _mm_cmpeq_epi32(isZero, g_XMZero); + + __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + t0 = _mm_cmpeq_epi32(t0, g_XMZero); + t1 = _mm_cmpeq_epi32(t1, g_XMInfinity); + __m128i isNaN = _mm_andnot_si128(t0, t1); + + select0 = _mm_and_si128(isInfinite, g_XMInfinity); + select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2)); + __m128i result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isZero, g_XMNegInfinity); + select1 = _mm_andnot_si128(isZero, g_XMNegQNaN); + tmp = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isPositive, result); + select1 = _mm_andnot_si128(isPositive, tmp); + result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isNaN, g_XMQNaN); + select1 = _mm_andnot_si128(isNaN, result); + result = _mm_or_si128(select0, select1); + + return _mm_castsi128_ps(result); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLog(FXMVECTOR V) noexcept +{ + return XMVectorLog2(V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorPow +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + powf(V1.vector4_f32[0], V2.vector4_f32[0]), + powf(V1.vector4_f32[1], V2.vector4_f32[1]), + powf(V1.vector4_f32[2], V2.vector4_f32[2]), + powf(V1.vector4_f32[3], V2.vector4_f32[3]) + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORF32 vResult = { { { + powf(vgetq_lane_f32(V1, 0), vgetq_lane_f32(V2, 0)), + powf(vgetq_lane_f32(V1, 1), vgetq_lane_f32(V2, 1)), + powf(vgetq_lane_f32(V1, 2), vgetq_lane_f32(V2, 2)), + powf(vgetq_lane_f32(V1, 3), vgetq_lane_f32(V2, 3)) + } } }; + return vResult.v; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_pow_ps(V1, V2); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + XM_ALIGNED_DATA(16) float a[4]; + XM_ALIGNED_DATA(16) float b[4]; + _mm_store_ps(a, V1); + _mm_store_ps(b, V2); + XMVECTOR vResult = _mm_setr_ps( + powf(a[0], b[0]), + powf(a[1], b[1]), + powf(a[2], b[2]), + powf(a[3], b[3])); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorAbs(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + fabsf(V.vector4_f32[0]), + fabsf(V.vector4_f32[1]), + fabsf(V.vector4_f32[2]), + fabsf(V.vector4_f32[3]) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vabsq_f32(V); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_setzero_ps(); + vResult = _mm_sub_ps(vResult, V); + vResult = _mm_max_ps(vResult, V); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMod +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + // V1 % V2 = V1 - V2 * truncate(V1 / V2) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Quotient = XMVectorDivide(V1, V2); + Quotient = XMVectorTruncate(Quotient); + XMVECTOR Result = XMVectorNegativeMultiplySubtract(V2, Quotient, V1); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vResult = XMVectorDivide(V1, V2); + vResult = XMVectorTruncate(vResult); + return vmlsq_f32(V1, vResult, V2); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_div_ps(V1, V2); + vResult = XMVectorTruncate(vResult); + return XM_FNMADD_PS(vResult, V2, V1); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorModAngles(FXMVECTOR Angles) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + XMVECTOR Result; + + // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI + V = XMVectorMultiply(Angles, g_XMReciprocalTwoPi.v); + V = XMVectorRound(V); + Result = XMVectorNegativeMultiplySubtract(g_XMTwoPi.v, V, Angles); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI + XMVECTOR vResult = vmulq_f32(Angles, g_XMReciprocalTwoPi); + // Use the inline function due to complexity for rounding + vResult = XMVectorRound(vResult); + return vmlsq_f32(Angles, vResult, g_XMTwoPi); +#elif defined(_XM_SSE_INTRINSICS_) + // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI + XMVECTOR vResult = _mm_mul_ps(Angles, g_XMReciprocalTwoPi); + // Use the inline function due to complexity for rounding + vResult = XMVectorRound(vResult); + return XM_FNMADD_PS(vResult, g_XMTwoPi, Angles); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSin(FXMVECTOR V) noexcept +{ + // 11-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + sinf(V.vector4_f32[0]), + sinf(V.vector4_f32[1]), + sinf(V.vector4_f32[2]), + sinf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x). + uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32(x); + float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32(comp, x, rflx); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR SC1 = g_XMSinCoefficients1; + const XMVECTOR SC0 = g_XMSinCoefficients0; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(SC1), 0); + + vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, x); + return Result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_sin_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x). + __m128 sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR SC1 = g_XMSinCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(SC1, _MM_SHUFFLE(0, 0, 0, 0)); + const XMVECTOR SC0 = g_XMSinCoefficients0; + __m128 vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(2, 2, 2, 2)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(0, 0, 0, 0)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + Result = XM_FMADD_PS(Result, x2, g_XMOne); + Result = _mm_mul_ps(Result, x); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCos(FXMVECTOR V) noexcept +{ + // 10-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + cosf(V.vector4_f32[0]), + cosf(V.vector4_f32[1]), + cosf(V.vector4_f32[2]), + cosf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Map V to x in [-pi,pi]. + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32(x); + float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32(comp, x, rflx); + float32x4_t fsign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR CC1 = g_XMCosCoefficients1; + const XMVECTOR CC0 = g_XMCosCoefficients0; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(CC1), 0); + + vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, fsign); + return Result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_cos_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + // Map V to x in [-pi,pi]. + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, g_XMOne); + select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + sign = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR CC1 = g_XMCosCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(CC1, _MM_SHUFFLE(0, 0, 0, 0)); + const XMVECTOR CC0 = g_XMCosCoefficients0; + __m128 vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(2, 2, 2, 2)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(0, 0, 0, 0)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + Result = XM_FMADD_PS(Result, x2, g_XMOne); + Result = _mm_mul_ps(Result, sign); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorSinCos +( + XMVECTOR* pSin, + XMVECTOR* pCos, + FXMVECTOR V +) noexcept +{ + assert(pSin != nullptr); + assert(pCos != nullptr); + + // 11/10-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Sin = { { { + sinf(V.vector4_f32[0]), + sinf(V.vector4_f32[1]), + sinf(V.vector4_f32[2]), + sinf(V.vector4_f32[3]) + } } }; + + XMVECTORF32 Cos = { { { + cosf(V.vector4_f32[0]), + cosf(V.vector4_f32[1]), + cosf(V.vector4_f32[2]), + cosf(V.vector4_f32[3]) + } } }; + + *pSin = Sin.v; + *pCos = Cos.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32(x); + float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32(comp, x, rflx); + float32x4_t fsign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation for sine + const XMVECTOR SC1 = g_XMSinCoefficients1; + const XMVECTOR SC0 = g_XMSinCoefficients0; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(SC1), 0); + + vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + *pSin = vmulq_f32(Result, x); + + // Compute polynomial approximation for cosine + const XMVECTOR CC1 = g_XMCosCoefficients1; + const XMVECTOR CC0 = g_XMCosCoefficients0; + vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1); + Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(CC1), 0); + + vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + *pCos = vmulq_f32(Result, fsign); +#elif defined(_XM_SVML_INTRINSICS_) + *pSin = _mm_sincos_ps(pCos, V); +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x). + XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, g_XMOne); + select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + sign = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation of sine + const XMVECTOR SC1 = g_XMSinCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(SC1, _MM_SHUFFLE(0, 0, 0, 0)); + const XMVECTOR SC0 = g_XMSinCoefficients0; + __m128 vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(2, 2, 2, 2)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(0, 0, 0, 0)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + Result = XM_FMADD_PS(Result, x2, g_XMOne); + Result = _mm_mul_ps(Result, x); + *pSin = Result; + + // Compute polynomial approximation of cosine + const XMVECTOR CC1 = g_XMCosCoefficients1; + vConstantsB = XM_PERMUTE_PS(CC1, _MM_SHUFFLE(0, 0, 0, 0)); + const XMVECTOR CC0 = g_XMCosCoefficients0; + vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(3, 3, 3, 3)); + Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(2, 2, 2, 2)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(0, 0, 0, 0)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + Result = XM_FMADD_PS(Result, x2, g_XMOne); + Result = _mm_mul_ps(Result, sign); + *pCos = Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorTan(FXMVECTOR V) noexcept +{ + // Cody and Waite algorithm to compute tangent. + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + tanf(V.vector4_f32[0]), + tanf(V.vector4_f32[1]), + tanf(V.vector4_f32[2]), + tanf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_tan_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 TanCoefficients0 = { { { 1.0f, -4.667168334e-1f, 2.566383229e-2f, -3.118153191e-4f } } }; + static const XMVECTORF32 TanCoefficients1 = { { { 4.981943399e-7f, -1.333835001e-1f, 3.424887824e-3f, -1.786170734e-5f } } }; + static const XMVECTORF32 TanConstants = { { { 1.570796371f, 6.077100628e-11f, 0.000244140625f, 0.63661977228f /*2 / Pi*/ } } }; + static const XMVECTORU32 Mask = { { { 0x1, 0x1, 0x1, 0x1 } } }; + + XMVECTOR TwoDivPi = XMVectorSplatW(TanConstants.v); + + XMVECTOR Zero = XMVectorZero(); + + XMVECTOR C0 = XMVectorSplatX(TanConstants.v); + XMVECTOR C1 = XMVectorSplatY(TanConstants.v); + XMVECTOR Epsilon = XMVectorSplatZ(TanConstants.v); + + XMVECTOR VA = XMVectorMultiply(V, TwoDivPi); + + VA = XMVectorRound(VA); + + XMVECTOR VC = XMVectorNegativeMultiplySubtract(VA, C0, V); + + XMVECTOR VB = XMVectorAbs(VA); + + VC = XMVectorNegativeMultiplySubtract(VA, C1, VC); + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + VB = vreinterpretq_f32_u32(vcvtq_u32_f32(VB)); +#elif defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + reinterpret_cast<__m128i*>(&VB)[0] = _mm_cvttps_epi32(VB); +#else + for (size_t i = 0; i < 4; i++) + { + VB.vector4_u32[i] = static_cast(VB.vector4_f32[i]); + } +#endif + + XMVECTOR VC2 = XMVectorMultiply(VC, VC); + + XMVECTOR T7 = XMVectorSplatW(TanCoefficients1.v); + XMVECTOR T6 = XMVectorSplatZ(TanCoefficients1.v); + XMVECTOR T4 = XMVectorSplatX(TanCoefficients1.v); + XMVECTOR T3 = XMVectorSplatW(TanCoefficients0.v); + XMVECTOR T5 = XMVectorSplatY(TanCoefficients1.v); + XMVECTOR T2 = XMVectorSplatZ(TanCoefficients0.v); + XMVECTOR T1 = XMVectorSplatY(TanCoefficients0.v); + XMVECTOR T0 = XMVectorSplatX(TanCoefficients0.v); + + XMVECTOR VBIsEven = XMVectorAndInt(VB, Mask.v); + VBIsEven = XMVectorEqualInt(VBIsEven, Zero); + + XMVECTOR N = XMVectorMultiplyAdd(VC2, T7, T6); + XMVECTOR D = XMVectorMultiplyAdd(VC2, T4, T3); + N = XMVectorMultiplyAdd(VC2, N, T5); + D = XMVectorMultiplyAdd(VC2, D, T2); + N = XMVectorMultiply(VC2, N); + D = XMVectorMultiplyAdd(VC2, D, T1); + N = XMVectorMultiplyAdd(VC, N, VC); + XMVECTOR VCNearZero = XMVectorInBounds(VC, Epsilon); + D = XMVectorMultiplyAdd(VC2, D, T0); + + N = XMVectorSelect(N, VC, VCNearZero); + D = XMVectorSelect(D, g_XMOne.v, VCNearZero); + + XMVECTOR R0 = XMVectorNegate(N); + XMVECTOR R1 = XMVectorDivide(N, D); + R0 = XMVectorDivide(D, R0); + + XMVECTOR VIsZero = XMVectorEqual(V, Zero); + + XMVECTOR Result = XMVectorSelect(R0, R1, VBIsEven); + + Result = XMVectorSelect(Result, Zero, VIsZero); + + return Result; + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSinH(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + sinhf(V.vector4_f32[0]), + sinhf(V.vector4_f32[1]), + sinhf(V.vector4_f32[2]), + sinhf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f } } }; // 1.0f / ln(2.0f) + + XMVECTOR V1 = vmlaq_f32(g_XMNegativeOne.v, V, Scale.v); + XMVECTOR V2 = vmlsq_f32(g_XMNegativeOne.v, V, Scale.v); + XMVECTOR E1 = XMVectorExp(V1); + XMVECTOR E2 = XMVectorExp(V2); + + return vsubq_f32(E1, E2); +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_sinh_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f } } }; // 1.0f / ln(2.0f) + + XMVECTOR V1 = XM_FMADD_PS(V, Scale, g_XMNegativeOne); + XMVECTOR V2 = XM_FNMADD_PS(V, Scale, g_XMNegativeOne); + XMVECTOR E1 = XMVectorExp(V1); + XMVECTOR E2 = XMVectorExp(V2); + + return _mm_sub_ps(E1, E2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCosH(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + coshf(V.vector4_f32[0]), + coshf(V.vector4_f32[1]), + coshf(V.vector4_f32[2]), + coshf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f } } }; // 1.0f / ln(2.0f) + + XMVECTOR V1 = vmlaq_f32(g_XMNegativeOne.v, V, Scale.v); + XMVECTOR V2 = vmlsq_f32(g_XMNegativeOne.v, V, Scale.v); + XMVECTOR E1 = XMVectorExp(V1); + XMVECTOR E2 = XMVectorExp(V2); + return vaddq_f32(E1, E2); +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_cosh_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f } } }; // 1.0f / ln(2.0f) + + XMVECTOR V1 = XM_FMADD_PS(V, Scale.v, g_XMNegativeOne.v); + XMVECTOR V2 = XM_FNMADD_PS(V, Scale.v, g_XMNegativeOne.v); + XMVECTOR E1 = XMVectorExp(V1); + XMVECTOR E2 = XMVectorExp(V2); + return _mm_add_ps(E1, E2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorTanH(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + tanhf(V.vector4_f32[0]), + tanhf(V.vector4_f32[1]), + tanhf(V.vector4_f32[2]), + tanhf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f } } }; // 2.0f / ln(2.0f) + + XMVECTOR E = vmulq_f32(V, Scale.v); + E = XMVectorExp(E); + E = vmlaq_f32(g_XMOneHalf.v, E, g_XMOneHalf.v); + E = XMVectorReciprocal(E); + return vsubq_f32(g_XMOne.v, E); +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_tanh_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f } } }; // 2.0f / ln(2.0f) + + XMVECTOR E = _mm_mul_ps(V, Scale.v); + E = XMVectorExp(E); + E = XM_FMADD_PS(E, g_XMOneHalf.v, g_XMOneHalf.v); + E = _mm_div_ps(g_XMOne.v, E); + return _mm_sub_ps(g_XMOne.v, E); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorASin(FXMVECTOR V) noexcept +{ + // 7-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + asinf(V.vector4_f32[0]), + asinf(V.vector4_f32[1]), + asinf(V.vector4_f32[2]), + asinf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero); + float32x4_t x = vabsq_f32(V); + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + float32x4_t oneMValue = vsubq_f32(g_XMOne, x); + float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); + float32x4_t root = XMVectorSqrt(clampOneMValue); + + // Compute polynomial approximation + const XMVECTOR AC1 = g_XMArcCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0); + XMVECTOR t0 = vmlaq_lane_f32(vConstants, x, vget_high_f32(AC1), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0); + t0 = vmlaq_f32(vConstants, t0, x); + + const XMVECTOR AC0 = g_XMArcCoefficients0; + vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0); + t0 = vmlaq_f32(vConstants, t0, x); + t0 = vmulq_f32(t0, root); + + float32x4_t t1 = vsubq_f32(g_XMPi, t0); + t0 = vbslq_f32(nonnegative, t0, t1); + t0 = vsubq_f32(g_XMHalfPi, t0); + return t0; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_asin_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); + __m128 mvalue = _mm_sub_ps(g_XMZero, V); + __m128 x = _mm_max_ps(V, mvalue); // |V| + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __m128 oneMValue = _mm_sub_ps(g_XMOne, x); + __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); + __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) + + // Compute polynomial approximation + const XMVECTOR AC1 = g_XMArcCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 t0 = XM_FMADD_PS(vConstantsB, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(1, 1, 1, 1)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(0, 0, 0, 0)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + const XMVECTOR AC0 = g_XMArcCoefficients0; + vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(3, 3, 3, 3)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(2, 2, 2, 2)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(1, 1, 1, 1)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(0, 0, 0, 0)); + t0 = XM_FMADD_PS(t0, x, vConstants); + t0 = _mm_mul_ps(t0, root); + + __m128 t1 = _mm_sub_ps(g_XMPi, t0); + t0 = _mm_and_ps(nonnegative, t0); + t1 = _mm_andnot_ps(nonnegative, t1); + t0 = _mm_or_ps(t0, t1); + t0 = _mm_sub_ps(g_XMHalfPi, t0); + return t0; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorACos(FXMVECTOR V) noexcept +{ + // 7-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + acosf(V.vector4_f32[0]), + acosf(V.vector4_f32[1]), + acosf(V.vector4_f32[2]), + acosf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero); + float32x4_t x = vabsq_f32(V); + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + float32x4_t oneMValue = vsubq_f32(g_XMOne, x); + float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); + float32x4_t root = XMVectorSqrt(clampOneMValue); + + // Compute polynomial approximation + const XMVECTOR AC1 = g_XMArcCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0); + XMVECTOR t0 = vmlaq_lane_f32(vConstants, x, vget_high_f32(AC1), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0); + t0 = vmlaq_f32(vConstants, t0, x); + + const XMVECTOR AC0 = g_XMArcCoefficients0; + vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0); + t0 = vmlaq_f32(vConstants, t0, x); + t0 = vmulq_f32(t0, root); + + float32x4_t t1 = vsubq_f32(g_XMPi, t0); + t0 = vbslq_f32(nonnegative, t0, t1); + return t0; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_acos_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); + __m128 mvalue = _mm_sub_ps(g_XMZero, V); + __m128 x = _mm_max_ps(V, mvalue); // |V| + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __m128 oneMValue = _mm_sub_ps(g_XMOne, x); + __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); + __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) + + // Compute polynomial approximation + const XMVECTOR AC1 = g_XMArcCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 t0 = XM_FMADD_PS(vConstantsB, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(1, 1, 1, 1)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(0, 0, 0, 0)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + const XMVECTOR AC0 = g_XMArcCoefficients0; + vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(3, 3, 3, 3)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(2, 2, 2, 2)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(1, 1, 1, 1)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(0, 0, 0, 0)); + t0 = XM_FMADD_PS(t0, x, vConstants); + t0 = _mm_mul_ps(t0, root); + + __m128 t1 = _mm_sub_ps(g_XMPi, t0); + t0 = _mm_and_ps(nonnegative, t0); + t1 = _mm_andnot_ps(nonnegative, t1); + t0 = _mm_or_ps(t0, t1); + return t0; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorATan(FXMVECTOR V) noexcept +{ + // 17-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + atanf(V.vector4_f32[0]), + atanf(V.vector4_f32[1]), + atanf(V.vector4_f32[2]), + atanf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t absV = vabsq_f32(V); + float32x4_t invV = XMVectorReciprocal(V); + uint32x4_t comp = vcgtq_f32(V, g_XMOne); + float32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne); + comp = vcleq_f32(absV, g_XMOne); + sign = vbslq_f32(comp, g_XMZero, sign); + float32x4_t x = vbslq_f32(comp, V, invV); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR TC1 = g_XMATanCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(TC1), 0); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(TC1), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(TC1), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(TC1), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + const XMVECTOR TC0 = g_XMATanCoefficients0; + vConstants = vdupq_lane_f32(vget_high_f32(TC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_high_f32(TC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(TC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(TC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, x); + + float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi); + result1 = vsubq_f32(result1, Result); + + comp = vceqq_f32(sign, g_XMZero); + Result = vbslq_f32(comp, Result, result1); + return Result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_atan_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 absV = XMVectorAbs(V); + __m128 invV = _mm_div_ps(g_XMOne, V); + __m128 comp = _mm_cmpgt_ps(V, g_XMOne); + __m128 select0 = _mm_and_ps(comp, g_XMOne); + __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + __m128 sign = _mm_or_ps(select0, select1); + comp = _mm_cmple_ps(absV, g_XMOne); + select0 = _mm_and_ps(comp, g_XMZero); + select1 = _mm_andnot_ps(comp, sign); + sign = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, V); + select1 = _mm_andnot_ps(comp, invV); + __m128 x = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR TC1 = g_XMATanCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(0, 0, 0, 0)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + const XMVECTOR TC0 = g_XMATanCoefficients0; + vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(3, 3, 3, 3)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(2, 2, 2, 2)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(0, 0, 0, 0)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + Result = XM_FMADD_PS(Result, x2, g_XMOne); + + Result = _mm_mul_ps(Result, x); + __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi); + result1 = _mm_sub_ps(result1, Result); + + comp = _mm_cmpeq_ps(sign, g_XMZero); + select0 = _mm_and_ps(comp, Result); + select1 = _mm_andnot_ps(comp, result1); + Result = _mm_or_ps(select0, select1); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorATan2 +( + FXMVECTOR Y, + FXMVECTOR X +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + atan2f(Y.vector4_f32[0], X.vector4_f32[0]), + atan2f(Y.vector4_f32[1], X.vector4_f32[1]), + atan2f(Y.vector4_f32[2], X.vector4_f32[2]), + atan2f(Y.vector4_f32[3], X.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_atan2_ps(Y, X); + return Result; +#else + + // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions: + + // Y == 0 and X is Negative -> Pi with the sign of Y + // y == 0 and x is positive -> 0 with the sign of y + // Y != 0 and X == 0 -> Pi / 2 with the sign of Y + // Y != 0 and X is Negative -> atan(y/x) + (PI with the sign of Y) + // X == -Infinity and Finite Y -> Pi with the sign of Y + // X == +Infinity and Finite Y -> 0 with the sign of Y + // Y == Infinity and X is Finite -> Pi / 2 with the sign of Y + // Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y + // Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y + + static const XMVECTORF32 ATan2Constants = { { { XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f } } }; + + XMVECTOR Zero = XMVectorZero(); + XMVECTOR ATanResultValid = XMVectorTrueInt(); + + XMVECTOR Pi = XMVectorSplatX(ATan2Constants); + XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants); + XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants); + XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants); + + XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero); + XMVECTOR XEqualsZero = XMVectorEqual(X, Zero); + XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v); + XIsPositive = XMVectorEqualInt(XIsPositive, Zero); + XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y); + XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X); + + XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v); + Pi = XMVectorOrInt(Pi, YSign); + PiOverTwo = XMVectorOrInt(PiOverTwo, YSign); + PiOverFour = XMVectorOrInt(PiOverFour, YSign); + ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign); + + XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive); + XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero); + XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero); + XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive); + XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity); + XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity); + ATanResultValid = XMVectorEqualInt(Result, ATanResultValid); + + XMVECTOR V = XMVectorDivide(Y, X); + + XMVECTOR R0 = XMVectorATan(V); + + R1 = XMVectorSelect(Pi, g_XMNegativeZero, XIsPositive); + R2 = XMVectorAdd(R0, R1); + + return XMVectorSelect(Result, R2, ATanResultValid); + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSinEst(FXMVECTOR V) noexcept +{ + // 7-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + sinf(V.vector4_f32[0]), + sinf(V.vector4_f32[1]), + sinf(V.vector4_f32[2]), + sinf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x). + uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32(x); + float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32(comp, x, rflx); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR SEC = g_XMSinCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(SEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, x); + return Result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_sin_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x). + __m128 sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR SEC = g_XMSinCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + Result = XM_FMADD_PS(Result, x2, g_XMOne); + Result = _mm_mul_ps(Result, x); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCosEst(FXMVECTOR V) noexcept +{ + // 6-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + cosf(V.vector4_f32[0]), + cosf(V.vector4_f32[1]), + cosf(V.vector4_f32[2]), + cosf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Map V to x in [-pi,pi]. + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32(x); + float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32(comp, x, rflx); + float32x4_t fsign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR CEC = g_XMCosCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(CEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, fsign); + return Result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_cos_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + // Map V to x in [-pi,pi]. + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, g_XMOne); + select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + sign = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR CEC = g_XMCosCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + Result = XM_FMADD_PS(Result, x2, g_XMOne); + Result = _mm_mul_ps(Result, sign); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorSinCosEst +( + XMVECTOR* pSin, + XMVECTOR* pCos, + FXMVECTOR V +) noexcept +{ + assert(pSin != nullptr); + assert(pCos != nullptr); + + // 7/6-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Sin = { { { + sinf(V.vector4_f32[0]), + sinf(V.vector4_f32[1]), + sinf(V.vector4_f32[2]), + sinf(V.vector4_f32[3]) + } } }; + + XMVECTORF32 Cos = { { { + cosf(V.vector4_f32[0]), + cosf(V.vector4_f32[1]), + cosf(V.vector4_f32[2]), + cosf(V.vector4_f32[3]) + } } }; + + *pSin = Sin.v; + *pCos = Cos.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32(x); + float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32(comp, x, rflx); + float32x4_t fsign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation for sine + const XMVECTOR SEC = g_XMSinCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(SEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + *pSin = vmulq_f32(Result, x); + + // Compute polynomial approximation + const XMVECTOR CEC = g_XMCosCoefficients1; + vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0); + Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(CEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + *pCos = vmulq_f32(Result, fsign); +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x). + XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, g_XMOne); + select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + sign = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation for sine + const XMVECTOR SEC = g_XMSinCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + Result = XM_FMADD_PS(Result, x2, g_XMOne); + Result = _mm_mul_ps(Result, x); + *pSin = Result; + + // Compute polynomial approximation for cosine + const XMVECTOR CEC = g_XMCosCoefficients1; + vConstantsB = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(3, 3, 3, 3)); + vConstants = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(2, 2, 2, 2)); + Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + Result = XM_FMADD_PS(Result, x2, g_XMOne); + Result = _mm_mul_ps(Result, sign); + *pCos = Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorTanEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + tanf(V.vector4_f32[0]), + tanf(V.vector4_f32[1]), + tanf(V.vector4_f32[2]), + tanf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_tan_ps(V); + return Result; +#else + + XMVECTOR OneOverPi = XMVectorSplatW(g_XMTanEstCoefficients.v); + + XMVECTOR V1 = XMVectorMultiply(V, OneOverPi); + V1 = XMVectorRound(V1); + + V1 = XMVectorNegativeMultiplySubtract(g_XMPi.v, V1, V); + + XMVECTOR T0 = XMVectorSplatX(g_XMTanEstCoefficients.v); + XMVECTOR T1 = XMVectorSplatY(g_XMTanEstCoefficients.v); + XMVECTOR T2 = XMVectorSplatZ(g_XMTanEstCoefficients.v); + + XMVECTOR V2T2 = XMVectorNegativeMultiplySubtract(V1, V1, T2); + XMVECTOR V2 = XMVectorMultiply(V1, V1); + XMVECTOR V1T0 = XMVectorMultiply(V1, T0); + XMVECTOR V1T1 = XMVectorMultiply(V1, T1); + + XMVECTOR D = XMVectorReciprocalEst(V2T2); + XMVECTOR N = XMVectorMultiplyAdd(V2, V1T1, V1T0); + + return XMVectorMultiply(N, D); + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorASinEst(FXMVECTOR V) noexcept +{ + // 3-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result; + Result.f[0] = asinf(V.vector4_f32[0]); + Result.f[1] = asinf(V.vector4_f32[1]); + Result.f[2] = asinf(V.vector4_f32[2]); + Result.f[3] = asinf(V.vector4_f32[3]); + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero); + float32x4_t x = vabsq_f32(V); + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + float32x4_t oneMValue = vsubq_f32(g_XMOne, x); + float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); + float32x4_t root = XMVectorSqrt(clampOneMValue); + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMArcEstCoefficients; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0); + XMVECTOR t0 = vmlaq_lane_f32(vConstants, x, vget_high_f32(AEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0); + t0 = vmlaq_f32(vConstants, t0, x); + t0 = vmulq_f32(t0, root); + + float32x4_t t1 = vsubq_f32(g_XMPi, t0); + t0 = vbslq_f32(nonnegative, t0, t1); + t0 = vsubq_f32(g_XMHalfPi, t0); + return t0; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_asin_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); + __m128 mvalue = _mm_sub_ps(g_XMZero, V); + __m128 x = _mm_max_ps(V, mvalue); // |V| + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __m128 oneMValue = _mm_sub_ps(g_XMOne, x); + __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); + __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMArcEstCoefficients; + __m128 vConstantsB = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 t0 = XM_FMADD_PS(vConstantsB, x, vConstants); + + vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(1, 1, 1, 1)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(0, 0, 0, 0)); + t0 = XM_FMADD_PS(t0, x, vConstants); + t0 = _mm_mul_ps(t0, root); + + __m128 t1 = _mm_sub_ps(g_XMPi, t0); + t0 = _mm_and_ps(nonnegative, t0); + t1 = _mm_andnot_ps(nonnegative, t1); + t0 = _mm_or_ps(t0, t1); + t0 = _mm_sub_ps(g_XMHalfPi, t0); + return t0; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorACosEst(FXMVECTOR V) noexcept +{ + // 3-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + acosf(V.vector4_f32[0]), + acosf(V.vector4_f32[1]), + acosf(V.vector4_f32[2]), + acosf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero); + float32x4_t x = vabsq_f32(V); + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + float32x4_t oneMValue = vsubq_f32(g_XMOne, x); + float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); + float32x4_t root = XMVectorSqrt(clampOneMValue); + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMArcEstCoefficients; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0); + XMVECTOR t0 = vmlaq_lane_f32(vConstants, x, vget_high_f32(AEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0); + t0 = vmlaq_f32(vConstants, t0, x); + t0 = vmulq_f32(t0, root); + + float32x4_t t1 = vsubq_f32(g_XMPi, t0); + t0 = vbslq_f32(nonnegative, t0, t1); + return t0; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_acos_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); + __m128 mvalue = _mm_sub_ps(g_XMZero, V); + __m128 x = _mm_max_ps(V, mvalue); // |V| + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __m128 oneMValue = _mm_sub_ps(g_XMOne, x); + __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); + __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMArcEstCoefficients; + __m128 vConstantsB = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 t0 = XM_FMADD_PS(vConstantsB, x, vConstants); + + vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(1, 1, 1, 1)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(0, 0, 0, 0)); + t0 = XM_FMADD_PS(t0, x, vConstants); + t0 = _mm_mul_ps(t0, root); + + __m128 t1 = _mm_sub_ps(g_XMPi, t0); + t0 = _mm_and_ps(nonnegative, t0); + t1 = _mm_andnot_ps(nonnegative, t1); + t0 = _mm_or_ps(t0, t1); + return t0; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorATanEst(FXMVECTOR V) noexcept +{ + // 9-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + atanf(V.vector4_f32[0]), + atanf(V.vector4_f32[1]), + atanf(V.vector4_f32[2]), + atanf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t absV = vabsq_f32(V); + float32x4_t invV = XMVectorReciprocalEst(V); + uint32x4_t comp = vcgtq_f32(V, g_XMOne); + float32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne); + comp = vcleq_f32(absV, g_XMOne); + sign = vbslq_f32(comp, g_XMZero, sign); + float32x4_t x = vbslq_f32(comp, V, invV); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMATanEstCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(AEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + // ATanEstCoefficients0 is already splatted + Result = vmlaq_f32(g_XMATanEstCoefficients0, Result, x2); + Result = vmulq_f32(Result, x); + + float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi); + result1 = vsubq_f32(result1, Result); + + comp = vceqq_f32(sign, g_XMZero); + Result = vbslq_f32(comp, Result, result1); + return Result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_atan_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 absV = XMVectorAbs(V); + __m128 invV = _mm_div_ps(g_XMOne, V); + __m128 comp = _mm_cmpgt_ps(V, g_XMOne); + __m128 select0 = _mm_and_ps(comp, g_XMOne); + __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + __m128 sign = _mm_or_ps(select0, select1); + comp = _mm_cmple_ps(absV, g_XMOne); + select0 = _mm_and_ps(comp, g_XMZero); + select1 = _mm_andnot_ps(comp, sign); + sign = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, V); + select1 = _mm_andnot_ps(comp, invV); + __m128 x = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMATanEstCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(0, 0, 0, 0)); + Result = XM_FMADD_PS(Result, x2, vConstants); + // ATanEstCoefficients0 is already splatted + Result = XM_FMADD_PS(Result, x2, g_XMATanEstCoefficients0); + Result = _mm_mul_ps(Result, x); + __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi); + result1 = _mm_sub_ps(result1, Result); + + comp = _mm_cmpeq_ps(sign, g_XMZero); + select0 = _mm_and_ps(comp, Result); + select1 = _mm_andnot_ps(comp, result1); + Result = _mm_or_ps(select0, select1); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorATan2Est +( + FXMVECTOR Y, + FXMVECTOR X +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + atan2f(Y.vector4_f32[0], X.vector4_f32[0]), + atan2f(Y.vector4_f32[1], X.vector4_f32[1]), + atan2f(Y.vector4_f32[2], X.vector4_f32[2]), + atan2f(Y.vector4_f32[3], X.vector4_f32[3]), + } } }; + return Result.v; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_atan2_ps(Y, X); + return Result; +#else + + static const XMVECTORF32 ATan2Constants = { { { XM_PI, XM_PIDIV2, XM_PIDIV4, 2.3561944905f /* Pi*3/4 */ } } }; + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR ATanResultValid = XMVectorTrueInt(); + + XMVECTOR Pi = XMVectorSplatX(ATan2Constants); + XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants); + XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants); + XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants); + + XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero); + XMVECTOR XEqualsZero = XMVectorEqual(X, Zero); + XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v); + XIsPositive = XMVectorEqualInt(XIsPositive, Zero); + XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y); + XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X); + + XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v); + Pi = XMVectorOrInt(Pi, YSign); + PiOverTwo = XMVectorOrInt(PiOverTwo, YSign); + PiOverFour = XMVectorOrInt(PiOverFour, YSign); + ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign); + + XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive); + XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero); + XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero); + XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive); + XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity); + XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity); + ATanResultValid = XMVectorEqualInt(Result, ATanResultValid); + + XMVECTOR Reciprocal = XMVectorReciprocalEst(X); + XMVECTOR V = XMVectorMultiply(Y, Reciprocal); + XMVECTOR R0 = XMVectorATanEst(V); + + R1 = XMVectorSelect(Pi, g_XMNegativeZero, XIsPositive); + R2 = XMVectorAdd(R0, R1); + + Result = XMVectorSelect(Result, R2, ATanResultValid); + + return Result; + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLerp +( + FXMVECTOR V0, + FXMVECTOR V1, + float t +) noexcept +{ + // V0 + t * (V1 - V0) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Scale = XMVectorReplicate(t); + XMVECTOR Length = XMVectorSubtract(V1, V0); + return XMVectorMultiplyAdd(Length, Scale, V0); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR L = vsubq_f32(V1, V0); + return vmlaq_n_f32(V0, L, t); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR L = _mm_sub_ps(V1, V0); + XMVECTOR S = _mm_set_ps1(t); + return XM_FMADD_PS(L, S, V0); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLerpV +( + FXMVECTOR V0, + FXMVECTOR V1, + FXMVECTOR T +) noexcept +{ + // V0 + T * (V1 - V0) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Length = XMVectorSubtract(V1, V0); + return XMVectorMultiplyAdd(Length, T, V0); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR L = vsubq_f32(V1, V0); + return vmlaq_f32(V0, L, T); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR Length = _mm_sub_ps(V1, V0); + return XM_FMADD_PS(Length, T, V0); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorHermite +( + FXMVECTOR Position0, + FXMVECTOR Tangent0, + FXMVECTOR Position1, + GXMVECTOR Tangent1, + float t +) noexcept +{ + // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 + + // (t^3 - 2 * t^2 + t) * Tangent0 + + // (-2 * t^3 + 3 * t^2) * Position1 + + // (t^3 - t^2) * Tangent1 + +#if defined(_XM_NO_INTRINSICS_) + + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = XMVectorReplicate(2.0f * t3 - 3.0f * t2 + 1.0f); + XMVECTOR T0 = XMVectorReplicate(t3 - 2.0f * t2 + t); + XMVECTOR P1 = XMVectorReplicate(-2.0f * t3 + 3.0f * t2); + XMVECTOR T1 = XMVectorReplicate(t3 - t2); + + XMVECTOR Result = XMVectorMultiply(P0, Position0); + Result = XMVectorMultiplyAdd(T0, Tangent0, Result); + Result = XMVectorMultiplyAdd(P1, Position1, Result); + Result = XMVectorMultiplyAdd(T1, Tangent1, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float t2 = t * t; + float t3 = t * t2; + + float p0 = 2.0f * t3 - 3.0f * t2 + 1.0f; + float t0 = t3 - 2.0f * t2 + t; + float p1 = -2.0f * t3 + 3.0f * t2; + float t1 = t3 - t2; + + XMVECTOR vResult = vmulq_n_f32(Position0, p0); + vResult = vmlaq_n_f32(vResult, Tangent0, t0); + vResult = vmlaq_n_f32(vResult, Position1, p1); + vResult = vmlaq_n_f32(vResult, Tangent1, t1); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = _mm_set_ps1(2.0f * t3 - 3.0f * t2 + 1.0f); + XMVECTOR T0 = _mm_set_ps1(t3 - 2.0f * t2 + t); + XMVECTOR P1 = _mm_set_ps1(-2.0f * t3 + 3.0f * t2); + XMVECTOR T1 = _mm_set_ps1(t3 - t2); + + XMVECTOR vResult = _mm_mul_ps(P0, Position0); + vResult = XM_FMADD_PS(Tangent0, T0, vResult); + vResult = XM_FMADD_PS(Position1, P1, vResult); + vResult = XM_FMADD_PS(Tangent1, T1, vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorHermiteV +( + FXMVECTOR Position0, + FXMVECTOR Tangent0, + FXMVECTOR Position1, + GXMVECTOR Tangent1, + HXMVECTOR T +) noexcept +{ + // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 + + // (t^3 - 2 * t^2 + t) * Tangent0 + + // (-2 * t^3 + 3 * t^2) * Position1 + + // (t^3 - t^2) * Tangent1 + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR T2 = XMVectorMultiply(T, T); + XMVECTOR T3 = XMVectorMultiply(T, T2); + + XMVECTOR P0 = XMVectorReplicate(2.0f * T3.vector4_f32[0] - 3.0f * T2.vector4_f32[0] + 1.0f); + XMVECTOR T0 = XMVectorReplicate(T3.vector4_f32[1] - 2.0f * T2.vector4_f32[1] + T.vector4_f32[1]); + XMVECTOR P1 = XMVectorReplicate(-2.0f * T3.vector4_f32[2] + 3.0f * T2.vector4_f32[2]); + XMVECTOR T1 = XMVectorReplicate(T3.vector4_f32[3] - T2.vector4_f32[3]); + + XMVECTOR Result = XMVectorMultiply(P0, Position0); + Result = XMVectorMultiplyAdd(T0, Tangent0, Result); + Result = XMVectorMultiplyAdd(P1, Position1, Result); + Result = XMVectorMultiplyAdd(T1, Tangent1, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 CatMulT2 = { { { -3.0f, -2.0f, 3.0f, -1.0f } } }; + static const XMVECTORF32 CatMulT3 = { { { 2.0f, 1.0f, -2.0f, 1.0f } } }; + + XMVECTOR T2 = vmulq_f32(T, T); + XMVECTOR T3 = vmulq_f32(T, T2); + // Mul by the constants against t^2 + T2 = vmulq_f32(T2, CatMulT2); + // Mul by the constants against t^3 + T3 = vmlaq_f32(T2, T3, CatMulT3); + // T3 now has the pre-result. + // I need to add t.y only + T2 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T), g_XMMaskY)); + T3 = vaddq_f32(T3, T2); + // Add 1.0f to x + T3 = vaddq_f32(T3, g_XMIdentityR0); + // Now, I have the constants created + // Mul the x constant to Position0 + XMVECTOR vResult = vmulq_lane_f32(Position0, vget_low_f32(T3), 0); // T3[0] + // Mul the y constant to Tangent0 + vResult = vmlaq_lane_f32(vResult, Tangent0, vget_low_f32(T3), 1); // T3[1] + // Mul the z constant to Position1 + vResult = vmlaq_lane_f32(vResult, Position1, vget_high_f32(T3), 0); // T3[2] + // Mul the w constant to Tangent1 + vResult = vmlaq_lane_f32(vResult, Tangent1, vget_high_f32(T3), 1); // T3[3] + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 CatMulT2 = { { { -3.0f, -2.0f, 3.0f, -1.0f } } }; + static const XMVECTORF32 CatMulT3 = { { { 2.0f, 1.0f, -2.0f, 1.0f } } }; + + XMVECTOR T2 = _mm_mul_ps(T, T); + XMVECTOR T3 = _mm_mul_ps(T, T2); + // Mul by the constants against t^2 + T2 = _mm_mul_ps(T2, CatMulT2); + // Mul by the constants against t^3 + T3 = XM_FMADD_PS(T3, CatMulT3, T2); + // T3 now has the pre-result. + // I need to add t.y only + T2 = _mm_and_ps(T, g_XMMaskY); + T3 = _mm_add_ps(T3, T2); + // Add 1.0f to x + T3 = _mm_add_ps(T3, g_XMIdentityR0); + // Now, I have the constants created + // Mul the x constant to Position0 + XMVECTOR vResult = XM_PERMUTE_PS(T3, _MM_SHUFFLE(0, 0, 0, 0)); + vResult = _mm_mul_ps(vResult, Position0); + // Mul the y constant to Tangent0 + T2 = XM_PERMUTE_PS(T3, _MM_SHUFFLE(1, 1, 1, 1)); + vResult = XM_FMADD_PS(T2, Tangent0, vResult); + // Mul the z constant to Position1 + T2 = XM_PERMUTE_PS(T3, _MM_SHUFFLE(2, 2, 2, 2)); + vResult = XM_FMADD_PS(T2, Position1, vResult); + // Mul the w constant to Tangent1 + T3 = XM_PERMUTE_PS(T3, _MM_SHUFFLE(3, 3, 3, 3)); + vResult = XM_FMADD_PS(T3, Tangent1, vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCatmullRom +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + GXMVECTOR Position3, + float t +) noexcept +{ + // Result = ((-t^3 + 2 * t^2 - t) * Position0 + + // (3 * t^3 - 5 * t^2 + 2) * Position1 + + // (-3 * t^3 + 4 * t^2 + t) * Position2 + + // (t^3 - t^2) * Position3) * 0.5 + +#if defined(_XM_NO_INTRINSICS_) + + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = XMVectorReplicate((-t3 + 2.0f * t2 - t) * 0.5f); + XMVECTOR P1 = XMVectorReplicate((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f); + XMVECTOR P2 = XMVectorReplicate((-3.0f * t3 + 4.0f * t2 + t) * 0.5f); + XMVECTOR P3 = XMVectorReplicate((t3 - t2) * 0.5f); + + XMVECTOR Result = XMVectorMultiply(P0, Position0); + Result = XMVectorMultiplyAdd(P1, Position1, Result); + Result = XMVectorMultiplyAdd(P2, Position2, Result); + Result = XMVectorMultiplyAdd(P3, Position3, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float t2 = t * t; + float t3 = t * t2; + + float p0 = (-t3 + 2.0f * t2 - t) * 0.5f; + float p1 = (3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f; + float p2 = (-3.0f * t3 + 4.0f * t2 + t) * 0.5f; + float p3 = (t3 - t2) * 0.5f; + + XMVECTOR P1 = vmulq_n_f32(Position1, p1); + XMVECTOR P0 = vmlaq_n_f32(P1, Position0, p0); + XMVECTOR P3 = vmulq_n_f32(Position3, p3); + XMVECTOR P2 = vmlaq_n_f32(P3, Position2, p2); + P0 = vaddq_f32(P0, P2); + return P0; +#elif defined(_XM_SSE_INTRINSICS_) + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = _mm_set_ps1((-t3 + 2.0f * t2 - t) * 0.5f); + XMVECTOR P1 = _mm_set_ps1((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f); + XMVECTOR P2 = _mm_set_ps1((-3.0f * t3 + 4.0f * t2 + t) * 0.5f); + XMVECTOR P3 = _mm_set_ps1((t3 - t2) * 0.5f); + + P1 = _mm_mul_ps(Position1, P1); + P0 = XM_FMADD_PS(Position0, P0, P1); + P3 = _mm_mul_ps(Position3, P3); + P2 = XM_FMADD_PS(Position2, P2, P3); + P0 = _mm_add_ps(P0, P2); + return P0; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCatmullRomV +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + GXMVECTOR Position3, + HXMVECTOR T +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float fx = T.vector4_f32[0]; + float fy = T.vector4_f32[1]; + float fz = T.vector4_f32[2]; + float fw = T.vector4_f32[3]; + XMVECTORF32 vResult = { { { + 0.5f * ((-fx * fx * fx + 2 * fx * fx - fx) * Position0.vector4_f32[0] + + (3 * fx * fx * fx - 5 * fx * fx + 2) * Position1.vector4_f32[0] + + (-3 * fx * fx * fx + 4 * fx * fx + fx) * Position2.vector4_f32[0] + + (fx * fx * fx - fx * fx) * Position3.vector4_f32[0]), + + 0.5f * ((-fy * fy * fy + 2 * fy * fy - fy) * Position0.vector4_f32[1] + + (3 * fy * fy * fy - 5 * fy * fy + 2) * Position1.vector4_f32[1] + + (-3 * fy * fy * fy + 4 * fy * fy + fy) * Position2.vector4_f32[1] + + (fy * fy * fy - fy * fy) * Position3.vector4_f32[1]), + + 0.5f * ((-fz * fz * fz + 2 * fz * fz - fz) * Position0.vector4_f32[2] + + (3 * fz * fz * fz - 5 * fz * fz + 2) * Position1.vector4_f32[2] + + (-3 * fz * fz * fz + 4 * fz * fz + fz) * Position2.vector4_f32[2] + + (fz * fz * fz - fz * fz) * Position3.vector4_f32[2]), + + 0.5f * ((-fw * fw * fw + 2 * fw * fw - fw) * Position0.vector4_f32[3] + + (3 * fw * fw * fw - 5 * fw * fw + 2) * Position1.vector4_f32[3] + + (-3 * fw * fw * fw + 4 * fw * fw + fw) * Position2.vector4_f32[3] + + (fw * fw * fw - fw * fw) * Position3.vector4_f32[3]) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Catmul2 = { { { 2.0f, 2.0f, 2.0f, 2.0f } } }; + static const XMVECTORF32 Catmul3 = { { { 3.0f, 3.0f, 3.0f, 3.0f } } }; + static const XMVECTORF32 Catmul4 = { { { 4.0f, 4.0f, 4.0f, 4.0f } } }; + static const XMVECTORF32 Catmul5 = { { { 5.0f, 5.0f, 5.0f, 5.0f } } }; + // Cache T^2 and T^3 + XMVECTOR T2 = vmulq_f32(T, T); + XMVECTOR T3 = vmulq_f32(T, T2); + // Perform the Position0 term + XMVECTOR vResult = vaddq_f32(T2, T2); + vResult = vsubq_f32(vResult, T); + vResult = vsubq_f32(vResult, T3); + vResult = vmulq_f32(vResult, Position0); + // Perform the Position1 term and add + XMVECTOR vTemp = vmulq_f32(T3, Catmul3); + vTemp = vmlsq_f32(vTemp, T2, Catmul5); + vTemp = vaddq_f32(vTemp, Catmul2); + vResult = vmlaq_f32(vResult, vTemp, Position1); + // Perform the Position2 term and add + vTemp = vmulq_f32(T2, Catmul4); + vTemp = vmlsq_f32(vTemp, T3, Catmul3); + vTemp = vaddq_f32(vTemp, T); + vResult = vmlaq_f32(vResult, vTemp, Position2); + // Position3 is the last term + T3 = vsubq_f32(T3, T2); + vResult = vmlaq_f32(vResult, T3, Position3); + // Multiply by 0.5f and exit + vResult = vmulq_f32(vResult, g_XMOneHalf); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Catmul2 = { { { 2.0f, 2.0f, 2.0f, 2.0f } } }; + static const XMVECTORF32 Catmul3 = { { { 3.0f, 3.0f, 3.0f, 3.0f } } }; + static const XMVECTORF32 Catmul4 = { { { 4.0f, 4.0f, 4.0f, 4.0f } } }; + static const XMVECTORF32 Catmul5 = { { { 5.0f, 5.0f, 5.0f, 5.0f } } }; + // Cache T^2 and T^3 + XMVECTOR T2 = _mm_mul_ps(T, T); + XMVECTOR T3 = _mm_mul_ps(T, T2); + // Perform the Position0 term + XMVECTOR vResult = _mm_add_ps(T2, T2); + vResult = _mm_sub_ps(vResult, T); + vResult = _mm_sub_ps(vResult, T3); + vResult = _mm_mul_ps(vResult, Position0); + // Perform the Position1 term and add + XMVECTOR vTemp = _mm_mul_ps(T3, Catmul3); + vTemp = XM_FNMADD_PS(T2, Catmul5, vTemp); + vTemp = _mm_add_ps(vTemp, Catmul2); + vResult = XM_FMADD_PS(vTemp, Position1, vResult); + // Perform the Position2 term and add + vTemp = _mm_mul_ps(T2, Catmul4); + vTemp = XM_FNMADD_PS(T3, Catmul3, vTemp); + vTemp = _mm_add_ps(vTemp, T); + vResult = XM_FMADD_PS(vTemp, Position2, vResult); + // Position3 is the last term + T3 = _mm_sub_ps(T3, T2); + vResult = XM_FMADD_PS(T3, Position3, vResult); + // Multiply by 0.5f and exit + vResult = _mm_mul_ps(vResult, g_XMOneHalf); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorBaryCentric +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + float f, + float g +) noexcept +{ + // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR P10 = XMVectorSubtract(Position1, Position0); + XMVECTOR ScaleF = XMVectorReplicate(f); + + XMVECTOR P20 = XMVectorSubtract(Position2, Position0); + XMVECTOR ScaleG = XMVectorReplicate(g); + + XMVECTOR Result = XMVectorMultiplyAdd(P10, ScaleF, Position0); + Result = XMVectorMultiplyAdd(P20, ScaleG, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR R1 = vsubq_f32(Position1, Position0); + XMVECTOR R2 = vsubq_f32(Position2, Position0); + R1 = vmlaq_n_f32(Position0, R1, f); + return vmlaq_n_f32(R1, R2, g); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR R1 = _mm_sub_ps(Position1, Position0); + XMVECTOR R2 = _mm_sub_ps(Position2, Position0); + XMVECTOR SF = _mm_set_ps1(f); + R1 = XM_FMADD_PS(R1, SF, Position0); + XMVECTOR SG = _mm_set_ps1(g); + return XM_FMADD_PS(R2, SG, R1); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorBaryCentricV +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + GXMVECTOR F, + HXMVECTOR G +) noexcept +{ + // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR P10 = XMVectorSubtract(Position1, Position0); + XMVECTOR P20 = XMVectorSubtract(Position2, Position0); + + XMVECTOR Result = XMVectorMultiplyAdd(P10, F, Position0); + Result = XMVectorMultiplyAdd(P20, G, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR R1 = vsubq_f32(Position1, Position0); + XMVECTOR R2 = vsubq_f32(Position2, Position0); + R1 = vmlaq_f32(Position0, R1, F); + return vmlaq_f32(R1, R2, G); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR R1 = _mm_sub_ps(Position1, Position0); + XMVECTOR R2 = _mm_sub_ps(Position2, Position0); + R1 = XM_FMADD_PS(R1, F, Position0); + return XM_FMADD_PS(R2, G, R1); +#endif +} + +/**************************************************************************** + * + * 2D Vector + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + // Comparison operations + //------------------------------------------------------------------------------ + + //------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2Equal +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_f32(vget_low_f32(V1), vget_low_f32(V2)); + return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + // z and w are don't care + return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0); +#endif +} + + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector2EqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && + (V1.vector4_f32[1] == V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && + (V1.vector4_f32[1] != V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_f32(vget_low_f32(V1), vget_low_f32(V2)); + uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0); + uint32_t CR = 0; + if (r == 0xFFFFFFFFFFFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + // z and w are don't care + int iTest = _mm_movemask_ps(vTemp) & 3; + uint32_t CR = 0; + if (iTest == 3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2EqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_u32(vget_low_u32(vreinterpretq_u32_f32(V1)), vget_low_u32(vreinterpretq_u32_f32(V2))); + return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 3) == 3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector2EqualIntR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_u32[0] == V2.vector4_u32[0]) && + (V1.vector4_u32[1] == V2.vector4_u32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) && + (V1.vector4_u32[1] != V2.vector4_u32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_u32(vget_low_u32(vreinterpretq_u32_f32(V1)), vget_low_u32(vreinterpretq_u32_f32(V2))); + uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0); + uint32_t CR = 0; + if (r == 0xFFFFFFFFFFFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 3; + uint32_t CR = 0; + if (iTest == 3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2NearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float dx = fabsf(V1.vector4_f32[0] - V2.vector4_f32[0]); + float dy = fabsf(V1.vector4_f32[1] - V2.vector4_f32[1]); + return ((dx <= Epsilon.vector4_f32[0]) && + (dy <= Epsilon.vector4_f32[1])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t vDelta = vsub_f32(vget_low_f32(V1), vget_low_f32(V2)); +#ifdef _MSC_VER + uint32x2_t vTemp = vacle_f32(vDelta, vget_low_u32(Epsilon)); +#else + uint32x2_t vTemp = vcle_f32(vabs_f32(vDelta), vget_low_f32(Epsilon)); +#endif + uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0); + return (r == 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1, V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp, vDelta); + vTemp = _mm_max_ps(vTemp, vDelta); + vTemp = _mm_cmple_ps(vTemp, Epsilon); + // z and w are don't care + return (((_mm_movemask_ps(vTemp) & 3) == 0x3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2NotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_f32(vget_low_f32(V1), vget_low_f32(V2)); + return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) != 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + // z and w are don't care + return (((_mm_movemask_ps(vTemp) & 3) != 3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2NotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_u32(vget_low_u32(vreinterpretq_u32_f32(V1)), vget_low_u32(vreinterpretq_u32_f32(V2))); + return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) != 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 3) != 3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2Greater +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vcgt_f32(vget_low_f32(V1), vget_low_f32(V2)); + return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2); + // z and w are don't care + return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector2GreaterR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_f32[0] > V2.vector4_f32[0]) && + (V1.vector4_f32[1] > V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) && + (V1.vector4_f32[1] <= V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vcgt_f32(vget_low_f32(V1), vget_low_f32(V2)); + uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0); + uint32_t CR = 0; + if (r == 0xFFFFFFFFFFFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2); + int iTest = _mm_movemask_ps(vTemp) & 3; + uint32_t CR = 0; + if (iTest == 3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2GreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vcge_f32(vget_low_f32(V1), vget_low_f32(V2)); + return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector2GreaterOrEqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && + (V1.vector4_f32[1] >= V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && + (V1.vector4_f32[1] < V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vcge_f32(vget_low_f32(V1), vget_low_f32(V2)); + uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0); + uint32_t CR = 0; + if (r == 0xFFFFFFFFFFFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1, V2); + int iTest = _mm_movemask_ps(vTemp) & 3; + uint32_t CR = 0; + if (iTest == 3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2Less +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vclt_f32(vget_low_f32(V1), vget_low_f32(V2)); + return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmplt_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2LessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vcle_f32(vget_low_f32(V1), vget_low_f32(V2)); + return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmple_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2InBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + float32x2_t B = vget_low_f32(Bounds); + // Test if less than or equal + uint32x2_t ivTemp1 = vcle_f32(VL, B); + // Negate the bounds + float32x2_t vTemp2 = vneg_f32(B); + // Test if greater or equal (Reversed) + uint32x2_t ivTemp2 = vcle_f32(vTemp2, VL); + // Blend answers + ivTemp1 = vand_u32(ivTemp1, ivTemp2); + // x and y in bounds? + return (vget_lane_u64(vreinterpret_u64_u32(ivTemp1), 0) == 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2, V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1, vTemp2); + // x and y in bounds? (z and w are don't care) + return (((_mm_movemask_ps(vTemp1) & 0x3) == 0x3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma float_control(push) +#pragma float_control(precise, on) +#endif + +inline bool XM_CALLCONV XMVector2IsNaN(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (XMISNAN(V.vector4_f32[0]) || + XMISNAN(V.vector4_f32[1])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Test against itself. NaN is always not equal + uint32x2_t vTempNan = vceq_f32(VL, VL); + // If x or y are NaN, the mask is zero + return (vget_lane_u64(vreinterpret_u64_u32(vTempNan), 0) != 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + // Test against itself. NaN is always not equal + XMVECTOR vTempNan = _mm_cmpneq_ps(V, V); + // If x or y are NaN, the mask is non-zero + return ((_mm_movemask_ps(vTempNan) & 3) != 0); +#endif +} + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma float_control(pop) +#endif + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2IsInfinite(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + return (XMISINF(V.vector4_f32[0]) || + XMISINF(V.vector4_f32[1])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bit + uint32x2_t vTemp = vand_u32(vget_low_u32(vreinterpretq_u32_f32(V)), vget_low_u32(g_XMAbsMask)); + // Compare to infinity + vTemp = vceq_f32(vreinterpret_f32_u32(vTemp), vget_low_f32(g_XMInfinity)); + // If any are infinity, the signs are true. + return vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) != 0; +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + __m128 vTemp = _mm_and_ps(V, g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp, g_XMInfinity); + // If x or z are infinity, the signs are true. + return ((_mm_movemask_ps(vTemp) & 3) != 0); +#endif +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result; + Result.f[0] = + Result.f[1] = + Result.f[2] = + Result.f[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1]; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Perform the dot product on x and y + float32x2_t vTemp = vmul_f32(vget_low_f32(V1), vget_low_f32(V2)); + vTemp = vpadd_f32(vTemp, vTemp); + return vcombine_f32(vTemp, vTemp); +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_dp_ps(V1, V2, 0x3f); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vDot = _mm_mul_ps(V1, V2); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_moveldup_ps(vDot); + return vDot; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V1, V2); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Cross +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + // [ V1.x*V2.y - V1.y*V2.x, V1.x*V2.y - V1.y*V2.x ] + +#if defined(_XM_NO_INTRINSICS_) + float fCross = (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]); + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = fCross; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Negate = { { { 1.f, -1.f, 0, 0 } } }; + + float32x2_t vTemp = vmul_f32(vget_low_f32(V1), vrev64_f32(vget_low_f32(V2))); + vTemp = vmul_f32(vTemp, vget_low_f32(Negate)); + vTemp = vpadd_f32(vTemp, vTemp); + return vcombine_f32(vTemp, vTemp); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap x and y + XMVECTOR vResult = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 1, 0, 1)); + // Perform the muls + vResult = _mm_mul_ps(vResult, V1); + // Splat y + XMVECTOR vTemp = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(1, 1, 1, 1)); + // Sub the values + vResult = _mm_sub_ss(vResult, vTemp); + // Splat the cross product + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 0, 0, 0)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2LengthSq(FXMVECTOR V) noexcept +{ + return XMVector2Dot(V, V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorReciprocalSqrtEst(Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32(VL, VL); + vTemp = vpadd_f32(vTemp, vTemp); + // Reciprocal sqrt (estimate) + vTemp = vrsqrte_f32(vTemp); + return vcombine_f32(vTemp, vTemp); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f); + return _mm_rsqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_rsqrt_ss(vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = _mm_rsqrt_ss(vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLength(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorReciprocalSqrt(Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32(VL, VL); + vTemp = vpadd_f32(vTemp, vTemp); + // Reciprocal sqrt + float32x2_t S0 = vrsqrte_f32(vTemp); + float32x2_t P0 = vmul_f32(vTemp, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(vTemp, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + float32x2_t Result = vmul_f32(S1, R1); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f); + XMVECTOR vLengthSq = _mm_sqrt_ps(vTemp); + return _mm_div_ps(g_XMOne, vLengthSq); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ss(vTemp); + vLengthSq = _mm_div_ss(g_XMOne, vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = _mm_sqrt_ss(vLengthSq); + vLengthSq = _mm_div_ss(g_XMOne, vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2LengthEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorSqrtEst(Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32(VL, VL); + vTemp = vpadd_f32(vTemp, vTemp); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32(vTemp, zero); + // Sqrt (estimate) + float32x2_t Result = vrsqrte_f32(vTemp); + Result = vmul_f32(vTemp, Result); + Result = vbsl_f32(VEqualsZero, zero, Result); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f); + return _mm_sqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ss(vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = _mm_sqrt_ss(vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Length(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorSqrt(Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32(VL, VL); + vTemp = vpadd_f32(vTemp, vTemp); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32(vTemp, zero); + // Sqrt + float32x2_t S0 = vrsqrte_f32(vTemp); + float32x2_t P0 = vmul_f32(vTemp, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(vTemp, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + float32x2_t Result = vmul_f32(S1, R1); + Result = vmul_f32(vTemp, Result); + Result = vbsl_f32(VEqualsZero, zero, Result); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f); + return _mm_sqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ss(vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ +// XMVector2NormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +inline XMVECTOR XM_CALLCONV XMVector2NormalizeEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2ReciprocalLength(V); + Result = XMVectorMultiply(V, Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32(VL, VL); + vTemp = vpadd_f32(vTemp, vTemp); + // Reciprocal sqrt (estimate) + vTemp = vrsqrte_f32(vTemp); + // Normalize + float32x2_t Result = vmul_f32(VL, vTemp); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f); + XMVECTOR vResult = _mm_rsqrt_ps(vTemp); + return _mm_mul_ps(vResult, V); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_rsqrt_ss(vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + vLengthSq = _mm_mul_ps(vLengthSq, V); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = _mm_rsqrt_ss(vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + vLengthSq = _mm_mul_ps(vLengthSq, V); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Normalize(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR vResult = XMVector2Length(V); + float fLength = vResult.vector4_f32[0]; + + // Prevent divide by zero + if (fLength > 0) + { + fLength = 1.0f / fLength; + } + + vResult.vector4_f32[0] = V.vector4_f32[0] * fLength; + vResult.vector4_f32[1] = V.vector4_f32[1] * fLength; + vResult.vector4_f32[2] = V.vector4_f32[2] * fLength; + vResult.vector4_f32[3] = V.vector4_f32[3] * fLength; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32(VL, VL); + vTemp = vpadd_f32(vTemp, vTemp); + uint32x2_t VEqualsZero = vceq_f32(vTemp, vdup_n_f32(0)); + uint32x2_t VEqualsInf = vceq_f32(vTemp, vget_low_f32(g_XMInfinity)); + // Reciprocal sqrt (2 iterations of Newton-Raphson) + float32x2_t S0 = vrsqrte_f32(vTemp); + float32x2_t P0 = vmul_f32(vTemp, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(vTemp, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + vTemp = vmul_f32(S1, R1); + // Normalize + float32x2_t Result = vmul_f32(VL, vTemp); + Result = vbsl_f32(VEqualsZero, vdup_n_f32(0), Result); + Result = vbsl_f32(VEqualsInf, vget_low_f32(g_XMQNaN), Result); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vLengthSq = _mm_dp_ps(V, V, 0x3f); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#elif defined(_XM_SSE3_INTRINSICS_) + // Perform the dot product on x and y only + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_moveldup_ps(vLengthSq); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y only + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1)); + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2ClampLength +( + FXMVECTOR V, + float LengthMin, + float LengthMax +) noexcept +{ + XMVECTOR ClampMax = XMVectorReplicate(LengthMax); + XMVECTOR ClampMin = XMVectorReplicate(LengthMin); + return XMVector2ClampLengthV(V, ClampMin, ClampMax); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2ClampLengthV +( + FXMVECTOR V, + FXMVECTOR LengthMin, + FXMVECTOR LengthMax +) noexcept +{ + assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin))); + assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax))); + assert(XMVector2GreaterOrEqual(LengthMin, g_XMZero)); + assert(XMVector2GreaterOrEqual(LengthMax, g_XMZero)); + assert(XMVector2GreaterOrEqual(LengthMax, LengthMin)); + + XMVECTOR LengthSq = XMVector2LengthSq(V); + + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq); + + XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); + XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero); + + XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength); + + XMVECTOR Normal = XMVectorMultiply(V, RcpLength); + + XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength); + Length = XMVectorSelect(LengthSq, Length, Select); + Normal = XMVectorSelect(LengthSq, Normal, Select); + + XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax); + XMVECTOR ControlMin = XMVectorLess(Length, LengthMin); + + XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); + ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); + + XMVECTOR Result = XMVectorMultiply(Normal, ClampLength); + + // Preserve the original vector (with no precision loss) if the length falls within the given range + XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin); + Result = XMVectorSelect(Result, V, Control); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Reflect +( + FXMVECTOR Incident, + FXMVECTOR Normal +) noexcept +{ + // Result = Incident - (2 * dot(Incident, Normal)) * Normal + + XMVECTOR Result; + Result = XMVector2Dot(Incident, Normal); + Result = XMVectorAdd(Result, Result); + Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Refract +( + FXMVECTOR Incident, + FXMVECTOR Normal, + float RefractionIndex +) noexcept +{ + XMVECTOR Index = XMVectorReplicate(RefractionIndex); + return XMVector2RefractV(Incident, Normal, Index); +} + +//------------------------------------------------------------------------------ + +// Return the refraction of a 2D vector +inline XMVECTOR XM_CALLCONV XMVector2RefractV +( + FXMVECTOR Incident, + FXMVECTOR Normal, + FXMVECTOR RefractionIndex +) noexcept +{ + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + +#if defined(_XM_NO_INTRINSICS_) + + float IDotN = (Incident.vector4_f32[0] * Normal.vector4_f32[0]) + (Incident.vector4_f32[1] * Normal.vector4_f32[1]); + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + float RY = 1.0f - (IDotN * IDotN); + float RX = 1.0f - (RY * RefractionIndex.vector4_f32[0] * RefractionIndex.vector4_f32[0]); + RY = 1.0f - (RY * RefractionIndex.vector4_f32[1] * RefractionIndex.vector4_f32[1]); + if (RX >= 0.0f) + { + RX = (RefractionIndex.vector4_f32[0] * Incident.vector4_f32[0]) - (Normal.vector4_f32[0] * ((RefractionIndex.vector4_f32[0] * IDotN) + sqrtf(RX))); + } + else + { + RX = 0.0f; + } + if (RY >= 0.0f) + { + RY = (RefractionIndex.vector4_f32[1] * Incident.vector4_f32[1]) - (Normal.vector4_f32[1] * ((RefractionIndex.vector4_f32[1] * IDotN) + sqrtf(RY))); + } + else + { + RY = 0.0f; + } + + XMVECTOR vResult; + vResult.vector4_f32[0] = RX; + vResult.vector4_f32[1] = RY; + vResult.vector4_f32[2] = 0.0f; + vResult.vector4_f32[3] = 0.0f; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t IL = vget_low_f32(Incident); + float32x2_t NL = vget_low_f32(Normal); + float32x2_t RIL = vget_low_f32(RefractionIndex); + // Get the 2D Dot product of Incident-Normal + float32x2_t vTemp = vmul_f32(IL, NL); + float32x2_t IDotN = vpadd_f32(vTemp, vTemp); + // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + vTemp = vmls_f32(vget_low_f32(g_XMOne), IDotN, IDotN); + vTemp = vmul_f32(vTemp, RIL); + vTemp = vmls_f32(vget_low_f32(g_XMOne), vTemp, RIL); + // If any terms are <=0, sqrt() will fail, punt to zero + uint32x2_t vMask = vcgt_f32(vTemp, vget_low_f32(g_XMZero)); + // Sqrt(vTemp) + float32x2_t S0 = vrsqrte_f32(vTemp); + float32x2_t P0 = vmul_f32(vTemp, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(vTemp, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + float32x2_t S2 = vmul_f32(S1, R1); + vTemp = vmul_f32(vTemp, S2); + // R = RefractionIndex * IDotN + sqrt(R) + vTemp = vmla_f32(vTemp, RIL, IDotN); + // Result = RefractionIndex * Incident - Normal * R + float32x2_t vResult = vmul_f32(RIL, IL); + vResult = vmls_f32(vResult, vTemp, NL); + vResult = vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(vResult), vMask)); + return vcombine_f32(vResult, vResult); +#elif defined(_XM_SSE_INTRINSICS_) + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + // Get the 2D Dot product of Incident-Normal + XMVECTOR IDotN = XMVector2Dot(Incident, Normal); + // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR vTemp = XM_FNMADD_PS(IDotN, IDotN, g_XMOne); + vTemp = _mm_mul_ps(vTemp, RefractionIndex); + vTemp = XM_FNMADD_PS(vTemp, RefractionIndex, g_XMOne); + // If any terms are <=0, sqrt() will fail, punt to zero + XMVECTOR vMask = _mm_cmpgt_ps(vTemp, g_XMZero); + // R = RefractionIndex * IDotN + sqrt(R) + vTemp = _mm_sqrt_ps(vTemp); + vTemp = XM_FMADD_PS(RefractionIndex, IDotN, vTemp); + // Result = RefractionIndex * Incident - Normal * R + XMVECTOR vResult = _mm_mul_ps(RefractionIndex, Incident); + vResult = XM_FNMADD_PS(vTemp, Normal, vResult); + vResult = _mm_and_ps(vResult, vMask); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Orthogonal(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + -V.vector4_f32[1], + V.vector4_f32[0], + 0.f, + 0.f + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Negate = { { { -1.f, 1.f, 0, 0 } } }; + const float32x2_t zero = vdup_n_f32(0); + + float32x2_t VL = vget_low_f32(V); + float32x2_t Result = vmul_f32(vrev64_f32(VL), vget_low_f32(Negate)); + return vcombine_f32(Result, zero); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1)); + vResult = _mm_mul_ps(vResult, g_XMNegateX); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormalsEst +( + FXMVECTOR N1, + FXMVECTOR N2 +) noexcept +{ + XMVECTOR Result = XMVector2Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACosEst(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormals +( + FXMVECTOR N1, + FXMVECTOR N2 +) noexcept +{ + XMVECTOR Result = XMVector2Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne, g_XMOne); + Result = XMVectorACos(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenVectors +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + XMVECTOR L1 = XMVector2ReciprocalLength(V1); + XMVECTOR L2 = XMVector2ReciprocalLength(V2); + + XMVECTOR Dot = XMVector2Dot(V1, V2); + + L1 = XMVectorMultiply(L1, L2); + + XMVECTOR CosAngle = XMVectorMultiply(Dot, L1); + CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v); + + return XMVectorACos(CosAngle); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2LinePointDistance +( + FXMVECTOR LinePoint1, + FXMVECTOR LinePoint2, + FXMVECTOR Point +) noexcept +{ + // Given a vector PointVector from LinePoint1 to Point and a vector + // LineVector from LinePoint1 to LinePoint2, the scaled distance + // PointProjectionScale from LinePoint1 to the perpendicular projection + // of PointVector onto the line is defined as: + // + // PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector) + + XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1); + XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1); + + XMVECTOR LengthSq = XMVector2LengthSq(LineVector); + + XMVECTOR PointProjectionScale = XMVector2Dot(PointVector, LineVector); + PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq); + + XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale); + DistanceVector = XMVectorSubtract(PointVector, DistanceVector); + + return XMVector2Length(DistanceVector); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2IntersectLine +( + FXMVECTOR Line1Point1, + FXMVECTOR Line1Point2, + FXMVECTOR Line2Point1, + GXMVECTOR Line2Point2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR V1 = XMVectorSubtract(Line1Point2, Line1Point1); + XMVECTOR V2 = XMVectorSubtract(Line2Point2, Line2Point1); + XMVECTOR V3 = XMVectorSubtract(Line1Point1, Line2Point1); + + XMVECTOR C1 = XMVector2Cross(V1, V2); + XMVECTOR C2 = XMVector2Cross(V2, V3); + + XMVECTOR Result; + const XMVECTOR Zero = XMVectorZero(); + if (XMVector2NearEqual(C1, Zero, g_XMEpsilon.v)) + { + if (XMVector2NearEqual(C2, Zero, g_XMEpsilon.v)) + { + // Coincident + Result = g_XMInfinity.v; + } + else + { + // Parallel + Result = g_XMQNaN.v; + } + } + else + { + // Intersection point = Line1Point1 + V1 * (C2 / C1) + XMVECTOR Scale = XMVectorReciprocal(C1); + Scale = XMVectorMultiply(C2, Scale); + Result = XMVectorMultiplyAdd(V1, Scale, Line1Point1); + } + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR V1 = _mm_sub_ps(Line1Point2, Line1Point1); + XMVECTOR V2 = _mm_sub_ps(Line2Point2, Line2Point1); + XMVECTOR V3 = _mm_sub_ps(Line1Point1, Line2Point1); + // Generate the cross products + XMVECTOR C1 = XMVector2Cross(V1, V2); + XMVECTOR C2 = XMVector2Cross(V2, V3); + // If C1 is not close to epsilon, use the calculated value + XMVECTOR vResultMask = _mm_setzero_ps(); + vResultMask = _mm_sub_ps(vResultMask, C1); + vResultMask = _mm_max_ps(vResultMask, C1); + // 0xFFFFFFFF if the calculated value is to be used + vResultMask = _mm_cmpgt_ps(vResultMask, g_XMEpsilon); + // If C1 is close to epsilon, which fail type is it? INFINITY or NAN? + XMVECTOR vFailMask = _mm_setzero_ps(); + vFailMask = _mm_sub_ps(vFailMask, C2); + vFailMask = _mm_max_ps(vFailMask, C2); + vFailMask = _mm_cmple_ps(vFailMask, g_XMEpsilon); + XMVECTOR vFail = _mm_and_ps(vFailMask, g_XMInfinity); + vFailMask = _mm_andnot_ps(vFailMask, g_XMQNaN); + // vFail is NAN or INF + vFail = _mm_or_ps(vFail, vFailMask); + // Intersection point = Line1Point1 + V1 * (C2 / C1) + XMVECTOR vResult = _mm_div_ps(C2, C1); + vResult = XM_FMADD_PS(vResult, V1, Line1Point1); + // Use result, or failure value + vResult = _mm_and_ps(vResult, vResultMask); + vResultMask = _mm_andnot_ps(vResultMask, vFail); + vResult = _mm_or_ps(vResult, vResultMask); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Transform +( + FXMVECTOR V, + FXMMATRIX M +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + float32x4_t Result = vmlaq_lane_f32(M.r[3], M.r[1], VL, 1); // Y + return vmlaq_lane_f32(Result, M.r[0], VL, 0); // X +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y + vResult = XM_FMADD_PS(vResult, M.r[1], M.r[3]); + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X + vResult = XM_FMADD_PS(vTemp, M.r[0], vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT4* XM_CALLCONV XMVector2TransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + const XMFLOAT2* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT2)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT2)); + + assert(OutputStride >= sizeof(XMFLOAT4)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat2(reinterpret_cast(pInputVector)); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3); + Result = XMVectorMultiplyAdd(X, row0, Result); + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015, "PREfast noise: Esp:1307" ) +#endif + + XMStoreFloat4(reinterpret_cast(pOutputVector), Result); + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT4))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x2_t V = vld2q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + float32x2_t r3 = vget_low_f32(row3); + float32x2_t r = vget_low_f32(row0); + XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Bx+N + + XM_PREFETCH(pInputVector); + + r3 = vget_high_f32(row3); + r = vget_high_f32(row0); + XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Cx+O + XMVECTOR vResult3 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Dx+P + + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + r = vget_low_f32(row1); + vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey+M + vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + + r = vget_high_f32(row1); + vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy+O + vResult3 = vmlaq_lane_f32(vResult3, V.val[1], r, 1); // Dx+Hy+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + + float32x4x4_t R; + R.val[0] = vResult0; + R.val[1] = vResult1; + R.val[2] = vResult2; + R.val[3] = vResult3; + + vst4q_f32(reinterpret_cast(pOutputVector), R); + pOutputVector += sizeof(XMFLOAT4) * 4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t V = vld1_f32(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR vResult = vmlaq_lane_f32(row3, row0, V, 0); // X + vResult = vmlaq_lane_f32(vResult, row1, V, 1); // Y + + vst1q_f32(reinterpret_cast(pOutputVector), vResult); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_AVX2_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + __m256 row0 = _mm256_broadcast_ps(&M.r[0]); + __m256 row1 = _mm256_broadcast_ps(&M.r[1]); + __m256 row3 = _mm256_broadcast_ps(&M.r[3]); + + if (InputStride == sizeof(XMFLOAT2)) + { + if (OutputStride == sizeof(XMFLOAT4)) + { + if (!(reinterpret_cast(pOutputStream) & 0x1F)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3); + __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3); + __m256 vTempA = _mm256_mul_ps(X1, row0); + __m256 vTempA2 = _mm256_mul_ps(X2, row0); + vTempA = _mm256_add_ps(vTempA, vTempB); + vTempA2 = _mm256_add_ps(vTempA2, vTempB2); + + X1 = _mm256_insertf128_ps(vTempA, _mm256_castps256_ps128(vTempA2), 1); + XM256_STREAM_PS(reinterpret_cast(pOutputVector), X1); + pOutputVector += sizeof(XMFLOAT4) * 2; + + X2 = _mm256_insertf128_ps(vTempA2, _mm256_extractf128_ps(vTempA, 1), 0); + XM256_STREAM_PS(reinterpret_cast(pOutputVector), X2); + pOutputVector += sizeof(XMFLOAT4) * 2; + + i += 4; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3); + __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3); + __m256 vTempA = _mm256_mul_ps(X1, row0); + __m256 vTempA2 = _mm256_mul_ps(X2, row0); + vTempA = _mm256_add_ps(vTempA, vTempB); + vTempA2 = _mm256_add_ps(vTempA2, vTempB2); + + X1 = _mm256_insertf128_ps(vTempA, _mm256_castps256_ps128(vTempA2), 1); + _mm256_storeu_ps(reinterpret_cast(pOutputVector), X1); + pOutputVector += sizeof(XMFLOAT4) * 2; + + X2 = _mm256_insertf128_ps(vTempA2, _mm256_extractf128_ps(vTempA, 1), 0); + _mm256_storeu_ps(reinterpret_cast(pOutputVector), X2); + pOutputVector += sizeof(XMFLOAT4) * 2; + + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3); + __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3); + __m256 vTempA = _mm256_mul_ps(X1, row0); + __m256 vTempA2 = _mm256_mul_ps(X2, row0); + vTempA = _mm256_add_ps(vTempA, vTempB); + vTempA2 = _mm256_add_ps(vTempA2, vTempB2); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), _mm256_castps256_ps128(vTempA)); + pOutputVector += OutputStride; + + _mm_storeu_ps(reinterpret_cast(pOutputVector), _mm256_castps256_ps128(vTempA2)); + pOutputVector += OutputStride; + + _mm_storeu_ps(reinterpret_cast(pOutputVector), _mm256_extractf128_ps(vTempA, 1)); + pOutputVector += OutputStride; + + _mm_storeu_ps(reinterpret_cast(pOutputVector), _mm256_extractf128_ps(vTempA2, 1)); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + if (i < VectorCount) + { + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + for (; i < VectorCount; i++) + { + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t two = VectorCount >> 1; + if (two > 0) + { + if (InputStride == sizeof(XMFLOAT2)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF) && !(OutputStride & 0xF)) + { + // Packed input, aligned output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 2; + + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + + vTemp = XM_FMADD_PS(Y, row1, row3); + vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 2; + } + } + else + { + // Packed input, unaligned output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 2; + + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + + vTemp = XM_FMADD_PS(Y, row1, row3); + vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 2; + } + } + } + } + + if (!(reinterpret_cast(pInputVector) & 0xF) && !(InputStride & 0xF)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF) && !(OutputStride & 0xF)) + { + // Aligned input, aligned output + for (; i < VectorCount; i++) + { + XMVECTOR V = _mm_castsi128_ps(_mm_loadl_epi64(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + } + else + { + // Aligned input, unaligned output + for (; i < VectorCount; i++) + { + XMVECTOR V = _mm_castsi128_ps(_mm_loadl_epi64(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + } + } + else + { + // Unaligned input + for (; i < VectorCount; i++) + { + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2TransformCoord +( + FXMVECTOR V, + FXMMATRIX M +) noexcept +{ + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + XMVECTOR W = XMVectorSplatW(Result); + return XMVectorDivide(Result, W); +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream +( + XMFLOAT2* pOutputStream, + size_t OutputStride, + const XMFLOAT2* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT2)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT2)); + + assert(OutputStride >= sizeof(XMFLOAT2)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT2)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat2(reinterpret_cast(pInputVector)); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3); + Result = XMVectorMultiplyAdd(X, row0, Result); + + XMVECTOR W = XMVectorSplatW(Result); + + Result = XMVectorDivide(Result, W); + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015, "PREfast noise: Esp:1307" ) +#endif + + XMStoreFloat2(reinterpret_cast(pOutputVector), Result); + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT2))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x2_t V = vld2q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + float32x2_t r3 = vget_low_f32(row3); + float32x2_t r = vget_low_f32(row0); + XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Bx+N + + XM_PREFETCH(pInputVector); + + r3 = vget_high_f32(row3); + r = vget_high_f32(row0); + XMVECTOR W = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Dx+P + + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + r = vget_low_f32(row1); + vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey+M + vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + + r = vget_high_f32(row1); + W = vmlaq_lane_f32(W, V.val[1], r, 1); // Dx+Hy+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ + V.val[0] = vdivq_f32(vResult0, W); + V.val[1] = vdivq_f32(vResult1, W); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + + V.val[0] = vmulq_f32(vResult0, Reciprocal); + V.val[1] = vmulq_f32(vResult1, Reciprocal); +#endif + + vst2q_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += sizeof(XMFLOAT2) * 4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t V = vld1_f32(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR vResult = vmlaq_lane_f32(row3, row0, V, 0); // X + vResult = vmlaq_lane_f32(vResult, row1, V, 1); // Y + + V = vget_high_f32(vResult); + float32x2_t W = vdup_lane_f32(V, 1); + +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ + V = vget_low_f32(vResult); + V = vdiv_f32(V, W); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal for W + float32x2_t Reciprocal = vrecpe_f32(W); + float32x2_t S = vrecps_f32(Reciprocal, W); + Reciprocal = vmul_f32(S, Reciprocal); + S = vrecps_f32(Reciprocal, W); + Reciprocal = vmul_f32(S, Reciprocal); + + V = vget_low_f32(vResult); + V = vmul_f32(V, Reciprocal); +#endif + + vst1_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_AVX2_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + __m256 row0 = _mm256_broadcast_ps(&M.r[0]); + __m256 row1 = _mm256_broadcast_ps(&M.r[1]); + __m256 row3 = _mm256_broadcast_ps(&M.r[3]); + + if (InputStride == sizeof(XMFLOAT2)) + { + if (OutputStride == sizeof(XMFLOAT2)) + { + if (!(reinterpret_cast(pOutputStream) & 0x1F)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3); + __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3); + __m256 vTempA = _mm256_mul_ps(X1, row0); + __m256 vTempA2 = _mm256_mul_ps(X2, row0); + vTempA = _mm256_add_ps(vTempA, vTempB); + vTempA2 = _mm256_add_ps(vTempA2, vTempB2); + + __m256 W = _mm256_shuffle_ps(vTempA, vTempA, _MM_SHUFFLE(3, 3, 3, 3)); + vTempA = _mm256_div_ps(vTempA, W); + + W = _mm256_shuffle_ps(vTempA2, vTempA2, _MM_SHUFFLE(3, 3, 3, 3)); + vTempA2 = _mm256_div_ps(vTempA2, W); + + X1 = _mm256_shuffle_ps(vTempA, vTempA2, 0x44); + XM256_STREAM_PS(reinterpret_cast(pOutputVector), X1); + pOutputVector += sizeof(XMFLOAT2) * 4; + + i += 4; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3); + __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3); + __m256 vTempA = _mm256_mul_ps(X1, row0); + __m256 vTempA2 = _mm256_mul_ps(X2, row0); + vTempA = _mm256_add_ps(vTempA, vTempB); + vTempA2 = _mm256_add_ps(vTempA2, vTempB2); + + __m256 W = _mm256_shuffle_ps(vTempA, vTempA, _MM_SHUFFLE(3, 3, 3, 3)); + vTempA = _mm256_div_ps(vTempA, W); + + W = _mm256_shuffle_ps(vTempA2, vTempA2, _MM_SHUFFLE(3, 3, 3, 3)); + vTempA2 = _mm256_div_ps(vTempA2, W); + + X1 = _mm256_shuffle_ps(vTempA, vTempA2, 0x44); + _mm256_storeu_ps(reinterpret_cast(pOutputVector), X1); + pOutputVector += sizeof(XMFLOAT2) * 4; + + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3); + __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3); + __m256 vTempA = _mm256_mul_ps(X1, row0); + __m256 vTempA2 = _mm256_mul_ps(X2, row0); + vTempA = _mm256_add_ps(vTempA, vTempB); + vTempA2 = _mm256_add_ps(vTempA2, vTempB2); + + __m256 W = _mm256_shuffle_ps(vTempA, vTempA, _MM_SHUFFLE(3, 3, 3, 3)); + vTempA = _mm256_div_ps(vTempA, W); + + W = _mm256_shuffle_ps(vTempA2, vTempA2, _MM_SHUFFLE(3, 3, 3, 3)); + vTempA2 = _mm256_div_ps(vTempA2, W); + + _mm_store_sd(reinterpret_cast(pOutputVector), + _mm_castps_pd(_mm256_castps256_ps128(vTempA))); + pOutputVector += OutputStride; + + _mm_store_sd(reinterpret_cast(pOutputVector), + _mm_castps_pd(_mm256_castps256_ps128(vTempA2))); + pOutputVector += OutputStride; + + _mm_store_sd(reinterpret_cast(pOutputVector), + _mm_castps_pd(_mm256_extractf128_ps(vTempA, 1))); + pOutputVector += OutputStride; + + _mm_store_sd(reinterpret_cast(pOutputVector), + _mm_castps_pd(_mm256_extractf128_ps(vTempA2, 1))); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + if (i < VectorCount) + { + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + for (; i < VectorCount; i++) + { + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t two = VectorCount >> 1; + if (two > 0) + { + if (InputStride == sizeof(XMFLOAT2)) + { + if (OutputStride == sizeof(XMFLOAT2)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + XMVECTOR V1 = _mm_div_ps(vTemp, W); + + // Result 2 + Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + + vTemp = XM_FMADD_PS(Y, row1, row3); + vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + XMVECTOR V2 = _mm_div_ps(vTemp, W); + + vTemp = _mm_movelh_ps(V1, V2); + + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += sizeof(XMFLOAT2) * 2; + + i += 2; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + XMVECTOR V1 = _mm_div_ps(vTemp, W); + + // Result 2 + Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + + vTemp = XM_FMADD_PS(Y, row1, row3); + vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + XMVECTOR V2 = _mm_div_ps(vTemp, W); + + vTemp = _mm_movelh_ps(V1, V2); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += sizeof(XMFLOAT2) * 2; + + i += 2; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + + // Result 2 + Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + + vTemp = XM_FMADD_PS(Y, row1, row3); + vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + + i += 2; + } + } + } + } + + if (!(reinterpret_cast(pInputVector) & 0xF) && !(InputStride & 0xF)) + { + // Aligned input + for (; i < VectorCount; i++) + { + XMVECTOR V = _mm_castsi128_ps(_mm_loadl_epi64(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + } + } + else + { + // Unaligned input + for (; i < VectorCount; i++) + { + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2TransformNormal +( + FXMVECTOR V, + FXMMATRIX M +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(Y, M.r[1]); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + float32x4_t Result = vmulq_lane_f32(M.r[1], VL, 1); // Y + return vmlaq_lane_f32(Result, M.r[0], VL, 0); // X +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y + vResult = _mm_mul_ps(vResult, M.r[1]); + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X + vResult = XM_FMADD_PS(vTemp, M.r[0], vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream +( + XMFLOAT2* pOutputStream, + size_t OutputStride, + const XMFLOAT2* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT2)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT2)); + + assert(OutputStride >= sizeof(XMFLOAT2)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT2)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat2(reinterpret_cast(pInputVector)); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(Y, row1); + Result = XMVectorMultiplyAdd(X, row0, Result); + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015, "PREfast noise: Esp:1307" ) +#endif + + XMStoreFloat2(reinterpret_cast(pOutputVector), Result); + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT2))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x2_t V = vld2q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + float32x2_t r = vget_low_f32(row0); + XMVECTOR vResult0 = vmulq_lane_f32(V.val[0], r, 0); // Ax + XMVECTOR vResult1 = vmulq_lane_f32(V.val[0], r, 1); // Bx + + XM_PREFETCH(pInputVector); + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + r = vget_low_f32(row1); + vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey + vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + + V.val[0] = vResult0; + V.val[1] = vResult1; + + vst2q_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += sizeof(XMFLOAT2) * 4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t V = vld1_f32(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR vResult = vmulq_lane_f32(row0, V, 0); // X + vResult = vmlaq_lane_f32(vResult, row1, V, 1); // Y + + V = vget_low_f32(vResult); + vst1_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_AVX2_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + __m256 row0 = _mm256_broadcast_ps(&M.r[0]); + __m256 row1 = _mm256_broadcast_ps(&M.r[1]); + + if (InputStride == sizeof(XMFLOAT2)) + { + if (OutputStride == sizeof(XMFLOAT2)) + { + if (!(reinterpret_cast(pOutputStream) & 0x1F)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempA = _mm256_mul_ps(Y1, row1); + __m256 vTempB = _mm256_mul_ps(Y2, row1); + vTempA = _mm256_fmadd_ps(X1, row0, vTempA); + vTempB = _mm256_fmadd_ps(X2, row0, vTempB); + + X1 = _mm256_shuffle_ps(vTempA, vTempB, 0x44); + XM256_STREAM_PS(reinterpret_cast(pOutputVector), X1); + pOutputVector += sizeof(XMFLOAT2) * 4; + + i += 4; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempA = _mm256_mul_ps(Y1, row1); + __m256 vTempB = _mm256_mul_ps(Y2, row1); + vTempA = _mm256_fmadd_ps(X1, row0, vTempA); + vTempB = _mm256_fmadd_ps(X2, row0, vTempB); + + X1 = _mm256_shuffle_ps(vTempA, vTempB, 0x44); + _mm256_storeu_ps(reinterpret_cast(pOutputVector), X1); + pOutputVector += sizeof(XMFLOAT2) * 4; + + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempA = _mm256_mul_ps(Y1, row1); + __m256 vTempB = _mm256_mul_ps(Y2, row1); + vTempA = _mm256_fmadd_ps(X1, row0, vTempA); + vTempB = _mm256_fmadd_ps(X2, row0, vTempB); + + _mm_store_sd(reinterpret_cast(pOutputVector), + _mm_castps_pd(_mm256_castps256_ps128(vTempA))); + pOutputVector += OutputStride; + + _mm_store_sd(reinterpret_cast(pOutputVector), + _mm_castps_pd(_mm256_castps256_ps128(vTempB))); + pOutputVector += OutputStride; + + _mm_store_sd(reinterpret_cast(pOutputVector), + _mm_castps_pd(_mm256_extractf128_ps(vTempA, 1))); + pOutputVector += OutputStride; + + _mm_store_sd(reinterpret_cast(pOutputVector), + _mm_castps_pd(_mm256_extractf128_ps(vTempB, 1))); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + if (i < VectorCount) + { + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + + for (; i < VectorCount; i++) + { + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Y, row1); + vTemp = XM_FMADD_PS(X, row0, vTemp); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + + size_t i = 0; + size_t two = VectorCount >> 1; + if (two > 0) + { + if (InputStride == sizeof(XMFLOAT2)) + { + if (OutputStride == sizeof(XMFLOAT2)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Y, row1); + XMVECTOR V1 = XM_FMADD_PS(X, row0, vTemp); + + // Result 2 + Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + + vTemp = _mm_mul_ps(Y, row1); + XMVECTOR V2 = XM_FMADD_PS(X, row0, vTemp); + + vTemp = _mm_movelh_ps(V1, V2); + + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += sizeof(XMFLOAT2) * 2; + + i += 2; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Y, row1); + XMVECTOR V1 = XM_FMADD_PS(X, row0, vTemp); + + // Result 2 + Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + + vTemp = _mm_mul_ps(Y, row1); + XMVECTOR V2 = XM_FMADD_PS(X, row0, vTemp); + + vTemp = _mm_movelh_ps(V1, V2); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += sizeof(XMFLOAT2) * 2; + + i += 2; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Y, row1); + vTemp = XM_FMADD_PS(X, row0, vTemp); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + + // Result 2 + Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + + vTemp = _mm_mul_ps(Y, row1); + vTemp = XM_FMADD_PS(X, row0, vTemp); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + + i += 2; + } + } + } + } + + if (!(reinterpret_cast(pInputVector) & 0xF) && !(InputStride & 0xF)) + { + // Aligned input + for (; i < VectorCount; i++) + { + XMVECTOR V = _mm_castsi128_ps(_mm_loadl_epi64(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Y, row1); + vTemp = XM_FMADD_PS(X, row0, vTemp); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + } + } + else + { + // Unaligned input + for (; i < VectorCount; i++) + { + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Y, row1); + vTemp = XM_FMADD_PS(X, row0, vTemp); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +/**************************************************************************** + * + * 3D Vector + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + // Comparison operations + //------------------------------------------------------------------------------ + + //------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3Equal +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector3EqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && + (V1.vector4_f32[1] == V2.vector4_f32[1]) && + (V1.vector4_f32[2] == V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && + (V1.vector4_f32[1] != V2.vector4_f32[1]) && + (V1.vector4_f32[2] != V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU; + + uint32_t CR = 0; + if (r == 0xFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + int iTest = _mm_movemask_ps(vTemp) & 7; + uint32_t CR = 0; + if (iTest == 7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3EqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 7) == 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector3EqualIntR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if ((V1.vector4_u32[0] == V2.vector4_u32[0]) && + (V1.vector4_u32[1] == V2.vector4_u32[1]) && + (V1.vector4_u32[2] == V2.vector4_u32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) && + (V1.vector4_u32[1] != V2.vector4_u32[1]) && + (V1.vector4_u32[2] != V2.vector4_u32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU; + + uint32_t CR = 0; + if (r == 0xFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + int iTemp = _mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 7; + uint32_t CR = 0; + if (iTemp == 7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTemp) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3NearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float dx, dy, dz; + + dx = fabsf(V1.vector4_f32[0] - V2.vector4_f32[0]); + dy = fabsf(V1.vector4_f32[1] - V2.vector4_f32[1]); + dz = fabsf(V1.vector4_f32[2] - V2.vector4_f32[2]); + return (((dx <= Epsilon.vector4_f32[0]) && + (dy <= Epsilon.vector4_f32[1]) && + (dz <= Epsilon.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vDelta = vsubq_f32(V1, V2); +#ifdef _MSC_VER + uint32x4_t vResult = vacleq_f32(vDelta, Epsilon); +#else + uint32x4_t vResult = vcleq_f32(vabsq_f32(vDelta), Epsilon); +#endif + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1, V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp, vDelta); + vTemp = _mm_max_ps(vTemp, vDelta); + vTemp = _mm_cmple_ps(vTemp, Epsilon); + // w is don't care + return (((_mm_movemask_ps(vTemp) & 7) == 0x7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3NotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) != 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 7) != 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3NotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) != 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 7) != 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3Greater +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgtq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector3GreaterR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if ((V1.vector4_f32[0] > V2.vector4_f32[0]) && + (V1.vector4_f32[1] > V2.vector4_f32[1]) && + (V1.vector4_f32[2] > V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) && + (V1.vector4_f32[1] <= V2.vector4_f32[1]) && + (V1.vector4_f32[2] <= V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgtq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU; + + uint32_t CR = 0; + if (r == 0xFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp) & 7; + if (iTest == 7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3GreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgeq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector3GreaterOrEqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && + (V1.vector4_f32[1] >= V2.vector4_f32[1]) && + (V1.vector4_f32[2] >= V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && + (V1.vector4_f32[1] < V2.vector4_f32[1]) && + (V1.vector4_f32[2] < V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgeq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU; + + uint32_t CR = 0; + if (r == 0xFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1, V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp) & 7; + if (iTest == 7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3Less +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcltq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmplt_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3LessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcleq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmple_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3InBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) && + (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test if less than or equal + uint32x4_t ivTemp1 = vcleq_f32(V, Bounds); + // Negate the bounds + float32x4_t vTemp2 = vnegq_f32(Bounds); + // Test if greater or equal (Reversed) + uint32x4_t ivTemp2 = vcleq_f32(vTemp2, V); + // Blend answers + ivTemp1 = vandq_u32(ivTemp1, ivTemp2); + // in bounds? + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(ivTemp1)), vget_high_u8(vreinterpretq_u8_u32(ivTemp1))); + uint16x4x2_t vTemp3 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp3.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2, V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1, vTemp2); + // x,y and z in bounds? (w is don't care) + return (((_mm_movemask_ps(vTemp1) & 0x7) == 0x7) != 0); +#else + return XMComparisonAllInBounds(XMVector3InBoundsR(V, Bounds)); +#endif +} + +//------------------------------------------------------------------------------ + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma float_control(push) +#pragma float_control(precise, on) +#endif + +inline bool XM_CALLCONV XMVector3IsNaN(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + return (XMISNAN(V.vector4_f32[0]) || + XMISNAN(V.vector4_f32[1]) || + XMISNAN(V.vector4_f32[2])); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test against itself. NaN is always not equal + uint32x4_t vTempNan = vceqq_f32(V, V); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTempNan)), vget_high_u8(vreinterpretq_u8_u32(vTempNan))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + // If x or y or z are NaN, the mask is zero + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) != 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + // Test against itself. NaN is always not equal + XMVECTOR vTempNan = _mm_cmpneq_ps(V, V); + // If x or y or z are NaN, the mask is non-zero + return ((_mm_movemask_ps(vTempNan) & 7) != 0); +#endif +} + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma float_control(pop) +#endif + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3IsInfinite(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (XMISINF(V.vector4_f32[0]) || + XMISINF(V.vector4_f32[1]) || + XMISINF(V.vector4_f32[2])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bit + uint32x4_t vTempInf = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + // Compare to infinity + vTempInf = vceqq_f32(vreinterpretq_f32_u32(vTempInf), g_XMInfinity); + // If any are infinity, the signs are true. + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTempInf)), vget_high_u8(vreinterpretq_u8_u32(vTempInf))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + __m128 vTemp = _mm_and_ps(V, g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp, g_XMInfinity); + // If x,y or z are infinity, the signs are true. + return ((_mm_movemask_ps(vTemp) & 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float fValue = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2]; + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = fValue; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vTemp = vmulq_f32(V1, V2); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vpadd_f32(v1, v1); + v2 = vdup_lane_f32(v2, 0); + v1 = vadd_f32(v1, v2); + return vcombine_f32(v1, v1); +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_dp_ps(V1, V2, 0x7f); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vTemp = _mm_mul_ps(V1, V2); + vTemp = _mm_and_ps(vTemp, g_XMMask3); + vTemp = _mm_hadd_ps(vTemp, vTemp); + return _mm_hadd_ps(vTemp, vTemp); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(V1, V2); + // x=Dot.vector4_f32[1], y=Dot.vector4_f32[2] + XMVECTOR vTemp = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1)); + // Result.vector4_f32[0] = x+y + vDot = _mm_add_ss(vDot, vTemp); + // x=Dot.vector4_f32[2] + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + // Result.vector4_f32[0] = (x+y)+z + vDot = _mm_add_ss(vDot, vTemp); + // Splat x + return XM_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0)); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Cross +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + // [ V1.y*V2.z - V1.z*V2.y, V1.z*V2.x - V1.x*V2.z, V1.x*V2.y - V1.y*V2.x ] + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + (V1.vector4_f32[1] * V2.vector4_f32[2]) - (V1.vector4_f32[2] * V2.vector4_f32[1]), + (V1.vector4_f32[2] * V2.vector4_f32[0]) - (V1.vector4_f32[0] * V2.vector4_f32[2]), + (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]), + 0.0f + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t v1xy = vget_low_f32(V1); + float32x2_t v2xy = vget_low_f32(V2); + + float32x2_t v1yx = vrev64_f32(v1xy); + float32x2_t v2yx = vrev64_f32(v2xy); + + float32x2_t v1zz = vdup_lane_f32(vget_high_f32(V1), 0); + float32x2_t v2zz = vdup_lane_f32(vget_high_f32(V2), 0); + + XMVECTOR vResult = vmulq_f32(vcombine_f32(v1yx, v1xy), vcombine_f32(v2zz, v2yx)); + vResult = vmlsq_f32(vResult, vcombine_f32(v1zz, v1yx), vcombine_f32(v2yx, v2xy)); + vResult = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(vResult), g_XMFlipY)); + return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vResult), g_XMMask3)); +#elif defined(_XM_SSE_INTRINSICS_) + // y1,z1,x1,w1 + XMVECTOR vTemp1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(3, 0, 2, 1)); + // z2,x2,y2,w2 + XMVECTOR vTemp2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(3, 1, 0, 2)); + // Perform the left operation + XMVECTOR vResult = _mm_mul_ps(vTemp1, vTemp2); + // z1,x1,y1,w1 + vTemp1 = XM_PERMUTE_PS(vTemp1, _MM_SHUFFLE(3, 0, 2, 1)); + // y2,z2,x2,w2 + vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(3, 1, 0, 2)); + // Perform the right operation + vResult = XM_FNMADD_PS(vTemp1, vTemp2, vResult); + // Set w to zero + return _mm_and_ps(vResult, g_XMMask3); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3LengthSq(FXMVECTOR V) noexcept +{ + return XMVector3Dot(V, V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorReciprocalSqrtEst(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vpadd_f32(v1, v1); + v2 = vdup_lane_f32(v2, 0); + v1 = vadd_f32(v1, v2); + // Reciprocal sqrt (estimate) + v2 = vrsqrte_f32(v1); + return vcombine_f32(v2, v2); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f); + return _mm_rsqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_rsqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and y + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 2, 1, 2)); + // x+z, y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + // y,y,y,y + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + // x+z+y,??,??,?? + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + // Splat the length squared + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + // Get the reciprocal + vLengthSq = _mm_rsqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLength(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorReciprocalSqrt(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vpadd_f32(v1, v1); + v2 = vdup_lane_f32(v2, 0); + v1 = vadd_f32(v1, v2); + // Reciprocal sqrt + float32x2_t S0 = vrsqrte_f32(v1); + float32x2_t P0 = vmul_f32(v1, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(v1, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + float32x2_t Result = vmul_f32(S1, R1); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f); + XMVECTOR vLengthSq = _mm_sqrt_ps(vTemp); + return _mm_div_ps(g_XMOne, vLengthSq); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vDot = _mm_mul_ps(V, V); + vDot = _mm_and_ps(vDot, g_XMMask3); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_sqrt_ps(vDot); + vDot = _mm_div_ps(g_XMOne, vDot); + return vDot; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(V, V); + // x=Dot.y, y=Dot.z + XMVECTOR vTemp = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1)); + // Result.x = x+y + vDot = _mm_add_ss(vDot, vTemp); + // x=Dot.z + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + // Result.x = (x+y)+z + vDot = _mm_add_ss(vDot, vTemp); + // Splat x + vDot = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0)); + // Get the reciprocal + vDot = _mm_sqrt_ps(vDot); + // Get the reciprocal + vDot = _mm_div_ps(g_XMOne, vDot); + return vDot; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3LengthEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorSqrtEst(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vpadd_f32(v1, v1); + v2 = vdup_lane_f32(v2, 0); + v1 = vadd_f32(v1, v2); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32(v1, zero); + // Sqrt (estimate) + float32x2_t Result = vrsqrte_f32(v1); + Result = vmul_f32(v1, Result); + Result = vbsl_f32(VEqualsZero, zero, Result); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f); + return _mm_sqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and y + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 2, 1, 2)); + // x+z, y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + // y,y,y,y + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + // x+z+y,??,??,?? + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + // Splat the length squared + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + // Get the length + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Length(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorSqrt(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vpadd_f32(v1, v1); + v2 = vdup_lane_f32(v2, 0); + v1 = vadd_f32(v1, v2); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32(v1, zero); + // Sqrt + float32x2_t S0 = vrsqrte_f32(v1); + float32x2_t P0 = vmul_f32(v1, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(v1, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + float32x2_t Result = vmul_f32(S1, R1); + Result = vmul_f32(v1, Result); + Result = vbsl_f32(VEqualsZero, zero, Result); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f); + return _mm_sqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and y + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 2, 1, 2)); + // x+z, y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + // y,y,y,y + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + // x+z+y,??,??,?? + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + // Splat the length squared + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + // Get the length + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ +// XMVector3NormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +inline XMVECTOR XM_CALLCONV XMVector3NormalizeEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector3ReciprocalLength(V); + Result = XMVectorMultiply(V, Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vpadd_f32(v1, v1); + v2 = vdup_lane_f32(v2, 0); + v1 = vadd_f32(v1, v2); + // Reciprocal sqrt (estimate) + v2 = vrsqrte_f32(v1); + // Normalize + return vmulq_f32(V, vcombine_f32(v2, v2)); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f); + XMVECTOR vResult = _mm_rsqrt_ps(vTemp); + return _mm_mul_ps(vResult, V); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vDot = _mm_mul_ps(V, V); + vDot = _mm_and_ps(vDot, g_XMMask3); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_rsqrt_ps(vDot); + vDot = _mm_mul_ps(vDot, V); + return vDot; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(V, V); + // x=Dot.y, y=Dot.z + XMVECTOR vTemp = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1)); + // Result.x = x+y + vDot = _mm_add_ss(vDot, vTemp); + // x=Dot.z + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + // Result.x = (x+y)+z + vDot = _mm_add_ss(vDot, vTemp); + // Splat x + vDot = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0)); + // Get the reciprocal + vDot = _mm_rsqrt_ps(vDot); + // Perform the normalization + vDot = _mm_mul_ps(vDot, V); + return vDot; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Normalize(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float fLength; + XMVECTOR vResult; + + vResult = XMVector3Length(V); + fLength = vResult.vector4_f32[0]; + + // Prevent divide by zero + if (fLength > 0) + { + fLength = 1.0f / fLength; + } + + vResult.vector4_f32[0] = V.vector4_f32[0] * fLength; + vResult.vector4_f32[1] = V.vector4_f32[1] * fLength; + vResult.vector4_f32[2] = V.vector4_f32[2] * fLength; + vResult.vector4_f32[3] = V.vector4_f32[3] * fLength; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vpadd_f32(v1, v1); + v2 = vdup_lane_f32(v2, 0); + v1 = vadd_f32(v1, v2); + uint32x2_t VEqualsZero = vceq_f32(v1, vdup_n_f32(0)); + uint32x2_t VEqualsInf = vceq_f32(v1, vget_low_f32(g_XMInfinity)); + // Reciprocal sqrt (2 iterations of Newton-Raphson) + float32x2_t S0 = vrsqrte_f32(v1); + float32x2_t P0 = vmul_f32(v1, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(v1, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + v2 = vmul_f32(S1, R1); + // Normalize + XMVECTOR vResult = vmulq_f32(V, vcombine_f32(v2, v2)); + vResult = vbslq_f32(vcombine_u32(VEqualsZero, VEqualsZero), vdupq_n_f32(0), vResult); + return vbslq_f32(vcombine_u32(VEqualsInf, VEqualsInf), g_XMQNaN, vResult); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vLengthSq = _mm_dp_ps(V, V, 0x7f); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#elif defined(_XM_SSE3_INTRINSICS_) + // Perform the dot product on x,y and z only + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z only + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 1, 2, 1)); + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3ClampLength +( + FXMVECTOR V, + float LengthMin, + float LengthMax +) noexcept +{ + XMVECTOR ClampMax = XMVectorReplicate(LengthMax); + XMVECTOR ClampMin = XMVectorReplicate(LengthMin); + + return XMVector3ClampLengthV(V, ClampMin, ClampMax); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3ClampLengthV +( + FXMVECTOR V, + FXMVECTOR LengthMin, + FXMVECTOR LengthMax +) noexcept +{ + assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin))); + assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax))); + assert(XMVector3GreaterOrEqual(LengthMin, XMVectorZero())); + assert(XMVector3GreaterOrEqual(LengthMax, XMVectorZero())); + assert(XMVector3GreaterOrEqual(LengthMax, LengthMin)); + + XMVECTOR LengthSq = XMVector3LengthSq(V); + + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq); + + XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); + XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero); + + XMVECTOR Normal = XMVectorMultiply(V, RcpLength); + + XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength); + + XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength); + Length = XMVectorSelect(LengthSq, Length, Select); + Normal = XMVectorSelect(LengthSq, Normal, Select); + + XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax); + XMVECTOR ControlMin = XMVectorLess(Length, LengthMin); + + XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); + ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); + + XMVECTOR Result = XMVectorMultiply(Normal, ClampLength); + + // Preserve the original vector (with no precision loss) if the length falls within the given range + XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin); + Result = XMVectorSelect(Result, V, Control); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Reflect +( + FXMVECTOR Incident, + FXMVECTOR Normal +) noexcept +{ + // Result = Incident - (2 * dot(Incident, Normal)) * Normal + + XMVECTOR Result = XMVector3Dot(Incident, Normal); + Result = XMVectorAdd(Result, Result); + Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Refract +( + FXMVECTOR Incident, + FXMVECTOR Normal, + float RefractionIndex +) noexcept +{ + XMVECTOR Index = XMVectorReplicate(RefractionIndex); + return XMVector3RefractV(Incident, Normal, Index); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3RefractV +( + FXMVECTOR Incident, + FXMVECTOR Normal, + FXMVECTOR RefractionIndex +) noexcept +{ + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + +#if defined(_XM_NO_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR IDotN = XMVector3Dot(Incident, Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v); + R = XMVectorMultiply(R, RefractionIndex); + R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v); + + if (XMVector4LessOrEqual(R, Zero)) + { + // Total internal reflection + return Zero; + } + else + { + // R = RefractionIndex * IDotN + sqrt(R) + R = XMVectorSqrt(R); + R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R); + + // Result = RefractionIndex * Incident - Normal * R + XMVECTOR Result = XMVectorMultiply(RefractionIndex, Incident); + Result = XMVectorNegativeMultiplySubtract(Normal, R, Result); + + return Result; + } + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR IDotN = XMVector3Dot(Incident, Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + float32x4_t R = vmlsq_f32(g_XMOne, IDotN, IDotN); + R = vmulq_f32(R, RefractionIndex); + R = vmlsq_f32(g_XMOne, R, RefractionIndex); + + uint32x4_t isrzero = vcleq_f32(R, g_XMZero); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(isrzero)), vget_high_u8(vreinterpretq_u8_u32(isrzero))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + + float32x4_t vResult; + if (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // Sqrt(R) + float32x4_t S0 = vrsqrteq_f32(R); + float32x4_t P0 = vmulq_f32(R, S0); + float32x4_t R0 = vrsqrtsq_f32(P0, S0); + float32x4_t S1 = vmulq_f32(S0, R0); + float32x4_t P1 = vmulq_f32(R, S1); + float32x4_t R1 = vrsqrtsq_f32(P1, S1); + float32x4_t S2 = vmulq_f32(S1, R1); + R = vmulq_f32(R, S2); + // R = RefractionIndex * IDotN + sqrt(R) + R = vmlaq_f32(R, RefractionIndex, IDotN); + // Result = RefractionIndex * Incident - Normal * R + vResult = vmulq_f32(RefractionIndex, Incident); + vResult = vmlsq_f32(vResult, R, Normal); + } + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + XMVECTOR IDotN = XMVector3Dot(Incident, Normal); + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR R = XM_FNMADD_PS(IDotN, IDotN, g_XMOne); + XMVECTOR R2 = _mm_mul_ps(RefractionIndex, RefractionIndex); + R = XM_FNMADD_PS(R, R2, g_XMOne); + + XMVECTOR vResult = _mm_cmple_ps(R, g_XMZero); + if (_mm_movemask_ps(vResult) == 0x0f) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // R = RefractionIndex * IDotN + sqrt(R) + R = _mm_sqrt_ps(R); + R = XM_FMADD_PS(RefractionIndex, IDotN, R); + // Result = RefractionIndex * Incident - Normal * R + vResult = _mm_mul_ps(RefractionIndex, Incident); + vResult = XM_FNMADD_PS(R, Normal, vResult); + } + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Orthogonal(FXMVECTOR V) noexcept +{ + XMVECTOR Zero = XMVectorZero(); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR YZYY = XMVectorSwizzle(V); + + XMVECTOR NegativeV = XMVectorSubtract(Zero, V); + + XMVECTOR ZIsNegative = XMVectorLess(Z, Zero); + XMVECTOR YZYYIsNegative = XMVectorLess(YZYY, Zero); + + XMVECTOR S = XMVectorAdd(YZYY, Z); + XMVECTOR D = XMVectorSubtract(YZYY, Z); + + XMVECTOR Select = XMVectorEqualInt(ZIsNegative, YZYYIsNegative); + + XMVECTOR R0 = XMVectorPermute(NegativeV, S); + XMVECTOR R1 = XMVectorPermute(V, D); + + return XMVectorSelect(R1, R0, Select); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormalsEst +( + FXMVECTOR N1, + FXMVECTOR N2 +) noexcept +{ + XMVECTOR Result = XMVector3Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACosEst(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormals +( + FXMVECTOR N1, + FXMVECTOR N2 +) noexcept +{ + XMVECTOR Result = XMVector3Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACos(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenVectors +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + XMVECTOR L1 = XMVector3ReciprocalLength(V1); + XMVECTOR L2 = XMVector3ReciprocalLength(V2); + + XMVECTOR Dot = XMVector3Dot(V1, V2); + + L1 = XMVectorMultiply(L1, L2); + + XMVECTOR CosAngle = XMVectorMultiply(Dot, L1); + CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v); + + return XMVectorACos(CosAngle); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3LinePointDistance +( + FXMVECTOR LinePoint1, + FXMVECTOR LinePoint2, + FXMVECTOR Point +) noexcept +{ + // Given a vector PointVector from LinePoint1 to Point and a vector + // LineVector from LinePoint1 to LinePoint2, the scaled distance + // PointProjectionScale from LinePoint1 to the perpendicular projection + // of PointVector onto the line is defined as: + // + // PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector) + + XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1); + XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1); + + XMVECTOR LengthSq = XMVector3LengthSq(LineVector); + + XMVECTOR PointProjectionScale = XMVector3Dot(PointVector, LineVector); + PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq); + + XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale); + DistanceVector = XMVectorSubtract(PointVector, DistanceVector); + + return XMVector3Length(DistanceVector); +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XM_CALLCONV XMVector3ComponentsFromNormal +( + XMVECTOR* pParallel, + XMVECTOR* pPerpendicular, + FXMVECTOR V, + FXMVECTOR Normal +) noexcept +{ + assert(pParallel != nullptr); + assert(pPerpendicular != nullptr); + + XMVECTOR Scale = XMVector3Dot(V, Normal); + + XMVECTOR Parallel = XMVectorMultiply(Normal, Scale); + + *pParallel = Parallel; + *pPerpendicular = XMVectorSubtract(V, Parallel); +} + +//------------------------------------------------------------------------------ +// Transform a vector using a rotation expressed as a unit quaternion + +inline XMVECTOR XM_CALLCONV XMVector3Rotate +( + FXMVECTOR V, + FXMVECTOR RotationQuaternion +) noexcept +{ + XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v); + XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion); + XMVECTOR Result = XMQuaternionMultiply(Q, A); + return XMQuaternionMultiply(Result, RotationQuaternion); +} + +//------------------------------------------------------------------------------ +// Transform a vector using the inverse of a rotation expressed as a unit quaternion + +inline XMVECTOR XM_CALLCONV XMVector3InverseRotate +( + FXMVECTOR V, + FXMVECTOR RotationQuaternion +) noexcept +{ + XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v); + XMVECTOR Result = XMQuaternionMultiply(RotationQuaternion, A); + XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion); + return XMQuaternionMultiply(Result, Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Transform +( + FXMVECTOR V, + FXMMATRIX M +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + XMVECTOR vResult = vmlaq_lane_f32(M.r[3], M.r[0], VL, 0); // X + vResult = vmlaq_lane_f32(vResult, M.r[1], VL, 1); // Y + return vmlaq_lane_f32(vResult, M.r[2], vget_high_f32(V), 0); // Z +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); // Z + vResult = XM_FMADD_PS(vResult, M.r[2], M.r[3]); + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y + vResult = XM_FMADD_PS(vTemp, M.r[1], vResult); + vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X + vResult = XM_FMADD_PS(vTemp, M.r[0], vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" ) +#endif + +_Use_decl_annotations_ +inline XMFLOAT4* XM_CALLCONV XMVector3TransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); + + assert(OutputStride >= sizeof(XMFLOAT4)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3); + Result = XMVectorMultiplyAdd(Y, row1, Result); + Result = XMVectorMultiplyAdd(X, row0, Result); + + XMStoreFloat4(reinterpret_cast(pOutputVector), Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT4))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x3_t V = vld3q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT3) * 4; + + float32x2_t r3 = vget_low_f32(row3); + float32x2_t r = vget_low_f32(row0); + XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Bx+N + + XM_PREFETCH(pInputVector); + + r3 = vget_high_f32(row3); + r = vget_high_f32(row0); + XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Cx+O + XMVECTOR vResult3 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Dx+P + + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + r = vget_low_f32(row1); + vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey+M + vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + + r = vget_high_f32(row1); + vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy+O + vResult3 = vmlaq_lane_f32(vResult3, V.val[1], r, 1); // Dx+Hy+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + + r = vget_low_f32(row2); + vResult0 = vmlaq_lane_f32(vResult0, V.val[2], r, 0); // Ax+Ey+Iz+M + vResult1 = vmlaq_lane_f32(vResult1, V.val[2], r, 1); // Bx+Fy+Jz+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4)); + + r = vget_high_f32(row2); + vResult2 = vmlaq_lane_f32(vResult2, V.val[2], r, 0); // Cx+Gy+Kz+O + vResult3 = vmlaq_lane_f32(vResult3, V.val[2], r, 1); // Dx+Hy+Lz+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5)); + + float32x4x4_t R; + R.val[0] = vResult0; + R.val[1] = vResult1; + R.val[2] = vResult2; + R.val[3] = vResult3; + + vst4q_f32(reinterpret_cast(pOutputVector), R); + pOutputVector += sizeof(XMFLOAT4) * 4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t VL = vld1_f32(reinterpret_cast(pInputVector)); + float32x2_t zero = vdup_n_f32(0); + float32x2_t VH = vld1_lane_f32(reinterpret_cast(pInputVector) + 2, zero, 0); + pInputVector += InputStride; + + XMVECTOR vResult = vmlaq_lane_f32(row3, row0, VL, 0); // X + vResult = vmlaq_lane_f32(vResult, row1, VL, 1); // Y + vResult = vmlaq_lane_f32(vResult, row2, VH, 0); // Z + + vst1q_f32(reinterpret_cast(pOutputVector), vResult); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(XMFLOAT3)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF) && !(OutputStride & 0xF)) + { + // Packed input, aligned output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + else + { + // Packed input, unaligned output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + if (!(reinterpret_cast(pOutputStream) & 0xF) && !(OutputStride & 0xF)) + { + // Aligned output + for (; i < VectorCount; ++i) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + } + else + { + // Unaligned output + for (; i < VectorCount; ++i) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3TransformCoord +( + FXMVECTOR V, + FXMMATRIX M +) noexcept +{ + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + XMVECTOR W = XMVectorSplatW(Result); + return XMVectorDivide(Result, W); +} + +//------------------------------------------------------------------------------ + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" ) +#endif + +_Use_decl_annotations_ +inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); + + assert(OutputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3); + Result = XMVectorMultiplyAdd(Y, row1, Result); + Result = XMVectorMultiplyAdd(X, row0, Result); + + XMVECTOR W = XMVectorSplatW(Result); + + Result = XMVectorDivide(Result, W); + + XMStoreFloat3(reinterpret_cast(pOutputVector), Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x3_t V = vld3q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT3) * 4; + + float32x2_t r3 = vget_low_f32(row3); + float32x2_t r = vget_low_f32(row0); + XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Bx+N + + XM_PREFETCH(pInputVector); + + r3 = vget_high_f32(row3); + r = vget_high_f32(row0); + XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Cx+O + XMVECTOR W = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Dx+P + + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + r = vget_low_f32(row1); + vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey+M + vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + + r = vget_high_f32(row1); + vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy+O + W = vmlaq_lane_f32(W, V.val[1], r, 1); // Dx+Hy+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + + r = vget_low_f32(row2); + vResult0 = vmlaq_lane_f32(vResult0, V.val[2], r, 0); // Ax+Ey+Iz+M + vResult1 = vmlaq_lane_f32(vResult1, V.val[2], r, 1); // Bx+Fy+Jz+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4)); + + r = vget_high_f32(row2); + vResult2 = vmlaq_lane_f32(vResult2, V.val[2], r, 0); // Cx+Gy+Kz+O + W = vmlaq_lane_f32(W, V.val[2], r, 1); // Dx+Hy+Lz+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5)); + +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ + V.val[0] = vdivq_f32(vResult0, W); + V.val[1] = vdivq_f32(vResult1, W); + V.val[2] = vdivq_f32(vResult2, W); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + + V.val[0] = vmulq_f32(vResult0, Reciprocal); + V.val[1] = vmulq_f32(vResult1, Reciprocal); + V.val[2] = vmulq_f32(vResult2, Reciprocal); +#endif + + vst3q_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += sizeof(XMFLOAT3) * 4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t VL = vld1_f32(reinterpret_cast(pInputVector)); + float32x2_t zero = vdup_n_f32(0); + float32x2_t VH = vld1_lane_f32(reinterpret_cast(pInputVector) + 2, zero, 0); + pInputVector += InputStride; + + XMVECTOR vResult = vmlaq_lane_f32(row3, row0, VL, 0); // X + vResult = vmlaq_lane_f32(vResult, row1, VL, 1); // Y + vResult = vmlaq_lane_f32(vResult, row2, VH, 0); // Z + + VH = vget_high_f32(vResult); + XMVECTOR W = vdupq_lane_f32(VH, 1); + +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ + vResult = vdivq_f32(vResult, W); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal for W + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + + vResult = vmulq_f32(vResult, Reciprocal); +#endif + + VL = vget_low_f32(vResult); + vst1_f32(reinterpret_cast(pOutputVector), VL); + vst1q_lane_f32(reinterpret_cast(pOutputVector) + 2, vResult, 2); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(XMFLOAT3)) + { + if (OutputStride == sizeof(XMFLOAT3)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + V1 = _mm_div_ps(vTemp, W); + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + V2 = _mm_div_ps(vTemp, W); + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + V3 = _mm_div_ps(vTemp, W); + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + V4 = _mm_div_ps(vTemp, W); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + XM_STREAM_PS(reinterpret_cast(pOutputVector), V1); + XM_STREAM_PS(reinterpret_cast(pOutputVector + 16), vTemp); + XM_STREAM_PS(reinterpret_cast(pOutputVector + 32), V3); + pOutputVector += sizeof(XMFLOAT3) * 4; + i += 4; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + V1 = _mm_div_ps(vTemp, W); + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + V2 = _mm_div_ps(vTemp, W); + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + V3 = _mm_div_ps(vTemp, W); + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + V4 = _mm_div_ps(vTemp, W); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + _mm_storeu_ps(reinterpret_cast(pOutputVector), V1); + _mm_storeu_ps(reinterpret_cast(pOutputVector + 16), vTemp); + _mm_storeu_ps(reinterpret_cast(pOutputVector + 32), V3); + pOutputVector += sizeof(XMFLOAT3) * 4; + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + for (; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3TransformNormal +( + FXMVECTOR V, + FXMMATRIX M +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(Z, M.r[2]); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + XMVECTOR vResult = vmulq_lane_f32(M.r[0], VL, 0); // X + vResult = vmlaq_lane_f32(vResult, M.r[1], VL, 1); // Y + return vmlaq_lane_f32(vResult, M.r[2], vget_high_f32(V), 0); // Z +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); // Z + vResult = _mm_mul_ps(vResult, M.r[2]); + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y + vResult = XM_FMADD_PS(vTemp, M.r[1], vResult); + vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X + vResult = XM_FMADD_PS(vTemp, M.r[0], vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" ) +#endif + +_Use_decl_annotations_ +inline XMFLOAT3* XM_CALLCONV XMVector3TransformNormalStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); + + assert(OutputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(Z, row2); + Result = XMVectorMultiplyAdd(Y, row1, Result); + Result = XMVectorMultiplyAdd(X, row0, Result); + + XMStoreFloat3(reinterpret_cast(pOutputVector), Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x3_t V = vld3q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT3) * 4; + + float32x2_t r = vget_low_f32(row0); + XMVECTOR vResult0 = vmulq_lane_f32(V.val[0], r, 0); // Ax + XMVECTOR vResult1 = vmulq_lane_f32(V.val[0], r, 1); // Bx + + XM_PREFETCH(pInputVector); + + r = vget_high_f32(row0); + XMVECTOR vResult2 = vmulq_lane_f32(V.val[0], r, 0); // Cx + + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + r = vget_low_f32(row1); + vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey + vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + + r = vget_high_f32(row1); + vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + + r = vget_low_f32(row2); + vResult0 = vmlaq_lane_f32(vResult0, V.val[2], r, 0); // Ax+Ey+Iz + vResult1 = vmlaq_lane_f32(vResult1, V.val[2], r, 1); // Bx+Fy+Jz + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4)); + + r = vget_high_f32(row2); + vResult2 = vmlaq_lane_f32(vResult2, V.val[2], r, 0); // Cx+Gy+Kz + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5)); + + V.val[0] = vResult0; + V.val[1] = vResult1; + V.val[2] = vResult2; + + vst3q_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += sizeof(XMFLOAT3) * 4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t VL = vld1_f32(reinterpret_cast(pInputVector)); + float32x2_t zero = vdup_n_f32(0); + float32x2_t VH = vld1_lane_f32(reinterpret_cast(pInputVector) + 2, zero, 0); + pInputVector += InputStride; + + XMVECTOR vResult = vmulq_lane_f32(row0, VL, 0); // X + vResult = vmlaq_lane_f32(vResult, row1, VL, 1); // Y + vResult = vmlaq_lane_f32(vResult, row2, VH, 0); // Z + + VL = vget_low_f32(vResult); + vst1_f32(reinterpret_cast(pOutputVector), VL); + vst1q_lane_f32(reinterpret_cast(pOutputVector) + 2, vResult, 2); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(XMFLOAT3)) + { + if (OutputStride == sizeof(XMFLOAT3)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Z, row2); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + V1 = _mm_add_ps(vTemp, vTemp3); + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + V2 = _mm_add_ps(vTemp, vTemp3); + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + V3 = _mm_add_ps(vTemp, vTemp3); + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + V4 = _mm_add_ps(vTemp, vTemp3); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + XM_STREAM_PS(reinterpret_cast(pOutputVector), V1); + XM_STREAM_PS(reinterpret_cast(pOutputVector + 16), vTemp); + XM_STREAM_PS(reinterpret_cast(pOutputVector + 32), V3); + pOutputVector += sizeof(XMFLOAT3) * 4; + i += 4; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Z, row2); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + V1 = _mm_add_ps(vTemp, vTemp3); + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + V2 = _mm_add_ps(vTemp, vTemp3); + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + V3 = _mm_add_ps(vTemp, vTemp3); + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + V4 = _mm_add_ps(vTemp, vTemp3); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + _mm_storeu_ps(reinterpret_cast(pOutputVector), V1); + _mm_storeu_ps(reinterpret_cast(pOutputVector + 16), vTemp); + _mm_storeu_ps(reinterpret_cast(pOutputVector + 32), V3); + pOutputVector += sizeof(XMFLOAT3) * 4; + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Z, row2); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + for (; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Z, row2); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Project +( + FXMVECTOR V, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + FXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) noexcept +{ + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + + XMVECTOR Result = XMVector3TransformCoord(V, Transform); + + Result = XMVectorMultiplyAdd(Result, Scale, Offset); + + return Result; +} + +//------------------------------------------------------------------------------ + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" ) +#endif + +_Use_decl_annotations_ +inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + FXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); + + assert(OutputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3)); + +#if defined(_XM_NO_INTRINSICS_) + + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + + XMVECTOR Result = XMVector3TransformCoord(V, Transform); + Result = XMVectorMultiplyAdd(Result, Scale, Offset); + + XMStoreFloat3(reinterpret_cast(pOutputVector), Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3))) + { + XMVECTOR ScaleX = vdupq_n_f32(HalfViewportWidth); + XMVECTOR ScaleY = vdupq_n_f32(-HalfViewportHeight); + XMVECTOR ScaleZ = vdupq_n_f32(ViewportMaxZ - ViewportMinZ); + + XMVECTOR OffsetX = vdupq_n_f32(ViewportX + HalfViewportWidth); + XMVECTOR OffsetY = vdupq_n_f32(ViewportY + HalfViewportHeight); + XMVECTOR OffsetZ = vdupq_n_f32(ViewportMinZ); + + for (size_t j = 0; j < four; ++j) + { + float32x4x3_t V = vld3q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT3) * 4; + + float32x2_t r3 = vget_low_f32(Transform.r[3]); + float32x2_t r = vget_low_f32(Transform.r[0]); + XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Bx+N + + XM_PREFETCH(pInputVector); + + r3 = vget_high_f32(Transform.r[3]); + r = vget_high_f32(Transform.r[0]); + XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Cx+O + XMVECTOR W = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Dx+P + + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + r = vget_low_f32(Transform.r[1]); + vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey+M + vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + + r = vget_high_f32(Transform.r[1]); + vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy+O + W = vmlaq_lane_f32(W, V.val[1], r, 1); // Dx+Hy+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + + r = vget_low_f32(Transform.r[2]); + vResult0 = vmlaq_lane_f32(vResult0, V.val[2], r, 0); // Ax+Ey+Iz+M + vResult1 = vmlaq_lane_f32(vResult1, V.val[2], r, 1); // Bx+Fy+Jz+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4)); + + r = vget_high_f32(Transform.r[2]); + vResult2 = vmlaq_lane_f32(vResult2, V.val[2], r, 0); // Cx+Gy+Kz+O + W = vmlaq_lane_f32(W, V.val[2], r, 1); // Dx+Hy+Lz+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5)); + +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ + vResult0 = vdivq_f32(vResult0, W); + vResult1 = vdivq_f32(vResult1, W); + vResult2 = vdivq_f32(vResult2, W); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + + vResult0 = vmulq_f32(vResult0, Reciprocal); + vResult1 = vmulq_f32(vResult1, Reciprocal); + vResult2 = vmulq_f32(vResult2, Reciprocal); +#endif + + V.val[0] = vmlaq_f32(OffsetX, vResult0, ScaleX); + V.val[1] = vmlaq_f32(OffsetY, vResult1, ScaleY); + V.val[2] = vmlaq_f32(OffsetZ, vResult2, ScaleZ); + + vst3q_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += sizeof(XMFLOAT3) * 4; + + i += 4; + } + } + } + + if (i < VectorCount) + { + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + for (; i < VectorCount; i++) + { + float32x2_t VL = vld1_f32(reinterpret_cast(pInputVector)); + float32x2_t zero = vdup_n_f32(0); + float32x2_t VH = vld1_lane_f32(reinterpret_cast(pInputVector) + 2, zero, 0); + pInputVector += InputStride; + + XMVECTOR vResult = vmlaq_lane_f32(Transform.r[3], Transform.r[0], VL, 0); // X + vResult = vmlaq_lane_f32(vResult, Transform.r[1], VL, 1); // Y + vResult = vmlaq_lane_f32(vResult, Transform.r[2], VH, 0); // Z + + VH = vget_high_f32(vResult); + XMVECTOR W = vdupq_lane_f32(VH, 1); + +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ + vResult = vdivq_f32(vResult, W); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal for W + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + + vResult = vmulq_f32(vResult, Reciprocal); +#endif + + vResult = vmlaq_f32(Offset, vResult, Scale); + + VL = vget_low_f32(vResult); + vst1_f32(reinterpret_cast(pOutputVector), VL); + vst1q_lane_f32(reinterpret_cast(pOutputVector) + 2, vResult, 2); + pOutputVector += OutputStride; + } + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(XMFLOAT3)) + { + if (OutputStride == sizeof(XMFLOAT3)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + V1 = XM_FMADD_PS(vTemp, Scale, Offset); + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + V2 = XM_FMADD_PS(vTemp, Scale, Offset); + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + V3 = XM_FMADD_PS(vTemp, Scale, Offset); + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + V4 = XM_FMADD_PS(vTemp, Scale, Offset); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + XM_STREAM_PS(reinterpret_cast(pOutputVector), V1); + XM_STREAM_PS(reinterpret_cast(pOutputVector + 16), vTemp); + XM_STREAM_PS(reinterpret_cast(pOutputVector + 32), V3); + pOutputVector += sizeof(XMFLOAT3) * 4; + i += 4; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + V1 = XM_FMADD_PS(vTemp, Scale, Offset); + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + V2 = XM_FMADD_PS(vTemp, Scale, Offset); + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + V3 = XM_FMADD_PS(vTemp, Scale, Offset); + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + V4 = XM_FMADD_PS(vTemp, Scale, Offset); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + _mm_storeu_ps(reinterpret_cast(pOutputVector), V1); + _mm_storeu_ps(reinterpret_cast(pOutputVector + 16), vTemp); + _mm_storeu_ps(reinterpret_cast(pOutputVector + 32), V3); + pOutputVector += sizeof(XMFLOAT3) * 4; + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + vTemp = XM_FMADD_PS(vTemp, Scale, Offset); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + vTemp = XM_FMADD_PS(vTemp, Scale, Offset); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + vTemp = XM_FMADD_PS(vTemp, Scale, Offset); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + vTemp = XM_FMADD_PS(vTemp, Scale, Offset); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + for (; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + vTemp = XM_FMADD_PS(vTemp, Scale, Offset); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Unproject +( + FXMVECTOR V, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + FXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) noexcept +{ + static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } }; + + XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); + Scale = XMVectorReciprocal(Scale); + + XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); + Offset = XMVectorMultiplyAdd(Scale, Offset, D.v); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset); + + return XMVector3TransformCoord(Result, Transform); +} + +//------------------------------------------------------------------------------ + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" ) +#endif + +_Use_decl_annotations_ +inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + FXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); + + assert(OutputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3)); + +#if defined(_XM_NO_INTRINSICS_) + + static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } }; + + XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); + Scale = XMVectorReciprocal(Scale); + + XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); + Offset = XMVectorMultiplyAdd(Scale, Offset, D.v); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + + XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset); + + Result = XMVector3TransformCoord(Result, Transform); + + XMStoreFloat3(reinterpret_cast(pOutputVector), Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + float sx = 1.f / (ViewportWidth * 0.5f); + float sy = 1.f / (-ViewportHeight * 0.5f); + float sz = 1.f / (ViewportMaxZ - ViewportMinZ); + + float ox = (-ViewportX * sx) - 1.f; + float oy = (-ViewportY * sy) + 1.f; + float oz = (-ViewportMinZ * sz); + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x3_t V = vld3q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT3) * 4; + + XMVECTOR ScaleX = vdupq_n_f32(sx); + XMVECTOR OffsetX = vdupq_n_f32(ox); + XMVECTOR VX = vmlaq_f32(OffsetX, ScaleX, V.val[0]); + + float32x2_t r3 = vget_low_f32(Transform.r[3]); + float32x2_t r = vget_low_f32(Transform.r[0]); + XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), VX, r, 0); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), VX, r, 1); // Bx+N + + XM_PREFETCH(pInputVector); + + r3 = vget_high_f32(Transform.r[3]); + r = vget_high_f32(Transform.r[0]); + XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), VX, r, 0); // Cx+O + XMVECTOR W = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), VX, r, 1); // Dx+P + + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + XMVECTOR ScaleY = vdupq_n_f32(sy); + XMVECTOR OffsetY = vdupq_n_f32(oy); + XMVECTOR VY = vmlaq_f32(OffsetY, ScaleY, V.val[1]); + + r = vget_low_f32(Transform.r[1]); + vResult0 = vmlaq_lane_f32(vResult0, VY, r, 0); // Ax+Ey+M + vResult1 = vmlaq_lane_f32(vResult1, VY, r, 1); // Bx+Fy+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + + r = vget_high_f32(Transform.r[1]); + vResult2 = vmlaq_lane_f32(vResult2, VY, r, 0); // Cx+Gy+O + W = vmlaq_lane_f32(W, VY, r, 1); // Dx+Hy+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + + XMVECTOR ScaleZ = vdupq_n_f32(sz); + XMVECTOR OffsetZ = vdupq_n_f32(oz); + XMVECTOR VZ = vmlaq_f32(OffsetZ, ScaleZ, V.val[2]); + + r = vget_low_f32(Transform.r[2]); + vResult0 = vmlaq_lane_f32(vResult0, VZ, r, 0); // Ax+Ey+Iz+M + vResult1 = vmlaq_lane_f32(vResult1, VZ, r, 1); // Bx+Fy+Jz+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4)); + + r = vget_high_f32(Transform.r[2]); + vResult2 = vmlaq_lane_f32(vResult2, VZ, r, 0); // Cx+Gy+Kz+O + W = vmlaq_lane_f32(W, VZ, r, 1); // Dx+Hy+Lz+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5)); + +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ + V.val[0] = vdivq_f32(vResult0, W); + V.val[1] = vdivq_f32(vResult1, W); + V.val[2] = vdivq_f32(vResult2, W); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + + V.val[0] = vmulq_f32(vResult0, Reciprocal); + V.val[1] = vmulq_f32(vResult1, Reciprocal); + V.val[2] = vmulq_f32(vResult2, Reciprocal); +#endif + + vst3q_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += sizeof(XMFLOAT3) * 4; + + i += 4; + } + } + } + + if (i < VectorCount) + { + float32x2_t ScaleL = vcreate_f32( + static_cast(*reinterpret_cast(&sx)) + | (static_cast(*reinterpret_cast(&sy)) << 32)); + float32x2_t ScaleH = vcreate_f32(static_cast(*reinterpret_cast(&sz))); + + float32x2_t OffsetL = vcreate_f32( + static_cast(*reinterpret_cast(&ox)) + | (static_cast(*reinterpret_cast(&oy)) << 32)); + float32x2_t OffsetH = vcreate_f32(static_cast(*reinterpret_cast(&oz))); + + for (; i < VectorCount; i++) + { + float32x2_t VL = vld1_f32(reinterpret_cast(pInputVector)); + float32x2_t zero = vdup_n_f32(0); + float32x2_t VH = vld1_lane_f32(reinterpret_cast(pInputVector) + 2, zero, 0); + pInputVector += InputStride; + + VL = vmla_f32(OffsetL, VL, ScaleL); + VH = vmla_f32(OffsetH, VH, ScaleH); + + XMVECTOR vResult = vmlaq_lane_f32(Transform.r[3], Transform.r[0], VL, 0); // X + vResult = vmlaq_lane_f32(vResult, Transform.r[1], VL, 1); // Y + vResult = vmlaq_lane_f32(vResult, Transform.r[2], VH, 0); // Z + + VH = vget_high_f32(vResult); + XMVECTOR W = vdupq_lane_f32(VH, 1); + +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__ + vResult = vdivq_f32(vResult, W); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal for W + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + + vResult = vmulq_f32(vResult, Reciprocal); +#endif + + VL = vget_low_f32(vResult); + vst1_f32(reinterpret_cast(pOutputVector), VL); + vst1q_lane_f32(reinterpret_cast(pOutputVector) + 2, vResult, 2); + pOutputVector += OutputStride; + } + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } }; + + XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); + Scale = XMVectorReciprocal(Scale); + + XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); + Offset = _mm_mul_ps(Scale, Offset); + Offset = _mm_add_ps(Offset, D); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(XMFLOAT3)) + { + if (OutputStride == sizeof(XMFLOAT3)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + V1 = XM_FMADD_PS(V1, Scale, Offset); + + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + V1 = _mm_div_ps(vTemp, W); + + // Result 2 + V2 = XM_FMADD_PS(V2, Scale, Offset); + + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + V2 = _mm_div_ps(vTemp, W); + + // Result 3 + V3 = XM_FMADD_PS(V3, Scale, Offset); + + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + V3 = _mm_div_ps(vTemp, W); + + // Result 4 + V4 = XM_FMADD_PS(V4, Scale, Offset); + + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + V4 = _mm_div_ps(vTemp, W); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + XM_STREAM_PS(reinterpret_cast(pOutputVector), V1); + XM_STREAM_PS(reinterpret_cast(pOutputVector + 16), vTemp); + XM_STREAM_PS(reinterpret_cast(pOutputVector + 32), V3); + pOutputVector += sizeof(XMFLOAT3) * 4; + i += 4; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + V1 = XM_FMADD_PS(V1, Scale, Offset); + + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + V1 = _mm_div_ps(vTemp, W); + + // Result 2 + V2 = XM_FMADD_PS(V2, Scale, Offset); + + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + V2 = _mm_div_ps(vTemp, W); + + // Result 3 + V3 = XM_FMADD_PS(V3, Scale, Offset); + + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + V3 = _mm_div_ps(vTemp, W); + + // Result 4 + V4 = XM_FMADD_PS(V4, Scale, Offset); + + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + V4 = _mm_div_ps(vTemp, W); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + _mm_storeu_ps(reinterpret_cast(pOutputVector), V1); + _mm_storeu_ps(reinterpret_cast(pOutputVector + 16), vTemp); + _mm_storeu_ps(reinterpret_cast(pOutputVector + 32), V3); + pOutputVector += sizeof(XMFLOAT3) * 4; + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + V1 = XM_FMADD_PS(V1, Scale, Offset); + + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + V2 = XM_FMADD_PS(V2, Scale, Offset); + + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + V3 = XM_FMADD_PS(V3, Scale, Offset); + + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + V4 = XM_FMADD_PS(V4, Scale, Offset); + + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + for (; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + V = _mm_mul_ps(V, Scale); + V = _mm_add_ps(V, Offset); + + XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +/**************************************************************************** + * + * 4D Vector + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + // Comparison operations + //------------------------------------------------------------------------------ + + //------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4Equal +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2]) && (V1.vector4_f32[3] == V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + return ((_mm_movemask_ps(vTemp) == 0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4EqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector4EqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + + if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && + (V1.vector4_f32[1] == V2.vector4_f32[1]) && + (V1.vector4_f32[2] == V2.vector4_f32[2]) && + (V1.vector4_f32[3] == V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && + (V1.vector4_f32[1] != V2.vector4_f32[1]) && + (V1.vector4_f32[2] != V2.vector4_f32[2]) && + (V1.vector4_f32[3] != V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + int iTest = _mm_movemask_ps(vTemp); + uint32_t CR = 0; + if (iTest == 0xf) // All equal? + { + CR = XM_CRMASK_CR6TRUE; + } + else if (iTest == 0) // All not equal? + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4EqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2]) && (V1.vector4_u32[3] == V2.vector4_u32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) == 0xf) != 0); +#else + return XMComparisonAllTrue(XMVector4EqualIntR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector4EqualIntR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if (V1.vector4_u32[0] == V2.vector4_u32[0] && + V1.vector4_u32[1] == V2.vector4_u32[1] && + V1.vector4_u32[2] == V2.vector4_u32[2] && + V1.vector4_u32[3] == V2.vector4_u32[3]) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (V1.vector4_u32[0] != V2.vector4_u32[0] && + V1.vector4_u32[1] != V2.vector4_u32[1] && + V1.vector4_u32[2] != V2.vector4_u32[2] && + V1.vector4_u32[3] != V2.vector4_u32[3]) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp)); + uint32_t CR = 0; + if (iTest == 0xf) // All equal? + { + CR = XM_CRMASK_CR6TRUE; + } + else if (iTest == 0) // All not equal? + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +inline bool XM_CALLCONV XMVector4NearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float dx, dy, dz, dw; + + dx = fabsf(V1.vector4_f32[0] - V2.vector4_f32[0]); + dy = fabsf(V1.vector4_f32[1] - V2.vector4_f32[1]); + dz = fabsf(V1.vector4_f32[2] - V2.vector4_f32[2]); + dw = fabsf(V1.vector4_f32[3] - V2.vector4_f32[3]); + return (((dx <= Epsilon.vector4_f32[0]) && + (dy <= Epsilon.vector4_f32[1]) && + (dz <= Epsilon.vector4_f32[2]) && + (dw <= Epsilon.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vDelta = vsubq_f32(V1, V2); +#ifdef _MSC_VER + uint32x4_t vResult = vacleq_f32(vDelta, Epsilon); +#else + uint32x4_t vResult = vcleq_f32(vabsq_f32(vDelta), Epsilon); +#endif + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1, V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp, vDelta); + vTemp = _mm_max_ps(vTemp, vDelta); + vTemp = _mm_cmple_ps(vTemp, Epsilon); + return ((_mm_movemask_ps(vTemp) == 0xf) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4NotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2]) || (V1.vector4_f32[3] != V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) != 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpneq_ps(V1, V2); + return ((_mm_movemask_ps(vTemp)) != 0); +#else + return XMComparisonAnyFalse(XMVector4EqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4NotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2]) || (V1.vector4_u32[3] != V2.vector4_u32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) != 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) != 0xF) != 0); +#else + return XMComparisonAnyFalse(XMVector4EqualIntR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4Greater +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2]) && (V1.vector4_f32[3] > V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgtq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2); + return ((_mm_movemask_ps(vTemp) == 0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector4GreaterR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if (V1.vector4_f32[0] > V2.vector4_f32[0] && + V1.vector4_f32[1] > V2.vector4_f32[1] && + V1.vector4_f32[2] > V2.vector4_f32[2] && + V1.vector4_f32[3] > V2.vector4_f32[3]) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (V1.vector4_f32[0] <= V2.vector4_f32[0] && + V1.vector4_f32[1] <= V2.vector4_f32[1] && + V1.vector4_f32[2] <= V2.vector4_f32[2] && + V1.vector4_f32[3] <= V2.vector4_f32[3]) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgtq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + uint32_t CR = 0; + XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2); + int iTest = _mm_movemask_ps(vTemp); + if (iTest == 0xf) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4GreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2]) && (V1.vector4_f32[3] >= V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgeq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1, V2); + return ((_mm_movemask_ps(vTemp) == 0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector4GreaterOrEqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && + (V1.vector4_f32[1] >= V2.vector4_f32[1]) && + (V1.vector4_f32[2] >= V2.vector4_f32[2]) && + (V1.vector4_f32[3] >= V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && + (V1.vector4_f32[1] < V2.vector4_f32[1]) && + (V1.vector4_f32[2] < V2.vector4_f32[2]) && + (V1.vector4_f32[3] < V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgeq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + uint32_t CR = 0; + XMVECTOR vTemp = _mm_cmpge_ps(V1, V2); + int iTest = _mm_movemask_ps(vTemp); + if (iTest == 0x0f) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4Less +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2]) && (V1.vector4_f32[3] < V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcltq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmplt_ps(V1, V2); + return ((_mm_movemask_ps(vTemp) == 0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterR(V2, V1)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4LessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2]) && (V1.vector4_f32[3] <= V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcleq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmple_ps(V1, V2); + return ((_mm_movemask_ps(vTemp) == 0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V2, V1)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4InBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) && + (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) && + (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test if less than or equal + uint32x4_t ivTemp1 = vcleq_f32(V, Bounds); + // Negate the bounds + float32x4_t vTemp2 = vnegq_f32(Bounds); + // Test if greater or equal (Reversed) + uint32x4_t ivTemp2 = vcleq_f32(vTemp2, V); + // Blend answers + ivTemp1 = vandq_u32(ivTemp1, ivTemp2); + // in bounds? + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(ivTemp1)), vget_high_u8(vreinterpretq_u8_u32(ivTemp1))); + uint16x4x2_t vTemp3 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp3.val[1]), 1) == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2, V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1, vTemp2); + // All in bounds? + return ((_mm_movemask_ps(vTemp1) == 0x0f) != 0); +#else + return XMComparisonAllInBounds(XMVector4InBoundsR(V, Bounds)); +#endif +} + +//------------------------------------------------------------------------------ + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma float_control(push) +#pragma float_control(precise, on) +#endif + +inline bool XM_CALLCONV XMVector4IsNaN(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (XMISNAN(V.vector4_f32[0]) || + XMISNAN(V.vector4_f32[1]) || + XMISNAN(V.vector4_f32[2]) || + XMISNAN(V.vector4_f32[3])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test against itself. NaN is always not equal + uint32x4_t vTempNan = vceqq_f32(V, V); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTempNan)), vget_high_u8(vreinterpretq_u8_u32(vTempNan))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + // If any are NaN, the mask is zero + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) != 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + // Test against itself. NaN is always not equal + XMVECTOR vTempNan = _mm_cmpneq_ps(V, V); + // If any are NaN, the mask is non-zero + return (_mm_movemask_ps(vTempNan) != 0); +#endif +} + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma float_control(pop) +#endif + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4IsInfinite(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + return (XMISINF(V.vector4_f32[0]) || + XMISINF(V.vector4_f32[1]) || + XMISINF(V.vector4_f32[2]) || + XMISINF(V.vector4_f32[3])); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bit + uint32x4_t vTempInf = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + // Compare to infinity + vTempInf = vceqq_f32(vreinterpretq_f32_u32(vTempInf), g_XMInfinity); + // If any are infinity, the signs are true. + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTempInf)), vget_high_u8(vreinterpretq_u8_u32(vTempInf))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + XMVECTOR vTemp = _mm_and_ps(V, g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp, g_XMInfinity); + // If any are infinity, the signs are true. + return (_mm_movemask_ps(vTemp) != 0); +#endif +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result; + Result.f[0] = + Result.f[1] = + Result.f[2] = + Result.f[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2] + V1.vector4_f32[3] * V2.vector4_f32[3]; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vTemp = vmulq_f32(V1, V2); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + return vcombine_f32(v1, v1); +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_dp_ps(V1, V2, 0xff); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vTemp = _mm_mul_ps(V1, V2); + vTemp = _mm_hadd_ps(vTemp, vTemp); + return _mm_hadd_ps(vTemp, vTemp); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp2 = V2; + XMVECTOR vTemp = _mm_mul_ps(V1, vTemp2); + vTemp2 = _mm_shuffle_ps(vTemp2, vTemp, _MM_SHUFFLE(1, 0, 0, 0)); // Copy X to the Z position and Y to the W position + vTemp2 = _mm_add_ps(vTemp2, vTemp); // Add Z = X+Z; W = Y+W; + vTemp = _mm_shuffle_ps(vTemp, vTemp2, _MM_SHUFFLE(0, 3, 0, 0)); // Copy W to the Z position + vTemp = _mm_add_ps(vTemp, vTemp2); // Add Z and W together + return XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(2, 2, 2, 2)); // Splat Z and return +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Cross +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) noexcept +{ + // [ ((v2.z*v3.w-v2.w*v3.z)*v1.y)-((v2.y*v3.w-v2.w*v3.y)*v1.z)+((v2.y*v3.z-v2.z*v3.y)*v1.w), + // ((v2.w*v3.z-v2.z*v3.w)*v1.x)-((v2.w*v3.x-v2.x*v3.w)*v1.z)+((v2.z*v3.x-v2.x*v3.z)*v1.w), + // ((v2.y*v3.w-v2.w*v3.y)*v1.x)-((v2.x*v3.w-v2.w*v3.x)*v1.y)+((v2.x*v3.y-v2.y*v3.x)*v1.w), + // ((v2.z*v3.y-v2.y*v3.z)*v1.x)-((v2.z*v3.x-v2.x*v3.z)*v1.y)+((v2.y*v3.x-v2.x*v3.y)*v1.z) ] + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + (((V2.vector4_f32[2] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[2])) * V1.vector4_f32[1]) - (((V2.vector4_f32[1] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[1])) * V1.vector4_f32[2]) + (((V2.vector4_f32[1] * V3.vector4_f32[2]) - (V2.vector4_f32[2] * V3.vector4_f32[1])) * V1.vector4_f32[3]), + (((V2.vector4_f32[3] * V3.vector4_f32[2]) - (V2.vector4_f32[2] * V3.vector4_f32[3])) * V1.vector4_f32[0]) - (((V2.vector4_f32[3] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[3])) * V1.vector4_f32[2]) + (((V2.vector4_f32[2] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[2])) * V1.vector4_f32[3]), + (((V2.vector4_f32[1] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[1])) * V1.vector4_f32[0]) - (((V2.vector4_f32[0] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[0])) * V1.vector4_f32[1]) + (((V2.vector4_f32[0] * V3.vector4_f32[1]) - (V2.vector4_f32[1] * V3.vector4_f32[0])) * V1.vector4_f32[3]), + (((V2.vector4_f32[2] * V3.vector4_f32[1]) - (V2.vector4_f32[1] * V3.vector4_f32[2])) * V1.vector4_f32[0]) - (((V2.vector4_f32[2] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[2])) * V1.vector4_f32[1]) + (((V2.vector4_f32[1] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[1])) * V1.vector4_f32[2]), + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + const uint32x2_t select = vget_low_u32(g_XMMaskX); + + // Term1: V2zwyz * V3wzwy + const float32x2_t v2xy = vget_low_f32(V2); + const float32x2_t v2zw = vget_high_f32(V2); + const float32x2_t v2yx = vrev64_f32(v2xy); + const float32x2_t v2wz = vrev64_f32(v2zw); + const float32x2_t v2yz = vbsl_f32(select, v2yx, v2wz); + + const float32x2_t v3zw = vget_high_f32(V3); + const float32x2_t v3wz = vrev64_f32(v3zw); + const float32x2_t v3xy = vget_low_f32(V3); + const float32x2_t v3wy = vbsl_f32(select, v3wz, v3xy); + + float32x4_t vTemp1 = vcombine_f32(v2zw, v2yz); + float32x4_t vTemp2 = vcombine_f32(v3wz, v3wy); + XMVECTOR vResult = vmulq_f32(vTemp1, vTemp2); + + // - V2wzwy * V3zwyz + const float32x2_t v2wy = vbsl_f32(select, v2wz, v2xy); + + const float32x2_t v3yx = vrev64_f32(v3xy); + const float32x2_t v3yz = vbsl_f32(select, v3yx, v3wz); + + vTemp1 = vcombine_f32(v2wz, v2wy); + vTemp2 = vcombine_f32(v3zw, v3yz); + vResult = vmlsq_f32(vResult, vTemp1, vTemp2); + + // term1 * V1yxxx + const float32x2_t v1xy = vget_low_f32(V1); + const float32x2_t v1yx = vrev64_f32(v1xy); + + vTemp1 = vcombine_f32(v1yx, vdup_lane_f32(v1yx, 1)); + vResult = vmulq_f32(vResult, vTemp1); + + // Term2: V2ywxz * V3wxwx + const float32x2_t v2yw = vrev64_f32(v2wy); + const float32x2_t v2xz = vbsl_f32(select, v2xy, v2wz); + + const float32x2_t v3wx = vbsl_f32(select, v3wz, v3yx); + + vTemp1 = vcombine_f32(v2yw, v2xz); + vTemp2 = vcombine_f32(v3wx, v3wx); + float32x4_t vTerm = vmulq_f32(vTemp1, vTemp2); + + // - V2wxwx * V3ywxz + const float32x2_t v2wx = vbsl_f32(select, v2wz, v2yx); + + const float32x2_t v3yw = vrev64_f32(v3wy); + const float32x2_t v3xz = vbsl_f32(select, v3xy, v3wz); + + vTemp1 = vcombine_f32(v2wx, v2wx); + vTemp2 = vcombine_f32(v3yw, v3xz); + vTerm = vmlsq_f32(vTerm, vTemp1, vTemp2); + + // vResult - term2 * V1zzyy + const float32x2_t v1zw = vget_high_f32(V1); + + vTemp1 = vcombine_f32(vdup_lane_f32(v1zw, 0), vdup_lane_f32(v1yx, 0)); + vResult = vmlsq_f32(vResult, vTerm, vTemp1); + + // Term3: V2yzxy * V3zxyx + const float32x2_t v3zx = vrev64_f32(v3xz); + + vTemp1 = vcombine_f32(v2yz, v2xy); + vTemp2 = vcombine_f32(v3zx, v3yx); + vTerm = vmulq_f32(vTemp1, vTemp2); + + // - V2zxyx * V3yzxy + const float32x2_t v2zx = vrev64_f32(v2xz); + + vTemp1 = vcombine_f32(v2zx, v2yx); + vTemp2 = vcombine_f32(v3yz, v3xy); + vTerm = vmlsq_f32(vTerm, vTemp1, vTemp2); + + // vResult + term3 * V1wwwz + const float32x2_t v1wz = vrev64_f32(v1zw); + + vTemp1 = vcombine_f32(vdup_lane_f32(v1wz, 0), v1wz); + return vmlaq_f32(vResult, vTerm, vTemp1); +#elif defined(_XM_SSE_INTRINSICS_) + // V2zwyz * V3wzwy + XMVECTOR vResult = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 1, 3, 2)); + XMVECTOR vTemp3 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 3, 2, 3)); + vResult = _mm_mul_ps(vResult, vTemp3); + // - V2wzwy * V3zwyz + XMVECTOR vTemp2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 3, 2, 3)); + vTemp3 = XM_PERMUTE_PS(vTemp3, _MM_SHUFFLE(1, 3, 0, 1)); + vResult = XM_FNMADD_PS(vTemp2, vTemp3, vResult); + // term1 * V1yxxx + XMVECTOR vTemp1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 1)); + vResult = _mm_mul_ps(vResult, vTemp1); + + // V2ywxz * V3wxwx + vTemp2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 0, 3, 1)); + vTemp3 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 3, 0, 3)); + vTemp3 = _mm_mul_ps(vTemp3, vTemp2); + // - V2wxwx * V3ywxz + vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(2, 1, 2, 1)); + vTemp1 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 0, 3, 1)); + vTemp3 = XM_FNMADD_PS(vTemp2, vTemp1, vTemp3); + // vResult - temp * V1zzyy + vTemp1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 2, 2)); + vResult = XM_FNMADD_PS(vTemp1, vTemp3, vResult); + + // V2yzxy * V3zxyx + vTemp2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 0, 2, 1)); + vTemp3 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 1, 0, 2)); + vTemp3 = _mm_mul_ps(vTemp3, vTemp2); + // - V2zxyx * V3yzxy + vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(2, 0, 2, 1)); + vTemp1 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 0, 2, 1)); + vTemp3 = XM_FNMADD_PS(vTemp1, vTemp2, vTemp3); + // vResult + term * V1wwwz + vTemp1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 3, 3, 3)); + vResult = XM_FMADD_PS(vTemp3, vTemp1, vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4LengthSq(FXMVECTOR V) noexcept +{ + return XMVector4Dot(V, V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorReciprocalSqrtEst(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + // Reciprocal sqrt (estimate) + v2 = vrsqrte_f32(v1); + return vcombine_f32(v2, v2); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff); + return _mm_rsqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_rsqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2)); + // Get the reciprocal + vLengthSq = _mm_rsqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLength(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorReciprocalSqrt(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + // Reciprocal sqrt + float32x2_t S0 = vrsqrte_f32(v1); + float32x2_t P0 = vmul_f32(v1, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(v1, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + float32x2_t Result = vmul_f32(S1, R1); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff); + XMVECTOR vLengthSq = _mm_sqrt_ps(vTemp); + return _mm_div_ps(g_XMOne, vLengthSq); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ps(vLengthSq); + vLengthSq = _mm_div_ps(g_XMOne, vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2)); + // Get the reciprocal + vLengthSq = _mm_sqrt_ps(vLengthSq); + // Accurate! + vLengthSq = _mm_div_ps(g_XMOne, vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4LengthEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorSqrtEst(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32(v1, zero); + // Sqrt (estimate) + float32x2_t Result = vrsqrte_f32(v1); + Result = vmul_f32(v1, Result); + Result = vbsl_f32(VEqualsZero, zero, Result); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff); + return _mm_sqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2)); + // Get the length + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Length(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorSqrt(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32(v1, zero); + // Sqrt + float32x2_t S0 = vrsqrte_f32(v1); + float32x2_t P0 = vmul_f32(v1, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(v1, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + float32x2_t Result = vmul_f32(S1, R1); + Result = vmul_f32(v1, Result); + Result = vbsl_f32(VEqualsZero, zero, Result); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff); + return _mm_sqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2)); + // Get the length + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ +// XMVector4NormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +inline XMVECTOR XM_CALLCONV XMVector4NormalizeEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector4ReciprocalLength(V); + Result = XMVectorMultiply(V, Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + // Reciprocal sqrt (estimate) + v2 = vrsqrte_f32(v1); + // Normalize + return vmulq_f32(V, vcombine_f32(v2, v2)); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff); + XMVECTOR vResult = _mm_rsqrt_ps(vTemp); + return _mm_mul_ps(vResult, V); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vDot = _mm_mul_ps(V, V); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_rsqrt_ps(vDot); + vDot = _mm_mul_ps(vDot, V); + return vDot; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2)); + // Get the reciprocal + XMVECTOR vResult = _mm_rsqrt_ps(vLengthSq); + // Reciprocal mul to perform the normalization + vResult = _mm_mul_ps(vResult, V); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Normalize(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float fLength; + XMVECTOR vResult; + + vResult = XMVector4Length(V); + fLength = vResult.vector4_f32[0]; + + // Prevent divide by zero + if (fLength > 0) + { + fLength = 1.0f / fLength; + } + + vResult.vector4_f32[0] = V.vector4_f32[0] * fLength; + vResult.vector4_f32[1] = V.vector4_f32[1] * fLength; + vResult.vector4_f32[2] = V.vector4_f32[2] * fLength; + vResult.vector4_f32[3] = V.vector4_f32[3] * fLength; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + uint32x2_t VEqualsZero = vceq_f32(v1, vdup_n_f32(0)); + uint32x2_t VEqualsInf = vceq_f32(v1, vget_low_f32(g_XMInfinity)); + // Reciprocal sqrt (2 iterations of Newton-Raphson) + float32x2_t S0 = vrsqrte_f32(v1); + float32x2_t P0 = vmul_f32(v1, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(v1, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + v2 = vmul_f32(S1, R1); + // Normalize + XMVECTOR vResult = vmulq_f32(V, vcombine_f32(v2, v2)); + vResult = vbslq_f32(vcombine_u32(VEqualsZero, VEqualsZero), vdupq_n_f32(0), vResult); + return vbslq_f32(vcombine_u32(VEqualsInf, VEqualsInf), g_XMQNaN, vResult); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vLengthSq = _mm_dp_ps(V, V, 0xff); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#elif defined(_XM_SSE3_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4ClampLength +( + FXMVECTOR V, + float LengthMin, + float LengthMax +) noexcept +{ + XMVECTOR ClampMax = XMVectorReplicate(LengthMax); + XMVECTOR ClampMin = XMVectorReplicate(LengthMin); + + return XMVector4ClampLengthV(V, ClampMin, ClampMax); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4ClampLengthV +( + FXMVECTOR V, + FXMVECTOR LengthMin, + FXMVECTOR LengthMax +) noexcept +{ + assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetW(LengthMin) == XMVectorGetX(LengthMin))); + assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetW(LengthMax) == XMVectorGetX(LengthMax))); + assert(XMVector4GreaterOrEqual(LengthMin, XMVectorZero())); + assert(XMVector4GreaterOrEqual(LengthMax, XMVectorZero())); + assert(XMVector4GreaterOrEqual(LengthMax, LengthMin)); + + XMVECTOR LengthSq = XMVector4LengthSq(V); + + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq); + + XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); + XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero); + + XMVECTOR Normal = XMVectorMultiply(V, RcpLength); + + XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength); + + XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength); + Length = XMVectorSelect(LengthSq, Length, Select); + Normal = XMVectorSelect(LengthSq, Normal, Select); + + XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax); + XMVECTOR ControlMin = XMVectorLess(Length, LengthMin); + + XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); + ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); + + XMVECTOR Result = XMVectorMultiply(Normal, ClampLength); + + // Preserve the original vector (with no precision loss) if the length falls within the given range + XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin); + Result = XMVectorSelect(Result, V, Control); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Reflect +( + FXMVECTOR Incident, + FXMVECTOR Normal +) noexcept +{ + // Result = Incident - (2 * dot(Incident, Normal)) * Normal + + XMVECTOR Result = XMVector4Dot(Incident, Normal); + Result = XMVectorAdd(Result, Result); + Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Refract +( + FXMVECTOR Incident, + FXMVECTOR Normal, + float RefractionIndex +) noexcept +{ + XMVECTOR Index = XMVectorReplicate(RefractionIndex); + return XMVector4RefractV(Incident, Normal, Index); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4RefractV +( + FXMVECTOR Incident, + FXMVECTOR Normal, + FXMVECTOR RefractionIndex +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR IDotN; + XMVECTOR R; + const XMVECTOR Zero = XMVectorZero(); + + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + + IDotN = XMVector4Dot(Incident, Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v); + R = XMVectorMultiply(R, RefractionIndex); + R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v); + + if (XMVector4LessOrEqual(R, Zero)) + { + // Total internal reflection + return Zero; + } + else + { + XMVECTOR Result; + + // R = RefractionIndex * IDotN + sqrt(R) + R = XMVectorSqrt(R); + R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R); + + // Result = RefractionIndex * Incident - Normal * R + Result = XMVectorMultiply(RefractionIndex, Incident); + Result = XMVectorNegativeMultiplySubtract(Normal, R, Result); + + return Result; + } + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR IDotN = XMVector4Dot(Incident, Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + float32x4_t R = vmlsq_f32(g_XMOne, IDotN, IDotN); + R = vmulq_f32(R, RefractionIndex); + R = vmlsq_f32(g_XMOne, R, RefractionIndex); + + uint32x4_t isrzero = vcleq_f32(R, g_XMZero); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(isrzero)), vget_high_u8(vreinterpretq_u8_u32(isrzero))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + + float32x4_t vResult; + if (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // Sqrt(R) + float32x4_t S0 = vrsqrteq_f32(R); + float32x4_t P0 = vmulq_f32(R, S0); + float32x4_t R0 = vrsqrtsq_f32(P0, S0); + float32x4_t S1 = vmulq_f32(S0, R0); + float32x4_t P1 = vmulq_f32(R, S1); + float32x4_t R1 = vrsqrtsq_f32(P1, S1); + float32x4_t S2 = vmulq_f32(S1, R1); + R = vmulq_f32(R, S2); + // R = RefractionIndex * IDotN + sqrt(R) + R = vmlaq_f32(R, RefractionIndex, IDotN); + // Result = RefractionIndex * Incident - Normal * R + vResult = vmulq_f32(RefractionIndex, Incident); + vResult = vmlsq_f32(vResult, R, Normal); + } + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR IDotN = XMVector4Dot(Incident, Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR R = XM_FNMADD_PS(IDotN, IDotN, g_XMOne); + XMVECTOR R2 = _mm_mul_ps(RefractionIndex, RefractionIndex); + R = XM_FNMADD_PS(R, R2, g_XMOne); + + XMVECTOR vResult = _mm_cmple_ps(R, g_XMZero); + if (_mm_movemask_ps(vResult) == 0x0f) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // R = RefractionIndex * IDotN + sqrt(R) + R = _mm_sqrt_ps(R); + R = XM_FMADD_PS(RefractionIndex, IDotN, R); + // Result = RefractionIndex * Incident - Normal * R + vResult = _mm_mul_ps(RefractionIndex, Incident); + vResult = XM_FNMADD_PS(R, Normal, vResult); + } + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Orthogonal(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + V.vector4_f32[2], + V.vector4_f32[3], + -V.vector4_f32[0], + -V.vector4_f32[1] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Negate = { { { 1.f, 1.f, -1.f, -1.f } } }; + + float32x4_t Result = vcombine_f32(vget_high_f32(V), vget_low_f32(V)); + return vmulq_f32(Result, Negate); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FlipZW = { { { 1.0f, 1.0f, -1.0f, -1.0f } } }; + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 0, 3, 2)); + vResult = _mm_mul_ps(vResult, FlipZW); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormalsEst +( + FXMVECTOR N1, + FXMVECTOR N2 +) noexcept +{ + XMVECTOR Result = XMVector4Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACosEst(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormals +( + FXMVECTOR N1, + FXMVECTOR N2 +) noexcept +{ + XMVECTOR Result = XMVector4Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACos(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenVectors +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + XMVECTOR L1 = XMVector4ReciprocalLength(V1); + XMVECTOR L2 = XMVector4ReciprocalLength(V2); + + XMVECTOR Dot = XMVector4Dot(V1, V2); + + L1 = XMVectorMultiply(L1, L2); + + XMVECTOR CosAngle = XMVectorMultiply(Dot, L1); + CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v); + + return XMVectorACos(CosAngle); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Transform +( + FXMVECTOR V, + FXMMATRIX M +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + float fX = (M.m[0][0] * V.vector4_f32[0]) + (M.m[1][0] * V.vector4_f32[1]) + (M.m[2][0] * V.vector4_f32[2]) + (M.m[3][0] * V.vector4_f32[3]); + float fY = (M.m[0][1] * V.vector4_f32[0]) + (M.m[1][1] * V.vector4_f32[1]) + (M.m[2][1] * V.vector4_f32[2]) + (M.m[3][1] * V.vector4_f32[3]); + float fZ = (M.m[0][2] * V.vector4_f32[0]) + (M.m[1][2] * V.vector4_f32[1]) + (M.m[2][2] * V.vector4_f32[2]) + (M.m[3][2] * V.vector4_f32[3]); + float fW = (M.m[0][3] * V.vector4_f32[0]) + (M.m[1][3] * V.vector4_f32[1]) + (M.m[2][3] * V.vector4_f32[2]) + (M.m[3][3] * V.vector4_f32[3]); + XMVECTORF32 vResult = { { { fX, fY, fZ, fW } } }; + return vResult.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + XMVECTOR vResult = vmulq_lane_f32(M.r[0], VL, 0); // X + vResult = vmlaq_lane_f32(vResult, M.r[1], VL, 1); // Y + float32x2_t VH = vget_high_f32(V); + vResult = vmlaq_lane_f32(vResult, M.r[2], VH, 0); // Z + return vmlaq_lane_f32(vResult, M.r[3], VH, 1); // W +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); // W + vResult = _mm_mul_ps(vResult, M.r[3]); + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); // Z + vResult = XM_FMADD_PS(vTemp, M.r[2], vResult); + vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y + vResult = XM_FMADD_PS(vTemp, M.r[1], vResult); + vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X + vResult = XM_FMADD_PS(vTemp, M.r[0], vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT4* XM_CALLCONV XMVector4TransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + const XMFLOAT4* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT4)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT4)); + + assert(OutputStride >= sizeof(XMFLOAT4)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat4(reinterpret_cast(pInputVector)); + XMVECTOR W = XMVectorSplatW(V); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(W, row3); + Result = XMVectorMultiplyAdd(Z, row2, Result); + Result = XMVectorMultiplyAdd(Y, row1, Result); + Result = XMVectorMultiplyAdd(X, row0, Result); + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015, "PREfast noise: Esp:1307" ) +#endif + + XMStoreFloat4(reinterpret_cast(pOutputVector), Result); + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT4)) && (OutputStride == sizeof(XMFLOAT4))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x4_t V = vld4q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT4) * 4; + + float32x2_t r = vget_low_f32(row0); + XMVECTOR vResult0 = vmulq_lane_f32(V.val[0], r, 0); // Ax + XMVECTOR vResult1 = vmulq_lane_f32(V.val[0], r, 1); // Bx + + XM_PREFETCH(pInputVector); + + r = vget_high_f32(row0); + XMVECTOR vResult2 = vmulq_lane_f32(V.val[0], r, 0); // Cx + XMVECTOR vResult3 = vmulq_lane_f32(V.val[0], r, 1); // Dx + + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + r = vget_low_f32(row1); + vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey + vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + + r = vget_high_f32(row1); + vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy + vResult3 = vmlaq_lane_f32(vResult3, V.val[1], r, 1); // Dx+Hy + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + + r = vget_low_f32(row2); + vResult0 = vmlaq_lane_f32(vResult0, V.val[2], r, 0); // Ax+Ey+Iz + vResult1 = vmlaq_lane_f32(vResult1, V.val[2], r, 1); // Bx+Fy+Jz + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4)); + + r = vget_high_f32(row2); + vResult2 = vmlaq_lane_f32(vResult2, V.val[2], r, 0); // Cx+Gy+Kz + vResult3 = vmlaq_lane_f32(vResult3, V.val[2], r, 1); // Dx+Hy+Lz + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5)); + + r = vget_low_f32(row3); + vResult0 = vmlaq_lane_f32(vResult0, V.val[3], r, 0); // Ax+Ey+Iz+Mw + vResult1 = vmlaq_lane_f32(vResult1, V.val[3], r, 1); // Bx+Fy+Jz+Nw + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 6)); + + r = vget_high_f32(row3); + vResult2 = vmlaq_lane_f32(vResult2, V.val[3], r, 0); // Cx+Gy+Kz+Ow + vResult3 = vmlaq_lane_f32(vResult3, V.val[3], r, 1); // Dx+Hy+Lz+Pw + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 7)); + + V.val[0] = vResult0; + V.val[1] = vResult1; + V.val[2] = vResult2; + V.val[3] = vResult3; + + vst4q_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += sizeof(XMFLOAT4) * 4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + XMVECTOR V = vld1q_f32(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + float32x2_t VL = vget_low_f32(V); + XMVECTOR vResult = vmulq_lane_f32(row0, VL, 0); // X + vResult = vmlaq_lane_f32(vResult, row1, VL, 1); // Y + float32x2_t VH = vget_high_f32(V); + vResult = vmlaq_lane_f32(vResult, row2, VH, 0); // Z + vResult = vmlaq_lane_f32(vResult, row3, VH, 1); // W + + vst1q_f32(reinterpret_cast(pOutputVector), vResult); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_AVX2_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t two = VectorCount >> 1; + if (two > 0) + { + __m256 row0 = _mm256_broadcast_ps(&M.r[0]); + __m256 row1 = _mm256_broadcast_ps(&M.r[1]); + __m256 row2 = _mm256_broadcast_ps(&M.r[2]); + __m256 row3 = _mm256_broadcast_ps(&M.r[3]); + + if (InputStride == sizeof(XMFLOAT4)) + { + if (OutputStride == sizeof(XMFLOAT4)) + { + if (!(reinterpret_cast(pOutputStream) & 0x1F)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < two; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT4) * 2; + + __m256 vTempX = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + __m256 vTempY = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 vTempZ = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 vTempW = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + + vTempX = _mm256_mul_ps(vTempX, row0); + vTempY = _mm256_mul_ps(vTempY, row1); + vTempZ = _mm256_fmadd_ps(vTempZ, row2, vTempX); + vTempW = _mm256_fmadd_ps(vTempW, row3, vTempY); + vTempX = _mm256_add_ps(vTempZ, vTempW); + + XM256_STREAM_PS(reinterpret_cast(pOutputVector), vTempX); + pOutputVector += sizeof(XMFLOAT4) * 2; + + i += 2; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < two; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT4) * 2; + + __m256 vTempX = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + __m256 vTempY = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 vTempZ = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 vTempW = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + + vTempX = _mm256_mul_ps(vTempX, row0); + vTempY = _mm256_mul_ps(vTempY, row1); + vTempZ = _mm256_fmadd_ps(vTempZ, row2, vTempX); + vTempW = _mm256_fmadd_ps(vTempW, row3, vTempY); + vTempX = _mm256_add_ps(vTempZ, vTempW); + + _mm256_storeu_ps(reinterpret_cast(pOutputVector), vTempX); + pOutputVector += sizeof(XMFLOAT4) * 2; + + i += 2; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < two; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT4) * 2; + + __m256 vTempX = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + __m256 vTempY = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 vTempZ = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 vTempW = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + + vTempX = _mm256_mul_ps(vTempX, row0); + vTempY = _mm256_mul_ps(vTempY, row1); + vTempZ = _mm256_fmadd_ps(vTempZ, row2, vTempX); + vTempW = _mm256_fmadd_ps(vTempW, row3, vTempY); + vTempX = _mm256_add_ps(vTempZ, vTempW); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), _mm256_castps256_ps128(vTempX)); + pOutputVector += OutputStride; + + _mm_storeu_ps(reinterpret_cast(pOutputVector), _mm256_extractf128_ps(vTempX, 1)); + pOutputVector += OutputStride; + i += 2; + } + } + } + } + + if (i < VectorCount) + { + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + for (; i < VectorCount; i++) + { + __m128 V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + + vTempX = _mm_mul_ps(vTempX, row0); + vTempY = _mm_mul_ps(vTempY, row1); + vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX); + vTempW = XM_FMADD_PS(vTempW, row3, vTempY); + vTempX = _mm_add_ps(vTempZ, vTempW); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTempX); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + if (!(reinterpret_cast(pOutputStream) & 0xF) && !(OutputStride & 0xF)) + { + if (!(reinterpret_cast(pInputStream) & 0xF) && !(InputStride & 0xF)) + { + // Aligned input, aligned output + for (size_t i = 0; i < VectorCount; i++) + { + __m128 V = _mm_load_ps(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + + vTempX = _mm_mul_ps(vTempX, row0); + vTempY = _mm_mul_ps(vTempY, row1); + vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX); + vTempW = XM_FMADD_PS(vTempW, row3, vTempY); + vTempX = _mm_add_ps(vTempZ, vTempW); + + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTempX); + pOutputVector += OutputStride; + } + } + else + { + // Unaligned input, aligned output + for (size_t i = 0; i < VectorCount; i++) + { + __m128 V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + + vTempX = _mm_mul_ps(vTempX, row0); + vTempY = _mm_mul_ps(vTempY, row1); + vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX); + vTempW = XM_FMADD_PS(vTempW, row3, vTempY); + vTempX = _mm_add_ps(vTempZ, vTempW); + + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTempX); + pOutputVector += OutputStride; + } + } + } + else + { + if (!(reinterpret_cast(pInputStream) & 0xF) && !(InputStride & 0xF)) + { + // Aligned input, unaligned output + for (size_t i = 0; i < VectorCount; i++) + { + __m128 V = _mm_load_ps(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + + vTempX = _mm_mul_ps(vTempX, row0); + vTempY = _mm_mul_ps(vTempY, row1); + vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX); + vTempW = XM_FMADD_PS(vTempW, row3, vTempY); + vTempX = _mm_add_ps(vTempZ, vTempW); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTempX); + pOutputVector += OutputStride; + } + } + else + { + // Unaligned input, unaligned output + for (size_t i = 0; i < VectorCount; i++) + { + __m128 V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + + vTempX = _mm_mul_ps(vTempX, row0); + vTempY = _mm_mul_ps(vTempY, row1); + vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX); + vTempW = XM_FMADD_PS(vTempW, row3, vTempY); + vTempX = _mm_add_ps(vTempZ, vTempW); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTempX); + pOutputVector += OutputStride; + } + } + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +/**************************************************************************** + * + * XMVECTOR operators + * + ****************************************************************************/ + +#ifndef _XM_NO_XMVECTOR_OVERLOADS_ + + //------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator+ (FXMVECTOR V) noexcept +{ + return V; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator- (FXMVECTOR V) noexcept +{ + return XMVectorNegate(V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& XM_CALLCONV operator+= +( + XMVECTOR& V1, + FXMVECTOR V2 +) noexcept +{ + V1 = XMVectorAdd(V1, V2); + return V1; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& XM_CALLCONV operator-= +( + XMVECTOR& V1, + FXMVECTOR V2 +) noexcept +{ + V1 = XMVectorSubtract(V1, V2); + return V1; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& XM_CALLCONV operator*= +( + XMVECTOR& V1, + FXMVECTOR V2 +) noexcept +{ + V1 = XMVectorMultiply(V1, V2); + return V1; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& XM_CALLCONV operator/= +( + XMVECTOR& V1, + FXMVECTOR V2 +) noexcept +{ + V1 = XMVectorDivide(V1, V2); + return V1; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& operator*= +( + XMVECTOR& V, + const float S +) noexcept +{ + V = XMVectorScale(V, S); + return V; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& operator/= +( + XMVECTOR& V, + const float S +) noexcept +{ + XMVECTOR vS = XMVectorReplicate(S); + V = XMVectorDivide(V, vS); + return V; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator+ +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + return XMVectorAdd(V1, V2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator- +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + return XMVectorSubtract(V1, V2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator* +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + return XMVectorMultiply(V1, V2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator/ +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + return XMVectorDivide(V1, V2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator* +( + FXMVECTOR V, + const float S +) noexcept +{ + return XMVectorScale(V, S); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator/ +( + FXMVECTOR V, + const float S +) noexcept +{ + XMVECTOR vS = XMVectorReplicate(S); + return XMVectorDivide(V, vS); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator* +( + float S, + FXMVECTOR V +) noexcept +{ + return XMVectorScale(V, S); +} + +#endif /* !_XM_NO_XMVECTOR_OVERLOADS_ */ + +#if defined(_XM_NO_INTRINSICS_) +#undef XMISNAN +#undef XMISINF +#endif + +#if defined(_XM_SSE_INTRINSICS_) +#undef XM3UNPACK3INTO4 +#undef XM3PACK4INTO3 +#endif + diff --git a/DirectXMath/DirectXPackedVector.h b/DirectXMath/DirectXPackedVector.h new file mode 100644 index 0000000..f76a05e --- /dev/null +++ b/DirectXMath/DirectXPackedVector.h @@ -0,0 +1,1216 @@ +//------------------------------------------------------------------------------------- +// DirectXPackedVector.h -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#include "DirectXMath.h" + +namespace DirectX +{ + + namespace PackedVector + { + +#pragma warning(push) +#pragma warning(disable:4201 4365 4324 4996) + // C4201: nonstandard extension used + // C4365: Off by default noise + // C4324: alignment padding warnings + // C4996: deprecation warnings + +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wgnu-anonymous-struct" +#pragma clang diagnostic ignored "-Wnested-anon-types" +#endif + + //------------------------------------------------------------------------------ + // ARGB Color; 8-8-8-8 bit unsigned normalized integer components packed into + // a 32 bit integer. The normalized color is packed into 32 bits using 8 bit + // unsigned, normalized integers for the alpha, red, green, and blue components. + // The alpha component is stored in the most significant bits and the blue + // component in the least significant bits (A8R8G8B8): + // [32] aaaaaaaa rrrrrrrr gggggggg bbbbbbbb [0] + struct XMCOLOR + { + union + { + struct + { + uint8_t b; // Blue: 0/255 to 255/255 + uint8_t g; // Green: 0/255 to 255/255 + uint8_t r; // Red: 0/255 to 255/255 + uint8_t a; // Alpha: 0/255 to 255/255 + }; + uint32_t c; + }; + + XMCOLOR() = default; + + XMCOLOR(const XMCOLOR&) = default; + XMCOLOR& operator=(const XMCOLOR&) = default; + + XMCOLOR(XMCOLOR&&) = default; + XMCOLOR& operator=(XMCOLOR&&) = default; + + constexpr XMCOLOR(uint32_t Color) noexcept : c(Color) {} + XMCOLOR(float _r, float _g, float _b, float _a) noexcept; + explicit XMCOLOR(_In_reads_(4) const float* pArray) noexcept; + + operator uint32_t () const noexcept { return c; } + + XMCOLOR& operator= (const uint32_t Color) noexcept { c = Color; return *this; } + }; + + //------------------------------------------------------------------------------ + // 16 bit floating point number consisting of a sign bit, a 5 bit biased + // exponent, and a 10 bit mantissa + using HALF = uint16_t; + + //------------------------------------------------------------------------------ + // 2D Vector; 16 bit floating point components + struct XMHALF2 + { + union + { + struct + { + HALF x; + HALF y; + }; + uint32_t v; + }; + + XMHALF2() = default; + + XMHALF2(const XMHALF2&) = default; + XMHALF2& operator=(const XMHALF2&) = default; + + XMHALF2(XMHALF2&&) = default; + XMHALF2& operator=(XMHALF2&&) = default; + + explicit constexpr XMHALF2(uint32_t Packed) noexcept : v(Packed) {} + constexpr XMHALF2(HALF _x, HALF _y) noexcept : x(_x), y(_y) {} + explicit XMHALF2(_In_reads_(2) const HALF* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + XMHALF2(float _x, float _y) noexcept; + explicit XMHALF2(_In_reads_(2) const float* pArray) noexcept; + + XMHALF2& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + //------------------------------------------------------------------------------ + // 2D Vector; 16 bit signed normalized integer components + struct XMSHORTN2 + { + union + { + struct + { + int16_t x; + int16_t y; + }; + uint32_t v; + }; + + XMSHORTN2() = default; + + XMSHORTN2(const XMSHORTN2&) = default; + XMSHORTN2& operator=(const XMSHORTN2&) = default; + + XMSHORTN2(XMSHORTN2&&) = default; + XMSHORTN2& operator=(XMSHORTN2&&) = default; + + explicit constexpr XMSHORTN2(uint32_t Packed) noexcept : v(Packed) {} + constexpr XMSHORTN2(int16_t _x, int16_t _y) noexcept : x(_x), y(_y) {} + explicit XMSHORTN2(_In_reads_(2) const int16_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + XMSHORTN2(float _x, float _y) noexcept; + explicit XMSHORTN2(_In_reads_(2) const float* pArray) noexcept; + + XMSHORTN2& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + // 2D Vector; 16 bit signed integer components + struct XMSHORT2 + { + union + { + struct + { + int16_t x; + int16_t y; + }; + uint32_t v; + }; + + XMSHORT2() = default; + + XMSHORT2(const XMSHORT2&) = default; + XMSHORT2& operator=(const XMSHORT2&) = default; + + XMSHORT2(XMSHORT2&&) = default; + XMSHORT2& operator=(XMSHORT2&&) = default; + + explicit constexpr XMSHORT2(uint32_t Packed) noexcept : v(Packed) {} + constexpr XMSHORT2(int16_t _x, int16_t _y) noexcept : x(_x), y(_y) {} + explicit XMSHORT2(_In_reads_(2) const int16_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + XMSHORT2(float _x, float _y) noexcept; + explicit XMSHORT2(_In_reads_(2) const float* pArray) noexcept; + + XMSHORT2& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + // 2D Vector; 16 bit unsigned normalized integer components + struct XMUSHORTN2 + { + union + { + struct + { + uint16_t x; + uint16_t y; + }; + uint32_t v; + }; + + XMUSHORTN2() = default; + + XMUSHORTN2(const XMUSHORTN2&) = default; + XMUSHORTN2& operator=(const XMUSHORTN2&) = default; + + XMUSHORTN2(XMUSHORTN2&&) = default; + XMUSHORTN2& operator=(XMUSHORTN2&&) = default; + + explicit constexpr XMUSHORTN2(uint32_t Packed) noexcept : v(Packed) {} + constexpr XMUSHORTN2(uint16_t _x, uint16_t _y) noexcept : x(_x), y(_y) {} + explicit XMUSHORTN2(_In_reads_(2) const uint16_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + XMUSHORTN2(float _x, float _y) noexcept; + explicit XMUSHORTN2(_In_reads_(2) const float* pArray) noexcept; + + XMUSHORTN2& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + // 2D Vector; 16 bit unsigned integer components + struct XMUSHORT2 + { + union + { + struct + { + uint16_t x; + uint16_t y; + }; + uint32_t v; + }; + + XMUSHORT2() = default; + + XMUSHORT2(const XMUSHORT2&) = default; + XMUSHORT2& operator=(const XMUSHORT2&) = default; + + XMUSHORT2(XMUSHORT2&&) = default; + XMUSHORT2& operator=(XMUSHORT2&&) = default; + + explicit constexpr XMUSHORT2(uint32_t Packed) noexcept : v(Packed) {} + constexpr XMUSHORT2(uint16_t _x, uint16_t _y) noexcept : x(_x), y(_y) {} + explicit XMUSHORT2(_In_reads_(2) const uint16_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + XMUSHORT2(float _x, float _y) noexcept; + explicit XMUSHORT2(_In_reads_(2) const float* pArray) noexcept; + + XMUSHORT2& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + //------------------------------------------------------------------------------ + // 2D Vector; 8 bit signed normalized integer components + struct XMBYTEN2 + { + union + { + struct + { + int8_t x; + int8_t y; + }; + uint16_t v; + }; + + XMBYTEN2() = default; + + XMBYTEN2(const XMBYTEN2&) = default; + XMBYTEN2& operator=(const XMBYTEN2&) = default; + + XMBYTEN2(XMBYTEN2&&) = default; + XMBYTEN2& operator=(XMBYTEN2&&) = default; + + explicit constexpr XMBYTEN2(uint16_t Packed) noexcept : v(Packed) {} + constexpr XMBYTEN2(int8_t _x, int8_t _y) noexcept : x(_x), y(_y) {} + explicit XMBYTEN2(_In_reads_(2) const int8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + XMBYTEN2(float _x, float _y) noexcept; + explicit XMBYTEN2(_In_reads_(2) const float* pArray) noexcept; + + XMBYTEN2& operator= (uint16_t Packed) noexcept { v = Packed; return *this; } + }; + + // 2D Vector; 8 bit signed integer components + struct XMBYTE2 + { + union + { + struct + { + int8_t x; + int8_t y; + }; + uint16_t v; + }; + + XMBYTE2() = default; + + XMBYTE2(const XMBYTE2&) = default; + XMBYTE2& operator=(const XMBYTE2&) = default; + + XMBYTE2(XMBYTE2&&) = default; + XMBYTE2& operator=(XMBYTE2&&) = default; + + explicit constexpr XMBYTE2(uint16_t Packed) noexcept : v(Packed) {} + constexpr XMBYTE2(int8_t _x, int8_t _y) noexcept : x(_x), y(_y) {} + explicit XMBYTE2(_In_reads_(2) const int8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + XMBYTE2(float _x, float _y) noexcept; + explicit XMBYTE2(_In_reads_(2) const float* pArray) noexcept; + + XMBYTE2& operator= (uint16_t Packed) noexcept { v = Packed; return *this; } + }; + + // 2D Vector; 8 bit unsigned normalized integer components + struct XMUBYTEN2 + { + union + { + struct + { + uint8_t x; + uint8_t y; + }; + uint16_t v; + }; + + XMUBYTEN2() = default; + + XMUBYTEN2(const XMUBYTEN2&) = default; + XMUBYTEN2& operator=(const XMUBYTEN2&) = default; + + XMUBYTEN2(XMUBYTEN2&&) = default; + XMUBYTEN2& operator=(XMUBYTEN2&&) = default; + + explicit constexpr XMUBYTEN2(uint16_t Packed) noexcept : v(Packed) {} + constexpr XMUBYTEN2(uint8_t _x, uint8_t _y) noexcept : x(_x), y(_y) {} + explicit XMUBYTEN2(_In_reads_(2) const uint8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + XMUBYTEN2(float _x, float _y) noexcept; + explicit XMUBYTEN2(_In_reads_(2) const float* pArray) noexcept; + + XMUBYTEN2& operator= (uint16_t Packed) noexcept { v = Packed; return *this; } + }; + + // 2D Vector; 8 bit unsigned integer components + struct XMUBYTE2 + { + union + { + struct + { + uint8_t x; + uint8_t y; + }; + uint16_t v; + }; + + XMUBYTE2() = default; + + XMUBYTE2(const XMUBYTE2&) = default; + XMUBYTE2& operator=(const XMUBYTE2&) = default; + + XMUBYTE2(XMUBYTE2&&) = default; + XMUBYTE2& operator=(XMUBYTE2&&) = default; + + explicit constexpr XMUBYTE2(uint16_t Packed) noexcept : v(Packed) {} + constexpr XMUBYTE2(uint8_t _x, uint8_t _y) noexcept : x(_x), y(_y) {} + explicit XMUBYTE2(_In_reads_(2) const uint8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + XMUBYTE2(float _x, float _y) noexcept; + explicit XMUBYTE2(_In_reads_(2) const float* pArray) noexcept; + + XMUBYTE2& operator= (uint16_t Packed) noexcept { v = Packed; return *this; } + }; + + //------------------------------------------------------------------------------ + // 3D vector: 5/6/5 unsigned integer components + struct XMU565 + { + union + { + struct + { + uint16_t x : 5; // 0 to 31 + uint16_t y : 6; // 0 to 63 + uint16_t z : 5; // 0 to 31 + }; + uint16_t v; + }; + + XMU565() = default; + + XMU565(const XMU565&) = default; + XMU565& operator=(const XMU565&) = default; + + XMU565(XMU565&&) = default; + XMU565& operator=(XMU565&&) = default; + + explicit constexpr XMU565(uint16_t Packed) noexcept : v(Packed) {} + constexpr XMU565(uint8_t _x, uint8_t _y, uint8_t _z) noexcept : x(_x), y(_y), z(_z) {} + explicit XMU565(_In_reads_(3) const uint8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} + XMU565(float _x, float _y, float _z) noexcept; + explicit XMU565(_In_reads_(3) const float* pArray) noexcept; + + operator uint16_t () const noexcept { return v; } + + XMU565& operator= (uint16_t Packed) noexcept { v = Packed; return *this; } + }; + + //------------------------------------------------------------------------------ + // 3D vector: 11/11/10 floating-point components + // The 3D vector is packed into 32 bits as follows: a 5-bit biased exponent + // and 6-bit mantissa for x component, a 5-bit biased exponent and + // 6-bit mantissa for y component, a 5-bit biased exponent and a 5-bit + // mantissa for z. The z component is stored in the most significant bits + // and the x component in the least significant bits. No sign bits so + // all partial-precision numbers are positive. + // (Z10Y11X11): [32] ZZZZZzzz zzzYYYYY yyyyyyXX XXXxxxxx [0] + struct XMFLOAT3PK + { + union + { + struct + { + uint32_t xm : 6; // x-mantissa + uint32_t xe : 5; // x-exponent + uint32_t ym : 6; // y-mantissa + uint32_t ye : 5; // y-exponent + uint32_t zm : 5; // z-mantissa + uint32_t ze : 5; // z-exponent + }; + uint32_t v; + }; + + XMFLOAT3PK() = default; + + XMFLOAT3PK(const XMFLOAT3PK&) = default; + XMFLOAT3PK& operator=(const XMFLOAT3PK&) = default; + + XMFLOAT3PK(XMFLOAT3PK&&) = default; + XMFLOAT3PK& operator=(XMFLOAT3PK&&) = default; + + explicit constexpr XMFLOAT3PK(uint32_t Packed) noexcept : v(Packed) {} + XMFLOAT3PK(float _x, float _y, float _z) noexcept; + explicit XMFLOAT3PK(_In_reads_(3) const float* pArray) noexcept; + + operator uint32_t () const noexcept { return v; } + + XMFLOAT3PK& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + //------------------------------------------------------------------------------ + // 3D vector: 9/9/9 floating-point components with shared 5-bit exponent + // The 3D vector is packed into 32 bits as follows: a 5-bit biased exponent + // with 9-bit mantissa for the x, y, and z component. The shared exponent + // is stored in the most significant bits and the x component mantissa is in + // the least significant bits. No sign bits so all partial-precision numbers + // are positive. + // (E5Z9Y9X9): [32] EEEEEzzz zzzzzzyy yyyyyyyx xxxxxxxx [0] + struct XMFLOAT3SE + { + union + { + struct + { + uint32_t xm : 9; // x-mantissa + uint32_t ym : 9; // y-mantissa + uint32_t zm : 9; // z-mantissa + uint32_t e : 5; // shared exponent + }; + uint32_t v; + }; + + XMFLOAT3SE() = default; + + XMFLOAT3SE(const XMFLOAT3SE&) = default; + XMFLOAT3SE& operator=(const XMFLOAT3SE&) = default; + + XMFLOAT3SE(XMFLOAT3SE&&) = default; + XMFLOAT3SE& operator=(XMFLOAT3SE&&) = default; + + explicit constexpr XMFLOAT3SE(uint32_t Packed) noexcept : v(Packed) {} + XMFLOAT3SE(float _x, float _y, float _z) noexcept; + explicit XMFLOAT3SE(_In_reads_(3) const float* pArray) noexcept; + + operator uint32_t () const noexcept { return v; } + + XMFLOAT3SE& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + //------------------------------------------------------------------------------ + // 4D Vector; 16 bit floating point components + struct XMHALF4 + { + union + { + struct + { + HALF x; + HALF y; + HALF z; + HALF w; + }; + uint64_t v; + }; + + XMHALF4() = default; + + XMHALF4(const XMHALF4&) = default; + XMHALF4& operator=(const XMHALF4&) = default; + + XMHALF4(XMHALF4&&) = default; + XMHALF4& operator=(XMHALF4&&) = default; + + explicit constexpr XMHALF4(uint64_t Packed) noexcept : v(Packed) {} + constexpr XMHALF4(HALF _x, HALF _y, HALF _z, HALF _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit XMHALF4(_In_reads_(4) const HALF* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMHALF4(float _x, float _y, float _z, float _w) noexcept; + explicit XMHALF4(_In_reads_(4) const float* pArray) noexcept; + + XMHALF4& operator= (uint64_t Packed) noexcept { v = Packed; return *this; } + }; + + //------------------------------------------------------------------------------ + // 4D Vector; 16 bit signed normalized integer components + struct XMSHORTN4 + { + union + { + struct + { + int16_t x; + int16_t y; + int16_t z; + int16_t w; + }; + uint64_t v; + }; + + XMSHORTN4() = default; + + XMSHORTN4(const XMSHORTN4&) = default; + XMSHORTN4& operator=(const XMSHORTN4&) = default; + + XMSHORTN4(XMSHORTN4&&) = default; + XMSHORTN4& operator=(XMSHORTN4&&) = default; + + explicit constexpr XMSHORTN4(uint64_t Packed) noexcept : v(Packed) {} + constexpr XMSHORTN4(int16_t _x, int16_t _y, int16_t _z, int16_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit XMSHORTN4(_In_reads_(4) const int16_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMSHORTN4(float _x, float _y, float _z, float _w) noexcept; + explicit XMSHORTN4(_In_reads_(4) const float* pArray) noexcept; + + XMSHORTN4& operator= (uint64_t Packed) noexcept { v = Packed; return *this; } + }; + + // 4D Vector; 16 bit signed integer components + struct XMSHORT4 + { + union + { + struct + { + int16_t x; + int16_t y; + int16_t z; + int16_t w; + }; + uint64_t v; + }; + + XMSHORT4() = default; + + XMSHORT4(const XMSHORT4&) = default; + XMSHORT4& operator=(const XMSHORT4&) = default; + + XMSHORT4(XMSHORT4&&) = default; + XMSHORT4& operator=(XMSHORT4&&) = default; + + explicit constexpr XMSHORT4(uint64_t Packed) noexcept : v(Packed) {} + constexpr XMSHORT4(int16_t _x, int16_t _y, int16_t _z, int16_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit XMSHORT4(_In_reads_(4) const int16_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMSHORT4(float _x, float _y, float _z, float _w) noexcept; + explicit XMSHORT4(_In_reads_(4) const float* pArray) noexcept; + + XMSHORT4& operator= (uint64_t Packed) noexcept { v = Packed; return *this; } + }; + + // 4D Vector; 16 bit unsigned normalized integer components + struct XMUSHORTN4 + { + union + { + struct + { + uint16_t x; + uint16_t y; + uint16_t z; + uint16_t w; + }; + uint64_t v; + }; + + XMUSHORTN4() = default; + + XMUSHORTN4(const XMUSHORTN4&) = default; + XMUSHORTN4& operator=(const XMUSHORTN4&) = default; + + XMUSHORTN4(XMUSHORTN4&&) = default; + XMUSHORTN4& operator=(XMUSHORTN4&&) = default; + + explicit constexpr XMUSHORTN4(uint64_t Packed) noexcept : v(Packed) {} + constexpr XMUSHORTN4(uint16_t _x, uint16_t _y, uint16_t _z, uint16_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit XMUSHORTN4(_In_reads_(4) const uint16_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMUSHORTN4(float _x, float _y, float _z, float _w) noexcept; + explicit XMUSHORTN4(_In_reads_(4) const float* pArray) noexcept; + + XMUSHORTN4& operator= (uint64_t Packed) noexcept { v = Packed; return *this; } + }; + + // 4D Vector; 16 bit unsigned integer components + struct XMUSHORT4 + { + union + { + struct + { + uint16_t x; + uint16_t y; + uint16_t z; + uint16_t w; + }; + uint64_t v; + }; + + XMUSHORT4() = default; + + XMUSHORT4(const XMUSHORT4&) = default; + XMUSHORT4& operator=(const XMUSHORT4&) = default; + + XMUSHORT4(XMUSHORT4&&) = default; + XMUSHORT4& operator=(XMUSHORT4&&) = default; + + explicit constexpr XMUSHORT4(uint64_t Packed) noexcept : v(Packed) {} + constexpr XMUSHORT4(uint16_t _x, uint16_t _y, uint16_t _z, uint16_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit XMUSHORT4(_In_reads_(4) const uint16_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMUSHORT4(float _x, float _y, float _z, float _w) noexcept; + explicit XMUSHORT4(_In_reads_(4) const float* pArray) noexcept; + + XMUSHORT4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + //------------------------------------------------------------------------------ + // 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer + // The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, + // normalized integer for the w component and 10 bit signed, normalized + // integers for the z, y, and x components. The w component is stored in the + // most significant bits and the x component in the least significant bits + // (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] + struct XMXDECN4 + { + union + { + struct + { + int32_t x : 10; // -511/511 to 511/511 + int32_t y : 10; // -511/511 to 511/511 + int32_t z : 10; // -511/511 to 511/511 + uint32_t w : 2; // 0/3 to 3/3 + }; + uint32_t v; + }; + + XMXDECN4() = default; + + XMXDECN4(const XMXDECN4&) = default; + XMXDECN4& operator=(const XMXDECN4&) = default; + + XMXDECN4(XMXDECN4&&) = default; + XMXDECN4& operator=(XMXDECN4&&) = default; + + explicit constexpr XMXDECN4(uint32_t Packed) : v(Packed) {} + XMXDECN4(float _x, float _y, float _z, float _w) noexcept; + explicit XMXDECN4(_In_reads_(4) const float* pArray) noexcept; + + operator uint32_t () const noexcept { return v; } + + XMXDECN4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + // 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer + // The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned + // integer for the w component and 10 bit signed integers for the + // z, y, and x components. The w component is stored in the + // most significant bits and the x component in the least significant bits + // (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] + struct XM_DEPRECATED XMXDEC4 + { + union + { + struct + { + int32_t x : 10; // -511 to 511 + int32_t y : 10; // -511 to 511 + int32_t z : 10; // -511 to 511 + uint32_t w : 2; // 0 to 3 + }; + uint32_t v; + }; + + XMXDEC4() = default; + + XMXDEC4(const XMXDEC4&) = default; + XMXDEC4& operator=(const XMXDEC4&) = default; + + XMXDEC4(XMXDEC4&&) = default; + XMXDEC4& operator=(XMXDEC4&&) = default; + + explicit constexpr XMXDEC4(uint32_t Packed) noexcept : v(Packed) {} + XMXDEC4(float _x, float _y, float _z, float _w) noexcept; + explicit XMXDEC4(_In_reads_(4) const float* pArray) noexcept; + + operator uint32_t () const noexcept { return v; } + + XMXDEC4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + // 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer + // The normalized 4D Vector is packed into 32 bits as follows: a 2 bit signed, + // normalized integer for the w component and 10 bit signed, normalized + // integers for the z, y, and x components. The w component is stored in the + // most significant bits and the x component in the least significant bits + // (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] + struct XM_DEPRECATED XMDECN4 + { + union + { + struct + { + int32_t x : 10; // -511/511 to 511/511 + int32_t y : 10; // -511/511 to 511/511 + int32_t z : 10; // -511/511 to 511/511 + int32_t w : 2; // -1/1 to 1/1 + }; + uint32_t v; + }; + + XMDECN4() = default; + + XMDECN4(const XMDECN4&) = default; + XMDECN4& operator=(const XMDECN4&) = default; + + XMDECN4(XMDECN4&&) = default; + XMDECN4& operator=(XMDECN4&&) = default; + + explicit constexpr XMDECN4(uint32_t Packed) noexcept : v(Packed) {} + XMDECN4(float _x, float _y, float _z, float _w) noexcept; + explicit XMDECN4(_In_reads_(4) const float* pArray) noexcept; + + operator uint32_t () const noexcept { return v; } + + XMDECN4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + // 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer + // The 4D Vector is packed into 32 bits as follows: a 2 bit signed, + // integer for the w component and 10 bit signed integers for the + // z, y, and x components. The w component is stored in the + // most significant bits and the x component in the least significant bits + // (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] + struct XM_DEPRECATED XMDEC4 + { + union + { + struct + { + int32_t x : 10; // -511 to 511 + int32_t y : 10; // -511 to 511 + int32_t z : 10; // -511 to 511 + int32_t w : 2; // -1 to 1 + }; + uint32_t v; + }; + + XMDEC4() = default; + + XMDEC4(const XMDEC4&) = default; + XMDEC4& operator=(const XMDEC4&) = default; + + XMDEC4(XMDEC4&&) = default; + XMDEC4& operator=(XMDEC4&&) = default; + + explicit constexpr XMDEC4(uint32_t Packed) noexcept : v(Packed) {} + XMDEC4(float _x, float _y, float _z, float _w) noexcept; + explicit XMDEC4(_In_reads_(4) const float* pArray) noexcept; + + operator uint32_t () const noexcept { return v; } + + XMDEC4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + // 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer + // The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, + // normalized integer for the w component and 10 bit unsigned, normalized + // integers for the z, y, and x components. The w component is stored in the + // most significant bits and the x component in the least significant bits + // (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] + struct XMUDECN4 + { + union + { + struct + { + uint32_t x : 10; // 0/1023 to 1023/1023 + uint32_t y : 10; // 0/1023 to 1023/1023 + uint32_t z : 10; // 0/1023 to 1023/1023 + uint32_t w : 2; // 0/3 to 3/3 + }; + uint32_t v; + }; + + XMUDECN4() = default; + + XMUDECN4(const XMUDECN4&) = default; + XMUDECN4& operator=(const XMUDECN4&) = default; + + XMUDECN4(XMUDECN4&&) = default; + XMUDECN4& operator=(XMUDECN4&&) = default; + + explicit constexpr XMUDECN4(uint32_t Packed) noexcept : v(Packed) {} + XMUDECN4(float _x, float _y, float _z, float _w) noexcept; + explicit XMUDECN4(_In_reads_(4) const float* pArray) noexcept; + + operator uint32_t () const noexcept { return v; } + + XMUDECN4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + // 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer + // The 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, + // integer for the w component and 10 bit unsigned integers + // for the z, y, and x components. The w component is stored in the + // most significant bits and the x component in the least significant bits + // (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] + struct XMUDEC4 + { + union + { + struct + { + uint32_t x : 10; // 0 to 1023 + uint32_t y : 10; // 0 to 1023 + uint32_t z : 10; // 0 to 1023 + uint32_t w : 2; // 0 to 3 + }; + uint32_t v; + }; + + XMUDEC4() = default; + + XMUDEC4(const XMUDEC4&) = default; + XMUDEC4& operator=(const XMUDEC4&) = default; + + XMUDEC4(XMUDEC4&&) = default; + XMUDEC4& operator=(XMUDEC4&&) = default; + + explicit constexpr XMUDEC4(uint32_t Packed) noexcept : v(Packed) {} + XMUDEC4(float _x, float _y, float _z, float _w) noexcept; + explicit XMUDEC4(_In_reads_(4) const float* pArray) noexcept; + + operator uint32_t () const noexcept { return v; } + + XMUDEC4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + //------------------------------------------------------------------------------ + // 4D Vector; 8 bit signed normalized integer components + struct XMBYTEN4 + { + union + { + struct + { + int8_t x; + int8_t y; + int8_t z; + int8_t w; + }; + uint32_t v; + }; + + XMBYTEN4() = default; + + XMBYTEN4(const XMBYTEN4&) = default; + XMBYTEN4& operator=(const XMBYTEN4&) = default; + + XMBYTEN4(XMBYTEN4&&) = default; + XMBYTEN4& operator=(XMBYTEN4&&) = default; + + constexpr XMBYTEN4(int8_t _x, int8_t _y, int8_t _z, int8_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit constexpr XMBYTEN4(uint32_t Packed) noexcept : v(Packed) {} + explicit XMBYTEN4(_In_reads_(4) const int8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMBYTEN4(float _x, float _y, float _z, float _w) noexcept; + explicit XMBYTEN4(_In_reads_(4) const float* pArray) noexcept; + + XMBYTEN4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + // 4D Vector; 8 bit signed integer components + struct XMBYTE4 + { + union + { + struct + { + int8_t x; + int8_t y; + int8_t z; + int8_t w; + }; + uint32_t v; + }; + + XMBYTE4() = default; + + XMBYTE4(const XMBYTE4&) = default; + XMBYTE4& operator=(const XMBYTE4&) = default; + + XMBYTE4(XMBYTE4&&) = default; + XMBYTE4& operator=(XMBYTE4&&) = default; + + constexpr XMBYTE4(int8_t _x, int8_t _y, int8_t _z, int8_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit constexpr XMBYTE4(uint32_t Packed) noexcept : v(Packed) {} + explicit XMBYTE4(_In_reads_(4) const int8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMBYTE4(float _x, float _y, float _z, float _w) noexcept; + explicit XMBYTE4(_In_reads_(4) const float* pArray) noexcept; + + XMBYTE4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + // 4D Vector; 8 bit unsigned normalized integer components + struct XMUBYTEN4 + { + union + { + struct + { + uint8_t x; + uint8_t y; + uint8_t z; + uint8_t w; + }; + uint32_t v; + }; + + XMUBYTEN4() = default; + + XMUBYTEN4(const XMUBYTEN4&) = default; + XMUBYTEN4& operator=(const XMUBYTEN4&) = default; + + XMUBYTEN4(XMUBYTEN4&&) = default; + XMUBYTEN4& operator=(XMUBYTEN4&&) = default; + + constexpr XMUBYTEN4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit constexpr XMUBYTEN4(uint32_t Packed) noexcept : v(Packed) {} + explicit XMUBYTEN4(_In_reads_(4) const uint8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMUBYTEN4(float _x, float _y, float _z, float _w) noexcept; + explicit XMUBYTEN4(_In_reads_(4) const float* pArray) noexcept; + + XMUBYTEN4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + // 4D Vector; 8 bit unsigned integer components + struct XMUBYTE4 + { + union + { + struct + { + uint8_t x; + uint8_t y; + uint8_t z; + uint8_t w; + }; + uint32_t v; + }; + + XMUBYTE4() = default; + + XMUBYTE4(const XMUBYTE4&) = default; + XMUBYTE4& operator=(const XMUBYTE4&) = default; + + XMUBYTE4(XMUBYTE4&&) = default; + XMUBYTE4& operator=(XMUBYTE4&&) = default; + + constexpr XMUBYTE4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit constexpr XMUBYTE4(uint32_t Packed) noexcept : v(Packed) {} + explicit XMUBYTE4(_In_reads_(4) const uint8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMUBYTE4(float _x, float _y, float _z, float _w) noexcept; + explicit XMUBYTE4(_In_reads_(4) const float* pArray) noexcept; + + XMUBYTE4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + //------------------------------------------------------------------------------ + // 4D vector; 4 bit unsigned integer components + struct XMUNIBBLE4 + { + union + { + struct + { + uint16_t x : 4; // 0 to 15 + uint16_t y : 4; // 0 to 15 + uint16_t z : 4; // 0 to 15 + uint16_t w : 4; // 0 to 15 + }; + uint16_t v; + }; + + XMUNIBBLE4() = default; + + XMUNIBBLE4(const XMUNIBBLE4&) = default; + XMUNIBBLE4& operator=(const XMUNIBBLE4&) = default; + + XMUNIBBLE4(XMUNIBBLE4&&) = default; + XMUNIBBLE4& operator=(XMUNIBBLE4&&) = default; + + explicit constexpr XMUNIBBLE4(uint16_t Packed) noexcept : v(Packed) {} + constexpr XMUNIBBLE4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit XMUNIBBLE4(_In_reads_(4) const uint8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMUNIBBLE4(float _x, float _y, float _z, float _w) noexcept; + explicit XMUNIBBLE4(_In_reads_(4) const float* pArray) noexcept; + + operator uint16_t () const noexcept { return v; } + + XMUNIBBLE4& operator= (uint16_t Packed) noexcept { v = Packed; return *this; } + }; + + //------------------------------------------------------------------------------ + // 4D vector: 5/5/5/1 unsigned integer components + struct XMU555 + { + union + { + struct + { + uint16_t x : 5; // 0 to 31 + uint16_t y : 5; // 0 to 31 + uint16_t z : 5; // 0 to 31 + uint16_t w : 1; // 0 or 1 + }; + uint16_t v; + }; + + XMU555() = default; + + XMU555(const XMU555&) = default; + XMU555& operator=(const XMU555&) = default; + + XMU555(XMU555&&) = default; + XMU555& operator=(XMU555&&) = default; + + explicit constexpr XMU555(uint16_t Packed) noexcept : v(Packed) {} + constexpr XMU555(uint8_t _x, uint8_t _y, uint8_t _z, bool _w) noexcept : x(_x), y(_y), z(_z), w(_w ? 0x1 : 0) {} + XMU555(_In_reads_(3) const uint8_t* pArray, _In_ bool _w) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(_w ? 0x1 : 0) {} + XMU555(float _x, float _y, float _z, bool _w) noexcept; + XMU555(_In_reads_(3) const float* pArray, _In_ bool _w) noexcept; + + operator uint16_t () const noexcept { return v; } + + XMU555& operator= (uint16_t Packed) noexcept { v = Packed; return *this; } + }; + +#ifdef __clang__ +#pragma clang diagnostic pop +#endif + +#pragma warning(pop) + + + /**************************************************************************** + * + * Data conversion operations + * + ****************************************************************************/ + + float XMConvertHalfToFloat(HALF Value) noexcept; + float* XMConvertHalfToFloatStream(_Out_writes_bytes_(sizeof(float) + OutputStride * (HalfCount - 1)) float* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(HALF) + InputStride * (HalfCount - 1)) const HALF* pInputStream, + _In_ size_t InputStride, _In_ size_t HalfCount) noexcept; + HALF XMConvertFloatToHalf(float Value) noexcept; + HALF* XMConvertFloatToHalfStream(_Out_writes_bytes_(sizeof(HALF) + OutputStride * (FloatCount - 1)) HALF* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(float) + InputStride * (FloatCount - 1)) const float* pInputStream, + _In_ size_t InputStride, _In_ size_t FloatCount) noexcept; + + /**************************************************************************** + * + * Load operations + * + ****************************************************************************/ + + XMVECTOR XM_CALLCONV XMLoadColor(_In_ const XMCOLOR* pSource) noexcept; + + XMVECTOR XM_CALLCONV XMLoadHalf2(_In_ const XMHALF2* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadShortN2(_In_ const XMSHORTN2* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadShort2(_In_ const XMSHORT2* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUShortN2(_In_ const XMUSHORTN2* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUShort2(_In_ const XMUSHORT2* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadByteN2(_In_ const XMBYTEN2* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadByte2(_In_ const XMBYTE2* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUByteN2(_In_ const XMUBYTEN2* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUByte2(_In_ const XMUBYTE2* pSource) noexcept; + + XMVECTOR XM_CALLCONV XMLoadU565(_In_ const XMU565* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadFloat3PK(_In_ const XMFLOAT3PK* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadFloat3SE(_In_ const XMFLOAT3SE* pSource) noexcept; + + XMVECTOR XM_CALLCONV XMLoadHalf4(_In_ const XMHALF4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadShortN4(_In_ const XMSHORTN4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadShort4(_In_ const XMSHORT4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUShortN4(_In_ const XMUSHORTN4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUShort4(_In_ const XMUSHORT4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadXDecN4(_In_ const XMXDECN4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUDecN4(_In_ const XMUDECN4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUDecN4_XR(_In_ const XMUDECN4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUDec4(_In_ const XMUDEC4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadByteN4(_In_ const XMBYTEN4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadByte4(_In_ const XMBYTE4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUByteN4(_In_ const XMUBYTEN4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUByte4(_In_ const XMUBYTE4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUNibble4(_In_ const XMUNIBBLE4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadU555(_In_ const XMU555* pSource) noexcept; + +#pragma warning(push) +#pragma warning(disable : 4996) + // C4996: ignore deprecation warning + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif + + XMVECTOR XM_DEPRECATED XM_CALLCONV XMLoadDecN4(_In_ const XMDECN4* pSource) noexcept; + XMVECTOR XM_DEPRECATED XM_CALLCONV XMLoadDec4(_In_ const XMDEC4* pSource) noexcept; + XMVECTOR XM_DEPRECATED XM_CALLCONV XMLoadXDec4(_In_ const XMXDEC4* pSource) noexcept; + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + +#pragma warning(pop) + + /**************************************************************************** + * + * Store operations + * + ****************************************************************************/ + + void XM_CALLCONV XMStoreColor(_Out_ XMCOLOR* pDestination, _In_ FXMVECTOR V) noexcept; + + void XM_CALLCONV XMStoreHalf2(_Out_ XMHALF2* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreShortN2(_Out_ XMSHORTN2* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreShort2(_Out_ XMSHORT2* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUShortN2(_Out_ XMUSHORTN2* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUShort2(_Out_ XMUSHORT2* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreByteN2(_Out_ XMBYTEN2* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreByte2(_Out_ XMBYTE2* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUByteN2(_Out_ XMUBYTEN2* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUByte2(_Out_ XMUBYTE2* pDestination, _In_ FXMVECTOR V) noexcept; + + void XM_CALLCONV XMStoreU565(_Out_ XMU565* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreFloat3PK(_Out_ XMFLOAT3PK* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreFloat3SE(_Out_ XMFLOAT3SE* pDestination, _In_ FXMVECTOR V) noexcept; + + void XM_CALLCONV XMStoreHalf4(_Out_ XMHALF4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreShortN4(_Out_ XMSHORTN4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreShort4(_Out_ XMSHORT4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUShortN4(_Out_ XMUSHORTN4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUShort4(_Out_ XMUSHORT4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreXDecN4(_Out_ XMXDECN4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUDecN4(_Out_ XMUDECN4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUDecN4_XR(_Out_ XMUDECN4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUDec4(_Out_ XMUDEC4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreByteN4(_Out_ XMBYTEN4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreByte4(_Out_ XMBYTE4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUByteN4(_Out_ XMUBYTEN4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUByte4(_Out_ XMUBYTE4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUNibble4(_Out_ XMUNIBBLE4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreU555(_Out_ XMU555* pDestination, _In_ FXMVECTOR V) noexcept; + +#pragma warning(push) +#pragma warning(disable : 4996) + // C4996: ignore deprecation warning + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif + + void XM_DEPRECATED XM_CALLCONV XMStoreDecN4(_Out_ XMDECN4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_DEPRECATED XM_CALLCONV XMStoreDec4(_Out_ XMDEC4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_DEPRECATED XM_CALLCONV XMStoreXDec4(_Out_ XMXDEC4* pDestination, _In_ FXMVECTOR V) noexcept; + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + +#pragma warning(pop) + + /**************************************************************************** + * + * Implementation + * + ****************************************************************************/ + +#pragma warning(push) +#pragma warning(disable:4068 4214 4204 4365 4616 6001 6101) + // C4068/4616: ignore unknown pragmas + // C4214/4204: nonstandard extension used + // C4365: Off by default noise + // C6001/6101: False positives + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") +#pragma prefast(disable : 26495, "Union initialization confuses /analyze") +#endif + +#include "DirectXPackedVector.inl" + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +#pragma warning(pop) + + } // namespace PackedVector + +} // namespace DirectX + diff --git a/DirectXMath/DirectXPackedVector.inl b/DirectXMath/DirectXPackedVector.inl new file mode 100644 index 0000000..fa1a660 --- /dev/null +++ b/DirectXMath/DirectXPackedVector.inl @@ -0,0 +1,4438 @@ +//------------------------------------------------------------------------------------- +// DirectXPackedVector.inl -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +/**************************************************************************** + * + * Data conversion + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline float XMConvertHalfToFloat(HALF Value) noexcept +{ +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + __m128i V1 = _mm_cvtsi32_si128(static_cast(Value)); + __m128 V2 = _mm_cvtph_ps(V1); + return _mm_cvtss_f32(V2); +#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2)) + uint16x4_t vHalf = vdup_n_u16(Value); + float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf)); + return vgetq_lane_f32(vFloat, 0); +#else + auto Mantissa = static_cast(Value & 0x03FF); + + uint32_t Exponent = (Value & 0x7C00); + if (Exponent == 0x7C00) // INF/NAN + { + Exponent = 0x8f; + } + else if (Exponent != 0) // The value is normalized + { + Exponent = static_cast((static_cast(Value) >> 10) & 0x1F); + } + else if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x0400) == 0); + + Mantissa &= 0x03FF; + } + else // The value is zero + { + Exponent = static_cast(-112); + } + + uint32_t Result = + ((static_cast(Value) & 0x8000) << 16) // Sign + | ((Exponent + 112) << 23) // Exponent + | (Mantissa << 13); // Mantissa + + return reinterpret_cast(&Result)[0]; +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" ) +#endif + +_Use_decl_annotations_ +inline float* XMConvertHalfToFloatStream +( + float* pOutputStream, + size_t OutputStride, + const HALF* pInputStream, + size_t InputStride, + size_t HalfCount +) noexcept +{ + assert(pOutputStream); + assert(pInputStream); + + assert(InputStride >= sizeof(HALF)); + _Analysis_assume_(InputStride >= sizeof(HALF)); + + assert(OutputStride >= sizeof(float)); + _Analysis_assume_(OutputStride >= sizeof(float)); + +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + auto pHalf = reinterpret_cast(pInputStream); + auto pFloat = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = HalfCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(HALF)) + { + if (OutputStride == sizeof(float)) + { + if ((reinterpret_cast(pFloat) & 0xF) == 0) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128i HV = _mm_loadl_epi64(reinterpret_cast(pHalf)); + pHalf += InputStride * 4; + + __m128 FV = _mm_cvtph_ps(HV); + + XM_STREAM_PS(reinterpret_cast(pFloat), FV); + pFloat += OutputStride * 4; + i += 4; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128i HV = _mm_loadl_epi64(reinterpret_cast(pHalf)); + pHalf += InputStride * 4; + + __m128 FV = _mm_cvtph_ps(HV); + + _mm_storeu_ps(reinterpret_cast(pFloat), FV); + pFloat += OutputStride * 4; + i += 4; + } + } + } + else + { + // Packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128i HV = _mm_loadl_epi64(reinterpret_cast(pHalf)); + pHalf += InputStride * 4; + + __m128 FV = _mm_cvtph_ps(HV); + + _mm_store_ss(reinterpret_cast(pFloat), FV); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 1); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 2); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 3); + pFloat += OutputStride; + i += 4; + } + } + } + else if (OutputStride == sizeof(float)) + { + if ((reinterpret_cast(pFloat) & 0xF) == 0) + { + // Scattered input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + __m128i HV = _mm_setzero_si128(); + HV = _mm_insert_epi16(HV, H1, 0); + HV = _mm_insert_epi16(HV, H2, 1); + HV = _mm_insert_epi16(HV, H3, 2); + HV = _mm_insert_epi16(HV, H4, 3); + __m128 FV = _mm_cvtph_ps(HV); + + XM_STREAM_PS(reinterpret_cast(pFloat), FV); + pFloat += OutputStride * 4; + i += 4; + } + } + else + { + // Scattered input, packed output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + __m128i HV = _mm_setzero_si128(); + HV = _mm_insert_epi16(HV, H1, 0); + HV = _mm_insert_epi16(HV, H2, 1); + HV = _mm_insert_epi16(HV, H3, 2); + HV = _mm_insert_epi16(HV, H4, 3); + __m128 FV = _mm_cvtph_ps(HV); + + _mm_storeu_ps(reinterpret_cast(pFloat), FV); + pFloat += OutputStride * 4; + i += 4; + } + } + } + else + { + // Scattered input, scattered output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + __m128i HV = _mm_setzero_si128(); + HV = _mm_insert_epi16(HV, H1, 0); + HV = _mm_insert_epi16(HV, H2, 1); + HV = _mm_insert_epi16(HV, H3, 2); + HV = _mm_insert_epi16(HV, H4, 3); + __m128 FV = _mm_cvtph_ps(HV); + + _mm_store_ss(reinterpret_cast(pFloat), FV); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 1); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 2); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 3); + pFloat += OutputStride; + i += 4; + } + } + } + + for (; i < HalfCount; ++i) + { + *reinterpret_cast(pFloat) = XMConvertHalfToFloat(reinterpret_cast(pHalf)[0]); + pHalf += InputStride; + pFloat += OutputStride; + } + + XM_SFENCE(); + + return pOutputStream; +#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2)) + auto pHalf = reinterpret_cast(pInputStream); + auto pFloat = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = HalfCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(HALF)) + { + if (OutputStride == sizeof(float)) + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + uint16x4_t vHalf = vld1_u16(reinterpret_cast(pHalf)); + pHalf += InputStride * 4; + + float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf)); + + vst1q_f32(reinterpret_cast(pFloat), vFloat); + pFloat += OutputStride * 4; + i += 4; + } + } + else + { + // Packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + uint16x4_t vHalf = vld1_u16(reinterpret_cast(pHalf)); + pHalf += InputStride * 4; + + float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf)); + + vst1q_lane_f32(reinterpret_cast(pFloat), vFloat, 0); + pFloat += OutputStride; + vst1q_lane_f32(reinterpret_cast(pFloat), vFloat, 1); + pFloat += OutputStride; + vst1q_lane_f32(reinterpret_cast(pFloat), vFloat, 2); + pFloat += OutputStride; + vst1q_lane_f32(reinterpret_cast(pFloat), vFloat, 3); + pFloat += OutputStride; + i += 4; + } + } + } + else if (OutputStride == sizeof(float)) + { + // Scattered input, packed output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + uint64_t iHalf = uint64_t(H1) | (uint64_t(H2) << 16) | (uint64_t(H3) << 32) | (uint64_t(H4) << 48); + uint16x4_t vHalf = vcreate_u16(iHalf); + + float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf)); + + vst1q_f32(reinterpret_cast(pFloat), vFloat); + pFloat += OutputStride * 4; + i += 4; + } + } + else + { + // Scattered input, scattered output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + uint64_t iHalf = uint64_t(H1) | (uint64_t(H2) << 16) | (uint64_t(H3) << 32) | (uint64_t(H4) << 48); + uint16x4_t vHalf = vcreate_u16(iHalf); + + float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf)); + + vst1q_lane_f32(reinterpret_cast(pFloat), vFloat, 0); + pFloat += OutputStride; + vst1q_lane_f32(reinterpret_cast(pFloat), vFloat, 1); + pFloat += OutputStride; + vst1q_lane_f32(reinterpret_cast(pFloat), vFloat, 2); + pFloat += OutputStride; + vst1q_lane_f32(reinterpret_cast(pFloat), vFloat, 3); + pFloat += OutputStride; + i += 4; + } + } + } + + for (; i < HalfCount; ++i) + { + *reinterpret_cast(pFloat) = XMConvertHalfToFloat(reinterpret_cast(pHalf)[0]); + pHalf += InputStride; + pFloat += OutputStride; + } + + return pOutputStream; +#else + auto pHalf = reinterpret_cast(pInputStream); + auto pFloat = reinterpret_cast(pOutputStream); + + for (size_t i = 0; i < HalfCount; i++) + { + *reinterpret_cast(pFloat) = XMConvertHalfToFloat(reinterpret_cast(pHalf)[0]); + pHalf += InputStride; + pFloat += OutputStride; + } + + return pOutputStream; +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline HALF XMConvertFloatToHalf(float Value) noexcept +{ +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + __m128 V1 = _mm_set_ss(Value); + __m128i V2 = _mm_cvtps_ph(V1, _MM_FROUND_TO_NEAREST_INT); + return static_cast(_mm_extract_epi16(V2, 0)); +#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2)) + float32x4_t vFloat = vdupq_n_f32(Value); + float16x4_t vHalf = vcvt_f16_f32(vFloat); + return vget_lane_u16(vreinterpret_u16_f16(vHalf), 0); +#else + uint32_t Result; + + auto IValue = reinterpret_cast(&Value)[0]; + uint32_t Sign = (IValue & 0x80000000U) >> 16U; + IValue = IValue & 0x7FFFFFFFU; // Hack off the sign + if (IValue >= 0x47800000 /*e+16*/) + { + // The number is too large to be represented as a half. Return infinity or NaN + Result = 0x7C00U | ((IValue > 0x7F800000) ? (0x200 | ((IValue >> 13U) & 0x3FFU)) : 0U); + } + else if (IValue <= 0x33000000U /*e-25*/) + { + Result = 0; + } + else if (IValue < 0x38800000U /*e-14*/) + { + // The number is too small to be represented as a normalized half. + // Convert it to a denormalized value. + uint32_t Shift = 125U - (IValue >> 23U); + IValue = 0x800000U | (IValue & 0x7FFFFFU); + Result = IValue >> (Shift + 1); + uint32_t s = (IValue & ((1U << Shift) - 1)) != 0; + Result += (Result | s) & ((IValue >> Shift) & 1U); + } + else + { + // Rebias the exponent to represent the value as a normalized half. + IValue += 0xC8000000U; + Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U) & 0x7FFFU; + } + return static_cast(Result | Sign); +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline HALF* XMConvertFloatToHalfStream +( + HALF* pOutputStream, + size_t OutputStride, + const float* pInputStream, + size_t InputStride, + size_t FloatCount +) noexcept +{ + assert(pOutputStream); + assert(pInputStream); + + assert(InputStride >= sizeof(float)); + _Analysis_assume_(InputStride >= sizeof(float)); + + assert(OutputStride >= sizeof(HALF)); + _Analysis_assume_(OutputStride >= sizeof(HALF)); + +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + auto pFloat = reinterpret_cast(pInputStream); + auto pHalf = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = FloatCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(float)) + { + if (OutputStride == sizeof(HALF)) + { + if ((reinterpret_cast(pFloat) & 0xF) == 0) + { + // Aligned and packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_load_ps(reinterpret_cast(pFloat)); + pFloat += InputStride * 4; + + __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT); + + _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV); + pHalf += OutputStride * 4; + i += 4; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_loadu_ps(reinterpret_cast(pFloat)); + pFloat += InputStride * 4; + + __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT); + + _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV); + pHalf += OutputStride * 4; + i += 4; + } + } + } + else + { + if ((reinterpret_cast(pFloat) & 0xF) == 0) + { + // Aligned & packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_load_ps(reinterpret_cast(pFloat)); + pFloat += InputStride * 4; + + __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT); + + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 0)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 1)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 2)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 3)); + pHalf += OutputStride; + i += 4; + } + } + else + { + // Packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_loadu_ps(reinterpret_cast(pFloat)); + pFloat += InputStride * 4; + + __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT); + + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 0)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 1)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 2)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 3)); + pHalf += OutputStride; + i += 4; + } + } + } + } + else if (OutputStride == sizeof(HALF)) + { + // Scattered input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128 FV1 = _mm_load_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV2 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV3 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV4 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV = _mm_blend_ps(FV1, FV2, 0x2); + __m128 FT = _mm_blend_ps(FV3, FV4, 0x8); + FV = _mm_blend_ps(FV, FT, 0xC); + + __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT); + + _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV); + pHalf += OutputStride * 4; + i += 4; + } + } + else + { + // Scattered input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128 FV1 = _mm_load_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV2 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV3 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV4 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV = _mm_blend_ps(FV1, FV2, 0x2); + __m128 FT = _mm_blend_ps(FV3, FV4, 0x8); + FV = _mm_blend_ps(FV, FT, 0xC); + + __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT); + + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 0)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 1)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 2)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 3)); + pHalf += OutputStride; + i += 4; + } + } + } + + for (; i < FloatCount; ++i) + { + *reinterpret_cast(pHalf) = XMConvertFloatToHalf(reinterpret_cast(pFloat)[0]); + pFloat += InputStride; + pHalf += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2)) + auto pFloat = reinterpret_cast(pInputStream); + auto pHalf = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = FloatCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(float)) + { + if (OutputStride == sizeof(HALF)) + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + float32x4_t vFloat = vld1q_f32(reinterpret_cast(pFloat)); + pFloat += InputStride * 4; + + uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat)); + + vst1_u16(reinterpret_cast(pHalf), vHalf); + pHalf += OutputStride * 4; + i += 4; + } + } + else + { + // Packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + float32x4_t vFloat = vld1q_f32(reinterpret_cast(pFloat)); + pFloat += InputStride * 4; + + uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat)); + + vst1_lane_u16(reinterpret_cast(pHalf), vHalf, 0); + pHalf += OutputStride; + vst1_lane_u16(reinterpret_cast(pHalf), vHalf, 1); + pHalf += OutputStride; + vst1_lane_u16(reinterpret_cast(pHalf), vHalf, 2); + pHalf += OutputStride; + vst1_lane_u16(reinterpret_cast(pHalf), vHalf, 3); + pHalf += OutputStride; + i += 4; + } + } + } + else if (OutputStride == sizeof(HALF)) + { + // Scattered input, packed output + for (size_t j = 0; j < four; ++j) + { + float32x4_t vFloat = vdupq_n_f32(0); + vFloat = vld1q_lane_f32(reinterpret_cast(pFloat), vFloat, 0); + pFloat += InputStride; + + vFloat = vld1q_lane_f32(reinterpret_cast(pFloat), vFloat, 1); + pFloat += InputStride; + + vFloat = vld1q_lane_f32(reinterpret_cast(pFloat), vFloat, 2); + pFloat += InputStride; + + vFloat = vld1q_lane_f32(reinterpret_cast(pFloat), vFloat, 3); + pFloat += InputStride; + + uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat)); + + vst1_u16(reinterpret_cast(pHalf), vHalf); + pHalf += OutputStride * 4; + i += 4; + } + } + else + { + // Scattered input, scattered output + for (size_t j = 0; j < four; ++j) + { + float32x4_t vFloat = vdupq_n_f32(0); + vFloat = vld1q_lane_f32(reinterpret_cast(pFloat), vFloat, 0); + pFloat += InputStride; + + vFloat = vld1q_lane_f32(reinterpret_cast(pFloat), vFloat, 1); + pFloat += InputStride; + + vFloat = vld1q_lane_f32(reinterpret_cast(pFloat), vFloat, 2); + pFloat += InputStride; + + vFloat = vld1q_lane_f32(reinterpret_cast(pFloat), vFloat, 3); + pFloat += InputStride; + + uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat)); + + vst1_lane_u16(reinterpret_cast(pHalf), vHalf, 0); + pHalf += OutputStride; + vst1_lane_u16(reinterpret_cast(pHalf), vHalf, 1); + pHalf += OutputStride; + vst1_lane_u16(reinterpret_cast(pHalf), vHalf, 2); + pHalf += OutputStride; + vst1_lane_u16(reinterpret_cast(pHalf), vHalf, 3); + pHalf += OutputStride; + i += 4; + } + } + } + + for (; i < FloatCount; ++i) + { + *reinterpret_cast(pHalf) = XMConvertFloatToHalf(reinterpret_cast(pFloat)[0]); + pFloat += InputStride; + pHalf += OutputStride; + } + + return pOutputStream; +#else + auto pFloat = reinterpret_cast(pInputStream); + auto pHalf = reinterpret_cast(pOutputStream); + + for (size_t i = 0; i < FloatCount; i++) + { + *reinterpret_cast(pHalf) = XMConvertFloatToHalf(reinterpret_cast(pFloat)[0]); + pFloat += InputStride; + pHalf += OutputStride; + } + return pOutputStream; +#endif // !_XM_F16C_INTRINSICS_ +} + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +/**************************************************************************** + * + * Vector and matrix load operations + * + ****************************************************************************/ + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable:28931, "PREfast noise: Esp:1266") +#endif + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadColor(const XMCOLOR* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + // int32_t -> Float conversions are done in one instruction. + // uint32_t -> Float calls a runtime function. Keep in int32_t + auto iColor = static_cast(pSource->c); + XMVECTORF32 vColor = { { { + static_cast((iColor >> 16) & 0xFF)* (1.0f / 255.0f), + static_cast((iColor >> 8) & 0xFF)* (1.0f / 255.0f), + static_cast(iColor & 0xFF)* (1.0f / 255.0f), + static_cast((iColor >> 24) & 0xFF)* (1.0f / 255.0f) + } } }; + return vColor.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32_t bgra = pSource->c; + uint32_t rgba = (bgra & 0xFF00FF00) | ((bgra >> 16) & 0xFF) | ((bgra << 16) & 0xFF0000); + uint32x2_t vInt8 = vdup_n_u32(rgba); + uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u32(vInt8)); + uint32x4_t vInt = vmovl_u16(vget_low_u16(vInt16)); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_n_f32(R, 1.0f / 255.0f); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries + __m128i vInt = _mm_set1_epi32(static_cast(pSource->c)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vInt = _mm_and_si128(vInt, g_XMMaskA8R8G8B8); + // a is unsigned! Flip the bit to convert the order to signed + vInt = _mm_xor_si128(vInt, g_XMFlipA8R8G8B8); + // Convert to floating point numbers + XMVECTOR vTemp = _mm_cvtepi32_ps(vInt); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp, g_XMFixAA8R8G8B8); + // Convert 0-255 to 0.0f-1.0f + return _mm_mul_ps(vTemp, g_XMNormalizeA8R8G8B8); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadHalf2(const XMHALF2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + __m128 V = _mm_load_ss(reinterpret_cast(pSource)); + return _mm_cvtph_ps(_mm_castps_si128(V)); +#else + XMVECTORF32 vResult = { { { + XMConvertHalfToFloat(pSource->x), + XMConvertHalfToFloat(pSource->y), + 0.0f, + 0.0f + } } }; + return vResult.v; +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadShortN2(const XMSHORTN2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + (pSource->x == -32768) ? -1.f : (static_cast(pSource->x)* (1.0f / 32767.0f)), + (pSource->y == -32768) ? -1.f : (static_cast(pSource->y)* (1.0f / 32767.0f)), + 0.0f, + 0.0f + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt16 = vld1_dup_u32(reinterpret_cast(pSource)); + int32x4_t vInt = vmovl_s16(vreinterpret_s16_u32(vInt16)); + vInt = vandq_s32(vInt, g_XMMaskXY); + float32x4_t R = vcvtq_f32_s32(vInt); + R = vmulq_n_f32(R, 1.0f / 32767.0f); + return vmaxq_f32(R, vdupq_n_f32(-1.f)); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the two shorts in all four entries (WORD alignment okay, + // DWORD alignment preferred) + __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->x)); + // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 + vTemp = _mm_and_ps(vTemp, g_XMMaskX16Y16); + // x needs to be sign extended + vTemp = _mm_xor_ps(vTemp, g_XMFlipX16Y16); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x - 0x8000 to undo the signed order. + vTemp = _mm_add_ps(vTemp, g_XMFixX16Y16); + // Convert -1.0f - 1.0f + vTemp = _mm_mul_ps(vTemp, g_XMNormalizeX16Y16); + // Clamp result (for case of -32768) + return _mm_max_ps(vTemp, g_XMNegativeOne); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadShort2(const XMSHORT2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + static_cast(pSource->x), + static_cast(pSource->y), + 0.f, + 0.f + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt16 = vld1_dup_u32(reinterpret_cast(pSource)); + int32x4_t vInt = vmovl_s16(vreinterpret_s16_u32(vInt16)); + vInt = vandq_s32(vInt, g_XMMaskXY); + return vcvtq_f32_s32(vInt); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the two shorts in all four entries (WORD alignment okay, + // DWORD alignment preferred) + __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->x)); + // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 + vTemp = _mm_and_ps(vTemp, g_XMMaskX16Y16); + // x needs to be sign extended + vTemp = _mm_xor_ps(vTemp, g_XMFlipX16Y16); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x - 0x8000 to undo the signed order. + vTemp = _mm_add_ps(vTemp, g_XMFixX16Y16); + // Y is 65536 too large + return _mm_mul_ps(vTemp, g_XMFixupY16); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUShortN2(const XMUSHORTN2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + static_cast(pSource->x) / 65535.0f, + static_cast(pSource->y) / 65535.0f, + 0.f, + 0.f + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt16 = vld1_dup_u32(reinterpret_cast(pSource)); + uint32x4_t vInt = vmovl_u16(vreinterpret_u16_u32(vInt16)); + vInt = vandq_u32(vInt, g_XMMaskXY); + float32x4_t R = vcvtq_f32_u32(vInt); + R = vmulq_n_f32(R, 1.0f / 65535.0f); + return vmaxq_f32(R, vdupq_n_f32(-1.f)); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FixupY16 = { { { 1.0f / 65535.0f, 1.0f / (65535.0f * 65536.0f), 0.0f, 0.0f } } }; + static const XMVECTORF32 FixaddY16 = { { { 0, 32768.0f * 65536.0f, 0, 0 } } }; + // Splat the two shorts in all four entries (WORD alignment okay, + // DWORD alignment preferred) + __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->x)); + // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 + vTemp = _mm_and_ps(vTemp, g_XMMaskX16Y16); + // y needs to be sign flipped + vTemp = _mm_xor_ps(vTemp, g_XMFlipY); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // y + 0x8000 to undo the signed order. + vTemp = _mm_add_ps(vTemp, FixaddY16); + // Y is 65536 times too large + vTemp = _mm_mul_ps(vTemp, FixupY16); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUShort2(const XMUSHORT2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + static_cast(pSource->x), + static_cast(pSource->y), + 0.f, + 0.f + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt16 = vld1_dup_u32(reinterpret_cast(pSource)); + uint32x4_t vInt = vmovl_u16(vreinterpret_u16_u32(vInt16)); + vInt = vandq_u32(vInt, g_XMMaskXY); + return vcvtq_f32_u32(vInt); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FixaddY16 = { { { 0, 32768.0f, 0, 0 } } }; + // Splat the two shorts in all four entries (WORD alignment okay, + // DWORD alignment preferred) + __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->x)); + // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 + vTemp = _mm_and_ps(vTemp, g_XMMaskX16Y16); + // y needs to be sign flipped + vTemp = _mm_xor_ps(vTemp, g_XMFlipY); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // Y is 65536 times too large + vTemp = _mm_mul_ps(vTemp, g_XMFixupY16); + // y + 0x8000 to undo the signed order. + vTemp = _mm_add_ps(vTemp, FixaddY16); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadByteN2(const XMBYTEN2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + (pSource->x == -128) ? -1.f : (static_cast(pSource->x)* (1.0f / 127.0f)), + (pSource->y == -128) ? -1.f : (static_cast(pSource->y)* (1.0f / 127.0f)), + 0.0f, + 0.0f + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint16x4_t vInt8 = vld1_dup_u16(reinterpret_cast(pSource)); + int16x8_t vInt16 = vmovl_s8(vreinterpret_s8_u16(vInt8)); + int32x4_t vInt = vmovl_s16(vget_low_s16(vInt16)); + vInt = vandq_s32(vInt, g_XMMaskXY); + float32x4_t R = vcvtq_f32_s32(vInt); + R = vmulq_n_f32(R, 1.0f / 127.0f); + return vmaxq_f32(R, vdupq_n_f32(-1.f)); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.0f / 127.0f, 1.0f / (127.0f * 256.0f), 0, 0 } } }; + static const XMVECTORU32 Mask = { { { 0xFF, 0xFF00, 0, 0 } } }; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask + vTemp = _mm_and_ps(vTemp, Mask); + // x,y and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp, g_XMXorByte4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x, y and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp, g_XMAddByte4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp, Scale); + // Clamp result (for case of -128) + return _mm_max_ps(vTemp, g_XMNegativeOne); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadByte2(const XMBYTE2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + static_cast(pSource->x), + static_cast(pSource->y), + 0.0f, + 0.0f + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint16x4_t vInt8 = vld1_dup_u16(reinterpret_cast(pSource)); + int16x8_t vInt16 = vmovl_s8(vreinterpret_s8_u16(vInt8)); + int32x4_t vInt = vmovl_s16(vget_low_s16(vInt16)); + vInt = vandq_s32(vInt, g_XMMaskXY); + return vcvtq_f32_s32(vInt); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.0f, 1.0f / 256.0f, 1.0f / 65536.0f, 1.0f / (65536.0f * 256.0f) } } }; + static const XMVECTORU32 Mask = { { { 0xFF, 0xFF00, 0, 0 } } }; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask + vTemp = _mm_and_ps(vTemp, Mask); + // x,y and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp, g_XMXorByte4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x, y and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp, g_XMAddByte4); + // Fix y, z and w because they are too large + return _mm_mul_ps(vTemp, Scale); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUByteN2(const XMUBYTEN2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + static_cast(pSource->x)* (1.0f / 255.0f), + static_cast(pSource->y)* (1.0f / 255.0f), + 0.0f, + 0.0f + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint16x4_t vInt8 = vld1_dup_u16(reinterpret_cast(pSource)); + uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u16(vInt8)); + uint32x4_t vInt = vmovl_u16(vget_low_u16(vInt16)); + vInt = vandq_u32(vInt, g_XMMaskXY); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_n_f32(R, 1.0f / 255.0f); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.0f / 255.0f, 1.0f / (255.0f * 256.0f), 0, 0 } } }; + static const XMVECTORU32 Mask = { { { 0xFF, 0xFF00, 0, 0 } } }; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask + vTemp = _mm_and_ps(vTemp, Mask); + // w is signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp, g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // w + 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp, g_XMAddUDec4); + // Fix y, z and w because they are too large + return _mm_mul_ps(vTemp, Scale); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUByte2(const XMUBYTE2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + static_cast(pSource->x), + static_cast(pSource->y), + 0.0f, + 0.0f + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint16x4_t vInt8 = vld1_dup_u16(reinterpret_cast(pSource)); + uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u16(vInt8)); + uint32x4_t vInt = vmovl_u16(vget_low_u16(vInt16)); + vInt = vandq_u32(vInt, g_XMMaskXY); + return vcvtq_f32_u32(vInt); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.0f, 1.0f / 256.0f, 0, 0 } } }; + static const XMVECTORU32 Mask = { { { 0xFF, 0xFF00, 0, 0 } } }; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask + vTemp = _mm_and_ps(vTemp, Mask); + // w is signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp, g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // w + 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp, g_XMAddUDec4); + // Fix y, z and w because they are too large + return _mm_mul_ps(vTemp, Scale); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadU565(const XMU565* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + float(pSource->v & 0x1F), + float((pSource->v >> 5) & 0x3F), + float((pSource->v >> 11) & 0x1F), + 0.f, + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORI32 U565And = { { { 0x1F, 0x3F << 5, 0x1F << 11, 0 } } }; + static const XMVECTORF32 U565Mul = { { { 1.0f, 1.0f / 32.0f, 1.0f / 2048.f, 0 } } }; + uint16x4_t vInt16 = vld1_dup_u16(reinterpret_cast(pSource)); + uint32x4_t vInt = vmovl_u16(vInt16); + vInt = vandq_u32(vInt, U565And); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_f32(R, U565Mul); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORI32 U565And = { { { 0x1F, 0x3F << 5, 0x1F << 11, 0 } } }; + static const XMVECTORF32 U565Mul = { { { 1.0f, 1.0f / 32.0f, 1.0f / 2048.f, 0 } } }; + // Get the 32 bit value and splat it + XMVECTOR vResult = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult, U565And); + // Convert to float + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Normalize x, y, and z + vResult = _mm_mul_ps(vResult, U565Mul); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat3PK(const XMFLOAT3PK* pSource) noexcept +{ + assert(pSource); + + XM_ALIGNED_DATA(16) uint32_t Result[4]; + uint32_t Mantissa; + uint32_t Exponent; + + // X Channel (6-bit mantissa) + Mantissa = pSource->xm; + + if (pSource->xe == 0x1f) // INF or NAN + { + Result[0] = static_cast(0x7f800000 | (static_cast(pSource->xm) << 17)); + } + else + { + if (pSource->xe != 0) // The value is normalized + { + Exponent = pSource->xe; + } + else if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x40) == 0); + + Mantissa &= 0x3F; + } + else // The value is zero + { + Exponent = static_cast(-112); + } + + Result[0] = ((Exponent + 112) << 23) | (Mantissa << 17); + } + + // Y Channel (6-bit mantissa) + Mantissa = pSource->ym; + + if (pSource->ye == 0x1f) // INF or NAN + { + Result[1] = static_cast(0x7f800000 | (static_cast(pSource->ym) << 17)); + } + else + { + if (pSource->ye != 0) // The value is normalized + { + Exponent = pSource->ye; + } + else if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x40) == 0); + + Mantissa &= 0x3F; + } + else // The value is zero + { + Exponent = static_cast(-112); + } + + Result[1] = ((Exponent + 112) << 23) | (Mantissa << 17); + } + + // Z Channel (5-bit mantissa) + Mantissa = pSource->zm; + + if (pSource->ze == 0x1f) // INF or NAN + { + Result[2] = static_cast(0x7f800000 | (static_cast(pSource->zm) << 17)); + } + else + { + if (pSource->ze != 0) // The value is normalized + { + Exponent = pSource->ze; + } + else if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x20) == 0); + + Mantissa &= 0x1F; + } + else // The value is zero + { + Exponent = static_cast(-112); + } + + Result[2] = ((Exponent + 112) << 23) | (Mantissa << 18); + } + + return XMLoadFloat3A(reinterpret_cast(&Result)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat3SE(const XMFLOAT3SE* pSource) noexcept +{ + assert(pSource); + + union { float f; int32_t i; } fi; + fi.i = 0x33800000 + (pSource->e << 23); + float Scale = fi.f; + + XMVECTORF32 v = { { { + Scale * float(pSource->xm), + Scale * float(pSource->ym), + Scale * float(pSource->zm), + 1.0f } } }; + return v; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadHalf4(const XMHALF4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + __m128i V = _mm_loadl_epi64(reinterpret_cast(pSource)); + return _mm_cvtph_ps(V); +#else + XMVECTORF32 vResult = { { { + XMConvertHalfToFloat(pSource->x), + XMConvertHalfToFloat(pSource->y), + XMConvertHalfToFloat(pSource->z), + XMConvertHalfToFloat(pSource->w) + } } }; + return vResult.v; +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadShortN4(const XMSHORTN4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + (pSource->x == -32768) ? -1.f : (static_cast(pSource->x)* (1.0f / 32767.0f)), + (pSource->y == -32768) ? -1.f : (static_cast(pSource->y)* (1.0f / 32767.0f)), + (pSource->z == -32768) ? -1.f : (static_cast(pSource->z)* (1.0f / 32767.0f)), + (pSource->w == -32768) ? -1.f : (static_cast(pSource->w)* (1.0f / 32767.0f)) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int16x4_t vInt = vld1_s16(reinterpret_cast(pSource)); + int32x4_t V = vmovl_s16(vInt); + float32x4_t vResult = vcvtq_f32_s32(V); + vResult = vmulq_n_f32(vResult, 1.0f / 32767.0f); + return vmaxq_f32(vResult, vdupq_n_f32(-1.f)); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries (x,z,y,w) + __m128d vIntd = _mm_load1_pd(reinterpret_cast(&pSource->x)); + // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 + __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd), g_XMMaskX16Y16Z16W16); + // x and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp, g_XMFlipX16Y16Z16W16); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x and z - 0x8000 to complete the conversion + vTemp = _mm_add_ps(vTemp, g_XMFixX16Y16Z16W16); + // Convert to -1.0f - 1.0f + vTemp = _mm_mul_ps(vTemp, g_XMNormalizeX16Y16Z16W16); + // Very important! The entries are x,z,y,w, flip it to x,y,z,w + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 1, 2, 0)); + // Clamp result (for case of -32768) + return _mm_max_ps(vTemp, g_XMNegativeOne); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadShort4(const XMSHORT4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + static_cast(pSource->x), + static_cast(pSource->y), + static_cast(pSource->z), + static_cast(pSource->w) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int16x4_t vInt = vld1_s16(reinterpret_cast(pSource)); + int32x4_t V = vmovl_s16(vInt); + return vcvtq_f32_s32(V); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries (x,z,y,w) + __m128d vIntd = _mm_load1_pd(reinterpret_cast(&pSource->x)); + // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 + __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd), g_XMMaskX16Y16Z16W16); + // x and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp, g_XMFlipX16Y16Z16W16); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x and z - 0x8000 to complete the conversion + vTemp = _mm_add_ps(vTemp, g_XMFixX16Y16Z16W16); + // Fix y and w because they are 65536 too large + vTemp = _mm_mul_ps(vTemp, g_XMFixupY16W16); + // Very important! The entries are x,z,y,w, flip it to x,y,z,w + return XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 1, 2, 0)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUShortN4(const XMUSHORTN4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + static_cast(pSource->x) / 65535.0f, + static_cast(pSource->y) / 65535.0f, + static_cast(pSource->z) / 65535.0f, + static_cast(pSource->w) / 65535.0f + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint16x4_t vInt = vld1_u16(reinterpret_cast(pSource)); + uint32x4_t V = vmovl_u16(vInt); + float32x4_t vResult = vcvtq_f32_u32(V); + return vmulq_n_f32(vResult, 1.0f / 65535.0f); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FixupY16W16 = { { { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / (65535.0f * 65536.0f), 1.0f / (65535.0f * 65536.0f) } } }; + static const XMVECTORF32 FixaddY16W16 = { { { 0, 0, 32768.0f * 65536.0f, 32768.0f * 65536.0f } } }; + // Splat the color in all four entries (x,z,y,w) + __m128d vIntd = _mm_load1_pd(reinterpret_cast(&pSource->x)); + // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 + __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd), g_XMMaskX16Y16Z16W16); + // y and w are signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp, g_XMFlipZW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // y and w + 0x8000 to complete the conversion + vTemp = _mm_add_ps(vTemp, FixaddY16W16); + // Fix y and w because they are 65536 too large + vTemp = _mm_mul_ps(vTemp, FixupY16W16); + // Very important! The entries are x,z,y,w, flip it to x,y,z,w + return XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 1, 2, 0)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUShort4(const XMUSHORT4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + static_cast(pSource->x), + static_cast(pSource->y), + static_cast(pSource->z), + static_cast(pSource->w) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint16x4_t vInt = vld1_u16(reinterpret_cast(pSource)); + uint32x4_t V = vmovl_u16(vInt); + return vcvtq_f32_u32(V); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FixaddY16W16 = { { { 0, 0, 32768.0f, 32768.0f } } }; + // Splat the color in all four entries (x,z,y,w) + __m128d vIntd = _mm_load1_pd(reinterpret_cast(&pSource->x)); + // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 + __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd), g_XMMaskX16Y16Z16W16); + // y and w are signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp, g_XMFlipZW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // Fix y and w because they are 65536 too large + vTemp = _mm_mul_ps(vTemp, g_XMFixupY16W16); + // y and w + 0x8000 to complete the conversion + vTemp = _mm_add_ps(vTemp, FixaddY16W16); + // Very important! The entries are x,z,y,w, flip it to x,y,z,w + return XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 1, 2, 0)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadXDecN4(const XMXDECN4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + static const uint32_t SignExtend[] = { 0x00000000, 0xFFFFFC00 }; + + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + + XMVECTORF32 vResult = { { { + (ElementX == 0x200) ? -1.f : (static_cast(static_cast(ElementX | SignExtend[ElementX >> 9])) / 511.0f), + (ElementY == 0x200) ? -1.f : (static_cast(static_cast(ElementY | SignExtend[ElementY >> 9])) / 511.0f), + (ElementZ == 0x200) ? -1.f : (static_cast(static_cast(ElementZ | SignExtend[ElementZ >> 9])) / 511.0f), + static_cast(pSource->v >> 30) / 3.0f + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast(pSource)); + vInt = vandq_u32(vInt, g_XMMaskA2B10G10R10); + vInt = veorq_u32(vInt, g_XMFlipA2B10G10R10); + float32x4_t R = vcvtq_f32_s32(vreinterpretq_s32_u32(vInt)); + R = vaddq_f32(R, g_XMFixAA2B10G10R10); + R = vmulq_f32(R, g_XMNormalizeA2B10G10R10); + return vmaxq_f32(R, vdupq_n_f32(-1.0f)); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries + __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp, g_XMMaskA2B10G10R10); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp, g_XMFlipA2B10G10R10); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp, g_XMFixAA2B10G10R10); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp, g_XMNormalizeA2B10G10R10); + // Clamp result (for case of -512) + return _mm_max_ps(vTemp, g_XMNegativeOne); +#endif +} + +//------------------------------------------------------------------------------ +#pragma warning(push) +#pragma warning(disable : 4996) +// C4996: ignore deprecation warning + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadXDec4(const XMXDEC4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + static const uint32_t SignExtend[] = { 0x00000000, 0xFFFFFC00 }; + + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + + XMVECTORF32 vResult = { { { + static_cast(static_cast(ElementX | SignExtend[ElementX >> 9])), + static_cast(static_cast(ElementY | SignExtend[ElementY >> 9])), + static_cast(static_cast(ElementZ | SignExtend[ElementZ >> 9])), + static_cast(pSource->v >> 30) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORU32 XDec4Xor = { { { 0x200, 0x200 << 10, 0x200 << 20, 0x80000000 } } }; + static const XMVECTORF32 XDec4Add = { { { -512.0f, -512.0f * 1024.0f, -512.0f * 1024.0f * 1024.0f, 32768 * 65536.0f } } }; + uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast(pSource)); + vInt = vandq_u32(vInt, g_XMMaskDec4); + vInt = veorq_u32(vInt, XDec4Xor); + float32x4_t R = vcvtq_f32_s32(vreinterpretq_s32_u32(vInt)); + R = vaddq_f32(R, XDec4Add); + return vmulq_f32(R, g_XMMulDec4); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORU32 XDec4Xor = { { { 0x200, 0x200 << 10, 0x200 << 20, 0x80000000 } } }; + static const XMVECTORF32 XDec4Add = { { { -512.0f, -512.0f * 1024.0f, -512.0f * 1024.0f * 1024.0f, 32768 * 65536.0f } } }; + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp, g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp, XDec4Xor); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp, XDec4Add); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp, g_XMMulDec4); + return vTemp; +#endif +} + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + +#pragma warning(pop) + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUDecN4(const XMUDECN4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + + XMVECTORF32 vResult = { { { + static_cast(ElementX) / 1023.0f, + static_cast(ElementY) / 1023.0f, + static_cast(ElementZ) / 1023.0f, + static_cast(pSource->v >> 30) / 3.0f + } } }; + return vResult.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 UDecN4Mul = { { { 1.0f / 1023.0f, 1.0f / (1023.0f * 1024.0f), 1.0f / (1023.0f * 1024.0f * 1024.0f), 1.0f / (3.0f * 1024.0f * 1024.0f * 1024.0f) } } }; + uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast(pSource)); + vInt = vandq_u32(vInt, g_XMMaskDec4); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_f32(R, UDecN4Mul); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 UDecN4Mul = { { { 1.0f / 1023.0f, 1.0f / (1023.0f * 1024.0f), 1.0f / (1023.0f * 1024.0f * 1024.0f), 1.0f / (3.0f * 1024.0f * 1024.0f * 1024.0f) } } }; + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp, g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp, g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp, g_XMAddUDec4); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp, UDecN4Mul); + return vTemp; +#endif +} + + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUDecN4_XR(const XMUDECN4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + int32_t ElementX = pSource->v & 0x3FF; + int32_t ElementY = (pSource->v >> 10) & 0x3FF; + int32_t ElementZ = (pSource->v >> 20) & 0x3FF; + + XMVECTORF32 vResult = { { { + static_cast(ElementX - 0x180) / 510.0f, + static_cast(ElementY - 0x180) / 510.0f, + static_cast(ElementZ - 0x180) / 510.0f, + static_cast(pSource->v >> 30) / 3.0f + } } }; + + return vResult.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 XRMul = { { { 1.0f / 510.0f, 1.0f / (510.0f * 1024.0f), 1.0f / (510.0f * 1024.0f * 1024.0f), 1.0f / (3.0f * 1024.0f * 1024.0f * 1024.0f) } } }; + static const XMVECTORI32 XRBias = { { { 0x180, 0x180 * 1024, 0x180 * 1024 * 1024, 0 } } }; + uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast(pSource)); + vInt = vandq_u32(vInt, g_XMMaskDec4); + int32x4_t vTemp = vsubq_s32(vreinterpretq_s32_u32(vInt), XRBias); + vTemp = veorq_s32(vTemp, g_XMFlipW); + float32x4_t R = vcvtq_f32_s32(vTemp); + R = vaddq_f32(R, g_XMAddUDec4); + return vmulq_f32(R, XRMul); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 XRMul = { { { 1.0f / 510.0f, 1.0f / (510.0f * 1024.0f), 1.0f / (510.0f * 1024.0f * 1024.0f), 1.0f / (3.0f * 1024.0f * 1024.0f * 1024.0f) } } }; + static const XMVECTORI32 XRBias = { { { 0x180, 0x180 * 1024, 0x180 * 1024 * 1024, 0 } } }; + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Mask channels + vTemp = _mm_and_ps(vTemp, g_XMMaskDec4); + // Subtract bias + vTemp = _mm_castsi128_ps(_mm_sub_epi32(_mm_castps_si128(vTemp), XRBias)); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp, g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp, g_XMAddUDec4); + // Convert to 0.0f-1.0f + return _mm_mul_ps(vTemp, XRMul); +#endif +} + + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUDec4(const XMUDEC4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + + XMVECTORF32 vResult = { { { + static_cast(ElementX), + static_cast(ElementY), + static_cast(ElementZ), + static_cast(pSource->v >> 30) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast(pSource)); + vInt = vandq_u32(vInt, g_XMMaskDec4); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_f32(R, g_XMMulDec4); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp, g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp, g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp, g_XMAddUDec4); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp, g_XMMulDec4); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +#pragma warning(push) +#pragma warning(disable : 4996) +// C4996: ignore deprecation warning + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadDecN4(const XMDECN4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + static const uint32_t SignExtend[] = { 0x00000000, 0xFFFFFC00 }; + static const uint32_t SignExtendW[] = { 0x00000000, 0xFFFFFFFC }; + + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + uint32_t ElementW = pSource->v >> 30; + + XMVECTORF32 vResult = { { { + (ElementX == 0x200) ? -1.f : (static_cast(static_cast(ElementX | SignExtend[ElementX >> 9])) / 511.0f), + (ElementY == 0x200) ? -1.f : (static_cast(static_cast(ElementY | SignExtend[ElementY >> 9])) / 511.0f), + (ElementZ == 0x200) ? -1.f : (static_cast(static_cast(ElementZ | SignExtend[ElementZ >> 9])) / 511.0f), + (ElementW == 0x2) ? -1.f : static_cast(static_cast(ElementW | SignExtendW[(ElementW >> 1) & 1])) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 DecN4Mul = { { { 1.0f / 511.0f, 1.0f / (511.0f * 1024.0f), 1.0f / (511.0f * 1024.0f * 1024.0f), 1.0f / (1024.0f * 1024.0f * 1024.0f) } } }; + uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast(pSource)); + vInt = vandq_u32(vInt, g_XMMaskDec4); + vInt = veorq_u32(vInt, g_XMXorDec4); + float32x4_t R = vcvtq_f32_s32(vreinterpretq_s32_u32(vInt)); + R = vaddq_f32(R, g_XMAddDec4); + R = vmulq_f32(R, DecN4Mul); + return vmaxq_f32(R, vdupq_n_f32(-1.0f)); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 DecN4Mul = { { { 1.0f / 511.0f, 1.0f / (511.0f * 1024.0f), 1.0f / (511.0f * 1024.0f * 1024.0f), 1.0f / (1024.0f * 1024.0f * 1024.0f) } } }; + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp, g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp, g_XMXorDec4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp, g_XMAddDec4); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp, DecN4Mul); + // Clamp result (for case of -512/-1) + return _mm_max_ps(vTemp, g_XMNegativeOne); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadDec4(const XMDEC4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + static const uint32_t SignExtend[] = { 0x00000000, 0xFFFFFC00 }; + static const uint32_t SignExtendW[] = { 0x00000000, 0xFFFFFFFC }; + + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + uint32_t ElementW = pSource->v >> 30; + + XMVECTORF32 vResult = { { { + static_cast(static_cast(ElementX | SignExtend[ElementX >> 9])), + static_cast(static_cast(ElementY | SignExtend[ElementY >> 9])), + static_cast(static_cast(ElementZ | SignExtend[ElementZ >> 9])), + static_cast(static_cast(ElementW | SignExtendW[ElementW >> 1])) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast(pSource)); + vInt = vandq_u32(vInt, g_XMMaskDec4); + vInt = veorq_u32(vInt, g_XMXorDec4); + float32x4_t R = vcvtq_f32_s32(vreinterpretq_s32_u32(vInt)); + R = vaddq_f32(R, g_XMAddDec4); + return vmulq_f32(R, g_XMMulDec4); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp, g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp, g_XMXorDec4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp, g_XMAddDec4); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp, g_XMMulDec4); + return vTemp; +#endif +} + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + +#pragma warning(pop) + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUByteN4(const XMUBYTEN4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + static_cast(pSource->x) / 255.0f, + static_cast(pSource->y) / 255.0f, + static_cast(pSource->z) / 255.0f, + static_cast(pSource->w) / 255.0f + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt8 = vld1_dup_u32(reinterpret_cast(pSource)); + uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u32(vInt8)); + uint32x4_t vInt = vmovl_u16(vget_low_u16(vInt16)); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_n_f32(R, 1.0f / 255.0f); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadUByteN4Mul = { { { 1.0f / 255.0f, 1.0f / (255.0f * 256.0f), 1.0f / (255.0f * 65536.0f), 1.0f / (255.0f * 65536.0f * 256.0f) } } }; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 + vTemp = _mm_and_ps(vTemp, g_XMMaskByte4); + // w is signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp, g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // w + 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp, g_XMAddUDec4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp, LoadUByteN4Mul); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUByte4(const XMUBYTE4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + static_cast(pSource->x), + static_cast(pSource->y), + static_cast(pSource->z), + static_cast(pSource->w) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt8 = vld1_dup_u32(reinterpret_cast(pSource)); + uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u32(vInt8)); + uint32x4_t vInt = vmovl_u16(vget_low_u16(vInt16)); + return vcvtq_f32_u32(vInt); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadUByte4Mul = { { { 1.0f, 1.0f / 256.0f, 1.0f / 65536.0f, 1.0f / (65536.0f * 256.0f) } } }; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 + vTemp = _mm_and_ps(vTemp, g_XMMaskByte4); + // w is signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp, g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // w + 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp, g_XMAddUDec4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp, LoadUByte4Mul); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadByteN4(const XMBYTEN4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + (pSource->x == -128) ? -1.f : (static_cast(pSource->x) / 127.0f), + (pSource->y == -128) ? -1.f : (static_cast(pSource->y) / 127.0f), + (pSource->z == -128) ? -1.f : (static_cast(pSource->z) / 127.0f), + (pSource->w == -128) ? -1.f : (static_cast(pSource->w) / 127.0f) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt8 = vld1_dup_u32(reinterpret_cast(pSource)); + int16x8_t vInt16 = vmovl_s8(vreinterpret_s8_u32(vInt8)); + int32x4_t vInt = vmovl_s16(vget_low_s16(vInt16)); + float32x4_t R = vcvtq_f32_s32(vInt); + R = vmulq_n_f32(R, 1.0f / 127.0f); + return vmaxq_f32(R, vdupq_n_f32(-1.f)); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadByteN4Mul = { { { 1.0f / 127.0f, 1.0f / (127.0f * 256.0f), 1.0f / (127.0f * 65536.0f), 1.0f / (127.0f * 65536.0f * 256.0f) } } }; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 + vTemp = _mm_and_ps(vTemp, g_XMMaskByte4); + // x,y and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp, g_XMXorByte4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x, y and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp, g_XMAddByte4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp, LoadByteN4Mul); + // Clamp result (for case of -128) + return _mm_max_ps(vTemp, g_XMNegativeOne); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadByte4(const XMBYTE4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + static_cast(pSource->x), + static_cast(pSource->y), + static_cast(pSource->z), + static_cast(pSource->w) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt8 = vld1_dup_u32(reinterpret_cast(pSource)); + int16x8_t vInt16 = vmovl_s8(vreinterpret_s8_u32(vInt8)); + int32x4_t vInt = vmovl_s16(vget_low_s16(vInt16)); + return vcvtq_f32_s32(vInt); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadByte4Mul = { { { 1.0f, 1.0f / 256.0f, 1.0f / 65536.0f, 1.0f / (65536.0f * 256.0f) } } }; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 + vTemp = _mm_and_ps(vTemp, g_XMMaskByte4); + // x,y and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp, g_XMXorByte4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x, y and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp, g_XMAddByte4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp, LoadByte4Mul); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUNibble4(const XMUNIBBLE4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + float(pSource->v & 0xF), + float((pSource->v >> 4) & 0xF), + float((pSource->v >> 8) & 0xF), + float((pSource->v >> 12) & 0xF) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORI32 UNibble4And = { { { 0xF, 0xF0, 0xF00, 0xF000 } } }; + static const XMVECTORF32 UNibble4Mul = { { { 1.0f, 1.0f / 16.f, 1.0f / 256.f, 1.0f / 4096.f } } }; + uint16x4_t vInt16 = vld1_dup_u16(reinterpret_cast(pSource)); + uint32x4_t vInt = vmovl_u16(vInt16); + vInt = vandq_u32(vInt, UNibble4And); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_f32(R, UNibble4Mul); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORI32 UNibble4And = { { { 0xF, 0xF0, 0xF00, 0xF000 } } }; + static const XMVECTORF32 UNibble4Mul = { { { 1.0f, 1.0f / 16.f, 1.0f / 256.f, 1.0f / 4096.f } } }; + // Get the 32 bit value and splat it + XMVECTOR vResult = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult, UNibble4And); + // Convert to float + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Normalize x, y, and z + vResult = _mm_mul_ps(vResult, UNibble4Mul); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadU555(const XMU555* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + float(pSource->v & 0x1F), + float((pSource->v >> 5) & 0x1F), + float((pSource->v >> 10) & 0x1F), + float((pSource->v >> 15) & 0x1) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORI32 U555And = { { { 0x1F, 0x1F << 5, 0x1F << 10, 0x8000 } } }; + static const XMVECTORF32 U555Mul = { { { 1.0f, 1.0f / 32.f, 1.0f / 1024.f, 1.0f / 32768.f } } }; + uint16x4_t vInt16 = vld1_dup_u16(reinterpret_cast(pSource)); + uint32x4_t vInt = vmovl_u16(vInt16); + vInt = vandq_u32(vInt, U555And); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_f32(R, U555Mul); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORI32 U555And = { { { 0x1F, 0x1F << 5, 0x1F << 10, 0x8000 } } }; + static const XMVECTORF32 U555Mul = { { { 1.0f, 1.0f / 32.f, 1.0f / 1024.f, 1.0f / 32768.f } } }; + // Get the 32 bit value and splat it + XMVECTOR vResult = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult, U555And); + // Convert to float + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Normalize x, y, and z + vResult = _mm_mul_ps(vResult, U555Mul); + return vResult; +#endif +} + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +/**************************************************************************** + * + * Vector and matrix store operations + * + ****************************************************************************/ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreColor +( + XMCOLOR* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiply(N, g_UByteMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->c = (static_cast(tmp.w) << 24) | + (static_cast(tmp.x) << 16) | + (static_cast(tmp.y) << 8) | + static_cast(tmp.z); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0)); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32(R, 255.0f); + R = XMVectorRound(R); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32(vInt32); + uint8x8_t vInt8 = vqmovn_u16(vcombine_u16(vInt16, vInt16)); + uint32_t rgba = vget_lane_u32(vreinterpret_u32_u8(vInt8), 0); + pDestination->c = (rgba & 0xFF00FF00) | ((rgba >> 16) & 0xFF) | ((rgba << 16) & 0xFF0000); +#elif defined(_XM_SSE_INTRINSICS_) + // Set <0 to 0 + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + // Set>1 to 1 + vResult = _mm_min_ps(vResult, g_XMOne); + // Convert to 0-255 + vResult = _mm_mul_ps(vResult, g_UByteMax); + // Shuffle RGBA to ARGB + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2)); + // Convert to int + __m128i vInt = _mm_cvtps_epi32(vResult); + // Mash to shorts + vInt = _mm_packs_epi32(vInt, vInt); + // Mash to bytes + vInt = _mm_packus_epi16(vInt, vInt); + // Store the color + _mm_store_ss(reinterpret_cast(&pDestination->c), _mm_castsi128_ps(vInt)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreHalf2 +( + XMHALF2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + __m128i V1 = _mm_cvtps_ph(V, _MM_FROUND_TO_NEAREST_INT); + _mm_store_ss(reinterpret_cast(pDestination), _mm_castsi128_ps(V1)); +#else + pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V)); + pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V)); +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreShortN2 +( + XMSHORTN2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, g_ShortMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f)); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32(R, 32767.0f); + int32x4_t vInt32 = vcvtq_s32_f32(R); + int16x4_t vInt16 = vqmovn_s32(vInt32); + vst1_lane_u32(&pDestination->v, vreinterpret_u32_s16(vInt16), 0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_max_ps(V, g_XMNegativeOne); + vResult = _mm_min_ps(vResult, g_XMOne); + vResult = _mm_mul_ps(vResult, g_ShortMax); + __m128i vResulti = _mm_cvtps_epi32(vResult); + vResulti = _mm_packs_epi32(vResulti, vResulti); + _mm_store_ss(reinterpret_cast(&pDestination->x), _mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreShort2 +( + XMSHORT2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_ShortMin, g_ShortMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-32767.f)); + R = vminq_f32(R, vdupq_n_f32(32767.0f)); + int32x4_t vInt32 = vcvtq_s32_f32(R); + int16x4_t vInt16 = vqmovn_s32(vInt32); + vst1_lane_u32(&pDestination->v, vreinterpret_u32_s16(vInt16), 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V, g_ShortMin); + vResult = _mm_min_ps(vResult, g_ShortMax); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Pack the ints into shorts + vInt = _mm_packs_epi32(vInt, vInt); + _mm_store_ss(reinterpret_cast(&pDestination->x), _mm_castsi128_ps(vInt)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUShortN2 +( + XMUSHORTN2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiplyAdd(N, g_UShortMax, g_XMOneHalf.v); + N = XMVectorTruncate(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f)); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32(R, 65535.0f); + R = vaddq_f32(R, g_XMOneHalf); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32(vInt32); + vst1_lane_u32(&pDestination->v, vreinterpret_u32_u16(vInt16), 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, g_XMOne); + vResult = _mm_mul_ps(vResult, g_UShortMax); + vResult = _mm_add_ps(vResult, g_XMOneHalf); + // Convert to int + __m128i vInt = _mm_cvttps_epi32(vResult); + // Since the SSE pack instruction clamps using signed rules, + // manually extract the values to store them to memory + pDestination->x = static_cast(_mm_extract_epi16(vInt, 0)); + pDestination->y = static_cast(_mm_extract_epi16(vInt, 2)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUShort2 +( + XMUSHORT2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UShortMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f)); + R = vminq_f32(R, vdupq_n_f32(65535.0f)); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32(vInt32); + vst1_lane_u32(&pDestination->v, vreinterpret_u32_u16(vInt16), 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, g_UShortMax); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Since the SSE pack instruction clamps using signed rules, + // manually extract the values to store them to memory + pDestination->x = static_cast(_mm_extract_epi16(vInt, 0)); + pDestination->y = static_cast(_mm_extract_epi16(vInt, 2)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreByteN2 +( + XMBYTEN2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, g_ByteMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f)); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32(R, 127.0f); + int32x4_t vInt32 = vcvtq_s32_f32(R); + int16x4_t vInt16 = vqmovn_s32(vInt32); + int8x8_t vInt8 = vqmovn_s16(vcombine_s16(vInt16, vInt16)); + vst1_lane_u16(reinterpret_cast(pDestination), vreinterpret_u16_s8(vInt8), 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, g_XMNegativeOne); + vResult = _mm_min_ps(vResult, g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult, g_ByteMax); + // Convert to int by rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + auto x = static_cast(_mm_extract_epi16(vInt, 0)); + auto y = static_cast(_mm_extract_epi16(vInt, 2)); + pDestination->v = static_cast(((static_cast(y) & 0xFF) << 8) | (static_cast(x) & 0xFF)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreByte2 +( + XMBYTE2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_ByteMin, g_ByteMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-127.f)); + R = vminq_f32(R, vdupq_n_f32(127.0f)); + int32x4_t vInt32 = vcvtq_s32_f32(R); + int16x4_t vInt16 = vqmovn_s32(vInt32); + int8x8_t vInt8 = vqmovn_s16(vcombine_s16(vInt16, vInt16)); + vst1_lane_u16(reinterpret_cast(pDestination), vreinterpret_u16_s8(vInt8), 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, g_ByteMin); + vResult = _mm_min_ps(vResult, g_ByteMax); + // Convert to int by rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + auto x = static_cast(_mm_extract_epi16(vInt, 0)); + auto y = static_cast(_mm_extract_epi16(vInt, 2)); + pDestination->v = static_cast(((static_cast(y) & 0xFF) << 8) | (static_cast(x) & 0xFF)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUByteN2 +( + XMUBYTEN2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiplyAdd(N, g_UByteMax, g_XMOneHalf.v); + N = XMVectorTruncate(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f)); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32(R, 255.0f); + R = vaddq_f32(R, g_XMOneHalf); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32(vInt32); + uint8x8_t vInt8 = vqmovn_u16(vcombine_u16(vInt16, vInt16)); + vst1_lane_u16(reinterpret_cast(pDestination), vreinterpret_u16_u8(vInt8), 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult, g_UByteMax); + vResult = _mm_add_ps(vResult, g_XMOneHalf); + // Convert to int + __m128i vInt = _mm_cvttps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + auto x = static_cast(_mm_extract_epi16(vInt, 0)); + auto y = static_cast(_mm_extract_epi16(vInt, 2)); + pDestination->v = static_cast(((static_cast(y) & 0xFF) << 8) | (static_cast(x) & 0xFF)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUByte2 +( + XMUBYTE2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UByteMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f)); + R = vminq_f32(R, vdupq_n_f32(255.0f)); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32(vInt32); + uint8x8_t vInt8 = vqmovn_u16(vcombine_u16(vInt16, vInt16)); + vst1_lane_u16(reinterpret_cast(pDestination), vreinterpret_u16_u8(vInt8), 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, g_UByteMax); + // Convert to int by rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + auto x = static_cast(_mm_extract_epi16(vInt, 0)); + auto y = static_cast(_mm_extract_epi16(vInt, 2)); + pDestination->v = static_cast(((static_cast(y) & 0xFF) << 8) | (static_cast(x) & 0xFF)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreU565 +( + XMU565* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + static const XMVECTORF32 Max = { { { 31.0f, 63.0f, 31.0f, 0.0f } } }; + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->v = static_cast( + ((static_cast(tmp.z) & 0x1F) << 11) + | ((static_cast(tmp.y) & 0x3F) << 5) + | ((static_cast(tmp.x) & 0x1F))); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.0f, 32.f, 32.f * 64.f, 0.f } } }; + static const XMVECTORU32 Mask = { { { 0x1F, 0x3F << 5, 0x1F << 11, 0 } } }; + float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0)); + vResult = vminq_f32(vResult, Max); + vResult = vmulq_f32(vResult, Scale); + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + vResulti = vandq_u32(vResulti, Mask); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vResulti); + uint32x2_t vhi = vget_high_u32(vResulti); + vTemp = vorr_u32(vTemp, vhi); + vTemp = vpadd_u32(vTemp, vTemp); + vst1_lane_u16(&pDestination->v, vreinterpret_u16_u32(vTemp), 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + auto x = static_cast(_mm_extract_epi16(vInt, 0)); + auto y = static_cast(_mm_extract_epi16(vInt, 2)); + auto z = static_cast(_mm_extract_epi16(vInt, 4)); + pDestination->v = static_cast( + ((static_cast(z) & 0x1F) << 11) + | ((static_cast(y) & 0x3F) << 5) + | ((static_cast(x) & 0x1F))); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3PK +( + XMFLOAT3PK* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + + XM_ALIGNED_DATA(16) uint32_t IValue[4]; + XMStoreFloat3A(reinterpret_cast(&IValue), V); + + uint32_t Result[3]; + + // X & Y Channels (5-bit exponent, 6-bit mantissa) + for (uint32_t j = 0; j < 2; ++j) + { + uint32_t Sign = IValue[j] & 0x80000000; + uint32_t I = IValue[j] & 0x7FFFFFFF; + + if ((I & 0x7F800000) == 0x7F800000) + { + // INF or NAN + Result[j] = 0x7C0U; + if ((I & 0x7FFFFF) != 0) + { + Result[j] = 0x7FFU; + } + else if (Sign) + { + // -INF is clamped to 0 since 3PK is positive only + Result[j] = 0; + } + } + else if (Sign || I < 0x35800000) + { + // 3PK is positive only, so clamp to zero + Result[j] = 0; + } + else if (I > 0x477E0000U) + { + // The number is too large to be represented as a float11, set to max + Result[j] = 0x7BFU; + } + else + { + if (I < 0x38800000U) + { + // The number is too small to be represented as a normalized float11 + // Convert it to a denormalized value. + uint32_t Shift = 113U - (I >> 23U); + I = (0x800000U | (I & 0x7FFFFFU)) >> Shift; + } + else + { + // Rebias the exponent to represent the value as a normalized float11 + I += 0xC8000000U; + } + + Result[j] = ((I + 0xFFFFU + ((I >> 17U) & 1U)) >> 17U) & 0x7ffU; + } + } + + // Z Channel (5-bit exponent, 5-bit mantissa) + uint32_t Sign = IValue[2] & 0x80000000; + uint32_t I = IValue[2] & 0x7FFFFFFF; + + if ((I & 0x7F800000) == 0x7F800000) + { + // INF or NAN + Result[2] = 0x3E0U; + if (I & 0x7FFFFF) + { + Result[2] = 0x3FFU; + } + else if (Sign || I < 0x36000000) + { + // -INF is clamped to 0 since 3PK is positive only + Result[2] = 0; + } + } + else if (Sign) + { + // 3PK is positive only, so clamp to zero + Result[2] = 0; + } + else if (I > 0x477C0000U) + { + // The number is too large to be represented as a float10, set to max + Result[2] = 0x3DFU; + } + else + { + if (I < 0x38800000U) + { + // The number is too small to be represented as a normalized float10 + // Convert it to a denormalized value. + uint32_t Shift = 113U - (I >> 23U); + I = (0x800000U | (I & 0x7FFFFFU)) >> Shift; + } + else + { + // Rebias the exponent to represent the value as a normalized float10 + I += 0xC8000000U; + } + + Result[2] = ((I + 0x1FFFFU + ((I >> 18U) & 1U)) >> 18U) & 0x3ffU; + } + + // Pack Result into memory + pDestination->v = (Result[0] & 0x7ff) + | ((Result[1] & 0x7ff) << 11) + | ((Result[2] & 0x3ff) << 22); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3SE +( + XMFLOAT3SE* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + + XMFLOAT3A tmp; + XMStoreFloat3A(&tmp, V); + + static constexpr float maxf9 = float(0x1FF << 7); + static constexpr float minf9 = float(1.f / (1 << 16)); + + float x = (tmp.x >= 0.f) ? ((tmp.x > maxf9) ? maxf9 : tmp.x) : 0.f; + float y = (tmp.y >= 0.f) ? ((tmp.y > maxf9) ? maxf9 : tmp.y) : 0.f; + float z = (tmp.z >= 0.f) ? ((tmp.z > maxf9) ? maxf9 : tmp.z) : 0.f; + + const float max_xy = (x > y) ? x : y; + const float max_xyz = (max_xy > z) ? max_xy : z; + + const float maxColor = (max_xyz > minf9) ? max_xyz : minf9; + + union { float f; int32_t i; } fi; + fi.f = maxColor; + fi.i += 0x00004000; // round up leaving 9 bits in fraction (including assumed 1) + + auto exp = static_cast(fi.i) >> 23; + pDestination->e = exp - 0x6f; + + fi.i = static_cast(0x83000000 - (exp << 23)); + float ScaleR = fi.f; + + pDestination->xm = static_cast(Internal::round_to_nearest(x * ScaleR)); + pDestination->ym = static_cast(Internal::round_to_nearest(y * ScaleR)); + pDestination->zm = static_cast(Internal::round_to_nearest(z * ScaleR)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreHalf4 +( + XMHALF4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + __m128i V1 = _mm_cvtps_ph(V, _MM_FROUND_TO_NEAREST_INT); + _mm_storel_epi64(reinterpret_cast<__m128i*>(pDestination), V1); +#else + XMFLOAT4A t; + XMStoreFloat4A(&t, V); + + pDestination->x = XMConvertFloatToHalf(t.x); + pDestination->y = XMConvertFloatToHalf(t.y); + pDestination->z = XMConvertFloatToHalf(t.z); + pDestination->w = XMConvertFloatToHalf(t.w); +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreShortN4 +( + XMSHORTN4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, g_ShortMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + pDestination->z = static_cast(tmp.z); + pDestination->w = static_cast(tmp.w); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(-1.f)); + vResult = vminq_f32(vResult, vdupq_n_f32(1.0f)); + vResult = vmulq_n_f32(vResult, 32767.0f); + int16x4_t vInt = vmovn_s32(vcvtq_s32_f32(vResult)); + vst1_s16(reinterpret_cast(pDestination), vInt); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_max_ps(V, g_XMNegativeOne); + vResult = _mm_min_ps(vResult, g_XMOne); + vResult = _mm_mul_ps(vResult, g_ShortMax); + __m128i vResulti = _mm_cvtps_epi32(vResult); + vResulti = _mm_packs_epi32(vResulti, vResulti); + _mm_store_sd(reinterpret_cast(&pDestination->x), _mm_castsi128_pd(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreShort4 +( + XMSHORT4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_ShortMin, g_ShortMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + pDestination->z = static_cast(tmp.z); + pDestination->w = static_cast(tmp.w); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmaxq_f32(V, g_ShortMin); + vResult = vminq_f32(vResult, g_ShortMax); + int16x4_t vInt = vmovn_s32(vcvtq_s32_f32(vResult)); + vst1_s16(reinterpret_cast(pDestination), vInt); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V, g_ShortMin); + vResult = _mm_min_ps(vResult, g_ShortMax); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Pack the ints into shorts + vInt = _mm_packs_epi32(vInt, vInt); + _mm_store_sd(reinterpret_cast(&pDestination->x), _mm_castsi128_pd(vInt)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUShortN4 +( + XMUSHORTN4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiplyAdd(N, g_UShortMax, g_XMOneHalf.v); + N = XMVectorTruncate(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + pDestination->z = static_cast(tmp.z); + pDestination->w = static_cast(tmp.w); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0)); + vResult = vminq_f32(vResult, vdupq_n_f32(1.0f)); + vResult = vmulq_n_f32(vResult, 65535.0f); + vResult = vaddq_f32(vResult, g_XMOneHalf); + uint16x4_t vInt = vmovn_u32(vcvtq_u32_f32(vResult)); + vst1_u16(reinterpret_cast(pDestination), vInt); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, g_XMOne); + vResult = _mm_mul_ps(vResult, g_UShortMax); + vResult = _mm_add_ps(vResult, g_XMOneHalf); + // Convert to int + __m128i vInt = _mm_cvttps_epi32(vResult); + // Since the SSE pack instruction clamps using signed rules, + // manually extract the values to store them to memory + pDestination->x = static_cast(_mm_extract_epi16(vInt, 0)); + pDestination->y = static_cast(_mm_extract_epi16(vInt, 2)); + pDestination->z = static_cast(_mm_extract_epi16(vInt, 4)); + pDestination->w = static_cast(_mm_extract_epi16(vInt, 6)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUShort4 +( + XMUSHORT4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UShortMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + pDestination->z = static_cast(tmp.z); + pDestination->w = static_cast(tmp.w); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0)); + vResult = vminq_f32(vResult, g_UShortMax); + uint16x4_t vInt = vmovn_u32(vcvtq_u32_f32(vResult)); + vst1_u16(reinterpret_cast(pDestination), vInt); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, g_UShortMax); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Since the SSE pack instruction clamps using signed rules, + // manually extract the values to store them to memory + pDestination->x = static_cast(_mm_extract_epi16(vInt, 0)); + pDestination->y = static_cast(_mm_extract_epi16(vInt, 2)); + pDestination->z = static_cast(_mm_extract_epi16(vInt, 4)); + pDestination->w = static_cast(_mm_extract_epi16(vInt, 6)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreXDecN4 +( + XMXDECN4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + static const XMVECTORF32 Min = { { { -1.0f, -1.0f, -1.0f, 0.0f } } }; + +#if defined(_XM_NO_INTRINSICS_) + + static const XMVECTORF32 Scale = { { { 511.0f, 511.0f, 511.0f, 3.0f } } }; + + XMVECTOR N = XMVectorClamp(V, Min.v, g_XMOne.v); + N = XMVectorMultiply(N, Scale.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->v = static_cast( + (static_cast(tmp.w) << 30) + | ((static_cast(tmp.z) & 0x3FF) << 20) + | ((static_cast(tmp.y) & 0x3FF) << 10) + | (static_cast(tmp.x) & 0x3FF)); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 511.0f, 511.0f * 1024.0f, 511.0f * 1048576.0f, 3.0f * 536870912.0f } } }; + static const XMVECTORI32 ScaleMask = { { { 0x3FF, 0x3FF << 10, 0x3FF << 20, 0x3 << 29 } } }; + float32x4_t vResult = vmaxq_f32(V, Min); + vResult = vminq_f32(vResult, vdupq_n_f32(1.0f)); + vResult = vmulq_f32(vResult, Scale); + int32x4_t vResulti = vcvtq_s32_f32(vResult); + vResulti = vandq_s32(vResulti, ScaleMask); + int32x4_t vResultw = vandq_s32(vResulti, g_XMMaskW); + vResulti = vaddq_s32(vResulti, vResultw); + // Do a horizontal or of all 4 entries + uint32x2_t vTemp = vget_low_u32(vreinterpretq_u32_s32(vResulti)); + uint32x2_t vhi = vget_high_u32(vreinterpretq_u32_s32(vResulti)); + vTemp = vorr_u32(vTemp, vhi); + vTemp = vpadd_u32(vTemp, vTemp); + vst1_lane_u32(&pDestination->v, vTemp, 0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 511.0f, 511.0f * 1024.0f, 511.0f * 1048576.0f, 3.0f * 536870912.0f } } }; + static const XMVECTORI32 ScaleMask = { { { 0x3FF, 0x3FF << 10, 0x3FF << 20, 0x3 << 29 } } }; + XMVECTOR vResult = _mm_max_ps(V, Min); + vResult = _mm_min_ps(vResult, g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult, Scale); + // Convert to int (W is unsigned) + __m128i vResulti = _mm_cvtps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti, ScaleMask); + // To fix W, add itself to shift it up to <<30 instead of <<29 + __m128i vResultw = _mm_and_si128(vResulti, g_XMMaskW); + vResulti = _mm_add_epi32(vResulti, vResultw); + // Do a horizontal or of all 4 entries + vResult = XM_PERMUTE_PS(_mm_castsi128_ps(vResulti), _MM_SHUFFLE(0, 3, 2, 1)); + vResulti = _mm_or_si128(vResulti, _mm_castps_si128(vResult)); + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 3, 2, 1)); + vResulti = _mm_or_si128(vResulti, _mm_castps_si128(vResult)); + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 3, 2, 1)); + vResulti = _mm_or_si128(vResulti, _mm_castps_si128(vResult)); + _mm_store_ss(reinterpret_cast(&pDestination->v), _mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +#pragma warning(push) +#pragma warning(disable : 4996) +// C4996: ignore deprecation warning + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif + +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreXDec4 +( + XMXDEC4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + static const XMVECTORF32 MinXDec4 = { { { -511.0f, -511.0f, -511.0f, 0.0f } } }; + static const XMVECTORF32 MaxXDec4 = { { { 511.0f, 511.0f, 511.0f, 3.0f } } }; + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, MinXDec4, MaxXDec4); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->v = static_cast( + (static_cast(tmp.w) << 30) + | ((static_cast(tmp.z) & 0x3FF) << 20) + | ((static_cast(tmp.y) & 0x3FF) << 10) + | ((static_cast(tmp.x) & 0x3FF))); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 ScaleXDec4 = { { { 1.0f, 1024.0f / 2.0f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f / 2.0f } } }; + static const XMVECTORI32 MaskXDec4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } }; + float32x4_t vResult = vmaxq_f32(V, MinXDec4); + vResult = vminq_f32(vResult, MaxXDec4); + vResult = vmulq_f32(vResult, ScaleXDec4); + int32x4_t vResulti = vcvtq_s32_f32(vResult); + vResulti = vandq_s32(vResulti, MaskXDec4); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vreinterpretq_u32_s32(vResulti)); + uint32x2_t vTemp2 = vget_high_u32(vreinterpretq_u32_s32(vResulti)); + vTemp = vorr_u32(vTemp, vTemp2); + // Perform a single bit left shift on y|w + vTemp2 = vdup_lane_u32(vTemp, 1); + vTemp2 = vadd_u32(vTemp2, vTemp2); + vTemp = vorr_u32(vTemp, vTemp2); + vst1_lane_u32(&pDestination->v, vTemp, 0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleXDec4 = { { { 1.0f, 1024.0f / 2.0f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f / 2.0f } } }; + static const XMVECTORI32 MaskXDec4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, MinXDec4); + vResult = _mm_min_ps(vResult, MaxXDec4); + // Scale by multiplication + vResult = _mm_mul_ps(vResult, ScaleXDec4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti, MaskXDec4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti, vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1)); + // Perform a single bit left shift on y|w + vResulti2 = _mm_add_epi32(vResulti2, vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti, vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v), _mm_castsi128_ps(vResulti)); +#endif +} + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + +#pragma warning(pop) + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUDecN4 +( + XMUDECN4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + static const XMVECTORF32 Scale = { { { 1023.0f, 1023.0f, 1023.0f, 3.0f } } }; + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiply(N, Scale.v); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->v = static_cast( + (static_cast(tmp.w) << 30) + | ((static_cast(tmp.z) & 0x3FF) << 20) + | ((static_cast(tmp.y) & 0x3FF) << 10) + | ((static_cast(tmp.x) & 0x3FF))); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 ScaleUDecN4 = { { { 1023.0f, 1023.0f * 1024.0f * 0.5f, 1023.0f * 1024.0f * 1024.0f, 3.0f * 1024.0f * 1024.0f * 1024.0f * 0.5f } } }; + static const XMVECTORI32 MaskUDecN4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } }; + float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0.f)); + vResult = vminq_f32(vResult, vdupq_n_f32(1.f)); + vResult = vmulq_f32(vResult, ScaleUDecN4); + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + vResulti = vandq_u32(vResulti, MaskUDecN4); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vResulti); + uint32x2_t vTemp2 = vget_high_u32(vResulti); + vTemp = vorr_u32(vTemp, vTemp2); + // Perform a single bit left shift on y|w + vTemp2 = vdup_lane_u32(vTemp, 1); + vTemp2 = vadd_u32(vTemp2, vTemp2); + vTemp = vorr_u32(vTemp, vTemp2); + vst1_lane_u32(&pDestination->v, vTemp, 0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleUDecN4 = { { { 1023.0f, 1023.0f * 1024.0f * 0.5f, 1023.0f * 1024.0f * 1024.0f, 3.0f * 1024.0f * 1024.0f * 1024.0f * 0.5f } } }; + static const XMVECTORI32 MaskUDecN4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult, ScaleUDecN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti, MaskUDecN4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti, vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1)); + // Perform a left shift by one bit on y|w + vResulti2 = _mm_add_epi32(vResulti2, vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti, vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v), _mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUDecN4_XR +( + XMUDECN4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + static const XMVECTORF32 Scale = { { { 510.0f, 510.0f, 510.0f, 3.0f } } }; + static const XMVECTORF32 Bias = { { { 384.0f, 384.0f, 384.0f, 0.0f } } }; + static const XMVECTORF32 C = { { { 1023.f, 1023.f, 1023.f, 3.f } } }; + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorMultiplyAdd(V, Scale, Bias); + N = XMVectorClamp(N, g_XMZero, C); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->v = static_cast( + (static_cast(tmp.w) << 30) + | ((static_cast(tmp.z) & 0x3FF) << 20) + | ((static_cast(tmp.y) & 0x3FF) << 10) + | ((static_cast(tmp.x) & 0x3FF))); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Shift = { { { 1.0f, 1024.0f * 0.5f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f * 0.5f } } }; + static const XMVECTORU32 MaskUDecN4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } }; + float32x4_t vResult = vmlaq_f32(Bias, V, Scale); + vResult = vmaxq_f32(vResult, vdupq_n_f32(0.f)); + vResult = vminq_f32(vResult, C); + vResult = vmulq_f32(vResult, Shift); + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + vResulti = vandq_u32(vResulti, MaskUDecN4); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vResulti); + uint32x2_t vTemp2 = vget_high_u32(vResulti); + vTemp = vorr_u32(vTemp, vTemp2); + // Perform a single bit left shift on y|w + vTemp2 = vdup_lane_u32(vTemp, 1); + vTemp2 = vadd_u32(vTemp2, vTemp2); + vTemp = vorr_u32(vTemp, vTemp2); + vst1_lane_u32(&pDestination->v, vTemp, 0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Shift = { { { 1.0f, 1024.0f * 0.5f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f * 0.5f } } }; + static const XMVECTORU32 MaskUDecN4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } }; + // Scale & bias + XMVECTOR vResult = XM_FMADD_PS(V, Scale, Bias); + // Clamp to bounds + vResult = _mm_max_ps(vResult, g_XMZero); + vResult = _mm_min_ps(vResult, C); + // Scale by shift values + vResult = _mm_mul_ps(vResult, Shift); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti, MaskUDecN4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti, vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1)); + // Perform a left shift by one bit on y|w + vResulti2 = _mm_add_epi32(vResulti2, vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti, vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v), _mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUDec4 +( + XMUDEC4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + static const XMVECTORF32 MaxUDec4 = { { { 1023.0f, 1023.0f, 1023.0f, 3.0f } } }; + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), MaxUDec4); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->v = static_cast( + (static_cast(tmp.w) << 30) + | ((static_cast(tmp.z) & 0x3FF) << 20) + | ((static_cast(tmp.y) & 0x3FF) << 10) + | ((static_cast(tmp.x) & 0x3FF))); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 ScaleUDec4 = { { { 1.0f, 1024.0f / 2.0f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f / 2.0f } } }; + static const XMVECTORI32 MaskUDec4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } }; + float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0.f)); + vResult = vminq_f32(vResult, MaxUDec4); + vResult = vmulq_f32(vResult, ScaleUDec4); + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + vResulti = vandq_u32(vResulti, MaskUDec4); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vResulti); + uint32x2_t vTemp2 = vget_high_u32(vResulti); + vTemp = vorr_u32(vTemp, vTemp2); + // Perform a single bit left shift on y|w + vTemp2 = vdup_lane_u32(vTemp, 1); + vTemp2 = vadd_u32(vTemp2, vTemp2); + vTemp = vorr_u32(vTemp, vTemp2); + vst1_lane_u32(&pDestination->v, vTemp, 0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleUDec4 = { { { 1.0f, 1024.0f / 2.0f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f / 2.0f } } }; + static const XMVECTORI32 MaskUDec4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, MaxUDec4); + // Scale by multiplication + vResult = _mm_mul_ps(vResult, ScaleUDec4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti, MaskUDec4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti, vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1)); + // Perform a left shift by one bit on y|w + vResulti2 = _mm_add_epi32(vResulti2, vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti, vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v), _mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +#pragma warning(push) +#pragma warning(disable : 4996) +// C4996: ignore deprecation warning + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif + +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreDecN4 +( + XMDECN4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + static const XMVECTORF32 Scale = { { { 511.0f, 511.0f, 511.0f, 1.0f } } }; + + XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, Scale.v); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->v = static_cast( + (static_cast(tmp.w) << 30) + | ((static_cast(tmp.z) & 0x3FF) << 20) + | ((static_cast(tmp.y) & 0x3FF) << 10) + | ((static_cast(tmp.x) & 0x3FF))); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 ScaleDecN4 = { { { 511.0f, 511.0f * 1024.0f, 511.0f * 1024.0f * 1024.0f, 1.0f * 1024.0f * 1024.0f * 1024.0f } } }; + float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(-1.f)); + vResult = vminq_f32(vResult, vdupq_n_f32(1.f)); + vResult = vmulq_f32(vResult, ScaleDecN4); + int32x4_t vResulti = vcvtq_s32_f32(vResult); + vResulti = vandq_s32(vResulti, g_XMMaskDec4); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vreinterpretq_u32_s32(vResulti)); + uint32x2_t vhi = vget_high_u32(vreinterpretq_u32_s32(vResulti)); + vTemp = vorr_u32(vTemp, vhi); + vTemp = vpadd_u32(vTemp, vTemp); + vst1_lane_u32(&pDestination->v, vTemp, 0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleDecN4 = { { { 511.0f, 511.0f * 1024.0f, 511.0f * 1024.0f * 1024.0f, 1.0f * 1024.0f * 1024.0f * 1024.0f } } }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, g_XMNegativeOne); + vResult = _mm_min_ps(vResult, g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult, ScaleDecN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti, g_XMMaskDec4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti, vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1)); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti, vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v), _mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreDec4 +( + XMDEC4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + static const XMVECTORF32 MinDec4 = { { { -511.0f, -511.0f, -511.0f, -1.0f } } }; + static const XMVECTORF32 MaxDec4 = { { { 511.0f, 511.0f, 511.0f, 1.0f } } }; + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, MinDec4, MaxDec4); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->v = static_cast( + (static_cast(tmp.w) << 30) + | ((static_cast(tmp.z) & 0x3FF) << 20) + | ((static_cast(tmp.y) & 0x3FF) << 10) + | ((static_cast(tmp.x) & 0x3FF))); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 ScaleDec4 = { { { 1.0f, 1024.0f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f } } }; + float32x4_t vResult = vmaxq_f32(V, MinDec4); + vResult = vminq_f32(vResult, MaxDec4); + vResult = vmulq_f32(vResult, ScaleDec4); + int32x4_t vResulti = vcvtq_s32_f32(vResult); + vResulti = vandq_s32(vResulti, g_XMMaskDec4); + // Do a horizontal or of all 4 entries + uint32x2_t vTemp = vget_low_u32(vreinterpretq_u32_s32(vResulti)); + uint32x2_t vhi = vget_high_u32(vreinterpretq_u32_s32(vResulti)); + vTemp = vorr_u32(vTemp, vhi); + vTemp = vpadd_u32(vTemp, vTemp); + vst1_lane_u32(&pDestination->v, vTemp, 0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleDec4 = { { { 1.0f, 1024.0f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f } } }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, MinDec4); + vResult = _mm_min_ps(vResult, MaxDec4); + // Scale by multiplication + vResult = _mm_mul_ps(vResult, ScaleDec4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti, g_XMMaskDec4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti, vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1)); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti, vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v), _mm_castsi128_ps(vResulti)); +#endif +} + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + +#pragma warning(pop) + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUByteN4 +( + XMUBYTEN4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiply(N, g_UByteMax); + N = XMVectorTruncate(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + pDestination->z = static_cast(tmp.z); + pDestination->w = static_cast(tmp.w); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0)); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32(R, 255.0f); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32(vInt32); + uint8x8_t vInt8 = vqmovn_u16(vcombine_u16(vInt16, vInt16)); + vst1_lane_u32(&pDestination->v, vreinterpret_u32_u8(vInt8), 0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleUByteN4 = { { { 255.0f, 255.0f * 256.0f * 0.5f, 255.0f * 256.0f * 256.0f, 255.0f * 256.0f * 256.0f * 256.0f * 0.5f } } }; + static const XMVECTORI32 MaskUByteN4 = { { { 0xFF, 0xFF << (8 - 1), 0xFF << 16, 0xFF << (24 - 1) } } }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult, ScaleUByteN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti, MaskUByteN4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti, vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1)); + // Perform a single bit left shift to fix y|w + vResulti2 = _mm_add_epi32(vResulti2, vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti, vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v), _mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUByte4 +( + XMUBYTE4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UByteMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + pDestination->z = static_cast(tmp.z); + pDestination->w = static_cast(tmp.w); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0)); + R = vminq_f32(R, vdupq_n_f32(255.0f)); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32(vInt32); + uint8x8_t vInt8 = vqmovn_u16(vcombine_u16(vInt16, vInt16)); + vst1_lane_u32(&pDestination->v, vreinterpret_u32_u8(vInt8), 0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleUByte4 = { { { 1.0f, 256.0f * 0.5f, 256.0f * 256.0f, 256.0f * 256.0f * 256.0f * 0.5f } } }; + static const XMVECTORI32 MaskUByte4 = { { { 0xFF, 0xFF << (8 - 1), 0xFF << 16, 0xFF << (24 - 1) } } }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, g_UByteMax); + // Scale by multiplication + vResult = _mm_mul_ps(vResult, ScaleUByte4); + // Convert to int by rounding + __m128i vResulti = _mm_cvtps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti, MaskUByte4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti, vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1)); + // Perform a single bit left shift to fix y|w + vResulti2 = _mm_add_epi32(vResulti2, vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti, vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v), _mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreByteN4 +( + XMBYTEN4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, g_ByteMax); + N = XMVectorTruncate(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + pDestination->z = static_cast(tmp.z); + pDestination->w = static_cast(tmp.w); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f)); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32(R, 127.0f); + int32x4_t vInt32 = vcvtq_s32_f32(R); + int16x4_t vInt16 = vqmovn_s32(vInt32); + int8x8_t vInt8 = vqmovn_s16(vcombine_s16(vInt16, vInt16)); + vst1_lane_u32(&pDestination->v, vreinterpret_u32_s8(vInt8), 0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleByteN4 = { { { 127.0f, 127.0f * 256.0f, 127.0f * 256.0f * 256.0f, 127.0f * 256.0f * 256.0f * 256.0f } } }; + static const XMVECTORI32 MaskByteN4 = { { { 0xFF, 0xFF << 8, 0xFF << 16, static_cast(0xFF000000) } } }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, g_XMNegativeOne); + vResult = _mm_min_ps(vResult, g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult, ScaleByteN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti, MaskByteN4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti, vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1)); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti, vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v), _mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreByte4 +( + XMBYTE4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_ByteMin, g_ByteMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + pDestination->z = static_cast(tmp.z); + pDestination->w = static_cast(tmp.w); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-127.f)); + R = vminq_f32(R, vdupq_n_f32(127.f)); + int32x4_t vInt32 = vcvtq_s32_f32(R); + int16x4_t vInt16 = vqmovn_s32(vInt32); + int8x8_t vInt8 = vqmovn_s16(vcombine_s16(vInt16, vInt16)); + vst1_lane_u32(&pDestination->v, vreinterpret_u32_s8(vInt8), 0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleByte4 = { { { 1.0f, 256.0f, 256.0f * 256.0f, 256.0f * 256.0f * 256.0f } } }; + static const XMVECTORI32 MaskByte4 = { { { 0xFF, 0xFF << 8, 0xFF << 16, static_cast(0xFF000000) } } }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, g_ByteMin); + vResult = _mm_min_ps(vResult, g_ByteMax); + // Scale by multiplication + vResult = _mm_mul_ps(vResult, ScaleByte4); + // Convert to int by rounding + __m128i vResulti = _mm_cvtps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti, MaskByte4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti, vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1)); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti, vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v), _mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUNibble4 +( + XMUNIBBLE4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + static const XMVECTORF32 Max = { { { 15.0f, 15.0f, 15.0f, 15.0f } } }; +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->v = static_cast( + ((static_cast(tmp.w) & 0xF) << 12) + | ((static_cast(tmp.z) & 0xF) << 8) + | ((static_cast(tmp.y) & 0xF) << 4) + | (static_cast(tmp.x) & 0xF)); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.0f, 16.f, 16.f * 16.f, 16.f * 16.f * 16.f } } }; + static const XMVECTORU32 Mask = { { { 0xF, 0xF << 4, 0xF << 8, 0xF << 12 } } }; + float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0)); + vResult = vminq_f32(vResult, Max); + vResult = vmulq_f32(vResult, Scale); + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + vResulti = vandq_u32(vResulti, Mask); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vResulti); + uint32x2_t vhi = vget_high_u32(vResulti); + vTemp = vorr_u32(vTemp, vhi); + vTemp = vpadd_u32(vTemp, vTemp); + vst1_lane_u16(&pDestination->v, vreinterpret_u16_u32(vTemp), 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + auto x = static_cast(_mm_extract_epi16(vInt, 0)); + auto y = static_cast(_mm_extract_epi16(vInt, 2)); + auto z = static_cast(_mm_extract_epi16(vInt, 4)); + auto w = static_cast(_mm_extract_epi16(vInt, 6)); + pDestination->v = static_cast( + ((static_cast(w) & 0xF) << 12) + | ((static_cast(z) & 0xF) << 8) + | ((static_cast(y) & 0xF) << 4) + | ((static_cast(x) & 0xF))); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreU555 +( + XMU555* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + static const XMVECTORF32 Max = { { { 31.0f, 31.0f, 31.0f, 1.0f } } }; + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->v = static_cast( + ((tmp.w > 0.f) ? 0x8000 : 0) + | ((static_cast(tmp.z) & 0x1F) << 10) + | ((static_cast(tmp.y) & 0x1F) << 5) + | (static_cast(tmp.x) & 0x1F)); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.0f, 32.f / 2.f, 32.f * 32.f, 32.f * 32.f * 32.f / 2.f } } }; + static const XMVECTORU32 Mask = { { { 0x1F, 0x1F << (5 - 1), 0x1F << 10, 0x1 << (15 - 1) } } }; + float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0)); + vResult = vminq_f32(vResult, Max); + vResult = vmulq_f32(vResult, Scale); + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + vResulti = vandq_u32(vResulti, Mask); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vResulti); + uint32x2_t vTemp2 = vget_high_u32(vResulti); + vTemp = vorr_u32(vTemp, vTemp2); + // Perform a single bit left shift on y|w + vTemp2 = vdup_lane_u32(vTemp, 1); + vTemp2 = vadd_u32(vTemp2, vTemp2); + vTemp = vorr_u32(vTemp, vTemp2); + vst1_lane_u16(&pDestination->v, vreinterpret_u16_u32(vTemp), 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + auto x = static_cast(_mm_extract_epi16(vInt, 0)); + auto y = static_cast(_mm_extract_epi16(vInt, 2)); + auto z = static_cast(_mm_extract_epi16(vInt, 4)); + auto w = static_cast(_mm_extract_epi16(vInt, 6)); + pDestination->v = static_cast( + (static_cast(w) ? 0x8000 : 0) + | ((static_cast(z) & 0x1F) << 10) + | ((static_cast(y) & 0x1F) << 5) + | ((static_cast(x) & 0x1F))); +#endif +} + + +/**************************************************************************** + * + * XMCOLOR operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMCOLOR::XMCOLOR +( + float _r, + float _g, + float _b, + float _a +) noexcept +{ + XMStoreColor(this, XMVectorSet(_r, _g, _b, _a)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMCOLOR::XMCOLOR(const float* pArray) noexcept +{ + XMStoreColor(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMHALF2 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMHALF2::XMHALF2 +( + float _x, + float _y +) noexcept +{ + x = XMConvertFloatToHalf(_x); + y = XMConvertFloatToHalf(_y); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMHALF2::XMHALF2(const float* pArray) noexcept +{ + assert(pArray != nullptr); + x = XMConvertFloatToHalf(pArray[0]); + y = XMConvertFloatToHalf(pArray[1]); +} + +/**************************************************************************** + * + * XMSHORTN2 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMSHORTN2::XMSHORTN2 +( + float _x, + float _y +) noexcept +{ + XMStoreShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMSHORTN2::XMSHORTN2(const float* pArray) noexcept +{ + XMStoreShortN2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMSHORT2 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMSHORT2::XMSHORT2 +( + float _x, + float _y +) noexcept +{ + XMStoreShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMSHORT2::XMSHORT2(const float* pArray) noexcept +{ + XMStoreShort2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUSHORTN2 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMUSHORTN2::XMUSHORTN2 +( + float _x, + float _y +) noexcept +{ + XMStoreUShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUSHORTN2::XMUSHORTN2(const float* pArray) noexcept +{ + XMStoreUShortN2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUSHORT2 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMUSHORT2::XMUSHORT2 +( + float _x, + float _y +) noexcept +{ + XMStoreUShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUSHORT2::XMUSHORT2(const float* pArray) noexcept +{ + XMStoreUShort2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMBYTEN2 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMBYTEN2::XMBYTEN2 +( + float _x, + float _y +) noexcept +{ + XMStoreByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMBYTEN2::XMBYTEN2(const float* pArray) noexcept +{ + XMStoreByteN2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMBYTE2 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMBYTE2::XMBYTE2 +( + float _x, + float _y +) noexcept +{ + XMStoreByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMBYTE2::XMBYTE2(const float* pArray) noexcept +{ + XMStoreByte2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUBYTEN2 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMUBYTEN2::XMUBYTEN2 +( + float _x, + float _y +) noexcept +{ + XMStoreUByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUBYTEN2::XMUBYTEN2(const float* pArray) noexcept +{ + XMStoreUByteN2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUBYTE2 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMUBYTE2::XMUBYTE2 +( + float _x, + float _y +) noexcept +{ + XMStoreUByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUBYTE2::XMUBYTE2(const float* pArray) noexcept +{ + XMStoreUByte2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMU565 operators + * + ****************************************************************************/ + +inline XMU565::XMU565 +( + float _x, + float _y, + float _z +) noexcept +{ + XMStoreU565(this, XMVectorSet(_x, _y, _z, 0.0f)); +} + +_Use_decl_annotations_ +inline XMU565::XMU565(const float* pArray) noexcept +{ + XMStoreU565(this, XMLoadFloat3(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMFLOAT3PK operators + * + ****************************************************************************/ + +inline XMFLOAT3PK::XMFLOAT3PK +( + float _x, + float _y, + float _z +) noexcept +{ + XMStoreFloat3PK(this, XMVectorSet(_x, _y, _z, 0.0f)); +} + +_Use_decl_annotations_ +inline XMFLOAT3PK::XMFLOAT3PK(const float* pArray) noexcept +{ + XMStoreFloat3PK(this, XMLoadFloat3(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMFLOAT3SE operators + * + ****************************************************************************/ + +inline XMFLOAT3SE::XMFLOAT3SE +( + float _x, + float _y, + float _z +) noexcept +{ + XMStoreFloat3SE(this, XMVectorSet(_x, _y, _z, 0.0f)); +} + +_Use_decl_annotations_ +inline XMFLOAT3SE::XMFLOAT3SE(const float* pArray) noexcept +{ + XMStoreFloat3SE(this, XMLoadFloat3(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMHALF4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMHALF4::XMHALF4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + x = XMConvertFloatToHalf(_x); + y = XMConvertFloatToHalf(_y); + z = XMConvertFloatToHalf(_z); + w = XMConvertFloatToHalf(_w); +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMHALF4::XMHALF4(const float* pArray) noexcept +{ + XMConvertFloatToHalfStream(&x, sizeof(HALF), pArray, sizeof(float), 4); +} + +/**************************************************************************** + * + * XMSHORTN4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMSHORTN4::XMSHORTN4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreShortN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMSHORTN4::XMSHORTN4(const float* pArray) noexcept +{ + XMStoreShortN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMSHORT4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMSHORT4::XMSHORT4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreShort4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMSHORT4::XMSHORT4(const float* pArray) noexcept +{ + XMStoreShort4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUSHORTN4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMUSHORTN4::XMUSHORTN4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreUShortN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUSHORTN4::XMUSHORTN4(const float* pArray) noexcept +{ + XMStoreUShortN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUSHORT4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMUSHORT4::XMUSHORT4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreUShort4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUSHORT4::XMUSHORT4(const float* pArray) noexcept +{ + XMStoreUShort4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMXDECN4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMXDECN4::XMXDECN4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreXDecN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMXDECN4::XMXDECN4(const float* pArray) noexcept +{ + XMStoreXDecN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMXDEC4 operators + * + ****************************************************************************/ + +#pragma warning(push) +#pragma warning(disable : 4996) + // C4996: ignore deprecation warning + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif + + //------------------------------------------------------------------------------ + +inline XMXDEC4::XMXDEC4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreXDec4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMXDEC4::XMXDEC4(const float* pArray) noexcept +{ + XMStoreXDec4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMDECN4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMDECN4::XMDECN4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreDecN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMDECN4::XMDECN4(const float* pArray) noexcept +{ + XMStoreDecN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMDEC4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMDEC4::XMDEC4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreDec4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMDEC4::XMDEC4(const float* pArray) noexcept +{ + XMStoreDec4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + +#pragma warning(pop) + +/**************************************************************************** + * + * XMUDECN4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMUDECN4::XMUDECN4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreUDecN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUDECN4::XMUDECN4(const float* pArray) noexcept +{ + XMStoreUDecN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUDEC4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMUDEC4::XMUDEC4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreUDec4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUDEC4::XMUDEC4(const float* pArray) noexcept +{ + XMStoreUDec4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMBYTEN4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMBYTEN4::XMBYTEN4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreByteN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMBYTEN4::XMBYTEN4(const float* pArray) noexcept +{ + XMStoreByteN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMBYTE4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMBYTE4::XMBYTE4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreByte4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMBYTE4::XMBYTE4(const float* pArray) noexcept +{ + XMStoreByte4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUBYTEN4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMUBYTEN4::XMUBYTEN4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreUByteN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUBYTEN4::XMUBYTEN4(const float* pArray) noexcept +{ + XMStoreUByteN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUBYTE4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMUBYTE4::XMUBYTE4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreUByte4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUBYTE4::XMUBYTE4(const float* pArray) noexcept +{ + XMStoreUByte4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUNIBBLE4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMUNIBBLE4::XMUNIBBLE4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreUNibble4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUNIBBLE4::XMUNIBBLE4(const float* pArray) noexcept +{ + XMStoreUNibble4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMU555 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMU555::XMU555 +( + float _x, + float _y, + float _z, + bool _w +) noexcept +{ + XMStoreU555(this, XMVectorSet(_x, _y, _z, ((_w) ? 1.0f : 0.0f))); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMU555::XMU555 +( + const float* pArray, + bool _w +) noexcept +{ + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pArray)); + XMStoreU555(this, XMVectorSetW(V, ((_w) ? 1.0f : 0.0f))); +} + diff --git a/DirectXTex/BC.cpp b/DirectXTex/BC.cpp index ef4583c..2ead1ed 100644 --- a/DirectXTex/BC.cpp +++ b/DirectXTex/BC.cpp @@ -412,7 +412,7 @@ namespace HDRColorA Error[NUM_PIXELS_PER_BLOCK]; if (flags & BC_FLAGS_DITHER_RGB) - memset(Error, 0x00, NUM_PIXELS_PER_BLOCK * sizeof(HDRColorA)); + std::memset(Error, 0x00, NUM_PIXELS_PER_BLOCK * sizeof(HDRColorA)); size_t i; for (i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) @@ -598,7 +598,7 @@ namespace // Encode colors uint32_t dw = 0; if (flags & BC_FLAGS_DITHER_RGB) - memset(Error, 0x00, NUM_PIXELS_PER_BLOCK * sizeof(HDRColorA)); + std::memset(Error, 0x00, NUM_PIXELS_PER_BLOCK * sizeof(HDRColorA)); for (i = 0; i < NUM_PIXELS_PER_BLOCK; ++i) { @@ -1010,7 +1010,7 @@ void DirectX::D3DXEncodeBC3(uint8_t *pBC, const XMVECTOR *pColor, uint32_t flags EncodeSolidBC1(&pBC3->dxt1, Color); pBC3->alpha[0] = 0x00; pBC3->alpha[1] = 0x00; - memset(pBC3->bitmap, 0x00, 6); + std::memset(pBC3->bitmap, 0x00, 6); } #endif @@ -1022,7 +1022,7 @@ void DirectX::D3DXEncodeBC3(uint8_t *pBC, const XMVECTOR *pColor, uint32_t flags { pBC3->alpha[0] = 0xff; pBC3->alpha[1] = 0xff; - memset(pBC3->bitmap, 0x00, 6); + std::memset(pBC3->bitmap, 0x00, 6); return; } @@ -1043,7 +1043,7 @@ void DirectX::D3DXEncodeBC3(uint8_t *pBC, const XMVECTOR *pColor, uint32_t flags { pBC3->alpha[0] = bAlphaA; pBC3->alpha[1] = bAlphaB; - memset(pBC3->bitmap, 0x00, 6); + std::memset(pBC3->bitmap, 0x00, 6); return; } @@ -1088,7 +1088,7 @@ void DirectX::D3DXEncodeBC3(uint8_t *pBC, const XMVECTOR *pColor, uint32_t flags float fScale = (fStep[0] != fStep[1]) ? (fSteps / (fStep[1] - fStep[0])) : 0.0f; if (flags & BC_FLAGS_DITHER_A) - memset(fError, 0x00, NUM_PIXELS_PER_BLOCK * sizeof(float)); + std::memset(fError, 0x00, NUM_PIXELS_PER_BLOCK * sizeof(float)); for (size_t iSet = 0; iSet < 2; iSet++) { diff --git a/DirectXTex/DirectXTex.h b/DirectXTex/DirectXTex.h index ad683f7..f89c268 100644 --- a/DirectXTex/DirectXTex.h +++ b/DirectXTex/DirectXTex.h @@ -8,14 +8,18 @@ // // http://go.microsoft.com/fwlink/?LinkId=248926 //------------------------------------------------------------------------------------- - #pragma once -#include -#include -#include -#include + // NOTE: we have already included these in the precompiled header. + // including these here breaks ports when sal.h conflicts with libcxx stl and crt +#if !defined(_DXTEX_PRIVATE) + #include + #include + #include + #include +#endif +#if !defined(_DXTX_NOWIN) #if !defined(__d3d11_h__) && !defined(__d3d11_x_h__) && !defined(__d3d12_h__) && !defined(__d3d12_x_h__) && !defined(__XBOX_D3D12_X__) #ifdef _GAMING_XBOX_SCARLETT #include @@ -27,17 +31,19 @@ #include #endif #endif +#endif #include +#if !defined(_DXTX_NOWIN) #include +#endif #define DIRECTX_TEX_VERSION 190 struct IWICImagingFactory; struct IWICMetadataQueryReader; - namespace DirectX { @@ -410,6 +416,8 @@ namespace DirectX HRESULT __cdecl SaveToTGAFile(_In_ const Image& image, _In_z_ const wchar_t* szFile, _In_opt_ const TexMetadata* metadata = nullptr) noexcept; // WIC operations + + #if !defined(_DXTX_NOWIN) HRESULT __cdecl LoadFromWICMemory( _In_reads_bytes_(size) const void* pSource, _In_ size_t size, _In_ WIC_FLAGS flags, @@ -440,6 +448,7 @@ namespace DirectX _In_z_ const wchar_t* szFile, _In_opt_ const GUID* targetFormat = nullptr, _In_opt_ std::function setCustomProps = nullptr); + #endif //--------------------------------------------------------------------------------- // Texture conversion, resizing, mipmap generation, and block compression @@ -749,7 +758,9 @@ namespace DirectX WIC_CODEC_ICO, // Windows Icon (.ico) }; +#if !defined(_DXTX_NOWIN) REFGUID __cdecl GetWICCodec(_In_ WICCodecs codec) noexcept; +#endif IWICImagingFactory* __cdecl GetWICFactory(bool& iswic2) noexcept; void __cdecl SetWICFactory(_In_opt_ IWICImagingFactory* pWIC) noexcept; diff --git a/DirectXTex/DirectXTexConvert.cpp b/DirectXTex/DirectXTexConvert.cpp index 16c2fbe..b95e29d 100644 --- a/DirectXTex/DirectXTexConvert.cpp +++ b/DirectXTex/DirectXTexConvert.cpp @@ -13,7 +13,10 @@ using namespace DirectX; using namespace DirectX::PackedVector; + +#if !defined(_DXTX_NOWIN) using Microsoft::WRL::ComPtr; +#endif namespace { @@ -2640,7 +2643,7 @@ HRESULT DirectX::_ConvertToR16G16B16A16(const Image& srcImage, ScratchImage& ima if (FAILED(hr)) return hr; - ScopedAlignedArrayXMVECTOR scanline(static_cast(_aligned_malloc((sizeof(XMVECTOR) * srcImage.width), 16))); + ScopedAlignedArrayXMVECTOR scanline(static_cast(AllocateVectorAligned((sizeof(XMVECTOR) * srcImage.width)))); if (!scanline) { image.Release(); @@ -2693,7 +2696,7 @@ HRESULT DirectX::_ConvertFromR16G16B16A16(const Image& srcImage, const Image& de if (srcImage.width != destImage.width || srcImage.height != destImage.height) return E_FAIL; - ScopedAlignedArrayXMVECTOR scanline(static_cast(_aligned_malloc((sizeof(XMVECTOR) * srcImage.width), 16))); + ScopedAlignedArrayXMVECTOR scanline(static_cast(AllocateVectorAligned((sizeof(XMVECTOR) * srcImage.width)))); if (!scanline) return E_OUTOFMEMORY; @@ -4363,6 +4366,8 @@ bool DirectX::_StoreScanlineDither( namespace { + + #if !defined(_DXTX_NOWIN) //------------------------------------------------------------------------------------- // Selection logic for using WIC vs. our own routines //------------------------------------------------------------------------------------- @@ -4573,6 +4578,8 @@ namespace return S_OK; } + #endif + //------------------------------------------------------------------------------------- // Convert the source image (not using WIC) @@ -4597,7 +4604,7 @@ namespace if (filter & TEX_FILTER_DITHER_DIFFUSION) { // Error diffusion dithering (aka Floyd-Steinberg dithering) - ScopedAlignedArrayXMVECTOR scanline(static_cast(_aligned_malloc((sizeof(XMVECTOR)*(width * 2 + 2)), 16))); + ScopedAlignedArrayXMVECTOR scanline(static_cast(AllocateVectorAligned((sizeof(XMVECTOR)*(width * 2 + 2))))); if (!scanline) return E_OUTOFMEMORY; @@ -4620,7 +4627,7 @@ namespace } else { - ScopedAlignedArrayXMVECTOR scanline(static_cast(_aligned_malloc((sizeof(XMVECTOR)*width), 16))); + ScopedAlignedArrayXMVECTOR scanline(static_cast(AllocateVectorAligned((sizeof(XMVECTOR)*width)))); if (!scanline) return E_OUTOFMEMORY; @@ -4856,12 +4863,14 @@ HRESULT DirectX::Convert( return E_POINTER; } +#if !defined(_DXTX_NOWIN) WICPixelFormatGUID pfGUID, targetGUID; if (UseWICConversion(filter, srcImage.format, format, pfGUID, targetGUID)) { hr = ConvertUsingWIC(srcImage, pfGUID, targetGUID, filter, threshold, *rimage); } else +#endif { hr = ConvertCustom(srcImage, filter, *rimage, threshold, 0); } @@ -4920,8 +4929,14 @@ HRESULT DirectX::Convert( return E_POINTER; } - WICPixelFormatGUID pfGUID, targetGUID; - bool usewic = !metadata.IsPMAlpha() && UseWICConversion(filter, metadata.format, format, pfGUID, targetGUID); + bool usewic; +#if !defined(_DXTX_NOWIN) + WICPixelFormatGUID pfGUID, targetGUID; + usewic =!metadata.IsPMAlpha() && UseWICConversion(filter, metadata.format, format, pfGUID, targetGUID); +#else + usewic = false; +#endif + switch (metadata.dimension) { @@ -4951,11 +4966,13 @@ HRESULT DirectX::Convert( return E_FAIL; } +#if !defined(_DXTX_NOWIN) if (usewic) { hr = ConvertUsingWIC(src, pfGUID, targetGUID, filter, threshold, dst); } else +#endif { hr = ConvertCustom(src, filter, dst, threshold, 0); } @@ -5004,11 +5021,13 @@ HRESULT DirectX::Convert( return E_FAIL; } +#if !defined(_DXTX_NOWIN) if (usewic) { hr = ConvertUsingWIC(src, pfGUID, targetGUID, filter, threshold, dst); } else +#endif { hr = ConvertCustom(src, filter, dst, threshold, slice); } diff --git a/DirectXTex/DirectXTexDDS.cpp b/DirectXTex/DirectXTexDDS.cpp index 9a5d323..b565a56 100644 --- a/DirectXTex/DirectXTexDDS.cpp +++ b/DirectXTex/DirectXTexDDS.cpp @@ -1575,32 +1575,10 @@ HRESULT DirectX::GetMetadataFromDDSFile( if (!szFile) return E_INVALIDARG; -#if (_WIN32_WINNT >= _WIN32_WINNT_WIN8) - ScopedHandle hFile(safe_handle(CreateFile2(szFile, GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING, nullptr))); -#else - ScopedHandle hFile(safe_handle(CreateFileW(szFile, GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING, - FILE_FLAG_SEQUENTIAL_SCAN, nullptr))); -#endif - if (!hFile) - { - return HRESULT_FROM_WIN32(GetLastError()); - } - - // Get the file size - FILE_STANDARD_INFO fileInfo; - if (!GetFileInformationByHandleEx(hFile.get(), FileStandardInfo, &fileInfo, sizeof(fileInfo))) - { - return HRESULT_FROM_WIN32(GetLastError()); - } - - // File is too big for 32-bit allocation, so reject read (4 GB should be plenty large enough for a valid DDS file) - if (fileInfo.EndOfFile.HighPart > 0) - { - return HRESULT_FROM_WIN32(ERROR_FILE_TOO_LARGE); - } + ScopedReadFile hFile(szFile); // Need at least enough data to fill the standard header and magic number to be a valid DDS - if (fileInfo.EndOfFile.LowPart < (sizeof(DDS_HEADER) + sizeof(uint32_t))) + if (hFile.GetLength() < (sizeof(DDS_HEADER) + sizeof(uint32_t))) { return E_FAIL; } @@ -1610,9 +1588,9 @@ HRESULT DirectX::GetMetadataFromDDSFile( uint8_t header[MAX_HEADER_SIZE] = {}; DWORD bytesRead = 0; - if (!ReadFile(hFile.get(), header, MAX_HEADER_SIZE, &bytesRead, nullptr)) + if (!(bytesRead = hFile.Read(header, MAX_HEADER_SIZE))) { - return HRESULT_FROM_WIN32(GetLastError()); + return E_FAIL; } uint32_t convFlags = 0; @@ -1708,33 +1686,11 @@ HRESULT DirectX::LoadFromDDSFile( image.Release(); -#if (_WIN32_WINNT >= _WIN32_WINNT_WIN8) - ScopedHandle hFile(safe_handle(CreateFile2(szFile, GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING, nullptr))); -#else - ScopedHandle hFile(safe_handle(CreateFileW(szFile, GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING, - FILE_FLAG_SEQUENTIAL_SCAN, nullptr))); -#endif - if (!hFile) - { - return HRESULT_FROM_WIN32(GetLastError()); - } - - // Get the file size - FILE_STANDARD_INFO fileInfo; - if (!GetFileInformationByHandleEx(hFile.get(), FileStandardInfo, &fileInfo, sizeof(fileInfo))) - { - return HRESULT_FROM_WIN32(GetLastError()); - } - - // File is too big for 32-bit allocation, so reject read (4 GB should be plenty large enough for a valid DDS file) - if (fileInfo.EndOfFile.HighPart > 0) - { - return HRESULT_FROM_WIN32(ERROR_FILE_TOO_LARGE); - } + ScopedReadFile hFile(szFile); // Need at least enough data to fill the standard header and magic number to be a valid DDS - if (fileInfo.EndOfFile.LowPart < (sizeof(DDS_HEADER) + sizeof(uint32_t))) + if (hFile.GetLength() < (sizeof(DDS_HEADER) + sizeof(uint32_t))) { return E_FAIL; } @@ -1744,9 +1700,9 @@ HRESULT DirectX::LoadFromDDSFile( uint8_t header[MAX_HEADER_SIZE] = {}; DWORD bytesRead = 0; - if (!ReadFile(hFile.get(), header, MAX_HEADER_SIZE, &bytesRead, nullptr)) + if (!(bytesRead = hFile.Read(header, MAX_HEADER_SIZE))) { - return HRESULT_FROM_WIN32(GetLastError()); + return E_FAIL; } uint32_t convFlags = 0; @@ -1760,12 +1716,7 @@ HRESULT DirectX::LoadFromDDSFile( if (!(convFlags & CONV_FLAGS_DX10)) { // Must reset file position since we read more than the standard header above - LARGE_INTEGER filePos = { { sizeof(uint32_t) + sizeof(DDS_HEADER), 0 } }; - if (!SetFilePointerEx(hFile.get(), filePos, nullptr, FILE_BEGIN)) - { - return HRESULT_FROM_WIN32(GetLastError()); - } - + hFile.SeekBegin(sizeof(uint32_t) + sizeof(DDS_HEADER)); offset = sizeof(uint32_t) + sizeof(DDS_HEADER); } @@ -1778,9 +1729,9 @@ HRESULT DirectX::LoadFromDDSFile( return E_OUTOFMEMORY; } - if (!ReadFile(hFile.get(), pal8.get(), 256 * sizeof(uint32_t), &bytesRead, nullptr)) + if (!(bytesRead = hFile.Read(pal8.get(), 256 * sizeof(uint32_t)))) { - return HRESULT_FROM_WIN32(GetLastError()); + return E_FAIL; } if (bytesRead != (256 * sizeof(uint32_t))) @@ -1791,7 +1742,7 @@ HRESULT DirectX::LoadFromDDSFile( offset += (256 * sizeof(uint32_t)); } - DWORD remaining = fileInfo.EndOfFile.LowPart - offset; + DWORD remaining = hFile.GetLength() - offset; if (remaining == 0) return E_FAIL; @@ -1808,10 +1759,10 @@ HRESULT DirectX::LoadFromDDSFile( return E_OUTOFMEMORY; } - if (!ReadFile(hFile.get(), temp.get(), remaining, &bytesRead, nullptr)) + if (!(bytesRead = hFile.Read(temp.get(), remaining))) { image.Release(); - return HRESULT_FROM_WIN32(GetLastError()); + return E_FAIL; } if (bytesRead != remaining) @@ -1857,10 +1808,10 @@ HRESULT DirectX::LoadFromDDSFile( return HRESULT_FROM_WIN32(ERROR_ARITHMETIC_OVERFLOW); } - if (!ReadFile(hFile.get(), image.GetPixels(), static_cast(image.GetPixelsSize()), &bytesRead, nullptr)) + if (!(bytesRead = hFile.Read(image.GetPixels(), static_cast(image.GetPixelsSize())))) { image.Release(); - return HRESULT_FROM_WIN32(GetLastError()); + return E_FAIL; } if (convFlags & (CONV_FLAGS_SWIZZLE | CONV_FLAGS_NOALPHA)) @@ -2133,25 +2084,9 @@ HRESULT DirectX::SaveToDDSFile( return hr; // Create file and write header -#if (_WIN32_WINNT >= _WIN32_WINNT_WIN8) - ScopedHandle hFile(safe_handle(CreateFile2(szFile, GENERIC_WRITE | DELETE, 0, CREATE_ALWAYS, nullptr))); -#else - ScopedHandle hFile(safe_handle(CreateFileW(szFile, GENERIC_WRITE | DELETE, 0, nullptr, CREATE_ALWAYS, 0, nullptr))); -#endif - if (!hFile) - { - return HRESULT_FROM_WIN32(GetLastError()); - } + ScopedWriteFile hFile(szFile); - auto_delete_file delonfail(hFile.get()); - - DWORD bytesWritten; - if (!WriteFile(hFile.get(), header, static_cast(required), &bytesWritten, nullptr)) - { - return HRESULT_FROM_WIN32(GetLastError()); - } - - if (bytesWritten != required) + if (! hFile.Write(header, static_cast(required))) { return E_FAIL; } @@ -2183,15 +2118,12 @@ HRESULT DirectX::SaveToDDSFile( if ((images[index].slicePitch == ddsSlicePitch) && (ddsSlicePitch <= UINT32_MAX)) { - if (!WriteFile(hFile.get(), images[index].pixels, static_cast(ddsSlicePitch), &bytesWritten, nullptr)) - { - return HRESULT_FROM_WIN32(GetLastError()); - } - - if (bytesWritten != ddsSlicePitch) + if (! hFile.Write(images[index].pixels, static_cast(ddsSlicePitch))) { return E_FAIL; } + + } else { @@ -2210,12 +2142,7 @@ HRESULT DirectX::SaveToDDSFile( size_t lines = ComputeScanlines(metadata.format, images[index].height); for (size_t j = 0; j < lines; ++j) { - if (!WriteFile(hFile.get(), sPtr, static_cast(ddsRowPitch), &bytesWritten, nullptr)) - { - return HRESULT_FROM_WIN32(GetLastError()); - } - - if (bytesWritten != ddsRowPitch) + if (!hFile.Write(sPtr, static_cast(ddsRowPitch))) { return E_FAIL; } @@ -2256,12 +2183,7 @@ HRESULT DirectX::SaveToDDSFile( if ((images[index].slicePitch == ddsSlicePitch) && (ddsSlicePitch <= UINT32_MAX)) { - if (!WriteFile(hFile.get(), images[index].pixels, static_cast(ddsSlicePitch), &bytesWritten, nullptr)) - { - return HRESULT_FROM_WIN32(GetLastError()); - } - - if (bytesWritten != ddsSlicePitch) + if (!hFile.Write(images[index].pixels, static_cast(ddsSlicePitch))) { return E_FAIL; } @@ -2283,16 +2205,10 @@ HRESULT DirectX::SaveToDDSFile( size_t lines = ComputeScanlines(metadata.format, images[index].height); for (size_t j = 0; j < lines; ++j) { - if (!WriteFile(hFile.get(), sPtr, static_cast(ddsRowPitch), &bytesWritten, nullptr)) - { - return HRESULT_FROM_WIN32(GetLastError()); - } - - if (bytesWritten != ddsRowPitch) + if (!hFile.Write(sPtr, static_cast(ddsRowPitch))) { return E_FAIL; } - sPtr += rowPitch; } } @@ -2308,7 +2224,7 @@ HRESULT DirectX::SaveToDDSFile( return E_FAIL; } - delonfail.clear(); + //delonfail.clear(); return S_OK; } diff --git a/DirectXTex/DirectXTexHDR.cpp b/DirectXTex/DirectXTexHDR.cpp index f00d19a..efda425 100644 --- a/DirectXTex/DirectXTexHDR.cpp +++ b/DirectXTex/DirectXTexHDR.cpp @@ -580,32 +580,10 @@ HRESULT DirectX::GetMetadataFromHDRFile(const wchar_t* szFile, TexMetadata& meta if (!szFile) return E_INVALIDARG; -#if (_WIN32_WINNT >= _WIN32_WINNT_WIN8) - ScopedHandle hFile(safe_handle(CreateFile2(szFile, GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING, nullptr))); -#else - ScopedHandle hFile(safe_handle(CreateFileW(szFile, GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING, - FILE_FLAG_SEQUENTIAL_SCAN, nullptr))); -#endif - if (!hFile) - { - return HRESULT_FROM_WIN32(GetLastError()); - } - - // Get the file size - FILE_STANDARD_INFO fileInfo; - if (!GetFileInformationByHandleEx(hFile.get(), FileStandardInfo, &fileInfo, sizeof(fileInfo))) - { - return HRESULT_FROM_WIN32(GetLastError()); - } - - // File is too big for 32-bit allocation, so reject read (4 GB should be plenty large enough for a valid HDR file) - if (fileInfo.EndOfFile.HighPart > 0) - { - return HRESULT_FROM_WIN32(ERROR_FILE_TOO_LARGE); - } + ScopedReadFile hFile(szFile); // Need at least enough data to fill the standard header to be a valid HDR - if (fileInfo.EndOfFile.LowPart < sizeof(g_Signature)) + if (szFile.GetLength() < sizeof(g_Signature)) { return E_FAIL; } @@ -615,7 +593,7 @@ HRESULT DirectX::GetMetadataFromHDRFile(const wchar_t* szFile, TexMetadata& meta DWORD bytesRead = 0; if (!ReadFile(hFile.get(), header, std::min(sizeof(header), fileInfo.EndOfFile.LowPart), &bytesRead, nullptr)) { - return HRESULT_FROM_WIN32(GetLastError()); + return E_FAIL; } size_t offset; @@ -860,14 +838,14 @@ HRESULT DirectX::LoadFromHDRFile(const wchar_t* szFile, TexMetadata* metadata, S #endif if (!hFile) { - return HRESULT_FROM_WIN32(GetLastError()); + return E_FAIL; } // Get the file size FILE_STANDARD_INFO fileInfo; if (!GetFileInformationByHandleEx(hFile.get(), FileStandardInfo, &fileInfo, sizeof(fileInfo))) { - return HRESULT_FROM_WIN32(GetLastError()); + return E_FAIL; } // File is too big for 32-bit allocation, so reject read (4 GB should be plenty large enough for a valid HDR file) @@ -892,7 +870,7 @@ HRESULT DirectX::LoadFromHDRFile(const wchar_t* szFile, TexMetadata* metadata, S DWORD bytesRead = 0; if (!ReadFile(hFile.get(), temp.get(), fileInfo.EndOfFile.LowPart, &bytesRead, nullptr)) { - return HRESULT_FROM_WIN32(GetLastError()); + return E_FAIL; } if (bytesRead != fileInfo.EndOfFile.LowPart) @@ -1055,7 +1033,7 @@ HRESULT DirectX::SaveToHDRFile(const Image& image, const wchar_t* szFile) noexce #endif if (!hFile) { - return HRESULT_FROM_WIN32(GetLastError()); + return E_FAIL; } auto_delete_file delonfail(hFile.get()); @@ -1082,7 +1060,7 @@ HRESULT DirectX::SaveToHDRFile(const Image& image, const wchar_t* szFile) noexce DWORD bytesWritten; if (!WriteFile(hFile.get(), blob.GetBufferPointer(), bytesToWrite, &bytesWritten, nullptr)) { - return HRESULT_FROM_WIN32(GetLastError()); + return E_FAIL; } if (bytesWritten != bytesToWrite) @@ -1108,7 +1086,7 @@ HRESULT DirectX::SaveToHDRFile(const Image& image, const wchar_t* szFile) noexce DWORD bytesWritten; if (!WriteFile(hFile.get(), header, headerLen, &bytesWritten, nullptr)) { - return HRESULT_FROM_WIN32(GetLastError()); + return E_FAIL; } if (bytesWritten != headerLen) @@ -1124,7 +1102,7 @@ HRESULT DirectX::SaveToHDRFile(const Image& image, const wchar_t* szFile) noexce if (!WriteFile(hFile.get(), rgbe, static_cast(rowPitch), &bytesWritten, nullptr)) { - return HRESULT_FROM_WIN32(GetLastError()); + return E_FAIL; } if (bytesWritten != rowPitch) @@ -1154,7 +1132,7 @@ HRESULT DirectX::SaveToHDRFile(const Image& image, const wchar_t* szFile) noexce if (!WriteFile(hFile.get(), enc, static_cast(encSize), &bytesWritten, nullptr)) { - return HRESULT_FROM_WIN32(GetLastError()); + return E_FAIL; } if (bytesWritten != encSize) @@ -1164,7 +1142,7 @@ HRESULT DirectX::SaveToHDRFile(const Image& image, const wchar_t* szFile) noexce { if (!WriteFile(hFile.get(), rgbe, static_cast(rowPitch), &bytesWritten, nullptr)) { - return HRESULT_FROM_WIN32(GetLastError()); + return E_FAIL; } if (bytesWritten != rowPitch) diff --git a/DirectXTex/DirectXTexP.h b/DirectXTex/DirectXTexP.h index bd3a193..65db76e 100644 --- a/DirectXTex/DirectXTexP.h +++ b/DirectXTex/DirectXTexP.h @@ -74,6 +74,28 @@ #define WIN32_LEAN_AND_MEAN #endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#if !defined(_DXTX_NOWIN) + #pragma warning(push) #pragma warning(disable : 4005) #define NOMINMAX @@ -107,29 +129,32 @@ #define _XM_NO_XMVECTOR_OVERLOADS_ -#include -#include -#include - -#include -#include - -#include - -#include -#include -#include - #include - -#include "DirectXTex.h" - #include #include +#else // !defined(_DXTX_NOWIN) + +#if defined(__clang__) +#include "ClangCommon.hpp" +#endif +#include "Win32Public.hpp" +#include "MSVCStubs.hpp" + +#endif + +#include "StdFS.hpp" + +#define _DXTEX_PRIVATE +#include "DirectXTex.h" + +#include +#include + #include "scoped.h" + #define XBOX_DXGI_FORMAT_R10G10B10_7E3_A2_FLOAT DXGI_FORMAT(116) #define XBOX_DXGI_FORMAT_R10G10B10_6E4_A2_FLOAT DXGI_FORMAT(117) #define XBOX_DXGI_FORMAT_D16_UNORM_S8_UINT DXGI_FORMAT(118) @@ -155,6 +180,7 @@ namespace DirectX TEX_FILTER_FLAGS __cdecl _CheckWICColorSpace(_In_ const GUID& sourceGUID, _In_ const GUID& targetGUID) noexcept; +#if !defined(_DXTX_NOWIN) inline WICBitmapDitherType __cdecl _GetWICDither(_In_ TEX_FILTER_FLAGS flags) noexcept { static_assert(TEX_FILTER_DITHER == 0x10000, "TEX_FILTER_DITHER* flag values don't match mask"); @@ -235,6 +261,7 @@ namespace DirectX } } +#endif //--------------------------------------------------------------------------------- // Image helper functions diff --git a/DirectXTex/scoped.h b/DirectXTex/scoped.h index 9d8832d..6a53e90 100644 --- a/DirectXTex/scoped.h +++ b/DirectXTex/scoped.h @@ -9,50 +9,16 @@ #pragma once -#include -#include -#include //--------------------------------------------------------------------------------- -struct aligned_deleter { void operator()(void* p) noexcept { _aligned_free(p); } }; +struct aligned_deleter { void operator()(void* p) noexcept { std::free(p); } }; using ScopedAlignedArrayFloat = std::unique_ptr; using ScopedAlignedArrayXMVECTOR = std::unique_ptr; -//--------------------------------------------------------------------------------- -struct handle_closer { void operator()(HANDLE h) noexcept { assert(h != INVALID_HANDLE_VALUE); if (h) CloseHandle(h); } }; -using ScopedHandle = std::unique_ptr; - -inline HANDLE safe_handle(HANDLE h) noexcept { return (h == INVALID_HANDLE_VALUE) ? nullptr : h; } - -//--------------------------------------------------------------------------------- -struct find_closer { void operator()(HANDLE h) noexcept { assert(h != INVALID_HANDLE_VALUE); if (h) FindClose(h); } }; - -using ScopedFindHandle = std::unique_ptr; - -//--------------------------------------------------------------------------------- -class auto_delete_file +static inline void * AllocateVectorAligned(size_t length) { -public: - auto_delete_file(HANDLE hFile) noexcept : m_handle(hFile) {} - - auto_delete_file(const auto_delete_file&) = delete; - auto_delete_file& operator=(const auto_delete_file&) = delete; - - ~auto_delete_file() - { - if (m_handle) - { - FILE_DISPOSITION_INFO info = {}; - info.DeleteFile = TRUE; - (void)SetFileInformationByHandle(m_handle, FileDispositionInfo, &info, sizeof(info)); - } - } - - void clear() noexcept { m_handle = nullptr; } - -private: - HANDLE m_handle; -}; + return std::aligned_alloc(16, length); +} \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..fd91289 --- /dev/null +++ b/Makefile @@ -0,0 +1,917 @@ +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 3.18 + +# Default target executed when no arguments are given to make. +default_target: all + +.PHONY : default_target + +# Allow only one "make -f Makefile2" at a time, but pass parallelism. +.NOTPARALLEL: + + +#============================================================================= +# Special targets provided by cmake. + +# Disable implicit rules so canonical targets will work. +.SUFFIXES: + + +# Disable VCS-based implicit rules. +% : %,v + + +# Disable VCS-based implicit rules. +% : RCS/% + + +# Disable VCS-based implicit rules. +% : RCS/%,v + + +# Disable VCS-based implicit rules. +% : SCCS/s.% + + +# Disable VCS-based implicit rules. +% : s.% + + +.SUFFIXES: .hpux_make_needs_suffix_list + + +# Command-line flag to silence nested $(MAKE). +$(VERBOSE)MAKESILENT = -s + +#Suppress display of executed commands. +$(VERBOSE).SILENT: + +# A target that is always out of date. +cmake_force: + +.PHONY : cmake_force + +#============================================================================= +# Set environment variables for the build. + +# The shell in which to execute make rules. +SHELL = /bin/sh + +# The CMake executable. +CMAKE_COMMAND = /usr/bin/cmake + +# The command to remove a file. +RM = /usr/bin/cmake -E rm -f + +# Escaping for special characters. +EQUALS = = + +# The top-level source directory on which CMake was run. +CMAKE_SOURCE_DIR = /run/media/reece/Misc/GameEngine/Master/Public/DirectXTex + +# The top-level build directory on which CMake was run. +CMAKE_BINARY_DIR = /run/media/reece/Misc/GameEngine/Master/Public/DirectXTex + +#============================================================================= +# Targets provided globally by CMake. + +# Special rule for the target edit_cache +edit_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..." + /usr/bin/ccmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : edit_cache + +# Special rule for the target edit_cache +edit_cache/fast: edit_cache + +.PHONY : edit_cache/fast + +# Special rule for the target rebuild_cache +rebuild_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..." + /usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : rebuild_cache + +# Special rule for the target rebuild_cache +rebuild_cache/fast: rebuild_cache + +.PHONY : rebuild_cache/fast + +# The main all target +all: cmake_check_build_system + $(CMAKE_COMMAND) -E cmake_progress_start /run/media/reece/Misc/GameEngine/Master/Public/DirectXTex/CMakeFiles /run/media/reece/Misc/GameEngine/Master/Public/DirectXTex//CMakeFiles/progress.marks + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 all + $(CMAKE_COMMAND) -E cmake_progress_start /run/media/reece/Misc/GameEngine/Master/Public/DirectXTex/CMakeFiles 0 +.PHONY : all + +# The main clean target +clean: + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 clean +.PHONY : clean + +# The main clean target +clean/fast: clean + +.PHONY : clean/fast + +# Prepare targets for installation. +preinstall: all + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 preinstall +.PHONY : preinstall + +# Prepare targets for installation. +preinstall/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 preinstall +.PHONY : preinstall/fast + +# clear depends +depend: + $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1 +.PHONY : depend + +#============================================================================= +# Target rules for targets named texdiag + +# Build rule for target. +texdiag: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 texdiag +.PHONY : texdiag + +# fast build rule for target. +texdiag/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/texdiag.dir/build.make CMakeFiles/texdiag.dir/build +.PHONY : texdiag/fast + +#============================================================================= +# Target rules for targets named texconv + +# Build rule for target. +texconv: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 texconv +.PHONY : texconv + +# fast build rule for target. +texconv/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/texconv.dir/build.make CMakeFiles/texconv.dir/build +.PHONY : texconv/fast + +#============================================================================= +# Target rules for targets named texassemble + +# Build rule for target. +texassemble: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 texassemble +.PHONY : texassemble + +# fast build rule for target. +texassemble/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/texassemble.dir/build.make CMakeFiles/texassemble.dir/build +.PHONY : texassemble/fast + +#============================================================================= +# Target rules for targets named DirectXTex + +# Build rule for target. +DirectXTex: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 DirectXTex +.PHONY : DirectXTex + +# fast build rule for target. +DirectXTex/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/build +.PHONY : DirectXTex/fast + +DirectXTex/BC.o: DirectXTex/BC.cpp.o + +.PHONY : DirectXTex/BC.o + +# target to build an object file +DirectXTex/BC.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/BC.cpp.o +.PHONY : DirectXTex/BC.cpp.o + +DirectXTex/BC.i: DirectXTex/BC.cpp.i + +.PHONY : DirectXTex/BC.i + +# target to preprocess a source file +DirectXTex/BC.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/BC.cpp.i +.PHONY : DirectXTex/BC.cpp.i + +DirectXTex/BC.s: DirectXTex/BC.cpp.s + +.PHONY : DirectXTex/BC.s + +# target to generate assembly for a file +DirectXTex/BC.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/BC.cpp.s +.PHONY : DirectXTex/BC.cpp.s + +DirectXTex/BC4BC5.o: DirectXTex/BC4BC5.cpp.o + +.PHONY : DirectXTex/BC4BC5.o + +# target to build an object file +DirectXTex/BC4BC5.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/BC4BC5.cpp.o +.PHONY : DirectXTex/BC4BC5.cpp.o + +DirectXTex/BC4BC5.i: DirectXTex/BC4BC5.cpp.i + +.PHONY : DirectXTex/BC4BC5.i + +# target to preprocess a source file +DirectXTex/BC4BC5.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/BC4BC5.cpp.i +.PHONY : DirectXTex/BC4BC5.cpp.i + +DirectXTex/BC4BC5.s: DirectXTex/BC4BC5.cpp.s + +.PHONY : DirectXTex/BC4BC5.s + +# target to generate assembly for a file +DirectXTex/BC4BC5.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/BC4BC5.cpp.s +.PHONY : DirectXTex/BC4BC5.cpp.s + +DirectXTex/BC6HBC7.o: DirectXTex/BC6HBC7.cpp.o + +.PHONY : DirectXTex/BC6HBC7.o + +# target to build an object file +DirectXTex/BC6HBC7.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/BC6HBC7.cpp.o +.PHONY : DirectXTex/BC6HBC7.cpp.o + +DirectXTex/BC6HBC7.i: DirectXTex/BC6HBC7.cpp.i + +.PHONY : DirectXTex/BC6HBC7.i + +# target to preprocess a source file +DirectXTex/BC6HBC7.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/BC6HBC7.cpp.i +.PHONY : DirectXTex/BC6HBC7.cpp.i + +DirectXTex/BC6HBC7.s: DirectXTex/BC6HBC7.cpp.s + +.PHONY : DirectXTex/BC6HBC7.s + +# target to generate assembly for a file +DirectXTex/BC6HBC7.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/BC6HBC7.cpp.s +.PHONY : DirectXTex/BC6HBC7.cpp.s + +DirectXTex/DirectXTexConvert.o: DirectXTex/DirectXTexConvert.cpp.o + +.PHONY : DirectXTex/DirectXTexConvert.o + +# target to build an object file +DirectXTex/DirectXTexConvert.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexConvert.cpp.o +.PHONY : DirectXTex/DirectXTexConvert.cpp.o + +DirectXTex/DirectXTexConvert.i: DirectXTex/DirectXTexConvert.cpp.i + +.PHONY : DirectXTex/DirectXTexConvert.i + +# target to preprocess a source file +DirectXTex/DirectXTexConvert.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexConvert.cpp.i +.PHONY : DirectXTex/DirectXTexConvert.cpp.i + +DirectXTex/DirectXTexConvert.s: DirectXTex/DirectXTexConvert.cpp.s + +.PHONY : DirectXTex/DirectXTexConvert.s + +# target to generate assembly for a file +DirectXTex/DirectXTexConvert.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexConvert.cpp.s +.PHONY : DirectXTex/DirectXTexConvert.cpp.s + +DirectXTex/DirectXTexD3D11.o: DirectXTex/DirectXTexD3D11.cpp.o + +.PHONY : DirectXTex/DirectXTexD3D11.o + +# target to build an object file +DirectXTex/DirectXTexD3D11.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexD3D11.cpp.o +.PHONY : DirectXTex/DirectXTexD3D11.cpp.o + +DirectXTex/DirectXTexD3D11.i: DirectXTex/DirectXTexD3D11.cpp.i + +.PHONY : DirectXTex/DirectXTexD3D11.i + +# target to preprocess a source file +DirectXTex/DirectXTexD3D11.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexD3D11.cpp.i +.PHONY : DirectXTex/DirectXTexD3D11.cpp.i + +DirectXTex/DirectXTexD3D11.s: DirectXTex/DirectXTexD3D11.cpp.s + +.PHONY : DirectXTex/DirectXTexD3D11.s + +# target to generate assembly for a file +DirectXTex/DirectXTexD3D11.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexD3D11.cpp.s +.PHONY : DirectXTex/DirectXTexD3D11.cpp.s + +DirectXTex/DirectXTexD3D12.o: DirectXTex/DirectXTexD3D12.cpp.o + +.PHONY : DirectXTex/DirectXTexD3D12.o + +# target to build an object file +DirectXTex/DirectXTexD3D12.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexD3D12.cpp.o +.PHONY : DirectXTex/DirectXTexD3D12.cpp.o + +DirectXTex/DirectXTexD3D12.i: DirectXTex/DirectXTexD3D12.cpp.i + +.PHONY : DirectXTex/DirectXTexD3D12.i + +# target to preprocess a source file +DirectXTex/DirectXTexD3D12.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexD3D12.cpp.i +.PHONY : DirectXTex/DirectXTexD3D12.cpp.i + +DirectXTex/DirectXTexD3D12.s: DirectXTex/DirectXTexD3D12.cpp.s + +.PHONY : DirectXTex/DirectXTexD3D12.s + +# target to generate assembly for a file +DirectXTex/DirectXTexD3D12.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexD3D12.cpp.s +.PHONY : DirectXTex/DirectXTexD3D12.cpp.s + +DirectXTex/DirectXTexDDS.o: DirectXTex/DirectXTexDDS.cpp.o + +.PHONY : DirectXTex/DirectXTexDDS.o + +# target to build an object file +DirectXTex/DirectXTexDDS.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexDDS.cpp.o +.PHONY : DirectXTex/DirectXTexDDS.cpp.o + +DirectXTex/DirectXTexDDS.i: DirectXTex/DirectXTexDDS.cpp.i + +.PHONY : DirectXTex/DirectXTexDDS.i + +# target to preprocess a source file +DirectXTex/DirectXTexDDS.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexDDS.cpp.i +.PHONY : DirectXTex/DirectXTexDDS.cpp.i + +DirectXTex/DirectXTexDDS.s: DirectXTex/DirectXTexDDS.cpp.s + +.PHONY : DirectXTex/DirectXTexDDS.s + +# target to generate assembly for a file +DirectXTex/DirectXTexDDS.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexDDS.cpp.s +.PHONY : DirectXTex/DirectXTexDDS.cpp.s + +DirectXTex/DirectXTexFlipRotate.o: DirectXTex/DirectXTexFlipRotate.cpp.o + +.PHONY : DirectXTex/DirectXTexFlipRotate.o + +# target to build an object file +DirectXTex/DirectXTexFlipRotate.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexFlipRotate.cpp.o +.PHONY : DirectXTex/DirectXTexFlipRotate.cpp.o + +DirectXTex/DirectXTexFlipRotate.i: DirectXTex/DirectXTexFlipRotate.cpp.i + +.PHONY : DirectXTex/DirectXTexFlipRotate.i + +# target to preprocess a source file +DirectXTex/DirectXTexFlipRotate.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexFlipRotate.cpp.i +.PHONY : DirectXTex/DirectXTexFlipRotate.cpp.i + +DirectXTex/DirectXTexFlipRotate.s: DirectXTex/DirectXTexFlipRotate.cpp.s + +.PHONY : DirectXTex/DirectXTexFlipRotate.s + +# target to generate assembly for a file +DirectXTex/DirectXTexFlipRotate.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexFlipRotate.cpp.s +.PHONY : DirectXTex/DirectXTexFlipRotate.cpp.s + +DirectXTex/DirectXTexHDR.o: DirectXTex/DirectXTexHDR.cpp.o + +.PHONY : DirectXTex/DirectXTexHDR.o + +# target to build an object file +DirectXTex/DirectXTexHDR.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexHDR.cpp.o +.PHONY : DirectXTex/DirectXTexHDR.cpp.o + +DirectXTex/DirectXTexHDR.i: DirectXTex/DirectXTexHDR.cpp.i + +.PHONY : DirectXTex/DirectXTexHDR.i + +# target to preprocess a source file +DirectXTex/DirectXTexHDR.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexHDR.cpp.i +.PHONY : DirectXTex/DirectXTexHDR.cpp.i + +DirectXTex/DirectXTexHDR.s: DirectXTex/DirectXTexHDR.cpp.s + +.PHONY : DirectXTex/DirectXTexHDR.s + +# target to generate assembly for a file +DirectXTex/DirectXTexHDR.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexHDR.cpp.s +.PHONY : DirectXTex/DirectXTexHDR.cpp.s + +DirectXTex/DirectXTexImage.o: DirectXTex/DirectXTexImage.cpp.o + +.PHONY : DirectXTex/DirectXTexImage.o + +# target to build an object file +DirectXTex/DirectXTexImage.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexImage.cpp.o +.PHONY : DirectXTex/DirectXTexImage.cpp.o + +DirectXTex/DirectXTexImage.i: DirectXTex/DirectXTexImage.cpp.i + +.PHONY : DirectXTex/DirectXTexImage.i + +# target to preprocess a source file +DirectXTex/DirectXTexImage.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexImage.cpp.i +.PHONY : DirectXTex/DirectXTexImage.cpp.i + +DirectXTex/DirectXTexImage.s: DirectXTex/DirectXTexImage.cpp.s + +.PHONY : DirectXTex/DirectXTexImage.s + +# target to generate assembly for a file +DirectXTex/DirectXTexImage.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexImage.cpp.s +.PHONY : DirectXTex/DirectXTexImage.cpp.s + +DirectXTex/DirectXTexMipmaps.o: DirectXTex/DirectXTexMipmaps.cpp.o + +.PHONY : DirectXTex/DirectXTexMipmaps.o + +# target to build an object file +DirectXTex/DirectXTexMipmaps.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexMipmaps.cpp.o +.PHONY : DirectXTex/DirectXTexMipmaps.cpp.o + +DirectXTex/DirectXTexMipmaps.i: DirectXTex/DirectXTexMipmaps.cpp.i + +.PHONY : DirectXTex/DirectXTexMipmaps.i + +# target to preprocess a source file +DirectXTex/DirectXTexMipmaps.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexMipmaps.cpp.i +.PHONY : DirectXTex/DirectXTexMipmaps.cpp.i + +DirectXTex/DirectXTexMipmaps.s: DirectXTex/DirectXTexMipmaps.cpp.s + +.PHONY : DirectXTex/DirectXTexMipmaps.s + +# target to generate assembly for a file +DirectXTex/DirectXTexMipmaps.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexMipmaps.cpp.s +.PHONY : DirectXTex/DirectXTexMipmaps.cpp.s + +DirectXTex/DirectXTexMisc.o: DirectXTex/DirectXTexMisc.cpp.o + +.PHONY : DirectXTex/DirectXTexMisc.o + +# target to build an object file +DirectXTex/DirectXTexMisc.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexMisc.cpp.o +.PHONY : DirectXTex/DirectXTexMisc.cpp.o + +DirectXTex/DirectXTexMisc.i: DirectXTex/DirectXTexMisc.cpp.i + +.PHONY : DirectXTex/DirectXTexMisc.i + +# target to preprocess a source file +DirectXTex/DirectXTexMisc.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexMisc.cpp.i +.PHONY : DirectXTex/DirectXTexMisc.cpp.i + +DirectXTex/DirectXTexMisc.s: DirectXTex/DirectXTexMisc.cpp.s + +.PHONY : DirectXTex/DirectXTexMisc.s + +# target to generate assembly for a file +DirectXTex/DirectXTexMisc.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexMisc.cpp.s +.PHONY : DirectXTex/DirectXTexMisc.cpp.s + +DirectXTex/DirectXTexNormalMaps.o: DirectXTex/DirectXTexNormalMaps.cpp.o + +.PHONY : DirectXTex/DirectXTexNormalMaps.o + +# target to build an object file +DirectXTex/DirectXTexNormalMaps.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexNormalMaps.cpp.o +.PHONY : DirectXTex/DirectXTexNormalMaps.cpp.o + +DirectXTex/DirectXTexNormalMaps.i: DirectXTex/DirectXTexNormalMaps.cpp.i + +.PHONY : DirectXTex/DirectXTexNormalMaps.i + +# target to preprocess a source file +DirectXTex/DirectXTexNormalMaps.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexNormalMaps.cpp.i +.PHONY : DirectXTex/DirectXTexNormalMaps.cpp.i + +DirectXTex/DirectXTexNormalMaps.s: DirectXTex/DirectXTexNormalMaps.cpp.s + +.PHONY : DirectXTex/DirectXTexNormalMaps.s + +# target to generate assembly for a file +DirectXTex/DirectXTexNormalMaps.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexNormalMaps.cpp.s +.PHONY : DirectXTex/DirectXTexNormalMaps.cpp.s + +DirectXTex/DirectXTexPMAlpha.o: DirectXTex/DirectXTexPMAlpha.cpp.o + +.PHONY : DirectXTex/DirectXTexPMAlpha.o + +# target to build an object file +DirectXTex/DirectXTexPMAlpha.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexPMAlpha.cpp.o +.PHONY : DirectXTex/DirectXTexPMAlpha.cpp.o + +DirectXTex/DirectXTexPMAlpha.i: DirectXTex/DirectXTexPMAlpha.cpp.i + +.PHONY : DirectXTex/DirectXTexPMAlpha.i + +# target to preprocess a source file +DirectXTex/DirectXTexPMAlpha.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexPMAlpha.cpp.i +.PHONY : DirectXTex/DirectXTexPMAlpha.cpp.i + +DirectXTex/DirectXTexPMAlpha.s: DirectXTex/DirectXTexPMAlpha.cpp.s + +.PHONY : DirectXTex/DirectXTexPMAlpha.s + +# target to generate assembly for a file +DirectXTex/DirectXTexPMAlpha.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexPMAlpha.cpp.s +.PHONY : DirectXTex/DirectXTexPMAlpha.cpp.s + +DirectXTex/DirectXTexResize.o: DirectXTex/DirectXTexResize.cpp.o + +.PHONY : DirectXTex/DirectXTexResize.o + +# target to build an object file +DirectXTex/DirectXTexResize.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexResize.cpp.o +.PHONY : DirectXTex/DirectXTexResize.cpp.o + +DirectXTex/DirectXTexResize.i: DirectXTex/DirectXTexResize.cpp.i + +.PHONY : DirectXTex/DirectXTexResize.i + +# target to preprocess a source file +DirectXTex/DirectXTexResize.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexResize.cpp.i +.PHONY : DirectXTex/DirectXTexResize.cpp.i + +DirectXTex/DirectXTexResize.s: DirectXTex/DirectXTexResize.cpp.s + +.PHONY : DirectXTex/DirectXTexResize.s + +# target to generate assembly for a file +DirectXTex/DirectXTexResize.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexResize.cpp.s +.PHONY : DirectXTex/DirectXTexResize.cpp.s + +DirectXTex/DirectXTexTGA.o: DirectXTex/DirectXTexTGA.cpp.o + +.PHONY : DirectXTex/DirectXTexTGA.o + +# target to build an object file +DirectXTex/DirectXTexTGA.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexTGA.cpp.o +.PHONY : DirectXTex/DirectXTexTGA.cpp.o + +DirectXTex/DirectXTexTGA.i: DirectXTex/DirectXTexTGA.cpp.i + +.PHONY : DirectXTex/DirectXTexTGA.i + +# target to preprocess a source file +DirectXTex/DirectXTexTGA.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexTGA.cpp.i +.PHONY : DirectXTex/DirectXTexTGA.cpp.i + +DirectXTex/DirectXTexTGA.s: DirectXTex/DirectXTexTGA.cpp.s + +.PHONY : DirectXTex/DirectXTexTGA.s + +# target to generate assembly for a file +DirectXTex/DirectXTexTGA.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexTGA.cpp.s +.PHONY : DirectXTex/DirectXTexTGA.cpp.s + +DirectXTex/DirectXTexUtil.o: DirectXTex/DirectXTexUtil.cpp.o + +.PHONY : DirectXTex/DirectXTexUtil.o + +# target to build an object file +DirectXTex/DirectXTexUtil.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexUtil.cpp.o +.PHONY : DirectXTex/DirectXTexUtil.cpp.o + +DirectXTex/DirectXTexUtil.i: DirectXTex/DirectXTexUtil.cpp.i + +.PHONY : DirectXTex/DirectXTexUtil.i + +# target to preprocess a source file +DirectXTex/DirectXTexUtil.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexUtil.cpp.i +.PHONY : DirectXTex/DirectXTexUtil.cpp.i + +DirectXTex/DirectXTexUtil.s: DirectXTex/DirectXTexUtil.cpp.s + +.PHONY : DirectXTex/DirectXTexUtil.s + +# target to generate assembly for a file +DirectXTex/DirectXTexUtil.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/DirectXTex/DirectXTexUtil.cpp.s +.PHONY : DirectXTex/DirectXTexUtil.cpp.s + +Texassemble/AnimatedGif.o: Texassemble/AnimatedGif.cpp.o + +.PHONY : Texassemble/AnimatedGif.o + +# target to build an object file +Texassemble/AnimatedGif.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/texassemble.dir/build.make CMakeFiles/texassemble.dir/Texassemble/AnimatedGif.cpp.o +.PHONY : Texassemble/AnimatedGif.cpp.o + +Texassemble/AnimatedGif.i: Texassemble/AnimatedGif.cpp.i + +.PHONY : Texassemble/AnimatedGif.i + +# target to preprocess a source file +Texassemble/AnimatedGif.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/texassemble.dir/build.make CMakeFiles/texassemble.dir/Texassemble/AnimatedGif.cpp.i +.PHONY : Texassemble/AnimatedGif.cpp.i + +Texassemble/AnimatedGif.s: Texassemble/AnimatedGif.cpp.s + +.PHONY : Texassemble/AnimatedGif.s + +# target to generate assembly for a file +Texassemble/AnimatedGif.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/texassemble.dir/build.make CMakeFiles/texassemble.dir/Texassemble/AnimatedGif.cpp.s +.PHONY : Texassemble/AnimatedGif.cpp.s + +Texassemble/texassemble.o: Texassemble/texassemble.cpp.o + +.PHONY : Texassemble/texassemble.o + +# target to build an object file +Texassemble/texassemble.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/texassemble.dir/build.make CMakeFiles/texassemble.dir/Texassemble/texassemble.cpp.o +.PHONY : Texassemble/texassemble.cpp.o + +Texassemble/texassemble.i: Texassemble/texassemble.cpp.i + +.PHONY : Texassemble/texassemble.i + +# target to preprocess a source file +Texassemble/texassemble.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/texassemble.dir/build.make CMakeFiles/texassemble.dir/Texassemble/texassemble.cpp.i +.PHONY : Texassemble/texassemble.cpp.i + +Texassemble/texassemble.s: Texassemble/texassemble.cpp.s + +.PHONY : Texassemble/texassemble.s + +# target to generate assembly for a file +Texassemble/texassemble.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/texassemble.dir/build.make CMakeFiles/texassemble.dir/Texassemble/texassemble.cpp.s +.PHONY : Texassemble/texassemble.cpp.s + +Texconv/ExtendedBMP.o: Texconv/ExtendedBMP.cpp.o + +.PHONY : Texconv/ExtendedBMP.o + +# target to build an object file +Texconv/ExtendedBMP.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/texconv.dir/build.make CMakeFiles/texconv.dir/Texconv/ExtendedBMP.cpp.o +.PHONY : Texconv/ExtendedBMP.cpp.o + +Texconv/ExtendedBMP.i: Texconv/ExtendedBMP.cpp.i + +.PHONY : Texconv/ExtendedBMP.i + +# target to preprocess a source file +Texconv/ExtendedBMP.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/texconv.dir/build.make CMakeFiles/texconv.dir/Texconv/ExtendedBMP.cpp.i +.PHONY : Texconv/ExtendedBMP.cpp.i + +Texconv/ExtendedBMP.s: Texconv/ExtendedBMP.cpp.s + +.PHONY : Texconv/ExtendedBMP.s + +# target to generate assembly for a file +Texconv/ExtendedBMP.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/texconv.dir/build.make CMakeFiles/texconv.dir/Texconv/ExtendedBMP.cpp.s +.PHONY : Texconv/ExtendedBMP.cpp.s + +Texconv/PortablePixMap.o: Texconv/PortablePixMap.cpp.o + +.PHONY : Texconv/PortablePixMap.o + +# target to build an object file +Texconv/PortablePixMap.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/texconv.dir/build.make CMakeFiles/texconv.dir/Texconv/PortablePixMap.cpp.o +.PHONY : Texconv/PortablePixMap.cpp.o + +Texconv/PortablePixMap.i: Texconv/PortablePixMap.cpp.i + +.PHONY : Texconv/PortablePixMap.i + +# target to preprocess a source file +Texconv/PortablePixMap.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/texconv.dir/build.make CMakeFiles/texconv.dir/Texconv/PortablePixMap.cpp.i +.PHONY : Texconv/PortablePixMap.cpp.i + +Texconv/PortablePixMap.s: Texconv/PortablePixMap.cpp.s + +.PHONY : Texconv/PortablePixMap.s + +# target to generate assembly for a file +Texconv/PortablePixMap.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/texconv.dir/build.make CMakeFiles/texconv.dir/Texconv/PortablePixMap.cpp.s +.PHONY : Texconv/PortablePixMap.cpp.s + +Texconv/texconv.o: Texconv/texconv.cpp.o + +.PHONY : Texconv/texconv.o + +# target to build an object file +Texconv/texconv.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/texconv.dir/build.make CMakeFiles/texconv.dir/Texconv/texconv.cpp.o +.PHONY : Texconv/texconv.cpp.o + +Texconv/texconv.i: Texconv/texconv.cpp.i + +.PHONY : Texconv/texconv.i + +# target to preprocess a source file +Texconv/texconv.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/texconv.dir/build.make CMakeFiles/texconv.dir/Texconv/texconv.cpp.i +.PHONY : Texconv/texconv.cpp.i + +Texconv/texconv.s: Texconv/texconv.cpp.s + +.PHONY : Texconv/texconv.s + +# target to generate assembly for a file +Texconv/texconv.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/texconv.dir/build.make CMakeFiles/texconv.dir/Texconv/texconv.cpp.s +.PHONY : Texconv/texconv.cpp.s + +Texdiag/texdiag.o: Texdiag/texdiag.cpp.o + +.PHONY : Texdiag/texdiag.o + +# target to build an object file +Texdiag/texdiag.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/texdiag.dir/build.make CMakeFiles/texdiag.dir/Texdiag/texdiag.cpp.o +.PHONY : Texdiag/texdiag.cpp.o + +Texdiag/texdiag.i: Texdiag/texdiag.cpp.i + +.PHONY : Texdiag/texdiag.i + +# target to preprocess a source file +Texdiag/texdiag.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/texdiag.dir/build.make CMakeFiles/texdiag.dir/Texdiag/texdiag.cpp.i +.PHONY : Texdiag/texdiag.cpp.i + +Texdiag/texdiag.s: Texdiag/texdiag.cpp.s + +.PHONY : Texdiag/texdiag.s + +# target to generate assembly for a file +Texdiag/texdiag.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/texdiag.dir/build.make CMakeFiles/texdiag.dir/Texdiag/texdiag.cpp.s +.PHONY : Texdiag/texdiag.cpp.s + +# target to build an object file +cmake_pch.hxx.pch: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/cmake_pch.hxx.pch +.PHONY : cmake_pch.hxx.pch + +# target to preprocess a source file +cmake_pch.hxx.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/cmake_pch.hxx.i +.PHONY : cmake_pch.hxx.i + +# target to generate assembly for a file +cmake_pch.hxx.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/DirectXTex.dir/build.make CMakeFiles/DirectXTex.dir/cmake_pch.hxx.s +.PHONY : cmake_pch.hxx.s + +# Help Target +help: + @echo "The following are some of the valid targets for this Makefile:" + @echo "... all (the default if no target is provided)" + @echo "... clean" + @echo "... depend" + @echo "... edit_cache" + @echo "... rebuild_cache" + @echo "... DirectXTex" + @echo "... texassemble" + @echo "... texconv" + @echo "... texdiag" + @echo "... DirectXTex/BC.o" + @echo "... DirectXTex/BC.i" + @echo "... DirectXTex/BC.s" + @echo "... DirectXTex/BC4BC5.o" + @echo "... DirectXTex/BC4BC5.i" + @echo "... DirectXTex/BC4BC5.s" + @echo "... DirectXTex/BC6HBC7.o" + @echo "... DirectXTex/BC6HBC7.i" + @echo "... DirectXTex/BC6HBC7.s" + @echo "... DirectXTex/DirectXTexConvert.o" + @echo "... DirectXTex/DirectXTexConvert.i" + @echo "... DirectXTex/DirectXTexConvert.s" + @echo "... DirectXTex/DirectXTexD3D11.o" + @echo "... DirectXTex/DirectXTexD3D11.i" + @echo "... DirectXTex/DirectXTexD3D11.s" + @echo "... DirectXTex/DirectXTexD3D12.o" + @echo "... DirectXTex/DirectXTexD3D12.i" + @echo "... DirectXTex/DirectXTexD3D12.s" + @echo "... DirectXTex/DirectXTexDDS.o" + @echo "... DirectXTex/DirectXTexDDS.i" + @echo "... DirectXTex/DirectXTexDDS.s" + @echo "... DirectXTex/DirectXTexFlipRotate.o" + @echo "... DirectXTex/DirectXTexFlipRotate.i" + @echo "... DirectXTex/DirectXTexFlipRotate.s" + @echo "... DirectXTex/DirectXTexHDR.o" + @echo "... DirectXTex/DirectXTexHDR.i" + @echo "... DirectXTex/DirectXTexHDR.s" + @echo "... DirectXTex/DirectXTexImage.o" + @echo "... DirectXTex/DirectXTexImage.i" + @echo "... DirectXTex/DirectXTexImage.s" + @echo "... DirectXTex/DirectXTexMipmaps.o" + @echo "... DirectXTex/DirectXTexMipmaps.i" + @echo "... DirectXTex/DirectXTexMipmaps.s" + @echo "... DirectXTex/DirectXTexMisc.o" + @echo "... DirectXTex/DirectXTexMisc.i" + @echo "... DirectXTex/DirectXTexMisc.s" + @echo "... DirectXTex/DirectXTexNormalMaps.o" + @echo "... DirectXTex/DirectXTexNormalMaps.i" + @echo "... DirectXTex/DirectXTexNormalMaps.s" + @echo "... DirectXTex/DirectXTexPMAlpha.o" + @echo "... DirectXTex/DirectXTexPMAlpha.i" + @echo "... DirectXTex/DirectXTexPMAlpha.s" + @echo "... DirectXTex/DirectXTexResize.o" + @echo "... DirectXTex/DirectXTexResize.i" + @echo "... DirectXTex/DirectXTexResize.s" + @echo "... DirectXTex/DirectXTexTGA.o" + @echo "... DirectXTex/DirectXTexTGA.i" + @echo "... DirectXTex/DirectXTexTGA.s" + @echo "... DirectXTex/DirectXTexUtil.o" + @echo "... DirectXTex/DirectXTexUtil.i" + @echo "... DirectXTex/DirectXTexUtil.s" + @echo "... Texassemble/AnimatedGif.o" + @echo "... Texassemble/AnimatedGif.i" + @echo "... Texassemble/AnimatedGif.s" + @echo "... Texassemble/texassemble.o" + @echo "... Texassemble/texassemble.i" + @echo "... Texassemble/texassemble.s" + @echo "... Texconv/ExtendedBMP.o" + @echo "... Texconv/ExtendedBMP.i" + @echo "... Texconv/ExtendedBMP.s" + @echo "... Texconv/PortablePixMap.o" + @echo "... Texconv/PortablePixMap.i" + @echo "... Texconv/PortablePixMap.s" + @echo "... Texconv/texconv.o" + @echo "... Texconv/texconv.i" + @echo "... Texconv/texconv.s" + @echo "... Texdiag/texdiag.o" + @echo "... Texdiag/texdiag.i" + @echo "... Texdiag/texdiag.s" + @echo "... cmake_pch.hxx.pch" + @echo "... cmake_pch.hxx.i" + @echo "... cmake_pch.hxx.s" +.PHONY : help + + + +#============================================================================= +# Special targets to cleanup operation of make. + +# Special rule to run CMake to check the build system integrity. +# No rule that depends on this can have commands that come from listfiles +# because they might be regenerated. +cmake_check_build_system: + $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0 +.PHONY : cmake_check_build_system + diff --git a/PlatformSupport/ClangBsearch.hpp b/PlatformSupport/ClangBsearch.hpp new file mode 100644 index 0000000..85ed81d --- /dev/null +++ b/PlatformSupport/ClangBsearch.hpp @@ -0,0 +1,124 @@ +/*------------------------------------------------------------------ + * bsearch_s.c + * + * September 2017, Reini Urban + * + * Copyright (c) 2017 by Reini Urban + * All rights reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + *------------------------------------------------------------------ + */ + +#pragma once +#include + + +/** + * @def bsearch_s(key,base,nmemb,size,compar,context) + * @brief + * The \c bsearch_s function finds an element equal to element pointed to + * by key in an array pointed to by base. The array contains count elements + * of size bytes and must be partitioned with respect to key, that is, all + * the elements that compare less than must appear before all the elements + * that compare equal to, and those must appear before all the elements + * that compare greater than the key object. A fully sorted array satisfies + * these requirements. The elements are compared using function pointed to + * by comp. The behavior is undefined if the array is not already + * partitioned with respect to *key in ascending order according to the + * same criterion that compar uses. + * + * @remark SPECIFIED IN + * * C11 standard (ISO/IEC 9899:2011): + * K.3.6.3.1 The bsearch_s function (p: 608-609) + * http://en.cppreference.com/w/c/algorithm/bsearch + * + * @param[in] key pointer to the element to search for + * @param[in] base pointer to the array to examine + * @param[in] nmemb number of elements in the array + * @param[in] size size of each element in the array in bytes + * @param[in] compar comparison function which returns ​a negative integer + * value if the first argument is less than the second, a + * positive integer value if the first argument is greater + * than the second and zero if the arguments are equal. key + * is passed as the first argument, an element from the + * array as the second. The signature of the comparison + * function should be equivalent to the following: int + * cmp(const void *a, const void *b); The function must not + * modify the objects passed to it and must return + * consistent results when called for the same objects, + * regardless of their positions in the array. + * @param[in] context additional information (e.g., collating sequence), passed + * to compar as the third argument + * + * @pre key, base or compar shall not be a null pointer (unless nmemb is zero). + * @pre nmemb or size shall not be greater than RSIZE_MAX_MEM. + * + * If the array contains several elements that compar would indicate as + * equal to the element searched for, then it is unspecified which element the + * function will return as the result. + * + * @return Pointer to an element in the array that compares equal to *key, or + * null pointer if such element has not been found, or any run-time + * constraint violation. + * + * @note + * Despite the name, neither C nor POSIX standards require this function to + * be implemented using binary search or make any complexity guarantees. + * Unlike other bounds-checked functions, bsearch_s does not treat arrays + * of zero size as a runtime constraint violation and instead indicates + * element not found (the other function that accepts arrays of zero size + * is qsort_s). + * + * @retval NULL when not found or any error + * + * errno values: + * ESNULLP when key, base or compar are a NULL pointer, and nmemb is > 0 + * ESLEMAX when nmemb or size > RSIZE_MAX_MEM + * + * @see + * qsort_s() + */ + +static void * +bsearch_s(const void *key, const void *base, size_t nmemb, size_t size, + int (*compar)(void *k, const void *key, const void *val), + void *context) +{ + errno = 0; + + while (nmemb > 0) { + void *ptry = (char *)base + size * (nmemb / 2); + int sign = compar(context, key, ptry); + if (!sign) + return ptry; + else if (nmemb == 1) + break; + else if (sign < 0) { + nmemb /= 2; + } else { + base = ptry; + nmemb -= nmemb / 2; + } + } + return NULL; +} diff --git a/PlatformSupport/ClangCommon.hpp b/PlatformSupport/ClangCommon.hpp new file mode 100644 index 0000000..7b37eab --- /dev/null +++ b/PlatformSupport/ClangCommon.hpp @@ -0,0 +1,15 @@ +#pragma once + +#include "ClangBsearch.hpp" + +static int memcpy_s(void * dest, size_t destsz, + const void * src, size_t count) +{ + size_t minVal = std::min(destsz, count); + if (count > destsz) + { + return 1; + } + std::memcpy(dest, src, minVal); + return 0; +} diff --git a/PlatformSupport/MSVCSal.h b/PlatformSupport/MSVCSal.h new file mode 100644 index 0000000..e651378 --- /dev/null +++ b/PlatformSupport/MSVCSal.h @@ -0,0 +1,2957 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +/*** +*sal.h - markers for documenting the semantics of APIs +* + +* +*Purpose: +* sal.h provides a set of annotations to describe how a function uses its +* parameters - the assumptions it makes about them, and the guarantees it makes +* upon finishing. +****/ +#pragma once + +/*========================================================================== + + The comments in this file are intended to give basic understanding of + the usage of SAL, the Microsoft Source Code Annotation Language. + For more details, please see http://go.microsoft.com/fwlink/?LinkID=242134 + + The macros are defined in 3 layers, plus the structural set: + + _In_/_Out_/_Ret_ Layer: + ---------------------- + This layer provides the highest abstraction and its macros should be used + in most cases. These macros typically start with: + _In_ : input parameter to a function, unmodified by called function + _Out_ : output parameter, written to by called function, pointed-to + location not expected to be initialized prior to call + _Outptr_ : like _Out_ when returned variable is a pointer type + (so param is pointer-to-pointer type). Called function + provides/allocated space. + _Outref_ : like _Outptr_, except param is reference-to-pointer type. + _Inout_ : inout parameter, read from and potentially modified by + called function. + _Ret_ : for return values + _Field_ : class/struct field invariants + For common usage, this class of SAL provides the most concise annotations. + Note that _In_/_Out_/_Inout_/_Outptr_ annotations are designed to be used + with a parameter target. Using them with _At_ to specify non-parameter + targets may yield unexpected results. + + This layer also includes a number of other properties that can be specified + to extend the ability of code analysis, most notably: + -- Designating parameters as format strings for printf/scanf/scanf_s + -- Requesting stricter type checking for C enum parameters + + _Pre_/_Post_ Layer: + ------------------ + The macros of this layer only should be used when there is no suitable macro + in the _In_/_Out_ layer. Its macros start with _Pre_ or _Post_. + This layer provides the most flexibility for annotations. + + Implementation Abstraction Layer: + -------------------------------- + Macros from this layer should never be used directly. The layer only exists + to hide the implementation of the annotation macros. + + Structural Layer: + ---------------- + These annotations, like _At_ and _When_, are used with annotations from + any of the other layers as modifiers, indicating exactly when and where + the annotations apply. + + + Common syntactic conventions: + ---------------------------- + + Usage: + ----- + _In_, _Out_, _Inout_, _Pre_, _Post_, are for formal parameters. + _Ret_, _Deref_ret_ must be used for return values. + + Nullness: + -------- + If the parameter can be NULL as a precondition to the function, the + annotation contains _opt. If the macro does not contain '_opt' the + parameter cannot be NULL. + + If an out/inout parameter returns a null pointer as a postcondition, this is + indicated by _Ret_maybenull_ or _result_maybenull_. If the macro is not + of this form, then the result will not be NULL as a postcondition. + _Outptr_ - output value is not NULL + _Outptr_result_maybenull_ - output value might be NULL + + String Type: + ----------- + _z: NullTerminated string + for _In_ parameters the buffer must have the specified stringtype before the call + for _Out_ parameters the buffer must have the specified stringtype after the call + for _Inout_ parameters both conditions apply + + Extent Syntax: + ------------- + Buffer sizes are expressed as element counts, unless the macro explicitly + contains _byte_ or _bytes_. Some annotations specify two buffer sizes, in + which case the second is used to indicate how much of the buffer is valid + as a postcondition. This table outlines the precondition buffer allocation + size, precondition number of valid elements, postcondition allocation size, + and postcondition number of valid elements for representative buffer size + annotations: + Pre | Pre | Post | Post + alloc | valid | alloc | valid + Annotation elems | elems | elems | elems + ---------- ------------------------------------ + _In_reads_(s) s | s | s | s + _Inout_updates_(s) s | s | s | s + _Inout_updates_to_(s,c) s | s | s | c + _Out_writes_(s) s | 0 | s | s + _Out_writes_to_(s,c) s | 0 | s | c + _Outptr_result_buffer_(s) ? | ? | s | s + _Outptr_result_buffer_to_(s,c) ? | ? | s | c + + For the _Outptr_ annotations, the buffer in question is at one level of + dereference. The called function is responsible for supplying the buffer. + + Success and failure: + ------------------- + The SAL concept of success allows functions to define expressions that can + be tested by the caller, which if it evaluates to non-zero, indicates the + function succeeded, which means that its postconditions are guaranteed to + hold. Otherwise, if the expression evaluates to zero, the function is + considered to have failed, and the postconditions are not guaranteed. + + The success criteria can be specified with the _Success_(expr) annotation: + _Success_(return != FALSE) BOOL + PathCanonicalizeA(_Out_writes_(MAX_PATH) LPSTR pszBuf, LPCSTR pszPath) : + pszBuf is only guaranteed to be NULL-terminated when TRUE is returned, + and FALSE indiates failure. In common practice, callers check for zero + vs. non-zero returns, so it is preferable to express the success + criteria in terms of zero/non-zero, not checked for exactly TRUE. + + Functions can specify that some postconditions will still hold, even when + the function fails, using _On_failure_(anno-list), or postconditions that + hold regardless of success or failure using _Always_(anno-list). + + The annotation _Return_type_success_(expr) may be used with a typedef to + give a default _Success_ criteria to all functions returning that type. + This is the case for common Windows API status types, including + HRESULT and NTSTATUS. This may be overridden on a per-function basis by + specifying a _Success_ annotation locally. + +============================================================================*/ + +#define __ATTR_SAL + +#ifndef _SAL_VERSION /*IFSTRIP=IGN*/ +#define _SAL_VERSION 20 +#endif + +#ifdef _PREFAST_ // [ + +// choose attribute or __declspec implementation +#ifndef _USE_DECLSPECS_FOR_SAL // [ +#define _USE_DECLSPECS_FOR_SAL 1 +#endif // ] + +#if _USE_DECLSPECS_FOR_SAL // [ +#undef _USE_ATTRIBUTES_FOR_SAL +#define _USE_ATTRIBUTES_FOR_SAL 0 +#elif !defined(_USE_ATTRIBUTES_FOR_SAL) // ][ +#if _MSC_VER >= 1400 /*IFSTRIP=IGN*/ // [ +#define _USE_ATTRIBUTES_FOR_SAL 1 +#else // ][ +#define _USE_ATTRIBUTES_FOR_SAL 0 +#endif // ] +#endif // ] + + +#if !_USE_DECLSPECS_FOR_SAL // [ +#if !_USE_ATTRIBUTES_FOR_SAL // [ +#if _MSC_VER >= 1400 /*IFSTRIP=IGN*/ // [ +#undef _USE_ATTRIBUTES_FOR_SAL +#define _USE_ATTRIBUTES_FOR_SAL 1 +#else // ][ +#undef _USE_DECLSPECS_FOR_SAL +#define _USE_DECLSPECS_FOR_SAL 1 +#endif // ] +#endif // ] +#endif // ] + +#else + +// Disable expansion of SAL macros in non-Prefast mode to +// improve compiler throughput. +#ifndef _USE_DECLSPECS_FOR_SAL // [ +#define _USE_DECLSPECS_FOR_SAL 0 +#endif // ] +#ifndef _USE_ATTRIBUTES_FOR_SAL // [ +#define _USE_ATTRIBUTES_FOR_SAL 0 +#endif // ] + +#endif // ] + +// safeguard for MIDL and RC builds +#if _USE_DECLSPECS_FOR_SAL && ( defined( MIDL_PASS ) || defined(__midl) || defined(RC_INVOKED) || !defined(_PREFAST_) ) /*IFSTRIP=IGN*/ // [ +#undef _USE_DECLSPECS_FOR_SAL +#define _USE_DECLSPECS_FOR_SAL 0 +#endif // ] +#if _USE_ATTRIBUTES_FOR_SAL && ( !defined(_MSC_EXTENSIONS) || defined( MIDL_PASS ) || defined(__midl) || defined(RC_INVOKED) ) /*IFSTRIP=IGN*/ // [ +#undef _USE_ATTRIBUTES_FOR_SAL +#define _USE_ATTRIBUTES_FOR_SAL 0 +#endif // ] + +#if _USE_DECLSPECS_FOR_SAL || _USE_ATTRIBUTES_FOR_SAL + +// Special enum type for Y/N/M +enum __SAL_YesNo {_SAL_notpresent, _SAL_no, _SAL_maybe, _SAL_yes, _SAL_default}; + +#endif + +#if defined(BUILD_WINDOWS) && !_USE_ATTRIBUTES_FOR_SAL /*IFSTRIP=IGN*/ +#define _SAL1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1") _GrouP_(annotes _SAL_nop_impl_) +#define _SAL1_1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.1") _GrouP_(annotes _SAL_nop_impl_) +#define _SAL1_2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.2") _GrouP_(annotes _SAL_nop_impl_) +#define _SAL2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "2") _GrouP_(annotes _SAL_nop_impl_) +#else +#define _SAL1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1") _Group_(annotes _SAL_nop_impl_) +#define _SAL1_1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.1") _Group_(annotes _SAL_nop_impl_) +#define _SAL1_2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.2") _Group_(annotes _SAL_nop_impl_) +#define _SAL2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "2") _Group_(annotes _SAL_nop_impl_) +#endif + +//============================================================================ +// Structural SAL: +// These annotations modify the use of other annotations. They may +// express the annotation target (i.e. what parameter/field the annotation +// applies to) or the condition under which the annotation is applicable. +//============================================================================ + +// _At_(target, annos) specifies that the annotations listed in 'annos' is to +// be applied to 'target' rather than to the identifier which is the current +// lexical target. +#define _At_(target, annos) _At_impl_(target, annos _SAL_nop_impl_) + +// _At_buffer_(target, iter, bound, annos) is similar to _At_, except that +// target names a buffer, and each annotation in annos is applied to each +// element of target up to bound, with the variable named in iter usable +// by the annotations to refer to relevant offsets within target. +#define _At_buffer_(target, iter, bound, annos) _At_buffer_impl_(target, iter, bound, annos _SAL_nop_impl_) + +// _When_(expr, annos) specifies that the annotations listed in 'annos' only +// apply when 'expr' evaluates to non-zero. +#define _When_(expr, annos) _When_impl_(expr, annos _SAL_nop_impl_) +#define _Group_(annos) _Group_impl_(annos _SAL_nop_impl_) +#define _GrouP_(annos) _GrouP_impl_(annos _SAL_nop_impl_) + +// indicates whether normal post conditions apply to a function +#define _Success_(expr) _SAL2_Source_(_Success_, (expr), _Success_impl_(expr)) + +// indicates whether post conditions apply to a function returning +// the type that this annotation is applied to +#define _Return_type_success_(expr) _SAL2_Source_(_Return_type_success_, (expr), _Success_impl_(expr)) + +// Establish postconditions that apply only if the function does not succeed +#define _On_failure_(annos) _On_failure_impl_(annos _SAL_nop_impl_) + +// Establish postconditions that apply in both success and failure cases. +// Only applicable with functions that have _Success_ or _Return_type_succss_. +#define _Always_(annos) _Always_impl_(annos _SAL_nop_impl_) + +// Usable on a function defintion. Asserts that a function declaration is +// in scope, and its annotations are to be used. There are no other annotations +// allowed on the function definition. +#define _Use_decl_annotations_ _Use_decl_anno_impl_ + +// _Notref_ may precede a _Deref_ or "real" annotation, and removes one +// level of dereference if the parameter is a C++ reference (&). If the +// net deref on a "real" annotation is negative, it is simply discarded. +#define _Notref_ _Notref_impl_ + +// Annotations for defensive programming styles. +#define _Pre_defensive_ _SA_annotes0(SAL_pre_defensive) +#define _Post_defensive_ _SA_annotes0(SAL_post_defensive) + +#define _In_defensive_(annotes) _Pre_defensive_ _Group_(annotes) +#define _Out_defensive_(annotes) _Post_defensive_ _Group_(annotes) +#define _Inout_defensive_(annotes) _Pre_defensive_ _Post_defensive_ _Group_(annotes) + +//============================================================================ +// _In_\_Out_ Layer: +//============================================================================ + +// Reserved pointer parameters, must always be NULL. +#define _Reserved_ _SAL2_Source_(_Reserved_, (), _Pre1_impl_(__null_impl)) + +// _Const_ allows specification that any namable memory location is considered +// readonly for a given call. +#define _Const_ _SAL2_Source_(_Const_, (), _Pre1_impl_(__readaccess_impl_notref)) + + +// Input parameters -------------------------- + +// _In_ - Annotations for parameters where data is passed into the function, but not modified. +// _In_ by itself can be used with non-pointer types (although it is redundant). + +// e.g. void SetPoint( _In_ const POINT* pPT ); +#define _In_ _SAL2_Source_(_In_, (), _Pre1_impl_(__notnull_impl_notref) _Pre_valid_impl_ _Deref_pre1_impl_(__readaccess_impl_notref)) +#define _In_opt_ _SAL2_Source_(_In_opt_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre_valid_impl_ _Deref_pre_readonly_) + +// nullterminated 'in' parameters. +// e.g. void CopyStr( _In_z_ const char* szFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo ); +#define _In_z_ _SAL2_Source_(_In_z_, (), _In_ _Pre1_impl_(__zterm_impl)) +#define _In_opt_z_ _SAL2_Source_(_In_opt_z_, (), _In_opt_ _Pre1_impl_(__zterm_impl)) + + +// 'input' buffers with given size + +#define _In_reads_(size) _SAL2_Source_(_In_reads_, (size), _Pre_count_(size) _Deref_pre_readonly_) +#define _In_reads_opt_(size) _SAL2_Source_(_In_reads_opt_, (size), _Pre_opt_count_(size) _Deref_pre_readonly_) +#define _In_reads_bytes_(size) _SAL2_Source_(_In_reads_bytes_, (size), _Pre_bytecount_(size) _Deref_pre_readonly_) +#define _In_reads_bytes_opt_(size) _SAL2_Source_(_In_reads_bytes_opt_, (size), _Pre_opt_bytecount_(size) _Deref_pre_readonly_) +#define _In_reads_z_(size) _SAL2_Source_(_In_reads_z_, (size), _In_reads_(size) _Pre_z_) +#define _In_reads_opt_z_(size) _SAL2_Source_(_In_reads_opt_z_, (size), _Pre_opt_count_(size) _Deref_pre_readonly_ _Pre_opt_z_) +#define _In_reads_or_z_(size) _SAL2_Source_(_In_reads_or_z_, (size), _In_ _When_(_String_length_(_Curr_) < (size), _Pre_z_) _When_(_String_length_(_Curr_) >= (size), _Pre1_impl_(__count_impl(size)))) +#define _In_reads_or_z_opt_(size) _SAL2_Source_(_In_reads_or_z_opt_, (size), _In_opt_ _When_(_String_length_(_Curr_) < (size), _Pre_z_) _When_(_String_length_(_Curr_) >= (size), _Pre1_impl_(__count_impl(size)))) + + +// 'input' buffers valid to the given end pointer + +#define _In_reads_to_ptr_(ptr) _SAL2_Source_(_In_reads_to_ptr_, (ptr), _Pre_ptrdiff_count_(ptr) _Deref_pre_readonly_) +#define _In_reads_to_ptr_opt_(ptr) _SAL2_Source_(_In_reads_to_ptr_opt_, (ptr), _Pre_opt_ptrdiff_count_(ptr) _Deref_pre_readonly_) +#define _In_reads_to_ptr_z_(ptr) _SAL2_Source_(_In_reads_to_ptr_z_, (ptr), _In_reads_to_ptr_(ptr) _Pre_z_) +#define _In_reads_to_ptr_opt_z_(ptr) _SAL2_Source_(_In_reads_to_ptr_opt_z_, (ptr), _Pre_opt_ptrdiff_count_(ptr) _Deref_pre_readonly_ _Pre_opt_z_) + + + +// Output parameters -------------------------- + +// _Out_ - Annotations for pointer or reference parameters where data passed back to the caller. +// These are mostly used where the pointer/reference is to a non-pointer type. +// _Outptr_/_Outref) (see below) are typically used to return pointers via parameters. + +// e.g. void GetPoint( _Out_ POINT* pPT ); +#define _Out_ _SAL2_Source_(_Out_, (), _Out_impl_) +#define _Out_opt_ _SAL2_Source_(_Out_opt_, (), _Out_opt_impl_) + +#define _Out_writes_(size) _SAL2_Source_(_Out_writes_, (size), _Pre_cap_(size) _Post_valid_impl_) +#define _Out_writes_opt_(size) _SAL2_Source_(_Out_writes_opt_, (size), _Pre_opt_cap_(size) _Post_valid_impl_) +#define _Out_writes_bytes_(size) _SAL2_Source_(_Out_writes_bytes_, (size), _Pre_bytecap_(size) _Post_valid_impl_) +#define _Out_writes_bytes_opt_(size) _SAL2_Source_(_Out_writes_bytes_opt_, (size), _Pre_opt_bytecap_(size) _Post_valid_impl_) +#define _Out_writes_z_(size) _SAL2_Source_(_Out_writes_z_, (size), _Pre_cap_(size) _Post_valid_impl_ _Post_z_) +#define _Out_writes_opt_z_(size) _SAL2_Source_(_Out_writes_opt_z_, (size), _Pre_opt_cap_(size) _Post_valid_impl_ _Post_z_) + +#define _Out_writes_to_(size,count) _SAL2_Source_(_Out_writes_to_, (size,count), _Pre_cap_(size) _Post_valid_impl_ _Post_count_(count)) +#define _Out_writes_to_opt_(size,count) _SAL2_Source_(_Out_writes_to_opt_, (size,count), _Pre_opt_cap_(size) _Post_valid_impl_ _Post_count_(count)) +#define _Out_writes_all_(size) _SAL2_Source_(_Out_writes_all_, (size), _Out_writes_to_(_Old_(size), _Old_(size))) +#define _Out_writes_all_opt_(size) _SAL2_Source_(_Out_writes_all_opt_, (size), _Out_writes_to_opt_(_Old_(size), _Old_(size))) + +#define _Out_writes_bytes_to_(size,count) _SAL2_Source_(_Out_writes_bytes_to_, (size,count), _Pre_bytecap_(size) _Post_valid_impl_ _Post_bytecount_(count)) +#define _Out_writes_bytes_to_opt_(size,count) _SAL2_Source_(_Out_writes_bytes_to_opt_, (size,count), _Pre_opt_bytecap_(size) _Post_valid_impl_ _Post_bytecount_(count)) +#define _Out_writes_bytes_all_(size) _SAL2_Source_(_Out_writes_bytes_all_, (size), _Out_writes_bytes_to_(_Old_(size), _Old_(size))) +#define _Out_writes_bytes_all_opt_(size) _SAL2_Source_(_Out_writes_bytes_all_opt_, (size), _Out_writes_bytes_to_opt_(_Old_(size), _Old_(size))) + +#define _Out_writes_to_ptr_(ptr) _SAL2_Source_(_Out_writes_to_ptr_, (ptr), _Pre_ptrdiff_cap_(ptr) _Post_valid_impl_) +#define _Out_writes_to_ptr_opt_(ptr) _SAL2_Source_(_Out_writes_to_ptr_opt_, (ptr), _Pre_opt_ptrdiff_cap_(ptr) _Post_valid_impl_) +#define _Out_writes_to_ptr_z_(ptr) _SAL2_Source_(_Out_writes_to_ptr_z_, (ptr), _Pre_ptrdiff_cap_(ptr) _Post_valid_impl_ Post_z_) +#define _Out_writes_to_ptr_opt_z_(ptr) _SAL2_Source_(_Out_writes_to_ptr_opt_z_, (ptr), _Pre_opt_ptrdiff_cap_(ptr) _Post_valid_impl_ Post_z_) + + +// Inout parameters ---------------------------- + +// _Inout_ - Annotations for pointer or reference parameters where data is passed in and +// potentially modified. +// void ModifyPoint( _Inout_ POINT* pPT ); +// void ModifyPointByRef( _Inout_ POINT& pPT ); + +#define _Inout_ _SAL2_Source_(_Inout_, (), _Prepost_valid_) +#define _Inout_opt_ _SAL2_Source_(_Inout_opt_, (), _Prepost_opt_valid_) + +// For modifying string buffers +// void toupper( _Inout_z_ char* sz ); +#define _Inout_z_ _SAL2_Source_(_Inout_z_, (), _Prepost_z_) +#define _Inout_opt_z_ _SAL2_Source_(_Inout_opt_z_, (), _Prepost_opt_z_) + +// For modifying buffers with explicit element size +#define _Inout_updates_(size) _SAL2_Source_(_Inout_updates_, (size), _Pre_cap_(size) _Pre_valid_impl_ _Post_valid_impl_) +#define _Inout_updates_opt_(size) _SAL2_Source_(_Inout_updates_opt_, (size), _Pre_opt_cap_(size) _Pre_valid_impl_ _Post_valid_impl_) +#define _Inout_updates_z_(size) _SAL2_Source_(_Inout_updates_z_, (size), _Pre_cap_(size) _Pre_valid_impl_ _Post_valid_impl_ _Pre1_impl_(__zterm_impl) _Post1_impl_(__zterm_impl)) +#define _Inout_updates_opt_z_(size) _SAL2_Source_(_Inout_updates_opt_z_, (size), _Pre_opt_cap_(size) _Pre_valid_impl_ _Post_valid_impl_ _Pre1_impl_(__zterm_impl) _Post1_impl_(__zterm_impl)) + +#define _Inout_updates_to_(size,count) _SAL2_Source_(_Inout_updates_to_, (size,count), _Out_writes_to_(size,count) _Pre_valid_impl_ _Pre1_impl_(__count_impl(count))) +#define _Inout_updates_to_opt_(size,count) _SAL2_Source_(_Inout_updates_to_opt_, (size,count), _Out_writes_to_opt_(size,count) _Pre_valid_impl_ _Pre1_impl_(__count_impl(count))) + +#define _Inout_updates_all_(size) _SAL2_Source_(_Inout_updates_all_, (size), _Inout_updates_to_(_Old_(size), _Old_(size))) +#define _Inout_updates_all_opt_(size) _SAL2_Source_(_Inout_updates_all_opt_, (size), _Inout_updates_to_opt_(_Old_(size), _Old_(size))) + +// For modifying buffers with explicit byte size +#define _Inout_updates_bytes_(size) _SAL2_Source_(_Inout_updates_bytes_, (size), _Pre_bytecap_(size) _Pre_valid_impl_ _Post_valid_impl_) +#define _Inout_updates_bytes_opt_(size) _SAL2_Source_(_Inout_updates_bytes_opt_, (size), _Pre_opt_bytecap_(size) _Pre_valid_impl_ _Post_valid_impl_) + +#define _Inout_updates_bytes_to_(size,count) _SAL2_Source_(_Inout_updates_bytes_to_, (size,count), _Out_writes_bytes_to_(size,count) _Pre_valid_impl_ _Pre1_impl_(__bytecount_impl(count))) +#define _Inout_updates_bytes_to_opt_(size,count) _SAL2_Source_(_Inout_updates_bytes_to_opt_, (size,count), _Out_writes_bytes_to_opt_(size,count) _Pre_valid_impl_ _Pre1_impl_(__bytecount_impl(count))) + +#define _Inout_updates_bytes_all_(size) _SAL2_Source_(_Inout_updates_bytes_all_, (size), _Inout_updates_bytes_to_(_Old_(size), _Old_(size))) +#define _Inout_updates_bytes_all_opt_(size) _SAL2_Source_(_Inout_updates_bytes_all_opt_, (size), _Inout_updates_bytes_to_opt_(_Old_(size), _Old_(size))) + + +// Pointer to pointer parameters ------------------------- + +// _Outptr_ - Annotations for output params returning pointers +// These describe parameters where the called function provides the buffer: +// HRESULT SHStrDupW(_In_ LPCWSTR psz, _Outptr_ LPWSTR *ppwsz); +// The caller passes the address of an LPWSTR variable as ppwsz, and SHStrDupW allocates +// and initializes memory and returns the pointer to the new LPWSTR in *ppwsz. +// +// _Outptr_opt_ - describes parameters that are allowed to be NULL. +// _Outptr_*_result_maybenull_ - describes parameters where the called function might return NULL to the caller. +// +// Example: +// void MyFunc(_Outptr_opt_ int **ppData1, _Outptr_result_maybenull_ int **ppData2); +// Callers: +// MyFunc(NULL, NULL); // error: parameter 2, ppData2, should not be NULL +// MyFunc(&pData1, &pData2); // ok: both non-NULL +// if (*pData1 == *pData2) ... // error: pData2 might be NULL after call + +#define _Outptr_ _SAL2_Source_(_Outptr_, (), _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(1))) +#define _Outptr_result_maybenull_ _SAL2_Source_(_Outptr_result_maybenull_, (), _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(1))) +#define _Outptr_opt_ _SAL2_Source_(_Outptr_opt_, (), _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(1))) +#define _Outptr_opt_result_maybenull_ _SAL2_Source_(_Outptr_opt_result_maybenull_, (), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(1))) + +// Annotations for _Outptr_ parameters returning pointers to null terminated strings. + +#define _Outptr_result_z_ _SAL2_Source_(_Outptr_result_z_, (), _Out_impl_ _Deref_post_z_) +#define _Outptr_opt_result_z_ _SAL2_Source_(_Outptr_opt_result_z_, (), _Out_opt_impl_ _Deref_post_z_) +#define _Outptr_result_maybenull_z_ _SAL2_Source_(_Outptr_result_maybenull_z_, (), _Out_impl_ _Deref_post_opt_z_) +#define _Outptr_opt_result_maybenull_z_ _SAL2_Source_(_Outptr_opt_result_maybenull_z_, (), _Out_opt_impl_ _Deref_post_opt_z_) + +// Annotations for _Outptr_ parameters where the output pointer is set to NULL if the function fails. + +#define _Outptr_result_nullonfailure_ _SAL2_Source_(_Outptr_result_nullonfailure_, (), _Outptr_ _On_failure_(_Deref_post_null_)) +#define _Outptr_opt_result_nullonfailure_ _SAL2_Source_(_Outptr_opt_result_nullonfailure_, (), _Outptr_opt_ _On_failure_(_Deref_post_null_)) + +// Annotations for _Outptr_ parameters which return a pointer to a ref-counted COM object, +// following the COM convention of setting the output to NULL on failure. +// The current implementation is identical to _Outptr_result_nullonfailure_. +// For pointers to types that are not COM objects, _Outptr_result_nullonfailure_ is preferred. + +#define _COM_Outptr_ _SAL2_Source_(_COM_Outptr_, (), _Outptr_ _On_failure_(_Deref_post_null_)) +#define _COM_Outptr_result_maybenull_ _SAL2_Source_(_COM_Outptr_result_maybenull_, (), _Outptr_result_maybenull_ _On_failure_(_Deref_post_null_)) +#define _COM_Outptr_opt_ _SAL2_Source_(_COM_Outptr_opt_, (), _Outptr_opt_ _On_failure_(_Deref_post_null_)) +#define _COM_Outptr_opt_result_maybenull_ _SAL2_Source_(_COM_Outptr_opt_result_maybenull_, (), _Outptr_opt_result_maybenull_ _On_failure_(_Deref_post_null_)) + +// Annotations for _Outptr_ parameters returning a pointer to buffer with a specified number of elements/bytes + +#define _Outptr_result_buffer_(size) _SAL2_Source_(_Outptr_result_buffer_, (size), _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __cap_impl(size))) +#define _Outptr_opt_result_buffer_(size) _SAL2_Source_(_Outptr_opt_result_buffer_, (size), _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __cap_impl(size))) +#define _Outptr_result_buffer_to_(size, count) _SAL2_Source_(_Outptr_result_buffer_to_, (size, count), _Out_impl_ _Deref_post3_impl_(__notnull_impl_notref, __cap_impl(size), __count_impl(count))) +#define _Outptr_opt_result_buffer_to_(size, count) _SAL2_Source_(_Outptr_opt_result_buffer_to_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__notnull_impl_notref, __cap_impl(size), __count_impl(count))) + +#define _Outptr_result_buffer_all_(size) _SAL2_Source_(_Outptr_result_buffer_all_, (size), _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(size))) +#define _Outptr_opt_result_buffer_all_(size) _SAL2_Source_(_Outptr_opt_result_buffer_all_, (size), _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(size))) + +#define _Outptr_result_buffer_maybenull_(size) _SAL2_Source_(_Outptr_result_buffer_maybenull_, (size), _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __cap_impl(size))) +#define _Outptr_opt_result_buffer_maybenull_(size) _SAL2_Source_(_Outptr_opt_result_buffer_maybenull_, (size), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __cap_impl(size))) +#define _Outptr_result_buffer_to_maybenull_(size, count) _SAL2_Source_(_Outptr_result_buffer_to_maybenull_, (size, count), _Out_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __cap_impl(size), __count_impl(count))) +#define _Outptr_opt_result_buffer_to_maybenull_(size, count) _SAL2_Source_(_Outptr_opt_result_buffer_to_maybenull_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __cap_impl(size), __count_impl(count))) + +#define _Outptr_result_buffer_all_maybenull_(size) _SAL2_Source_(_Outptr_result_buffer_all_maybenull_, (size), _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(size))) +#define _Outptr_opt_result_buffer_all_maybenull_(size) _SAL2_Source_(_Outptr_opt_result_buffer_all_maybenull_, (size), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(size))) + +#define _Outptr_result_bytebuffer_(size) _SAL2_Source_(_Outptr_result_bytebuffer_, (size), _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecap_impl(size))) +#define _Outptr_opt_result_bytebuffer_(size) _SAL2_Source_(_Outptr_opt_result_bytebuffer_, (size), _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecap_impl(size))) +#define _Outptr_result_bytebuffer_to_(size, count) _SAL2_Source_(_Outptr_result_bytebuffer_to_, (size, count), _Out_impl_ _Deref_post3_impl_(__notnull_impl_notref, __bytecap_impl(size), __bytecount_impl(count))) +#define _Outptr_opt_result_bytebuffer_to_(size, count) _SAL2_Source_(_Outptr_opt_result_bytebuffer_to_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__notnull_impl_notref, __bytecap_impl(size), __bytecount_impl(count))) + +#define _Outptr_result_bytebuffer_all_(size) _SAL2_Source_(_Outptr_result_bytebuffer_all_, (size), _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecount_impl(size))) +#define _Outptr_opt_result_bytebuffer_all_(size) _SAL2_Source_(_Outptr_opt_result_bytebuffer_all_, (size), _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecount_impl(size))) + +#define _Outptr_result_bytebuffer_maybenull_(size) _SAL2_Source_(_Outptr_result_bytebuffer_maybenull_, (size), _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecap_impl(size))) +#define _Outptr_opt_result_bytebuffer_maybenull_(size) _SAL2_Source_(_Outptr_opt_result_bytebuffer_maybenull_, (size), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecap_impl(size))) +#define _Outptr_result_bytebuffer_to_maybenull_(size, count) _SAL2_Source_(_Outptr_result_bytebuffer_to_maybenull_, (size, count), _Out_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __bytecap_impl(size), __bytecount_impl(count))) +#define _Outptr_opt_result_bytebuffer_to_maybenull_(size, count) _SAL2_Source_(_Outptr_opt_result_bytebuffer_to_maybenull_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __bytecap_impl(size), __bytecount_impl(count))) + +#define _Outptr_result_bytebuffer_all_maybenull_(size) _SAL2_Source_(_Outptr_result_bytebuffer_all_maybenull_, (size), _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecount_impl(size))) +#define _Outptr_opt_result_bytebuffer_all_maybenull_(size) _SAL2_Source_(_Outptr_opt_result_bytebuffer_all_maybenull_, (size), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecount_impl(size))) + +// Annotations for output reference to pointer parameters. + +#define _Outref_ _SAL2_Source_(_Outref_, (), _Out_impl_ _Post_notnull_) +#define _Outref_result_maybenull_ _SAL2_Source_(_Outref_result_maybenull_, (), _Pre2_impl_(__notnull_impl_notref, __cap_c_one_notref_impl) _Post_maybenull_ _Post_valid_impl_) + +#define _Outref_result_buffer_(size) _SAL2_Source_(_Outref_result_buffer_, (size), _Outref_ _Post1_impl_(__cap_impl(size))) +#define _Outref_result_bytebuffer_(size) _SAL2_Source_(_Outref_result_bytebuffer_, (size), _Outref_ _Post1_impl_(__bytecap_impl(size))) +#define _Outref_result_buffer_to_(size, count) _SAL2_Source_(_Outref_result_buffer_to_, (size, count), _Outref_result_buffer_(size) _Post1_impl_(__count_impl(count))) +#define _Outref_result_bytebuffer_to_(size, count) _SAL2_Source_(_Outref_result_bytebuffer_to_, (size, count), _Outref_result_bytebuffer_(size) _Post1_impl_(__bytecount_impl(count))) +#define _Outref_result_buffer_all_(size) _SAL2_Source_(_Outref_result_buffer_all_, (size), _Outref_result_buffer_to_(size, _Old_(size))) +#define _Outref_result_bytebuffer_all_(size) _SAL2_Source_(_Outref_result_bytebuffer_all_, (size), _Outref_result_bytebuffer_to_(size, _Old_(size))) + +#define _Outref_result_buffer_maybenull_(size) _SAL2_Source_(_Outref_result_buffer_maybenull_, (size), _Outref_result_maybenull_ _Post1_impl_(__cap_impl(size))) +#define _Outref_result_bytebuffer_maybenull_(size) _SAL2_Source_(_Outref_result_bytebuffer_maybenull_, (size), _Outref_result_maybenull_ _Post1_impl_(__bytecap_impl(size))) +#define _Outref_result_buffer_to_maybenull_(size, count) _SAL2_Source_(_Outref_result_buffer_to_maybenull_, (size, count), _Outref_result_buffer_maybenull_(size) _Post1_impl_(__count_impl(count))) +#define _Outref_result_bytebuffer_to_maybenull_(size, count) _SAL2_Source_(_Outref_result_bytebuffer_to_maybenull_, (size, count), _Outref_result_bytebuffer_maybenull_(size) _Post1_impl_(__bytecount_impl(count))) +#define _Outref_result_buffer_all_maybenull_(size) _SAL2_Source_(_Outref_result_buffer_all_maybenull_, (size), _Outref_result_buffer_to_maybenull_(size, _Old_(size))) +#define _Outref_result_bytebuffer_all_maybenull_(size) _SAL2_Source_(_Outref_result_bytebuffer_all_maybenull_, (size), _Outref_result_bytebuffer_to_maybenull_(size, _Old_(size))) + +// Annotations for output reference to pointer parameters that guarantee +// that the pointer is set to NULL on failure. +#define _Outref_result_nullonfailure_ _SAL2_Source_(_Outref_result_nullonfailure_, (), _Outref_ _On_failure_(_Post_null_)) + +// Generic annotations to set output value of a by-pointer or by-reference parameter to null/zero on failure. +#define _Result_nullonfailure_ _SAL2_Source_(_Result_nullonfailure_, (), _On_failure_(_Notref_impl_ _Deref_impl_ _Post_null_)) +#define _Result_zeroonfailure_ _SAL2_Source_(_Result_zeroonfailure_, (), _On_failure_(_Notref_impl_ _Deref_impl_ _Out_range_(==, 0))) + + +// return values ------------------------------- + +// +// _Ret_ annotations +// +// describing conditions that hold for return values after the call + +// e.g. _Ret_z_ CString::operator const WCHAR*() const throw(); +#define _Ret_z_ _SAL2_Source_(_Ret_z_, (), _Ret2_impl_(__notnull_impl, __zterm_impl) _Ret_valid_impl_) +#define _Ret_maybenull_z_ _SAL2_Source_(_Ret_maybenull_z_, (), _Ret2_impl_(__maybenull_impl,__zterm_impl) _Ret_valid_impl_) + +// used with allocated but not yet initialized objects +#define _Ret_notnull_ _SAL2_Source_(_Ret_notnull_, (), _Ret1_impl_(__notnull_impl)) +#define _Ret_maybenull_ _SAL2_Source_(_Ret_maybenull_, (), _Ret1_impl_(__maybenull_impl)) +#define _Ret_null_ _SAL2_Source_(_Ret_null_, (), _Ret1_impl_(__null_impl)) + +// used with allocated and initialized objects +// returns single valid object +#define _Ret_valid_ _SAL2_Source_(_Ret_valid_, (), _Ret1_impl_(__notnull_impl_notref) _Ret_valid_impl_) + +// returns pointer to initialized buffer of specified size +#define _Ret_writes_(size) _SAL2_Source_(_Ret_writes_, (size), _Ret2_impl_(__notnull_impl, __count_impl(size)) _Ret_valid_impl_) +#define _Ret_writes_z_(size) _SAL2_Source_(_Ret_writes_z_, (size), _Ret3_impl_(__notnull_impl, __count_impl(size), __zterm_impl) _Ret_valid_impl_) +#define _Ret_writes_bytes_(size) _SAL2_Source_(_Ret_writes_bytes_, (size), _Ret2_impl_(__notnull_impl, __bytecount_impl(size)) _Ret_valid_impl_) +#define _Ret_writes_maybenull_(size) _SAL2_Source_(_Ret_writes_maybenull_, (size), _Ret2_impl_(__maybenull_impl,__count_impl(size)) _Ret_valid_impl_) +#define _Ret_writes_maybenull_z_(size) _SAL2_Source_(_Ret_writes_maybenull_z_, (size), _Ret3_impl_(__maybenull_impl,__count_impl(size),__zterm_impl) _Ret_valid_impl_) +#define _Ret_writes_bytes_maybenull_(size) _SAL2_Source_(_Ret_writes_bytes_maybenull_, (size), _Ret2_impl_(__maybenull_impl,__bytecount_impl(size)) _Ret_valid_impl_) + +// returns pointer to partially initialized buffer, with total size 'size' and initialized size 'count' +#define _Ret_writes_to_(size,count) _SAL2_Source_(_Ret_writes_to_, (size,count), _Ret3_impl_(__notnull_impl, __cap_impl(size), __count_impl(count)) _Ret_valid_impl_) +#define _Ret_writes_bytes_to_(size,count) _SAL2_Source_(_Ret_writes_bytes_to_, (size,count), _Ret3_impl_(__notnull_impl, __bytecap_impl(size), __bytecount_impl(count)) _Ret_valid_impl_) +#define _Ret_writes_to_maybenull_(size,count) _SAL2_Source_(_Ret_writes_to_maybenull_, (size,count), _Ret3_impl_(__maybenull_impl, __cap_impl(size), __count_impl(count)) _Ret_valid_impl_) +#define _Ret_writes_bytes_to_maybenull_(size,count) _SAL2_Source_(_Ret_writes_bytes_to_maybenull_, (size,count), _Ret3_impl_(__maybenull_impl, __bytecap_impl(size), __bytecount_impl(count)) _Ret_valid_impl_) + + +// Annotations for strict type checking +#define _Points_to_data_ _SAL2_Source_(_Points_to_data_, (), _Pre_ _Points_to_data_impl_) +#define _Literal_ _SAL2_Source_(_Literal_, (), _Pre_ _Literal_impl_) +#define _Notliteral_ _SAL2_Source_(_Notliteral_, (), _Pre_ _Notliteral_impl_) + +// Check the return value of a function e.g. _Check_return_ ErrorCode Foo(); +#define _Check_return_ _SAL2_Source_(_Check_return_, (), _Check_return_impl_) +#define _Must_inspect_result_ _SAL2_Source_(_Must_inspect_result_, (), _Must_inspect_impl_ _Check_return_impl_) + +// e.g. MyPrintF( _Printf_format_string_ const WCHAR* wzFormat, ... ); +#define _Printf_format_string_ _SAL2_Source_(_Printf_format_string_, (), _Printf_format_string_impl_) +#define _Scanf_format_string_ _SAL2_Source_(_Scanf_format_string_, (), _Scanf_format_string_impl_) +#define _Scanf_s_format_string_ _SAL2_Source_(_Scanf_s_format_string_, (), _Scanf_s_format_string_impl_) + +#define _Format_string_impl_(kind,where) _SA_annotes2(SAL_IsFormatString2, kind, where) +#define _Printf_format_string_params_(x) _SAL2_Source_(_Printf_format_string_params_, (x), _Format_string_impl_("printf", x)) +#define _Scanf_format_string_params_(x) _SAL2_Source_(_Scanf_format_string_params_, (x), _Format_string_impl_("scanf", x)) +#define _Scanf_s_format_string_params_(x) _SAL2_Source_(_Scanf_s_format_string_params_, (x), _Format_string_impl_("scanf_s", x)) + +// annotations to express value of integral or pointer parameter +#define _In_range_(lb,ub) _SAL2_Source_(_In_range_, (lb,ub), _In_range_impl_(lb,ub)) +#define _Out_range_(lb,ub) _SAL2_Source_(_Out_range_, (lb,ub), _Out_range_impl_(lb,ub)) +#define _Ret_range_(lb,ub) _SAL2_Source_(_Ret_range_, (lb,ub), _Ret_range_impl_(lb,ub)) +#define _Deref_in_range_(lb,ub) _SAL2_Source_(_Deref_in_range_, (lb,ub), _Deref_in_range_impl_(lb,ub)) +#define _Deref_out_range_(lb,ub) _SAL2_Source_(_Deref_out_range_, (lb,ub), _Deref_out_range_impl_(lb,ub)) +#define _Deref_ret_range_(lb,ub) _SAL2_Source_(_Deref_ret_range_, (lb,ub), _Deref_ret_range_impl_(lb,ub)) +#define _Pre_equal_to_(expr) _SAL2_Source_(_Pre_equal_to_, (expr), _In_range_(==, expr)) +#define _Post_equal_to_(expr) _SAL2_Source_(_Post_equal_to_, (expr), _Out_range_(==, expr)) + +// annotation to express that a value (usually a field of a mutable class) +// is not changed by a function call +#define _Unchanged_(e) _SAL2_Source_(_Unchanged_, (e), _At_(e, _Post_equal_to_(_Old_(e)) _Const_)) + +// Annotations to allow expressing generalized pre and post conditions. +// 'cond' may be any valid SAL expression that is considered to be true as a precondition +// or postcondition (respsectively). +#define _Pre_satisfies_(cond) _SAL2_Source_(_Pre_satisfies_, (cond), _Pre_satisfies_impl_(cond)) +#define _Post_satisfies_(cond) _SAL2_Source_(_Post_satisfies_, (cond), _Post_satisfies_impl_(cond)) + +// Annotations to express struct, class and field invariants +#define _Struct_size_bytes_(size) _SAL2_Source_(_Struct_size_bytes_, (size), _Writable_bytes_(size)) + +#define _Field_size_(size) _SAL2_Source_(_Field_size_, (size), _Notnull_ _Writable_elements_(size)) +#define _Field_size_opt_(size) _SAL2_Source_(_Field_size_opt_, (size), _Maybenull_ _Writable_elements_(size)) +#define _Field_size_part_(size, count) _SAL2_Source_(_Field_size_part_, (size, count), _Notnull_ _Writable_elements_(size) _Readable_elements_(count)) +#define _Field_size_part_opt_(size, count) _SAL2_Source_(_Field_size_part_opt_, (size, count), _Maybenull_ _Writable_elements_(size) _Readable_elements_(count)) +#define _Field_size_full_(size) _SAL2_Source_(_Field_size_full_, (size), _Field_size_part_(size, size)) +#define _Field_size_full_opt_(size) _SAL2_Source_(_Field_size_full_opt_, (size), _Field_size_part_opt_(size, size)) + +#define _Field_size_bytes_(size) _SAL2_Source_(_Field_size_bytes_, (size), _Notnull_ _Writable_bytes_(size)) +#define _Field_size_bytes_opt_(size) _SAL2_Source_(_Field_size_bytes_opt_, (size), _Maybenull_ _Writable_bytes_(size)) +#define _Field_size_bytes_part_(size, count) _SAL2_Source_(_Field_size_bytes_part_, (size, count), _Notnull_ _Writable_bytes_(size) _Readable_bytes_(count)) +#define _Field_size_bytes_part_opt_(size, count) _SAL2_Source_(_Field_size_bytes_part_opt_, (size, count), _Maybenull_ _Writable_bytes_(size) _Readable_bytes_(count)) +#define _Field_size_bytes_full_(size) _SAL2_Source_(_Field_size_bytes_full_, (size), _Field_size_bytes_part_(size, size)) +#define _Field_size_bytes_full_opt_(size) _SAL2_Source_(_Field_size_bytes_full_opt_, (size), _Field_size_bytes_part_opt_(size, size)) + +#define _Field_z_ _SAL2_Source_(_Field_z_, (), _Null_terminated_) + +#define _Field_range_(min,max) _SAL2_Source_(_Field_range_, (min,max), _Field_range_impl_(min,max)) + +//============================================================================ +// _Pre_\_Post_ Layer: +//============================================================================ + +// +// Raw Pre/Post for declaring custom pre/post conditions +// + +#define _Pre_ _Pre_impl_ +#define _Post_ _Post_impl_ + +// +// Validity property +// + +#define _Valid_ _Valid_impl_ +#define _Notvalid_ _Notvalid_impl_ +#define _Maybevalid_ _Maybevalid_impl_ + +// +// Buffer size properties +// + +// Expressing buffer sizes without specifying pre or post condition +#define _Readable_bytes_(size) _SAL2_Source_(_Readable_bytes_, (size), _Readable_bytes_impl_(size)) +#define _Readable_elements_(size) _SAL2_Source_(_Readable_elements_, (size), _Readable_elements_impl_(size)) +#define _Writable_bytes_(size) _SAL2_Source_(_Writable_bytes_, (size), _Writable_bytes_impl_(size)) +#define _Writable_elements_(size) _SAL2_Source_(_Writable_elements_, (size), _Writable_elements_impl_(size)) + +#define _Null_terminated_ _SAL2_Source_(_Null_terminated_, (), _Null_terminated_impl_) +#define _NullNull_terminated_ _SAL2_Source_(_NullNull_terminated_, (), _NullNull_terminated_impl_) + +// Expressing buffer size as pre or post condition +#define _Pre_readable_size_(size) _SAL2_Source_(_Pre_readable_size_, (size), _Pre1_impl_(__count_impl(size)) _Pre_valid_impl_) +#define _Pre_writable_size_(size) _SAL2_Source_(_Pre_writable_size_, (size), _Pre1_impl_(__cap_impl(size))) +#define _Pre_readable_byte_size_(size) _SAL2_Source_(_Pre_readable_byte_size_, (size), _Pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_) +#define _Pre_writable_byte_size_(size) _SAL2_Source_(_Pre_writable_byte_size_, (size), _Pre1_impl_(__bytecap_impl(size))) + +#define _Post_readable_size_(size) _SAL2_Source_(_Post_readable_size_, (size), _Post1_impl_(__count_impl(size)) _Post_valid_impl_) +#define _Post_writable_size_(size) _SAL2_Source_(_Post_writable_size_, (size), _Post1_impl_(__cap_impl(size))) +#define _Post_readable_byte_size_(size) _SAL2_Source_(_Post_readable_byte_size_, (size), _Post1_impl_(__bytecount_impl(size)) _Post_valid_impl_) +#define _Post_writable_byte_size_(size) _SAL2_Source_(_Post_writable_byte_size_, (size), _Post1_impl_(__bytecap_impl(size))) + +// +// Pointer null-ness properties +// +#define _Null_ _Null_impl_ +#define _Notnull_ _Notnull_impl_ +#define _Maybenull_ _Maybenull_impl_ + +// +// _Pre_ annotations --- +// +// describing conditions that must be met before the call of the function + +// e.g. int strlen( _Pre_z_ const char* sz ); +// buffer is a zero terminated string +#define _Pre_z_ _SAL2_Source_(_Pre_z_, (), _Pre1_impl_(__zterm_impl) _Pre_valid_impl_) + +// valid size unknown or indicated by type (e.g.:LPSTR) +#define _Pre_valid_ _SAL2_Source_(_Pre_valid_, (), _Pre1_impl_(__notnull_impl_notref) _Pre_valid_impl_) +#define _Pre_opt_valid_ _SAL2_Source_(_Pre_opt_valid_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre_valid_impl_) + +#define _Pre_invalid_ _SAL2_Source_(_Pre_invalid_, (), _Deref_pre1_impl_(__notvalid_impl)) + +// Overrides recursive valid when some field is not yet initialized when using _Inout_ +#define _Pre_unknown_ _SAL2_Source_(_Pre_unknown_, (), _Pre1_impl_(__maybevalid_impl)) + +// used with allocated but not yet initialized objects +#define _Pre_notnull_ _SAL2_Source_(_Pre_notnull_, (), _Pre1_impl_(__notnull_impl_notref)) +#define _Pre_maybenull_ _SAL2_Source_(_Pre_maybenull_, (), _Pre1_impl_(__maybenull_impl_notref)) +#define _Pre_null_ _SAL2_Source_(_Pre_null_, (), _Pre1_impl_(__null_impl_notref)) + +// +// _Post_ annotations --- +// +// describing conditions that hold after the function call + +// void CopyStr( _In_z_ const char* szFrom, _Pre_cap_(cch) _Post_z_ char* szFrom, size_t cchFrom ); +// buffer will be a zero-terminated string after the call +#define _Post_z_ _SAL2_Source_(_Post_z_, (), _Post1_impl_(__zterm_impl) _Post_valid_impl_) + +// e.g. HRESULT InitStruct( _Post_valid_ Struct* pobj ); +#define _Post_valid_ _SAL2_Source_(_Post_valid_, (), _Post_valid_impl_) +#define _Post_invalid_ _SAL2_Source_(_Post_invalid_, (), _Deref_post1_impl_(__notvalid_impl)) + +// e.g. void free( _Post_ptr_invalid_ void* pv ); +#define _Post_ptr_invalid_ _SAL2_Source_(_Post_ptr_invalid_, (), _Post1_impl_(__notvalid_impl)) + +// e.g. void ThrowExceptionIfNull( _Post_notnull_ const void* pv ); +#define _Post_notnull_ _SAL2_Source_(_Post_notnull_, (), _Post1_impl_(__notnull_impl)) + +// e.g. HRESULT GetObject(_Outptr_ _On_failure_(_At_(*p, _Post_null_)) T **p); +#define _Post_null_ _SAL2_Source_(_Post_null_, (), _Post1_impl_(__null_impl)) + +#define _Post_maybenull_ _SAL2_Source_(_Post_maybenull_, (), _Post1_impl_(__maybenull_impl)) + +#define _Prepost_z_ _SAL2_Source_(_Prepost_z_, (), _Pre_z_ _Post_z_) + + +// #pragma region Input Buffer SAL 1 compatibility macros + +/*========================================================================== + + This section contains definitions for macros defined for VS2010 and earlier. + Usage of these macros is still supported, but the SAL 2 macros defined above + are recommended instead. This comment block is retained to assist in + understanding SAL that still uses the older syntax. + + The macros are defined in 3 layers: + + _In_\_Out_ Layer: + ---------------- + This layer provides the highest abstraction and its macros should be used + in most cases. Its macros start with _In_, _Out_ or _Inout_. For the + typical case they provide the most concise annotations. + + _Pre_\_Post_ Layer: + ------------------ + The macros of this layer only should be used when there is no suitable macro + in the _In_\_Out_ layer. Its macros start with _Pre_, _Post_, _Ret_, + _Deref_pre_ _Deref_post_ and _Deref_ret_. This layer provides the most + flexibility for annotations. + + Implementation Abstraction Layer: + -------------------------------- + Macros from this layer should never be used directly. The layer only exists + to hide the implementation of the annotation macros. + + + Annotation Syntax: + |--------------|----------|----------------|-----------------------------| + | Usage | Nullness | ZeroTerminated | Extent | + |--------------|----------|----------------|-----------------------------| + | _In_ | <> | <> | <> | + | _Out_ | opt_ | z_ | [byte]cap_[c_|x_]( size ) | + | _Inout_ | | | [byte]count_[c_|x_]( size ) | + | _Deref_out_ | | | ptrdiff_cap_( ptr ) | + |--------------| | | ptrdiff_count_( ptr ) | + | _Ret_ | | | | + | _Deref_ret_ | | | | + |--------------| | | | + | _Pre_ | | | | + | _Post_ | | | | + | _Deref_pre_ | | | | + | _Deref_post_ | | | | + |--------------|----------|----------------|-----------------------------| + + Usage: + ----- + _In_, _Out_, _Inout_, _Pre_, _Post_, _Deref_pre_, _Deref_post_ are for + formal parameters. + _Ret_, _Deref_ret_ must be used for return values. + + Nullness: + -------- + If the pointer can be NULL the annotation contains _opt. If the macro + does not contain '_opt' the pointer may not be NULL. + + String Type: + ----------- + _z: NullTerminated string + for _In_ parameters the buffer must have the specified stringtype before the call + for _Out_ parameters the buffer must have the specified stringtype after the call + for _Inout_ parameters both conditions apply + + Extent Syntax: + |------|---------------|---------------| + | Unit | Writ\Readable | Argument Type | + |------|---------------|---------------| + | <> | cap_ | <> | + | byte | count_ | c_ | + | | | x_ | + |------|---------------|---------------| + + 'cap' (capacity) describes the writable size of the buffer and is typically used + with _Out_. The default unit is elements. Use 'bytecap' if the size is given in bytes + 'count' describes the readable size of the buffer and is typically used with _In_. + The default unit is elements. Use 'bytecount' if the size is given in bytes. + + Argument syntax for cap_, bytecap_, count_, bytecount_: + (|return)[+n] e.g. cch, return, cb+2 + + If the buffer size is a constant expression use the c_ postfix. + E.g. cap_c_(20), count_c_(MAX_PATH), bytecount_c_(16) + + If the buffer size is given by a limiting pointer use the ptrdiff_ versions + of the macros. + + If the buffer size is neither a parameter nor a constant expression use the x_ + postfix. e.g. bytecount_x_(num*size) x_ annotations accept any arbitrary string. + No analysis can be done for x_ annotations but they at least tell the tool that + the buffer has some sort of extent description. x_ annotations might be supported + by future compiler versions. + +============================================================================*/ + +// e.g. void SetCharRange( _In_count_(cch) const char* rgch, size_t cch ) +// valid buffer extent described by another parameter +#define _In_count_(size) _SAL1_1_Source_(_In_count_, (size), _Pre_count_(size) _Deref_pre_readonly_) +#define _In_opt_count_(size) _SAL1_1_Source_(_In_opt_count_, (size), _Pre_opt_count_(size) _Deref_pre_readonly_) +#define _In_bytecount_(size) _SAL1_1_Source_(_In_bytecount_, (size), _Pre_bytecount_(size) _Deref_pre_readonly_) +#define _In_opt_bytecount_(size) _SAL1_1_Source_(_In_opt_bytecount_, (size), _Pre_opt_bytecount_(size) _Deref_pre_readonly_) + +// valid buffer extent described by a constant extression +#define _In_count_c_(size) _SAL1_1_Source_(_In_count_c_, (size), _Pre_count_c_(size) _Deref_pre_readonly_) +#define _In_opt_count_c_(size) _SAL1_1_Source_(_In_opt_count_c_, (size), _Pre_opt_count_c_(size) _Deref_pre_readonly_) +#define _In_bytecount_c_(size) _SAL1_1_Source_(_In_bytecount_c_, (size), _Pre_bytecount_c_(size) _Deref_pre_readonly_) +#define _In_opt_bytecount_c_(size) _SAL1_1_Source_(_In_opt_bytecount_c_, (size), _Pre_opt_bytecount_c_(size) _Deref_pre_readonly_) + +// nullterminated 'input' buffers with given size + +// e.g. void SetCharRange( _In_count_(cch) const char* rgch, size_t cch ) +// nullterminated valid buffer extent described by another parameter +#define _In_z_count_(size) _SAL1_1_Source_(_In_z_count_, (size), _Pre_z_ _Pre_count_(size) _Deref_pre_readonly_) +#define _In_opt_z_count_(size) _SAL1_1_Source_(_In_opt_z_count_, (size), _Pre_opt_z_ _Pre_opt_count_(size) _Deref_pre_readonly_) +#define _In_z_bytecount_(size) _SAL1_1_Source_(_In_z_bytecount_, (size), _Pre_z_ _Pre_bytecount_(size) _Deref_pre_readonly_) +#define _In_opt_z_bytecount_(size) _SAL1_1_Source_(_In_opt_z_bytecount_, (size), _Pre_opt_z_ _Pre_opt_bytecount_(size) _Deref_pre_readonly_) + +// nullterminated valid buffer extent described by a constant extression +#define _In_z_count_c_(size) _SAL1_1_Source_(_In_z_count_c_, (size), _Pre_z_ _Pre_count_c_(size) _Deref_pre_readonly_) +#define _In_opt_z_count_c_(size) _SAL1_1_Source_(_In_opt_z_count_c_, (size), _Pre_opt_z_ _Pre_opt_count_c_(size) _Deref_pre_readonly_) +#define _In_z_bytecount_c_(size) _SAL1_1_Source_(_In_z_bytecount_c_, (size), _Pre_z_ _Pre_bytecount_c_(size) _Deref_pre_readonly_) +#define _In_opt_z_bytecount_c_(size) _SAL1_1_Source_(_In_opt_z_bytecount_c_, (size), _Pre_opt_z_ _Pre_opt_bytecount_c_(size) _Deref_pre_readonly_) + +// buffer capacity is described by another pointer +// e.g. void Foo( _In_ptrdiff_count_(pchMax) const char* pch, const char* pchMax ) { while pch < pchMax ) pch++; } +#define _In_ptrdiff_count_(size) _SAL1_1_Source_(_In_ptrdiff_count_, (size), _Pre_ptrdiff_count_(size) _Deref_pre_readonly_) +#define _In_opt_ptrdiff_count_(size) _SAL1_1_Source_(_In_opt_ptrdiff_count_, (size), _Pre_opt_ptrdiff_count_(size) _Deref_pre_readonly_) + +// 'x' version for complex expressions that are not supported by the current compiler version +// e.g. void Set3ColMatrix( _In_count_x_(3*cRows) const Elem* matrix, int cRows ); +#define _In_count_x_(size) _SAL1_1_Source_(_In_count_x_, (size), _Pre_count_x_(size) _Deref_pre_readonly_) +#define _In_opt_count_x_(size) _SAL1_1_Source_(_In_opt_count_x_, (size), _Pre_opt_count_x_(size) _Deref_pre_readonly_) +#define _In_bytecount_x_(size) _SAL1_1_Source_(_In_bytecount_x_, (size), _Pre_bytecount_x_(size) _Deref_pre_readonly_) +#define _In_opt_bytecount_x_(size) _SAL1_1_Source_(_In_opt_bytecount_x_, (size), _Pre_opt_bytecount_x_(size) _Deref_pre_readonly_) + + +// 'out' with buffer size +// e.g. void GetIndeces( _Out_cap_(cIndeces) int* rgIndeces, size_t cIndices ); +// buffer capacity is described by another parameter +#define _Out_cap_(size) _SAL1_1_Source_(_Out_cap_, (size), _Pre_cap_(size) _Post_valid_impl_) +#define _Out_opt_cap_(size) _SAL1_1_Source_(_Out_opt_cap_, (size), _Pre_opt_cap_(size) _Post_valid_impl_) +#define _Out_bytecap_(size) _SAL1_1_Source_(_Out_bytecap_, (size), _Pre_bytecap_(size) _Post_valid_impl_) +#define _Out_opt_bytecap_(size) _SAL1_1_Source_(_Out_opt_bytecap_, (size), _Pre_opt_bytecap_(size) _Post_valid_impl_) + +// buffer capacity is described by a constant expression +#define _Out_cap_c_(size) _SAL1_1_Source_(_Out_cap_c_, (size), _Pre_cap_c_(size) _Post_valid_impl_) +#define _Out_opt_cap_c_(size) _SAL1_1_Source_(_Out_opt_cap_c_, (size), _Pre_opt_cap_c_(size) _Post_valid_impl_) +#define _Out_bytecap_c_(size) _SAL1_1_Source_(_Out_bytecap_c_, (size), _Pre_bytecap_c_(size) _Post_valid_impl_) +#define _Out_opt_bytecap_c_(size) _SAL1_1_Source_(_Out_opt_bytecap_c_, (size), _Pre_opt_bytecap_c_(size) _Post_valid_impl_) + +// buffer capacity is described by another parameter multiplied by a constant expression +#define _Out_cap_m_(mult,size) _SAL1_1_Source_(_Out_cap_m_, (mult,size), _Pre_cap_m_(mult,size) _Post_valid_impl_) +#define _Out_opt_cap_m_(mult,size) _SAL1_1_Source_(_Out_opt_cap_m_, (mult,size), _Pre_opt_cap_m_(mult,size) _Post_valid_impl_) +#define _Out_z_cap_m_(mult,size) _SAL1_1_Source_(_Out_z_cap_m_, (mult,size), _Pre_cap_m_(mult,size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_cap_m_(mult,size) _SAL1_1_Source_(_Out_opt_z_cap_m_, (mult,size), _Pre_opt_cap_m_(mult,size) _Post_valid_impl_ _Post_z_) + +// buffer capacity is described by another pointer +// e.g. void Foo( _Out_ptrdiff_cap_(pchMax) char* pch, const char* pchMax ) { while pch < pchMax ) pch++; } +#define _Out_ptrdiff_cap_(size) _SAL1_1_Source_(_Out_ptrdiff_cap_, (size), _Pre_ptrdiff_cap_(size) _Post_valid_impl_) +#define _Out_opt_ptrdiff_cap_(size) _SAL1_1_Source_(_Out_opt_ptrdiff_cap_, (size), _Pre_opt_ptrdiff_cap_(size) _Post_valid_impl_) + +// buffer capacity is described by a complex expression +#define _Out_cap_x_(size) _SAL1_1_Source_(_Out_cap_x_, (size), _Pre_cap_x_(size) _Post_valid_impl_) +#define _Out_opt_cap_x_(size) _SAL1_1_Source_(_Out_opt_cap_x_, (size), _Pre_opt_cap_x_(size) _Post_valid_impl_) +#define _Out_bytecap_x_(size) _SAL1_1_Source_(_Out_bytecap_x_, (size), _Pre_bytecap_x_(size) _Post_valid_impl_) +#define _Out_opt_bytecap_x_(size) _SAL1_1_Source_(_Out_opt_bytecap_x_, (size), _Pre_opt_bytecap_x_(size) _Post_valid_impl_) + +// a zero terminated string is filled into a buffer of given capacity +// e.g. void CopyStr( _In_z_ const char* szFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo ); +// buffer capacity is described by another parameter +#define _Out_z_cap_(size) _SAL1_1_Source_(_Out_z_cap_, (size), _Pre_cap_(size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_cap_(size) _SAL1_1_Source_(_Out_opt_z_cap_, (size), _Pre_opt_cap_(size) _Post_valid_impl_ _Post_z_) +#define _Out_z_bytecap_(size) _SAL1_1_Source_(_Out_z_bytecap_, (size), _Pre_bytecap_(size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_bytecap_(size) _SAL1_1_Source_(_Out_opt_z_bytecap_, (size), _Pre_opt_bytecap_(size) _Post_valid_impl_ _Post_z_) + +// buffer capacity is described by a constant expression +#define _Out_z_cap_c_(size) _SAL1_1_Source_(_Out_z_cap_c_, (size), _Pre_cap_c_(size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_cap_c_(size) _SAL1_1_Source_(_Out_opt_z_cap_c_, (size), _Pre_opt_cap_c_(size) _Post_valid_impl_ _Post_z_) +#define _Out_z_bytecap_c_(size) _SAL1_1_Source_(_Out_z_bytecap_c_, (size), _Pre_bytecap_c_(size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_bytecap_c_(size) _SAL1_1_Source_(_Out_opt_z_bytecap_c_, (size), _Pre_opt_bytecap_c_(size) _Post_valid_impl_ _Post_z_) + +// buffer capacity is described by a complex expression +#define _Out_z_cap_x_(size) _SAL1_1_Source_(_Out_z_cap_x_, (size), _Pre_cap_x_(size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_cap_x_(size) _SAL1_1_Source_(_Out_opt_z_cap_x_, (size), _Pre_opt_cap_x_(size) _Post_valid_impl_ _Post_z_) +#define _Out_z_bytecap_x_(size) _SAL1_1_Source_(_Out_z_bytecap_x_, (size), _Pre_bytecap_x_(size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_bytecap_x_(size) _SAL1_1_Source_(_Out_opt_z_bytecap_x_, (size), _Pre_opt_bytecap_x_(size) _Post_valid_impl_ _Post_z_) + +// a zero terminated string is filled into a buffer of given capacity +// e.g. size_t CopyCharRange( _In_count_(cchFrom) const char* rgchFrom, size_t cchFrom, _Out_cap_post_count_(cchTo,return)) char* rgchTo, size_t cchTo ); +#define _Out_cap_post_count_(cap,count) _SAL1_1_Source_(_Out_cap_post_count_, (cap,count), _Pre_cap_(cap) _Post_valid_impl_ _Post_count_(count)) +#define _Out_opt_cap_post_count_(cap,count) _SAL1_1_Source_(_Out_opt_cap_post_count_, (cap,count), _Pre_opt_cap_(cap) _Post_valid_impl_ _Post_count_(count)) +#define _Out_bytecap_post_bytecount_(cap,count) _SAL1_1_Source_(_Out_bytecap_post_bytecount_, (cap,count), _Pre_bytecap_(cap) _Post_valid_impl_ _Post_bytecount_(count)) +#define _Out_opt_bytecap_post_bytecount_(cap,count) _SAL1_1_Source_(_Out_opt_bytecap_post_bytecount_, (cap,count), _Pre_opt_bytecap_(cap) _Post_valid_impl_ _Post_bytecount_(count)) + +// a zero terminated string is filled into a buffer of given capacity +// e.g. size_t CopyStr( _In_z_ const char* szFrom, _Out_z_cap_post_count_(cchTo,return+1) char* szTo, size_t cchTo ); +#define _Out_z_cap_post_count_(cap,count) _SAL1_1_Source_(_Out_z_cap_post_count_, (cap,count), _Pre_cap_(cap) _Post_valid_impl_ _Post_z_count_(count)) +#define _Out_opt_z_cap_post_count_(cap,count) _SAL1_1_Source_(_Out_opt_z_cap_post_count_, (cap,count), _Pre_opt_cap_(cap) _Post_valid_impl_ _Post_z_count_(count)) +#define _Out_z_bytecap_post_bytecount_(cap,count) _SAL1_1_Source_(_Out_z_bytecap_post_bytecount_, (cap,count), _Pre_bytecap_(cap) _Post_valid_impl_ _Post_z_bytecount_(count)) +#define _Out_opt_z_bytecap_post_bytecount_(cap,count) _SAL1_1_Source_(_Out_opt_z_bytecap_post_bytecount_, (cap,count), _Pre_opt_bytecap_(cap) _Post_valid_impl_ _Post_z_bytecount_(count)) + +// only use with dereferenced arguments e.g. '*pcch' +#define _Out_capcount_(capcount) _SAL1_1_Source_(_Out_capcount_, (capcount), _Pre_cap_(capcount) _Post_valid_impl_ _Post_count_(capcount)) +#define _Out_opt_capcount_(capcount) _SAL1_1_Source_(_Out_opt_capcount_, (capcount), _Pre_opt_cap_(capcount) _Post_valid_impl_ _Post_count_(capcount)) +#define _Out_bytecapcount_(capcount) _SAL1_1_Source_(_Out_bytecapcount_, (capcount), _Pre_bytecap_(capcount) _Post_valid_impl_ _Post_bytecount_(capcount)) +#define _Out_opt_bytecapcount_(capcount) _SAL1_1_Source_(_Out_opt_bytecapcount_, (capcount), _Pre_opt_bytecap_(capcount) _Post_valid_impl_ _Post_bytecount_(capcount)) + +#define _Out_capcount_x_(capcount) _SAL1_1_Source_(_Out_capcount_x_, (capcount), _Pre_cap_x_(capcount) _Post_valid_impl_ _Post_count_x_(capcount)) +#define _Out_opt_capcount_x_(capcount) _SAL1_1_Source_(_Out_opt_capcount_x_, (capcount), _Pre_opt_cap_x_(capcount) _Post_valid_impl_ _Post_count_x_(capcount)) +#define _Out_bytecapcount_x_(capcount) _SAL1_1_Source_(_Out_bytecapcount_x_, (capcount), _Pre_bytecap_x_(capcount) _Post_valid_impl_ _Post_bytecount_x_(capcount)) +#define _Out_opt_bytecapcount_x_(capcount) _SAL1_1_Source_(_Out_opt_bytecapcount_x_, (capcount), _Pre_opt_bytecap_x_(capcount) _Post_valid_impl_ _Post_bytecount_x_(capcount)) + +// e.g. GetString( _Out_z_capcount_(*pLen+1) char* sz, size_t* pLen ); +#define _Out_z_capcount_(capcount) _SAL1_1_Source_(_Out_z_capcount_, (capcount), _Pre_cap_(capcount) _Post_valid_impl_ _Post_z_count_(capcount)) +#define _Out_opt_z_capcount_(capcount) _SAL1_1_Source_(_Out_opt_z_capcount_, (capcount), _Pre_opt_cap_(capcount) _Post_valid_impl_ _Post_z_count_(capcount)) +#define _Out_z_bytecapcount_(capcount) _SAL1_1_Source_(_Out_z_bytecapcount_, (capcount), _Pre_bytecap_(capcount) _Post_valid_impl_ _Post_z_bytecount_(capcount)) +#define _Out_opt_z_bytecapcount_(capcount) _SAL1_1_Source_(_Out_opt_z_bytecapcount_, (capcount), _Pre_opt_bytecap_(capcount) _Post_valid_impl_ _Post_z_bytecount_(capcount)) + + +// 'inout' buffers with initialized elements before and after the call +// e.g. void ModifyIndices( _Inout_count_(cIndices) int* rgIndeces, size_t cIndices ); +#define _Inout_count_(size) _SAL1_1_Source_(_Inout_count_, (size), _Prepost_count_(size)) +#define _Inout_opt_count_(size) _SAL1_1_Source_(_Inout_opt_count_, (size), _Prepost_opt_count_(size)) +#define _Inout_bytecount_(size) _SAL1_1_Source_(_Inout_bytecount_, (size), _Prepost_bytecount_(size)) +#define _Inout_opt_bytecount_(size) _SAL1_1_Source_(_Inout_opt_bytecount_, (size), _Prepost_opt_bytecount_(size)) + +#define _Inout_count_c_(size) _SAL1_1_Source_(_Inout_count_c_, (size), _Prepost_count_c_(size)) +#define _Inout_opt_count_c_(size) _SAL1_1_Source_(_Inout_opt_count_c_, (size), _Prepost_opt_count_c_(size)) +#define _Inout_bytecount_c_(size) _SAL1_1_Source_(_Inout_bytecount_c_, (size), _Prepost_bytecount_c_(size)) +#define _Inout_opt_bytecount_c_(size) _SAL1_1_Source_(_Inout_opt_bytecount_c_, (size), _Prepost_opt_bytecount_c_(size)) + +// nullterminated 'inout' buffers with initialized elements before and after the call +// e.g. void ModifyIndices( _Inout_count_(cIndices) int* rgIndeces, size_t cIndices ); +#define _Inout_z_count_(size) _SAL1_1_Source_(_Inout_z_count_, (size), _Prepost_z_ _Prepost_count_(size)) +#define _Inout_opt_z_count_(size) _SAL1_1_Source_(_Inout_opt_z_count_, (size), _Prepost_z_ _Prepost_opt_count_(size)) +#define _Inout_z_bytecount_(size) _SAL1_1_Source_(_Inout_z_bytecount_, (size), _Prepost_z_ _Prepost_bytecount_(size)) +#define _Inout_opt_z_bytecount_(size) _SAL1_1_Source_(_Inout_opt_z_bytecount_, (size), _Prepost_z_ _Prepost_opt_bytecount_(size)) + +#define _Inout_z_count_c_(size) _SAL1_1_Source_(_Inout_z_count_c_, (size), _Prepost_z_ _Prepost_count_c_(size)) +#define _Inout_opt_z_count_c_(size) _SAL1_1_Source_(_Inout_opt_z_count_c_, (size), _Prepost_z_ _Prepost_opt_count_c_(size)) +#define _Inout_z_bytecount_c_(size) _SAL1_1_Source_(_Inout_z_bytecount_c_, (size), _Prepost_z_ _Prepost_bytecount_c_(size)) +#define _Inout_opt_z_bytecount_c_(size) _SAL1_1_Source_(_Inout_opt_z_bytecount_c_, (size), _Prepost_z_ _Prepost_opt_bytecount_c_(size)) + +#define _Inout_ptrdiff_count_(size) _SAL1_1_Source_(_Inout_ptrdiff_count_, (size), _Pre_ptrdiff_count_(size)) +#define _Inout_opt_ptrdiff_count_(size) _SAL1_1_Source_(_Inout_opt_ptrdiff_count_, (size), _Pre_opt_ptrdiff_count_(size)) + +#define _Inout_count_x_(size) _SAL1_1_Source_(_Inout_count_x_, (size), _Prepost_count_x_(size)) +#define _Inout_opt_count_x_(size) _SAL1_1_Source_(_Inout_opt_count_x_, (size), _Prepost_opt_count_x_(size)) +#define _Inout_bytecount_x_(size) _SAL1_1_Source_(_Inout_bytecount_x_, (size), _Prepost_bytecount_x_(size)) +#define _Inout_opt_bytecount_x_(size) _SAL1_1_Source_(_Inout_opt_bytecount_x_, (size), _Prepost_opt_bytecount_x_(size)) + +// e.g. void AppendToLPSTR( _In_ LPCSTR szFrom, _Inout_cap_(cchTo) LPSTR* szTo, size_t cchTo ); +#define _Inout_cap_(size) _SAL1_1_Source_(_Inout_cap_, (size), _Pre_valid_cap_(size) _Post_valid_) +#define _Inout_opt_cap_(size) _SAL1_1_Source_(_Inout_opt_cap_, (size), _Pre_opt_valid_cap_(size) _Post_valid_) +#define _Inout_bytecap_(size) _SAL1_1_Source_(_Inout_bytecap_, (size), _Pre_valid_bytecap_(size) _Post_valid_) +#define _Inout_opt_bytecap_(size) _SAL1_1_Source_(_Inout_opt_bytecap_, (size), _Pre_opt_valid_bytecap_(size) _Post_valid_) + +#define _Inout_cap_c_(size) _SAL1_1_Source_(_Inout_cap_c_, (size), _Pre_valid_cap_c_(size) _Post_valid_) +#define _Inout_opt_cap_c_(size) _SAL1_1_Source_(_Inout_opt_cap_c_, (size), _Pre_opt_valid_cap_c_(size) _Post_valid_) +#define _Inout_bytecap_c_(size) _SAL1_1_Source_(_Inout_bytecap_c_, (size), _Pre_valid_bytecap_c_(size) _Post_valid_) +#define _Inout_opt_bytecap_c_(size) _SAL1_1_Source_(_Inout_opt_bytecap_c_, (size), _Pre_opt_valid_bytecap_c_(size) _Post_valid_) + +#define _Inout_cap_x_(size) _SAL1_1_Source_(_Inout_cap_x_, (size), _Pre_valid_cap_x_(size) _Post_valid_) +#define _Inout_opt_cap_x_(size) _SAL1_1_Source_(_Inout_opt_cap_x_, (size), _Pre_opt_valid_cap_x_(size) _Post_valid_) +#define _Inout_bytecap_x_(size) _SAL1_1_Source_(_Inout_bytecap_x_, (size), _Pre_valid_bytecap_x_(size) _Post_valid_) +#define _Inout_opt_bytecap_x_(size) _SAL1_1_Source_(_Inout_opt_bytecap_x_, (size), _Pre_opt_valid_bytecap_x_(size) _Post_valid_) + +// inout string buffers with writable size +// e.g. void AppendStr( _In_z_ const char* szFrom, _Inout_z_cap_(cchTo) char* szTo, size_t cchTo ); +#define _Inout_z_cap_(size) _SAL1_1_Source_(_Inout_z_cap_, (size), _Pre_z_cap_(size) _Post_z_) +#define _Inout_opt_z_cap_(size) _SAL1_1_Source_(_Inout_opt_z_cap_, (size), _Pre_opt_z_cap_(size) _Post_z_) +#define _Inout_z_bytecap_(size) _SAL1_1_Source_(_Inout_z_bytecap_, (size), _Pre_z_bytecap_(size) _Post_z_) +#define _Inout_opt_z_bytecap_(size) _SAL1_1_Source_(_Inout_opt_z_bytecap_, (size), _Pre_opt_z_bytecap_(size) _Post_z_) + +#define _Inout_z_cap_c_(size) _SAL1_1_Source_(_Inout_z_cap_c_, (size), _Pre_z_cap_c_(size) _Post_z_) +#define _Inout_opt_z_cap_c_(size) _SAL1_1_Source_(_Inout_opt_z_cap_c_, (size), _Pre_opt_z_cap_c_(size) _Post_z_) +#define _Inout_z_bytecap_c_(size) _SAL1_1_Source_(_Inout_z_bytecap_c_, (size), _Pre_z_bytecap_c_(size) _Post_z_) +#define _Inout_opt_z_bytecap_c_(size) _SAL1_1_Source_(_Inout_opt_z_bytecap_c_, (size), _Pre_opt_z_bytecap_c_(size) _Post_z_) + +#define _Inout_z_cap_x_(size) _SAL1_1_Source_(_Inout_z_cap_x_, (size), _Pre_z_cap_x_(size) _Post_z_) +#define _Inout_opt_z_cap_x_(size) _SAL1_1_Source_(_Inout_opt_z_cap_x_, (size), _Pre_opt_z_cap_x_(size) _Post_z_) +#define _Inout_z_bytecap_x_(size) _SAL1_1_Source_(_Inout_z_bytecap_x_, (size), _Pre_z_bytecap_x_(size) _Post_z_) +#define _Inout_opt_z_bytecap_x_(size) _SAL1_1_Source_(_Inout_opt_z_bytecap_x_, (size), _Pre_opt_z_bytecap_x_(size) _Post_z_) + + +// returning pointers to valid objects +#define _Ret_ _SAL1_1_Source_(_Ret_, (), _Ret_valid_) +#define _Ret_opt_ _SAL1_1_Source_(_Ret_opt_, (), _Ret_opt_valid_) + +// annotations to express 'boundedness' of integral value parameter +#define _In_bound_ _SAL1_1_Source_(_In_bound_, (), _In_bound_impl_) +#define _Out_bound_ _SAL1_1_Source_(_Out_bound_, (), _Out_bound_impl_) +#define _Ret_bound_ _SAL1_1_Source_(_Ret_bound_, (), _Ret_bound_impl_) +#define _Deref_in_bound_ _SAL1_1_Source_(_Deref_in_bound_, (), _Deref_in_bound_impl_) +#define _Deref_out_bound_ _SAL1_1_Source_(_Deref_out_bound_, (), _Deref_out_bound_impl_) +#define _Deref_inout_bound_ _SAL1_1_Source_(_Deref_inout_bound_, (), _Deref_in_bound_ _Deref_out_bound_) +#define _Deref_ret_bound_ _SAL1_1_Source_(_Deref_ret_bound_, (), _Deref_ret_bound_impl_) + +// e.g. HRESULT HrCreatePoint( _Deref_out_opt_ POINT** ppPT ); +#define _Deref_out_ _SAL1_1_Source_(_Deref_out_, (), _Out_ _Deref_post_valid_) +#define _Deref_out_opt_ _SAL1_1_Source_(_Deref_out_opt_, (), _Out_ _Deref_post_opt_valid_) +#define _Deref_opt_out_ _SAL1_1_Source_(_Deref_opt_out_, (), _Out_opt_ _Deref_post_valid_) +#define _Deref_opt_out_opt_ _SAL1_1_Source_(_Deref_opt_out_opt_, (), _Out_opt_ _Deref_post_opt_valid_) + +// e.g. void CloneString( _In_z_ const WCHAR* wzFrom, _Deref_out_z_ WCHAR** pWzTo ); +#define _Deref_out_z_ _SAL1_1_Source_(_Deref_out_z_, (), _Out_ _Deref_post_z_) +#define _Deref_out_opt_z_ _SAL1_1_Source_(_Deref_out_opt_z_, (), _Out_ _Deref_post_opt_z_) +#define _Deref_opt_out_z_ _SAL1_1_Source_(_Deref_opt_out_z_, (), _Out_opt_ _Deref_post_z_) +#define _Deref_opt_out_opt_z_ _SAL1_1_Source_(_Deref_opt_out_opt_z_, (), _Out_opt_ _Deref_post_opt_z_) + +// +// _Deref_pre_ --- +// +// describing conditions for array elements of dereferenced pointer parameters that must be met before the call + +// e.g. void SaveStringArray( _In_count_(cStrings) _Deref_pre_z_ const WCHAR* const rgpwch[] ); +#define _Deref_pre_z_ _SAL1_1_Source_(_Deref_pre_z_, (), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__zterm_impl) _Pre_valid_impl_) +#define _Deref_pre_opt_z_ _SAL1_1_Source_(_Deref_pre_opt_z_, (), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__zterm_impl) _Pre_valid_impl_) + +// e.g. void FillInArrayOfStr32( _In_count_(cStrings) _Deref_pre_cap_c_(32) _Deref_post_z_ WCHAR* const rgpwch[] ); +// buffer capacity is described by another parameter +#define _Deref_pre_cap_(size) _SAL1_1_Source_(_Deref_pre_cap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_impl(size))) +#define _Deref_pre_opt_cap_(size) _SAL1_1_Source_(_Deref_pre_opt_cap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_impl(size))) +#define _Deref_pre_bytecap_(size) _SAL1_1_Source_(_Deref_pre_bytecap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_impl(size))) +#define _Deref_pre_opt_bytecap_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_impl(size))) + +// buffer capacity is described by a constant expression +#define _Deref_pre_cap_c_(size) _SAL1_1_Source_(_Deref_pre_cap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_c_impl(size))) +#define _Deref_pre_opt_cap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_cap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_c_impl(size))) +#define _Deref_pre_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_bytecap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_c_impl(size))) +#define _Deref_pre_opt_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_c_impl(size))) + +// buffer capacity is described by a complex condition +#define _Deref_pre_cap_x_(size) _SAL1_1_Source_(_Deref_pre_cap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_x_impl(size))) +#define _Deref_pre_opt_cap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_cap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_x_impl(size))) +#define _Deref_pre_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_bytecap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_x_impl(size))) +#define _Deref_pre_opt_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_x_impl(size))) + +// convenience macros for nullterminated buffers with given capacity +#define _Deref_pre_z_cap_(size) _SAL1_1_Source_(_Deref_pre_z_cap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_z_cap_(size) _SAL1_1_Source_(_Deref_pre_opt_z_cap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_z_bytecap_(size) _SAL1_1_Source_(_Deref_pre_z_bytecap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_z_bytecap_(size) _SAL1_1_Source_(_Deref_pre_opt_z_bytecap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_impl(size)) _Pre_valid_impl_) + +#define _Deref_pre_z_cap_c_(size) _SAL1_1_Source_(_Deref_pre_z_cap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_z_cap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_z_cap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_z_bytecap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_z_bytecap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_) + +#define _Deref_pre_z_cap_x_(size) _SAL1_1_Source_(_Deref_pre_z_cap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_z_cap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_z_cap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_z_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_z_bytecap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_z_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_z_bytecap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_) + +// known capacity and valid but unknown readable extent +#define _Deref_pre_valid_cap_(size) _SAL1_1_Source_(_Deref_pre_valid_cap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_cap_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_cap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_valid_bytecap_(size) _SAL1_1_Source_(_Deref_pre_valid_bytecap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_bytecap_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_bytecap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_) + +#define _Deref_pre_valid_cap_c_(size) _SAL1_1_Source_(_Deref_pre_valid_cap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_cap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_cap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_valid_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_valid_bytecap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_bytecap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_) + +#define _Deref_pre_valid_cap_x_(size) _SAL1_1_Source_(_Deref_pre_valid_cap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_cap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_cap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_valid_bytecap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_bytecap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_) + +// e.g. void SaveMatrix( _In_count_(n) _Deref_pre_count_(n) const Elem** matrix, size_t n ); +// valid buffer extent is described by another parameter +#define _Deref_pre_count_(size) _SAL1_1_Source_(_Deref_pre_count_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__count_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_count_(size) _SAL1_1_Source_(_Deref_pre_opt_count_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__count_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_bytecount_(size) _SAL1_1_Source_(_Deref_pre_bytecount_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_bytecount_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecount_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_) + +// valid buffer extent is described by a constant expression +#define _Deref_pre_count_c_(size) _SAL1_1_Source_(_Deref_pre_count_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__count_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_count_c_(size) _SAL1_1_Source_(_Deref_pre_opt_count_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__count_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_bytecount_c_(size) _SAL1_1_Source_(_Deref_pre_bytecount_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_bytecount_c_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecount_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_) + +// valid buffer extent is described by a complex expression +#define _Deref_pre_count_x_(size) _SAL1_1_Source_(_Deref_pre_count_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__count_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_count_x_(size) _SAL1_1_Source_(_Deref_pre_opt_count_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__count_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_bytecount_x_(size) _SAL1_1_Source_(_Deref_pre_bytecount_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_bytecount_x_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecount_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_) + +// e.g. void PrintStringArray( _In_count_(cElems) _Deref_pre_valid_ LPCSTR rgStr[], size_t cElems ); +#define _Deref_pre_valid_ _SAL1_1_Source_(_Deref_pre_valid_, (), _Deref_pre1_impl_(__notnull_impl_notref) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_ _SAL1_1_Source_(_Deref_pre_opt_valid_, (), _Deref_pre1_impl_(__maybenull_impl_notref) _Pre_valid_impl_) +#define _Deref_pre_invalid_ _SAL1_1_Source_(_Deref_pre_invalid_, (), _Deref_pre1_impl_(__notvalid_impl)) + +#define _Deref_pre_notnull_ _SAL1_1_Source_(_Deref_pre_notnull_, (), _Deref_pre1_impl_(__notnull_impl_notref)) +#define _Deref_pre_maybenull_ _SAL1_1_Source_(_Deref_pre_maybenull_, (), _Deref_pre1_impl_(__maybenull_impl_notref)) +#define _Deref_pre_null_ _SAL1_1_Source_(_Deref_pre_null_, (), _Deref_pre1_impl_(__null_impl_notref)) + +// restrict access rights +#define _Deref_pre_readonly_ _SAL1_1_Source_(_Deref_pre_readonly_, (), _Deref_pre1_impl_(__readaccess_impl_notref)) +#define _Deref_pre_writeonly_ _SAL1_1_Source_(_Deref_pre_writeonly_, (), _Deref_pre1_impl_(__writeaccess_impl_notref)) + +// +// _Deref_post_ --- +// +// describing conditions for array elements or dereferenced pointer parameters that hold after the call + +// e.g. void CloneString( _In_z_ const Wchar_t* wzIn _Out_ _Deref_post_z_ WCHAR** pWzOut ); +#define _Deref_post_z_ _SAL1_1_Source_(_Deref_post_z_, (), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__zterm_impl) _Post_valid_impl_) +#define _Deref_post_opt_z_ _SAL1_1_Source_(_Deref_post_opt_z_, (), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__zterm_impl) _Post_valid_impl_) + +// e.g. HRESULT HrAllocateMemory( size_t cb, _Out_ _Deref_post_bytecap_(cb) void** ppv ); +// buffer capacity is described by another parameter +#define _Deref_post_cap_(size) _SAL1_1_Source_(_Deref_post_cap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_impl(size))) +#define _Deref_post_opt_cap_(size) _SAL1_1_Source_(_Deref_post_opt_cap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_impl(size))) +#define _Deref_post_bytecap_(size) _SAL1_1_Source_(_Deref_post_bytecap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size))) +#define _Deref_post_opt_bytecap_(size) _SAL1_1_Source_(_Deref_post_opt_bytecap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size))) + +// buffer capacity is described by a constant expression +#define _Deref_post_cap_c_(size) _SAL1_1_Source_(_Deref_post_cap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size))) +#define _Deref_post_opt_cap_c_(size) _SAL1_1_Source_(_Deref_post_opt_cap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size))) +#define _Deref_post_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_bytecap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size))) +#define _Deref_post_opt_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_opt_bytecap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size))) + +// buffer capacity is described by a complex expression +#define _Deref_post_cap_x_(size) _SAL1_1_Source_(_Deref_post_cap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size))) +#define _Deref_post_opt_cap_x_(size) _SAL1_1_Source_(_Deref_post_opt_cap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size))) +#define _Deref_post_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_bytecap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size))) +#define _Deref_post_opt_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_opt_bytecap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size))) + +// convenience macros for nullterminated buffers with given capacity +#define _Deref_post_z_cap_(size) _SAL1_1_Source_(_Deref_post_z_cap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_z_cap_(size) _SAL1_1_Source_(_Deref_post_opt_z_cap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_impl(size)) _Post_valid_impl_) +#define _Deref_post_z_bytecap_(size) _SAL1_1_Source_(_Deref_post_z_bytecap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_z_bytecap_(size) _SAL1_1_Source_(_Deref_post_opt_z_bytecap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_impl(size)) _Post_valid_impl_) + +#define _Deref_post_z_cap_c_(size) _SAL1_1_Source_(_Deref_post_z_cap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_z_cap_c_(size) _SAL1_1_Source_(_Deref_post_opt_z_cap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_z_bytecap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_opt_z_bytecap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Post_valid_impl_) + +#define _Deref_post_z_cap_x_(size) _SAL1_1_Source_(_Deref_post_z_cap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_z_cap_x_(size) _SAL1_1_Source_(_Deref_post_opt_z_cap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_z_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_z_bytecap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_z_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_opt_z_bytecap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Post_valid_impl_) + +// known capacity and valid but unknown readable extent +#define _Deref_post_valid_cap_(size) _SAL1_1_Source_(_Deref_post_valid_cap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_valid_cap_(size) _SAL1_1_Source_(_Deref_post_opt_valid_cap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_impl(size)) _Post_valid_impl_) +#define _Deref_post_valid_bytecap_(size) _SAL1_1_Source_(_Deref_post_valid_bytecap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_valid_bytecap_(size) _SAL1_1_Source_(_Deref_post_opt_valid_bytecap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size)) _Post_valid_impl_) + +#define _Deref_post_valid_cap_c_(size) _SAL1_1_Source_(_Deref_post_valid_cap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_valid_cap_c_(size) _SAL1_1_Source_(_Deref_post_opt_valid_cap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_valid_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_valid_bytecap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_valid_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_opt_valid_bytecap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size)) _Post_valid_impl_) + +#define _Deref_post_valid_cap_x_(size) _SAL1_1_Source_(_Deref_post_valid_cap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_valid_cap_x_(size) _SAL1_1_Source_(_Deref_post_opt_valid_cap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_valid_bytecap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_opt_valid_bytecap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size)) _Post_valid_impl_) + +// e.g. HRESULT HrAllocateZeroInitializedMemory( size_t cb, _Out_ _Deref_post_bytecount_(cb) void** ppv ); +// valid buffer extent is described by another parameter +#define _Deref_post_count_(size) _SAL1_1_Source_(_Deref_post_count_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__count_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_count_(size) _SAL1_1_Source_(_Deref_post_opt_count_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__count_impl(size)) _Post_valid_impl_) +#define _Deref_post_bytecount_(size) _SAL1_1_Source_(_Deref_post_bytecount_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecount_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_bytecount_(size) _SAL1_1_Source_(_Deref_post_opt_bytecount_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecount_impl(size)) _Post_valid_impl_) + +// buffer capacity is described by a constant expression +#define _Deref_post_count_c_(size) _SAL1_1_Source_(_Deref_post_count_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__count_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_count_c_(size) _SAL1_1_Source_(_Deref_post_opt_count_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__count_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_bytecount_c_(size) _SAL1_1_Source_(_Deref_post_bytecount_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecount_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_bytecount_c_(size) _SAL1_1_Source_(_Deref_post_opt_bytecount_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecount_c_impl(size)) _Post_valid_impl_) + +// buffer capacity is described by a complex expression +#define _Deref_post_count_x_(size) _SAL1_1_Source_(_Deref_post_count_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__count_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_count_x_(size) _SAL1_1_Source_(_Deref_post_opt_count_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__count_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_bytecount_x_(size) _SAL1_1_Source_(_Deref_post_bytecount_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecount_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_bytecount_x_(size) _SAL1_1_Source_(_Deref_post_opt_bytecount_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecount_x_impl(size)) _Post_valid_impl_) + +// e.g. void GetStrings( _Out_count_(cElems) _Deref_post_valid_ LPSTR const rgStr[], size_t cElems ); +#define _Deref_post_valid_ _SAL1_1_Source_(_Deref_post_valid_, (), _Deref_post1_impl_(__notnull_impl_notref) _Post_valid_impl_) +#define _Deref_post_opt_valid_ _SAL1_1_Source_(_Deref_post_opt_valid_, (), _Deref_post1_impl_(__maybenull_impl_notref) _Post_valid_impl_) + +#define _Deref_post_notnull_ _SAL1_1_Source_(_Deref_post_notnull_, (), _Deref_post1_impl_(__notnull_impl_notref)) +#define _Deref_post_maybenull_ _SAL1_1_Source_(_Deref_post_maybenull_, (), _Deref_post1_impl_(__maybenull_impl_notref)) +#define _Deref_post_null_ _SAL1_1_Source_(_Deref_post_null_, (), _Deref_post1_impl_(__null_impl_notref)) + +// +// _Deref_ret_ --- +// + +#define _Deref_ret_z_ _SAL1_1_Source_(_Deref_ret_z_, (), _Deref_ret1_impl_(__notnull_impl_notref) _Deref_ret1_impl_(__zterm_impl)) +#define _Deref_ret_opt_z_ _SAL1_1_Source_(_Deref_ret_opt_z_, (), _Deref_ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__zterm_impl)) + +// +// special _Deref_ --- +// +#define _Deref2_pre_readonly_ _SAL1_1_Source_(_Deref2_pre_readonly_, (), _Deref2_pre1_impl_(__readaccess_impl_notref)) + +// +// _Ret_ --- +// + +// e.g. _Ret_opt_valid_ LPSTR void* CloneSTR( _Pre_valid_ LPSTR src ); +#define _Ret_opt_valid_ _SAL1_1_Source_(_Ret_opt_valid_, (), _Ret1_impl_(__maybenull_impl_notref) _Ret_valid_impl_) +#define _Ret_opt_z_ _SAL1_1_Source_(_Ret_opt_z_, (), _Ret2_impl_(__maybenull_impl,__zterm_impl) _Ret_valid_impl_) + +// e.g. _Ret_opt_bytecap_(cb) void* AllocateMemory( size_t cb ); +// Buffer capacity is described by another parameter +#define _Ret_cap_(size) _SAL1_1_Source_(_Ret_cap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__cap_impl(size))) +#define _Ret_opt_cap_(size) _SAL1_1_Source_(_Ret_opt_cap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__cap_impl(size))) +#define _Ret_bytecap_(size) _SAL1_1_Source_(_Ret_bytecap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecap_impl(size))) +#define _Ret_opt_bytecap_(size) _SAL1_1_Source_(_Ret_opt_bytecap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecap_impl(size))) + +// Buffer capacity is described by a constant expression +#define _Ret_cap_c_(size) _SAL1_1_Source_(_Ret_cap_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__cap_c_impl(size))) +#define _Ret_opt_cap_c_(size) _SAL1_1_Source_(_Ret_opt_cap_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__cap_c_impl(size))) +#define _Ret_bytecap_c_(size) _SAL1_1_Source_(_Ret_bytecap_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecap_c_impl(size))) +#define _Ret_opt_bytecap_c_(size) _SAL1_1_Source_(_Ret_opt_bytecap_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecap_c_impl(size))) + +// Buffer capacity is described by a complex condition +#define _Ret_cap_x_(size) _SAL1_1_Source_(_Ret_cap_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__cap_x_impl(size))) +#define _Ret_opt_cap_x_(size) _SAL1_1_Source_(_Ret_opt_cap_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__cap_x_impl(size))) +#define _Ret_bytecap_x_(size) _SAL1_1_Source_(_Ret_bytecap_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecap_x_impl(size))) +#define _Ret_opt_bytecap_x_(size) _SAL1_1_Source_(_Ret_opt_bytecap_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecap_x_impl(size))) + +// return value is nullterminated and capacity is given by another parameter +#define _Ret_z_cap_(size) _SAL1_1_Source_(_Ret_z_cap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__cap_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_z_cap_(size) _SAL1_1_Source_(_Ret_opt_z_cap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__cap_impl(size)) _Ret_valid_impl_) +#define _Ret_z_bytecap_(size) _SAL1_1_Source_(_Ret_z_bytecap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecap_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_z_bytecap_(size) _SAL1_1_Source_(_Ret_opt_z_bytecap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecap_impl(size)) _Ret_valid_impl_) + +// e.g. _Ret_opt_bytecount_(cb) void* AllocateZeroInitializedMemory( size_t cb ); +// Valid Buffer extent is described by another parameter +#define _Ret_count_(size) _SAL1_1_Source_(_Ret_count_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__count_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_count_(size) _SAL1_1_Source_(_Ret_opt_count_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__count_impl(size)) _Ret_valid_impl_) +#define _Ret_bytecount_(size) _SAL1_1_Source_(_Ret_bytecount_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecount_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_bytecount_(size) _SAL1_1_Source_(_Ret_opt_bytecount_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecount_impl(size)) _Ret_valid_impl_) + +// Valid Buffer extent is described by a constant expression +#define _Ret_count_c_(size) _SAL1_1_Source_(_Ret_count_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__count_c_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_count_c_(size) _SAL1_1_Source_(_Ret_opt_count_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__count_c_impl(size)) _Ret_valid_impl_) +#define _Ret_bytecount_c_(size) _SAL1_1_Source_(_Ret_bytecount_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecount_c_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_bytecount_c_(size) _SAL1_1_Source_(_Ret_opt_bytecount_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecount_c_impl(size)) _Ret_valid_impl_) + +// Valid Buffer extent is described by a complex expression +#define _Ret_count_x_(size) _SAL1_1_Source_(_Ret_count_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__count_x_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_count_x_(size) _SAL1_1_Source_(_Ret_opt_count_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__count_x_impl(size)) _Ret_valid_impl_) +#define _Ret_bytecount_x_(size) _SAL1_1_Source_(_Ret_bytecount_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecount_x_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_bytecount_x_(size) _SAL1_1_Source_(_Ret_opt_bytecount_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecount_x_impl(size)) _Ret_valid_impl_) + +// return value is nullterminated and length is given by another parameter +#define _Ret_z_count_(size) _SAL1_1_Source_(_Ret_z_count_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__count_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_z_count_(size) _SAL1_1_Source_(_Ret_opt_z_count_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__count_impl(size)) _Ret_valid_impl_) +#define _Ret_z_bytecount_(size) _SAL1_1_Source_(_Ret_z_bytecount_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecount_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_z_bytecount_(size) _SAL1_1_Source_(_Ret_opt_z_bytecount_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecount_impl(size)) _Ret_valid_impl_) + + +// _Pre_ annotations --- +#define _Pre_opt_z_ _SAL1_1_Source_(_Pre_opt_z_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__zterm_impl) _Pre_valid_impl_) + +// restrict access rights +#define _Pre_readonly_ _SAL1_1_Source_(_Pre_readonly_, (), _Pre1_impl_(__readaccess_impl_notref)) +#define _Pre_writeonly_ _SAL1_1_Source_(_Pre_writeonly_, (), _Pre1_impl_(__writeaccess_impl_notref)) + +// e.g. void FreeMemory( _Pre_bytecap_(cb) _Post_ptr_invalid_ void* pv, size_t cb ); +// buffer capacity described by another parameter +#define _Pre_cap_(size) _SAL1_1_Source_(_Pre_cap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_impl(size))) +#define _Pre_opt_cap_(size) _SAL1_1_Source_(_Pre_opt_cap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_impl(size))) +#define _Pre_bytecap_(size) _SAL1_1_Source_(_Pre_bytecap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_impl(size))) +#define _Pre_opt_bytecap_(size) _SAL1_1_Source_(_Pre_opt_bytecap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_impl(size))) + +// buffer capacity described by a constant expression +#define _Pre_cap_c_(size) _SAL1_1_Source_(_Pre_cap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_impl(size))) +#define _Pre_opt_cap_c_(size) _SAL1_1_Source_(_Pre_opt_cap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_impl(size))) +#define _Pre_bytecap_c_(size) _SAL1_1_Source_(_Pre_bytecap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size))) +#define _Pre_opt_bytecap_c_(size) _SAL1_1_Source_(_Pre_opt_bytecap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size))) +#define _Pre_cap_c_one_ _SAL1_1_Source_(_Pre_cap_c_one_, (), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl)) +#define _Pre_opt_cap_c_one_ _SAL1_1_Source_(_Pre_opt_cap_c_one_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl)) + +// buffer capacity is described by another parameter multiplied by a constant expression +#define _Pre_cap_m_(mult,size) _SAL1_1_Source_(_Pre_cap_m_, (mult,size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__mult_impl(mult,size))) +#define _Pre_opt_cap_m_(mult,size) _SAL1_1_Source_(_Pre_opt_cap_m_, (mult,size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__mult_impl(mult,size))) + +// buffer capacity described by size of other buffer, only used by dangerous legacy APIs +// e.g. int strcpy(_Pre_cap_for_(src) char* dst, const char* src); +#define _Pre_cap_for_(param) _SAL1_1_Source_(_Pre_cap_for_, (param), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_for_impl(param))) +#define _Pre_opt_cap_for_(param) _SAL1_1_Source_(_Pre_opt_cap_for_, (param), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_for_impl(param))) + +// buffer capacity described by a complex condition +#define _Pre_cap_x_(size) _SAL1_1_Source_(_Pre_cap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_x_impl(size))) +#define _Pre_opt_cap_x_(size) _SAL1_1_Source_(_Pre_opt_cap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_x_impl(size))) +#define _Pre_bytecap_x_(size) _SAL1_1_Source_(_Pre_bytecap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size))) +#define _Pre_opt_bytecap_x_(size) _SAL1_1_Source_(_Pre_opt_bytecap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size))) + +// buffer capacity described by the difference to another pointer parameter +#define _Pre_ptrdiff_cap_(ptr) _SAL1_1_Source_(_Pre_ptrdiff_cap_, (ptr), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_x_impl(__ptrdiff(ptr)))) +#define _Pre_opt_ptrdiff_cap_(ptr) _SAL1_1_Source_(_Pre_opt_ptrdiff_cap_, (ptr), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_x_impl(__ptrdiff(ptr)))) + +// e.g. void AppendStr( _Pre_z_ const char* szFrom, _Pre_z_cap_(cchTo) _Post_z_ char* szTo, size_t cchTo ); +#define _Pre_z_cap_(size) _SAL1_1_Source_(_Pre_z_cap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_z_cap_(size) _SAL1_1_Source_(_Pre_opt_z_cap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_impl(size)) _Pre_valid_impl_) +#define _Pre_z_bytecap_(size) _SAL1_1_Source_(_Pre_z_bytecap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_z_bytecap_(size) _SAL1_1_Source_(_Pre_opt_z_bytecap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_impl(size)) _Pre_valid_impl_) + +#define _Pre_z_cap_c_(size) _SAL1_1_Source_(_Pre_z_cap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_c_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_z_cap_c_(size) _SAL1_1_Source_(_Pre_opt_z_cap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_c_impl(size)) _Pre_valid_impl_) +#define _Pre_z_bytecap_c_(size) _SAL1_1_Source_(_Pre_z_bytecap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_z_bytecap_c_(size) _SAL1_1_Source_(_Pre_opt_z_bytecap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_) + +#define _Pre_z_cap_x_(size) _SAL1_1_Source_(_Pre_z_cap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_x_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_z_cap_x_(size) _SAL1_1_Source_(_Pre_opt_z_cap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_x_impl(size)) _Pre_valid_impl_) +#define _Pre_z_bytecap_x_(size) _SAL1_1_Source_(_Pre_z_bytecap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_z_bytecap_x_(size) _SAL1_1_Source_(_Pre_opt_z_bytecap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_) + +// known capacity and valid but unknown readable extent +#define _Pre_valid_cap_(size) _SAL1_1_Source_(_Pre_valid_cap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_valid_cap_(size) _SAL1_1_Source_(_Pre_opt_valid_cap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_impl(size)) _Pre_valid_impl_) +#define _Pre_valid_bytecap_(size) _SAL1_1_Source_(_Pre_valid_bytecap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_valid_bytecap_(size) _SAL1_1_Source_(_Pre_opt_valid_bytecap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_) + +#define _Pre_valid_cap_c_(size) _SAL1_1_Source_(_Pre_valid_cap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_valid_cap_c_(size) _SAL1_1_Source_(_Pre_opt_valid_cap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_impl(size)) _Pre_valid_impl_) +#define _Pre_valid_bytecap_c_(size) _SAL1_1_Source_(_Pre_valid_bytecap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_valid_bytecap_c_(size) _SAL1_1_Source_(_Pre_opt_valid_bytecap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_) + +#define _Pre_valid_cap_x_(size) _SAL1_1_Source_(_Pre_valid_cap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_x_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_valid_cap_x_(size) _SAL1_1_Source_(_Pre_opt_valid_cap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_x_impl(size)) _Pre_valid_impl_) +#define _Pre_valid_bytecap_x_(size) _SAL1_1_Source_(_Pre_valid_bytecap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_valid_bytecap_x_(size) _SAL1_1_Source_(_Pre_opt_valid_bytecap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_) + +// e.g. void AppendCharRange( _Pre_count_(cchFrom) const char* rgFrom, size_t cchFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo ); +// Valid buffer extent described by another parameter +#define _Pre_count_(size) _SAL1_1_Source_(_Pre_count_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_count_(size) _SAL1_1_Source_(_Pre_opt_count_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_impl(size)) _Pre_valid_impl_) +#define _Pre_bytecount_(size) _SAL1_1_Source_(_Pre_bytecount_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_bytecount_(size) _SAL1_1_Source_(_Pre_opt_bytecount_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_) + +// Valid buffer extent described by a constant expression +#define _Pre_count_c_(size) _SAL1_1_Source_(_Pre_count_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_c_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_count_c_(size) _SAL1_1_Source_(_Pre_opt_count_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_c_impl(size)) _Pre_valid_impl_) +#define _Pre_bytecount_c_(size) _SAL1_1_Source_(_Pre_bytecount_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_bytecount_c_(size) _SAL1_1_Source_(_Pre_opt_bytecount_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_) + +// Valid buffer extent described by a complex expression +#define _Pre_count_x_(size) _SAL1_1_Source_(_Pre_count_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_x_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_count_x_(size) _SAL1_1_Source_(_Pre_opt_count_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_x_impl(size)) _Pre_valid_impl_) +#define _Pre_bytecount_x_(size) _SAL1_1_Source_(_Pre_bytecount_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_bytecount_x_(size) _SAL1_1_Source_(_Pre_opt_bytecount_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_) + +// Valid buffer extent described by the difference to another pointer parameter +#define _Pre_ptrdiff_count_(ptr) _SAL1_1_Source_(_Pre_ptrdiff_count_, (ptr), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_x_impl(__ptrdiff(ptr))) _Pre_valid_impl_) +#define _Pre_opt_ptrdiff_count_(ptr) _SAL1_1_Source_(_Pre_opt_ptrdiff_count_, (ptr), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_x_impl(__ptrdiff(ptr))) _Pre_valid_impl_) + + +// char * strncpy(_Out_cap_(_Count) _Post_maybez_ char * _Dest, _In_z_ const char * _Source, _In_ size_t _Count) +// buffer maybe zero-terminated after the call +#define _Post_maybez_ _SAL1_1_Source_(_Post_maybez_, (), _Post1_impl_(__maybezterm_impl)) + +// e.g. SIZE_T HeapSize( _In_ HANDLE hHeap, DWORD dwFlags, _Pre_notnull_ _Post_bytecap_(return) LPCVOID lpMem ); +#define _Post_cap_(size) _SAL1_1_Source_(_Post_cap_, (size), _Post1_impl_(__cap_impl(size))) +#define _Post_bytecap_(size) _SAL1_1_Source_(_Post_bytecap_, (size), _Post1_impl_(__bytecap_impl(size))) + +// e.g. int strlen( _In_z_ _Post_count_(return+1) const char* sz ); +#define _Post_count_(size) _SAL1_1_Source_(_Post_count_, (size), _Post1_impl_(__count_impl(size)) _Post_valid_impl_) +#define _Post_bytecount_(size) _SAL1_1_Source_(_Post_bytecount_, (size), _Post1_impl_(__bytecount_impl(size)) _Post_valid_impl_) +#define _Post_count_c_(size) _SAL1_1_Source_(_Post_count_c_, (size), _Post1_impl_(__count_c_impl(size)) _Post_valid_impl_) +#define _Post_bytecount_c_(size) _SAL1_1_Source_(_Post_bytecount_c_, (size), _Post1_impl_(__bytecount_c_impl(size)) _Post_valid_impl_) +#define _Post_count_x_(size) _SAL1_1_Source_(_Post_count_x_, (size), _Post1_impl_(__count_x_impl(size)) _Post_valid_impl_) +#define _Post_bytecount_x_(size) _SAL1_1_Source_(_Post_bytecount_x_, (size), _Post1_impl_(__bytecount_x_impl(size)) _Post_valid_impl_) + +// e.g. size_t CopyStr( _In_z_ const char* szFrom, _Pre_cap_(cch) _Post_z_count_(return+1) char* szFrom, size_t cchFrom ); +#define _Post_z_count_(size) _SAL1_1_Source_(_Post_z_count_, (size), _Post2_impl_(__zterm_impl,__count_impl(size)) _Post_valid_impl_) +#define _Post_z_bytecount_(size) _SAL1_1_Source_(_Post_z_bytecount_, (size), _Post2_impl_(__zterm_impl,__bytecount_impl(size)) _Post_valid_impl_) +#define _Post_z_count_c_(size) _SAL1_1_Source_(_Post_z_count_c_, (size), _Post2_impl_(__zterm_impl,__count_c_impl(size)) _Post_valid_impl_) +#define _Post_z_bytecount_c_(size) _SAL1_1_Source_(_Post_z_bytecount_c_, (size), _Post2_impl_(__zterm_impl,__bytecount_c_impl(size)) _Post_valid_impl_) +#define _Post_z_count_x_(size) _SAL1_1_Source_(_Post_z_count_x_, (size), _Post2_impl_(__zterm_impl,__count_x_impl(size)) _Post_valid_impl_) +#define _Post_z_bytecount_x_(size) _SAL1_1_Source_(_Post_z_bytecount_x_, (size), _Post2_impl_(__zterm_impl,__bytecount_x_impl(size)) _Post_valid_impl_) + +// +// _Prepost_ --- +// +// describing conditions that hold before and after the function call + +#define _Prepost_opt_z_ _SAL1_1_Source_(_Prepost_opt_z_, (), _Pre_opt_z_ _Post_z_) + +#define _Prepost_count_(size) _SAL1_1_Source_(_Prepost_count_, (size), _Pre_count_(size) _Post_count_(size)) +#define _Prepost_opt_count_(size) _SAL1_1_Source_(_Prepost_opt_count_, (size), _Pre_opt_count_(size) _Post_count_(size)) +#define _Prepost_bytecount_(size) _SAL1_1_Source_(_Prepost_bytecount_, (size), _Pre_bytecount_(size) _Post_bytecount_(size)) +#define _Prepost_opt_bytecount_(size) _SAL1_1_Source_(_Prepost_opt_bytecount_, (size), _Pre_opt_bytecount_(size) _Post_bytecount_(size)) +#define _Prepost_count_c_(size) _SAL1_1_Source_(_Prepost_count_c_, (size), _Pre_count_c_(size) _Post_count_c_(size)) +#define _Prepost_opt_count_c_(size) _SAL1_1_Source_(_Prepost_opt_count_c_, (size), _Pre_opt_count_c_(size) _Post_count_c_(size)) +#define _Prepost_bytecount_c_(size) _SAL1_1_Source_(_Prepost_bytecount_c_, (size), _Pre_bytecount_c_(size) _Post_bytecount_c_(size)) +#define _Prepost_opt_bytecount_c_(size) _SAL1_1_Source_(_Prepost_opt_bytecount_c_, (size), _Pre_opt_bytecount_c_(size) _Post_bytecount_c_(size)) +#define _Prepost_count_x_(size) _SAL1_1_Source_(_Prepost_count_x_, (size), _Pre_count_x_(size) _Post_count_x_(size)) +#define _Prepost_opt_count_x_(size) _SAL1_1_Source_(_Prepost_opt_count_x_, (size), _Pre_opt_count_x_(size) _Post_count_x_(size)) +#define _Prepost_bytecount_x_(size) _SAL1_1_Source_(_Prepost_bytecount_x_, (size), _Pre_bytecount_x_(size) _Post_bytecount_x_(size)) +#define _Prepost_opt_bytecount_x_(size) _SAL1_1_Source_(_Prepost_opt_bytecount_x_, (size), _Pre_opt_bytecount_x_(size) _Post_bytecount_x_(size)) + +#define _Prepost_valid_ _SAL1_1_Source_(_Prepost_valid_, (), _Pre_valid_ _Post_valid_) +#define _Prepost_opt_valid_ _SAL1_1_Source_(_Prepost_opt_valid_, (), _Pre_opt_valid_ _Post_valid_) + +// +// _Deref_ --- +// +// short version for _Deref_pre_ _Deref_post_ +// describing conditions for array elements or dereferenced pointer parameters that hold before and after the call + +#define _Deref_prepost_z_ _SAL1_1_Source_(_Deref_prepost_z_, (), _Deref_pre_z_ _Deref_post_z_) +#define _Deref_prepost_opt_z_ _SAL1_1_Source_(_Deref_prepost_opt_z_, (), _Deref_pre_opt_z_ _Deref_post_opt_z_) + +#define _Deref_prepost_cap_(size) _SAL1_1_Source_(_Deref_prepost_cap_, (size), _Deref_pre_cap_(size) _Deref_post_cap_(size)) +#define _Deref_prepost_opt_cap_(size) _SAL1_1_Source_(_Deref_prepost_opt_cap_, (size), _Deref_pre_opt_cap_(size) _Deref_post_opt_cap_(size)) +#define _Deref_prepost_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_bytecap_, (size), _Deref_pre_bytecap_(size) _Deref_post_bytecap_(size)) +#define _Deref_prepost_opt_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_opt_bytecap_, (size), _Deref_pre_opt_bytecap_(size) _Deref_post_opt_bytecap_(size)) + +#define _Deref_prepost_cap_x_(size) _SAL1_1_Source_(_Deref_prepost_cap_x_, (size), _Deref_pre_cap_x_(size) _Deref_post_cap_x_(size)) +#define _Deref_prepost_opt_cap_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_cap_x_, (size), _Deref_pre_opt_cap_x_(size) _Deref_post_opt_cap_x_(size)) +#define _Deref_prepost_bytecap_x_(size) _SAL1_1_Source_(_Deref_prepost_bytecap_x_, (size), _Deref_pre_bytecap_x_(size) _Deref_post_bytecap_x_(size)) +#define _Deref_prepost_opt_bytecap_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_bytecap_x_, (size), _Deref_pre_opt_bytecap_x_(size) _Deref_post_opt_bytecap_x_(size)) + +#define _Deref_prepost_z_cap_(size) _SAL1_1_Source_(_Deref_prepost_z_cap_, (size), _Deref_pre_z_cap_(size) _Deref_post_z_cap_(size)) +#define _Deref_prepost_opt_z_cap_(size) _SAL1_1_Source_(_Deref_prepost_opt_z_cap_, (size), _Deref_pre_opt_z_cap_(size) _Deref_post_opt_z_cap_(size)) +#define _Deref_prepost_z_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_z_bytecap_, (size), _Deref_pre_z_bytecap_(size) _Deref_post_z_bytecap_(size)) +#define _Deref_prepost_opt_z_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_opt_z_bytecap_, (size), _Deref_pre_opt_z_bytecap_(size) _Deref_post_opt_z_bytecap_(size)) + +#define _Deref_prepost_valid_cap_(size) _SAL1_1_Source_(_Deref_prepost_valid_cap_, (size), _Deref_pre_valid_cap_(size) _Deref_post_valid_cap_(size)) +#define _Deref_prepost_opt_valid_cap_(size) _SAL1_1_Source_(_Deref_prepost_opt_valid_cap_, (size), _Deref_pre_opt_valid_cap_(size) _Deref_post_opt_valid_cap_(size)) +#define _Deref_prepost_valid_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_valid_bytecap_, (size), _Deref_pre_valid_bytecap_(size) _Deref_post_valid_bytecap_(size)) +#define _Deref_prepost_opt_valid_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_opt_valid_bytecap_, (size), _Deref_pre_opt_valid_bytecap_(size) _Deref_post_opt_valid_bytecap_(size)) + +#define _Deref_prepost_valid_cap_x_(size) _SAL1_1_Source_(_Deref_prepost_valid_cap_x_, (size), _Deref_pre_valid_cap_x_(size) _Deref_post_valid_cap_x_(size)) +#define _Deref_prepost_opt_valid_cap_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_valid_cap_x_, (size), _Deref_pre_opt_valid_cap_x_(size) _Deref_post_opt_valid_cap_x_(size)) +#define _Deref_prepost_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_prepost_valid_bytecap_x_, (size), _Deref_pre_valid_bytecap_x_(size) _Deref_post_valid_bytecap_x_(size)) +#define _Deref_prepost_opt_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_valid_bytecap_x_, (size), _Deref_pre_opt_valid_bytecap_x_(size) _Deref_post_opt_valid_bytecap_x_(size)) + +#define _Deref_prepost_count_(size) _SAL1_1_Source_(_Deref_prepost_count_, (size), _Deref_pre_count_(size) _Deref_post_count_(size)) +#define _Deref_prepost_opt_count_(size) _SAL1_1_Source_(_Deref_prepost_opt_count_, (size), _Deref_pre_opt_count_(size) _Deref_post_opt_count_(size)) +#define _Deref_prepost_bytecount_(size) _SAL1_1_Source_(_Deref_prepost_bytecount_, (size), _Deref_pre_bytecount_(size) _Deref_post_bytecount_(size)) +#define _Deref_prepost_opt_bytecount_(size) _SAL1_1_Source_(_Deref_prepost_opt_bytecount_, (size), _Deref_pre_opt_bytecount_(size) _Deref_post_opt_bytecount_(size)) + +#define _Deref_prepost_count_x_(size) _SAL1_1_Source_(_Deref_prepost_count_x_, (size), _Deref_pre_count_x_(size) _Deref_post_count_x_(size)) +#define _Deref_prepost_opt_count_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_count_x_, (size), _Deref_pre_opt_count_x_(size) _Deref_post_opt_count_x_(size)) +#define _Deref_prepost_bytecount_x_(size) _SAL1_1_Source_(_Deref_prepost_bytecount_x_, (size), _Deref_pre_bytecount_x_(size) _Deref_post_bytecount_x_(size)) +#define _Deref_prepost_opt_bytecount_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_bytecount_x_, (size), _Deref_pre_opt_bytecount_x_(size) _Deref_post_opt_bytecount_x_(size)) + +#define _Deref_prepost_valid_ _SAL1_1_Source_(_Deref_prepost_valid_, (), _Deref_pre_valid_ _Deref_post_valid_) +#define _Deref_prepost_opt_valid_ _SAL1_1_Source_(_Deref_prepost_opt_valid_, (), _Deref_pre_opt_valid_ _Deref_post_opt_valid_) + +// +// _Deref_ +// +// used with references to arrays + +#define _Deref_out_z_cap_c_(size) _SAL1_1_Source_(_Deref_out_z_cap_c_, (size), _Deref_pre_cap_c_(size) _Deref_post_z_) +#define _Deref_inout_z_cap_c_(size) _SAL1_1_Source_(_Deref_inout_z_cap_c_, (size), _Deref_pre_z_cap_c_(size) _Deref_post_z_) +#define _Deref_out_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_out_z_bytecap_c_, (size), _Deref_pre_bytecap_c_(size) _Deref_post_z_) +#define _Deref_inout_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_inout_z_bytecap_c_, (size), _Deref_pre_z_bytecap_c_(size) _Deref_post_z_) +#define _Deref_inout_z_ _SAL1_1_Source_(_Deref_inout_z_, (), _Deref_prepost_z_) + +// #pragma endregion Input Buffer SAL 1 compatibility macros + + +//============================================================================ +// Implementation Layer: +//============================================================================ + + +// Naming conventions: +// A symbol the begins with _SA_ is for the machinery of creating any +// annotations; many of those come from sourceannotations.h in the case +// of attributes. + +// A symbol that ends with _impl is the very lowest level macro. It is +// not required to be a legal standalone annotation, and in the case +// of attribute annotations, usually is not. (In the case of some declspec +// annotations, it might be, but it should not be assumed so.) Those +// symols will be used in the _PreN..., _PostN... and _RetN... annotations +// to build up more complete annotations. + +// A symbol ending in _impl_ is reserved to the implementation as well, +// but it does form a complete annotation; usually they are used to build +// up even higher level annotations. + + +#if _USE_ATTRIBUTES_FOR_SAL || _USE_DECLSPECS_FOR_SAL // [ +// Sharable "_impl" macros: these can be shared between the various annotation +// forms but are part of the implementation of the macros. These are collected +// here to assure that only necessary differences in the annotations +// exist. + +#define _Always_impl_(annos) _Group_(annos _SAL_nop_impl_) _On_failure_impl_(annos _SAL_nop_impl_) +#define _Bound_impl_ _SA_annotes0(SAL_bound) +#define _Field_range_impl_(min,max) _Range_impl_(min,max) +#define _Literal_impl_ _SA_annotes1(SAL_constant, __yes) +#define _Maybenull_impl_ _SA_annotes1(SAL_null, __maybe) +#define _Maybevalid_impl_ _SA_annotes1(SAL_valid, __maybe) +#define _Must_inspect_impl_ _Post_impl_ _SA_annotes0(SAL_mustInspect) +#define _Notliteral_impl_ _SA_annotes1(SAL_constant, __no) +#define _Notnull_impl_ _SA_annotes1(SAL_null, __no) +#define _Notvalid_impl_ _SA_annotes1(SAL_valid, __no) +#define _NullNull_terminated_impl_ _Group_(_SA_annotes1(SAL_nullTerminated, __yes) _SA_annotes1(SAL_readableTo,inexpressibleCount("NullNull terminated string"))) +#define _Null_impl_ _SA_annotes1(SAL_null, __yes) +#define _Null_terminated_impl_ _SA_annotes1(SAL_nullTerminated, __yes) +#define _Out_impl_ _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl) _Post_valid_impl_ +#define _Out_opt_impl_ _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl) _Post_valid_impl_ +#define _Points_to_data_impl_ _At_(*_Curr_, _SA_annotes1(SAL_mayBePointer, __no)) +#define _Post_satisfies_impl_(cond) _Post_impl_ _Satisfies_impl_(cond) +#define _Post_valid_impl_ _Post1_impl_(__valid_impl) +#define _Pre_satisfies_impl_(cond) _Pre_impl_ _Satisfies_impl_(cond) +#define _Pre_valid_impl_ _Pre1_impl_(__valid_impl) +#define _Range_impl_(min,max) _SA_annotes2(SAL_range, min, max) +#define _Readable_bytes_impl_(size) _SA_annotes1(SAL_readableTo, byteCount(size)) +#define _Readable_elements_impl_(size) _SA_annotes1(SAL_readableTo, elementCount(size)) +#define _Ret_valid_impl_ _Ret1_impl_(__valid_impl) +#define _Satisfies_impl_(cond) _SA_annotes1(SAL_satisfies, cond) +#define _Valid_impl_ _SA_annotes1(SAL_valid, __yes) +#define _Writable_bytes_impl_(size) _SA_annotes1(SAL_writableTo, byteCount(size)) +#define _Writable_elements_impl_(size) _SA_annotes1(SAL_writableTo, elementCount(size)) + +#define _In_range_impl_(min,max) _Pre_impl_ _Range_impl_(min,max) +#define _Out_range_impl_(min,max) _Post_impl_ _Range_impl_(min,max) +#define _Ret_range_impl_(min,max) _Post_impl_ _Range_impl_(min,max) +#define _Deref_in_range_impl_(min,max) _Deref_pre_impl_ _Range_impl_(min,max) +#define _Deref_out_range_impl_(min,max) _Deref_post_impl_ _Range_impl_(min,max) +#define _Deref_ret_range_impl_(min,max) _Deref_post_impl_ _Range_impl_(min,max) + +#define _Deref_pre_impl_ _Pre_impl_ _Notref_impl_ _Deref_impl_ +#define _Deref_post_impl_ _Post_impl_ _Notref_impl_ _Deref_impl_ + +// The following are for the implementation machinery, and are not +// suitable for annotating general code. +// We're tying to phase this out, someday. The parser quotes the param. +#define __AuToQuOtE _SA_annotes0(SAL_AuToQuOtE) + +// Normally the parser does some simple type checking of annotation params, +// defer that check to the plugin. +#define __deferTypecheck _SA_annotes0(SAL_deferTypecheck) + +#define _SA_SPECSTRIZE( x ) #x +#define _SAL_nop_impl_ /* nothing */ +#define __nop_impl(x) x +#endif + + +#if _USE_ATTRIBUTES_FOR_SAL // [ + +// Using attributes for sal + +#include "codeanalysis\sourceannotations.h" + + +#define _SA_annotes0(n) [SAL_annotes(Name=#n)] +#define _SA_annotes1(n,pp1) [SAL_annotes(Name=#n, p1=_SA_SPECSTRIZE(pp1))] +#define _SA_annotes2(n,pp1,pp2) [SAL_annotes(Name=#n, p1=_SA_SPECSTRIZE(pp1), p2=_SA_SPECSTRIZE(pp2))] +#define _SA_annotes3(n,pp1,pp2,pp3) [SAL_annotes(Name=#n, p1=_SA_SPECSTRIZE(pp1), p2=_SA_SPECSTRIZE(pp2), p3=_SA_SPECSTRIZE(pp3))] + +#define _Pre_impl_ [SAL_pre] +#define _Post_impl_ [SAL_post] +#define _Deref_impl_ [SAL_deref] +#define _Notref_impl_ [SAL_notref] + + +// Declare a function to be an annotation or primop (respectively). +// Done this way so that they don't appear in the regular compiler's +// namespace. +#define __ANNOTATION(fun) _SA_annotes0(SAL_annotation) void __SA_##fun; +#define __PRIMOP(type, fun) _SA_annotes0(SAL_primop) type __SA_##fun; +#define __QUALIFIER(fun) _SA_annotes0(SAL_qualifier) void __SA_##fun; + +// Benign declspec needed here for WindowsPREfast +#define __In_impl_ [SA_Pre(Valid=SA_Yes)] [SA_Pre(Deref=1, Notref=1, Access=SA_Read)] __declspec("SAL_pre SAL_valid") + +#elif _USE_DECLSPECS_FOR_SAL // ][ + +// Using declspecs for sal + +#define _SA_annotes0(n) __declspec(#n) +#define _SA_annotes1(n,pp1) __declspec(#n "(" _SA_SPECSTRIZE(pp1) ")" ) +#define _SA_annotes2(n,pp1,pp2) __declspec(#n "(" _SA_SPECSTRIZE(pp1) "," _SA_SPECSTRIZE(pp2) ")") +#define _SA_annotes3(n,pp1,pp2,pp3) __declspec(#n "(" _SA_SPECSTRIZE(pp1) "," _SA_SPECSTRIZE(pp2) "," _SA_SPECSTRIZE(pp3) ")") + +#define _Pre_impl_ _SA_annotes0(SAL_pre) +#define _Post_impl_ _SA_annotes0(SAL_post) +#define _Deref_impl_ _SA_annotes0(SAL_deref) +#define _Notref_impl_ _SA_annotes0(SAL_notref) + +// Declare a function to be an annotation or primop (respectively). +// Done this way so that they don't appear in the regular compiler's +// namespace. +#define __ANNOTATION(fun) _SA_annotes0(SAL_annotation) void __SA_##fun + +#define __PRIMOP(type, fun) _SA_annotes0(SAL_primop) type __SA_##fun + +#define __QUALIFIER(fun) _SA_annotes0(SAL_qualifier) void __SA_##fun; + +#define __In_impl_ _Pre_impl_ _SA_annotes0(SAL_valid) _Pre_impl_ _Deref_impl_ _Notref_impl_ _SA_annotes0(SAL_readonly) + +#else // ][ + +// Using "nothing" for sal + +#define _SA_annotes0(n) +#define _SA_annotes1(n,pp1) +#define _SA_annotes2(n,pp1,pp2) +#define _SA_annotes3(n,pp1,pp2,pp3) + +#define __ANNOTATION(fun) +#define __PRIMOP(type, fun) +#define __QUALIFIER(type, fun) + +#endif // ] + +#if _USE_ATTRIBUTES_FOR_SAL || _USE_DECLSPECS_FOR_SAL // [ + +// Declare annotations that need to be declared. +__ANNOTATION(SAL_useHeader(void)); +__ANNOTATION(SAL_bound(void)); +__ANNOTATION(SAL_allocator(void)); //??? resolve with PFD +__ANNOTATION(SAL_file_parser(__AuToQuOtE __In_impl_ char *, __In_impl_ char *)); +__ANNOTATION(SAL_source_code_content(__In_impl_ char *)); +__ANNOTATION(SAL_analysisHint(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_untrusted_data_source(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_untrusted_data_source_this(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_validated(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_validated_this(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_encoded(void)); +__ANNOTATION(SAL_adt(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_add_adt_property(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_remove_adt_property(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_transfer_adt_property_from(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_post_type(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_volatile(void)); +__ANNOTATION(SAL_nonvolatile(void)); +__ANNOTATION(SAL_entrypoint(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_blocksOn(__In_impl_ void*)); +__ANNOTATION(SAL_mustInspect(void)); + +// Only appears in model files, but needs to be declared. +__ANNOTATION(SAL_TypeName(__AuToQuOtE __In_impl_ char *)); + +// To be declared well-known soon. +__ANNOTATION(SAL_interlocked(void);) + +#pragma warning (suppress: 28227 28241) +__ANNOTATION(SAL_name(__In_impl_ char *, __In_impl_ char *, __In_impl_ char *);) + +__PRIMOP(char *, _Macro_value_(__In_impl_ char *)); +__PRIMOP(int, _Macro_defined_(__In_impl_ char *)); +__PRIMOP(char *, _Strstr_(__In_impl_ char *, __In_impl_ char *)); + +#endif // ] + +#if _USE_ATTRIBUTES_FOR_SAL // [ + +#define _Check_return_impl_ [SA_Post(MustCheck=SA_Yes)] + +#define _Success_impl_(expr) [SA_Success(Condition=#expr)] +#define _On_failure_impl_(annos) [SAL_context(p1="SAL_failed")] _Group_(_Post_impl_ _Group_(annos _SAL_nop_impl_)) + +#define _Printf_format_string_impl_ [SA_FormatString(Style="printf")] +#define _Scanf_format_string_impl_ [SA_FormatString(Style="scanf")] +#define _Scanf_s_format_string_impl_ [SA_FormatString(Style="scanf_s")] + +#define _In_bound_impl_ [SA_PreBound(Deref=0)] +#define _Out_bound_impl_ [SA_PostBound(Deref=0)] +#define _Ret_bound_impl_ [SA_PostBound(Deref=0)] +#define _Deref_in_bound_impl_ [SA_PreBound(Deref=1)] +#define _Deref_out_bound_impl_ [SA_PostBound(Deref=1)] +#define _Deref_ret_bound_impl_ [SA_PostBound(Deref=1)] + +#define __valid_impl Valid=SA_Yes +#define __maybevalid_impl Valid=SA_Maybe +#define __notvalid_impl Valid=SA_No + +#define __null_impl Null=SA_Yes +#define __maybenull_impl Null=SA_Maybe +#define __notnull_impl Null=SA_No + +#define __null_impl_notref Null=SA_Yes,Notref=1 +#define __maybenull_impl_notref Null=SA_Maybe,Notref=1 +#define __notnull_impl_notref Null=SA_No,Notref=1 + +#define __zterm_impl NullTerminated=SA_Yes +#define __maybezterm_impl NullTerminated=SA_Maybe +#define __maybzterm_impl NullTerminated=SA_Maybe +#define __notzterm_impl NullTerminated=SA_No + +#define __readaccess_impl Access=SA_Read +#define __writeaccess_impl Access=SA_Write +#define __allaccess_impl Access=SA_ReadWrite + +#define __readaccess_impl_notref Access=SA_Read,Notref=1 +#define __writeaccess_impl_notref Access=SA_Write,Notref=1 +#define __allaccess_impl_notref Access=SA_ReadWrite,Notref=1 + +#if _MSC_VER >= 1610 /*IFSTRIP=IGN*/ // [ + +// For SAL2, we need to expect general expressions. + +#define __cap_impl(size) WritableElements="\n"#size +#define __bytecap_impl(size) WritableBytes="\n"#size +#define __bytecount_impl(size) ValidBytes="\n"#size +#define __count_impl(size) ValidElements="\n"#size + +#else // ][ + +#define __cap_impl(size) WritableElements=#size +#define __bytecap_impl(size) WritableBytes=#size +#define __bytecount_impl(size) ValidBytes=#size +#define __count_impl(size) ValidElements=#size + +#endif // ] + +#define __cap_c_impl(size) WritableElementsConst=size +#define __cap_c_one_notref_impl WritableElementsConst=1,Notref=1 +#define __cap_for_impl(param) WritableElementsLength=#param +#define __cap_x_impl(size) WritableElements="\n@"#size + +#define __bytecap_c_impl(size) WritableBytesConst=size +#define __bytecap_x_impl(size) WritableBytes="\n@"#size + +#define __mult_impl(mult,size) __cap_impl((mult)*(size)) + +#define __count_c_impl(size) ValidElementsConst=size +#define __count_x_impl(size) ValidElements="\n@"#size + +#define __bytecount_c_impl(size) ValidBytesConst=size +#define __bytecount_x_impl(size) ValidBytes="\n@"#size + + +#define _At_impl_(target, annos) [SAL_at(p1=#target)] _Group_(annos) +#define _At_buffer_impl_(target, iter, bound, annos) [SAL_at_buffer(p1=#target, p2=#iter, p3=#bound)] _Group_(annos) +#define _When_impl_(expr, annos) [SAL_when(p1=#expr)] _Group_(annos) + +#define _Group_impl_(annos) [SAL_begin] annos [SAL_end] +#define _GrouP_impl_(annos) [SAL_BEGIN] annos [SAL_END] + +#define _Use_decl_anno_impl_ _SA_annotes0(SAL_useHeader) // this is a special case! + +#define _Pre1_impl_(p1) [SA_Pre(p1)] +#define _Pre2_impl_(p1,p2) [SA_Pre(p1,p2)] +#define _Pre3_impl_(p1,p2,p3) [SA_Pre(p1,p2,p3)] + +#define _Post1_impl_(p1) [SA_Post(p1)] +#define _Post2_impl_(p1,p2) [SA_Post(p1,p2)] +#define _Post3_impl_(p1,p2,p3) [SA_Post(p1,p2,p3)] + +#define _Ret1_impl_(p1) [SA_Post(p1)] +#define _Ret2_impl_(p1,p2) [SA_Post(p1,p2)] +#define _Ret3_impl_(p1,p2,p3) [SA_Post(p1,p2,p3)] + +#define _Deref_pre1_impl_(p1) [SA_Pre(Deref=1,p1)] +#define _Deref_pre2_impl_(p1,p2) [SA_Pre(Deref=1,p1,p2)] +#define _Deref_pre3_impl_(p1,p2,p3) [SA_Pre(Deref=1,p1,p2,p3)] + + +#define _Deref_post1_impl_(p1) [SA_Post(Deref=1,p1)] +#define _Deref_post2_impl_(p1,p2) [SA_Post(Deref=1,p1,p2)] +#define _Deref_post3_impl_(p1,p2,p3) [SA_Post(Deref=1,p1,p2,p3)] + +#define _Deref_ret1_impl_(p1) [SA_Post(Deref=1,p1)] +#define _Deref_ret2_impl_(p1,p2) [SA_Post(Deref=1,p1,p2)] +#define _Deref_ret3_impl_(p1,p2,p3) [SA_Post(Deref=1,p1,p2,p3)] + +#define _Deref2_pre1_impl_(p1) [SA_Pre(Deref=2,Notref=1,p1)] +#define _Deref2_post1_impl_(p1) [SA_Post(Deref=2,Notref=1,p1)] +#define _Deref2_ret1_impl_(p1) [SA_Post(Deref=2,Notref=1,p1)] + +// Obsolete -- may be needed for transition to attributes. +#define __inner_typefix(ctype) [SAL_typefix(p1=_SA_SPECSTRIZE(ctype))] +#define __inner_exceptthat [SAL_except] + + +#elif _USE_DECLSPECS_FOR_SAL // ][ + +#define _Check_return_impl_ __post _SA_annotes0(SAL_checkReturn) + +#define _Success_impl_(expr) _SA_annotes1(SAL_success, expr) +#define _On_failure_impl_(annos) _SA_annotes1(SAL_context, SAL_failed) _Group_(_Post_impl_ _Group_(_SAL_nop_impl_ annos)) + +#define _Printf_format_string_impl_ _SA_annotes1(SAL_IsFormatString, "printf") +#define _Scanf_format_string_impl_ _SA_annotes1(SAL_IsFormatString, "scanf") +#define _Scanf_s_format_string_impl_ _SA_annotes1(SAL_IsFormatString, "scanf_s") + +#define _In_bound_impl_ _Pre_impl_ _Bound_impl_ +#define _Out_bound_impl_ _Post_impl_ _Bound_impl_ +#define _Ret_bound_impl_ _Post_impl_ _Bound_impl_ +#define _Deref_in_bound_impl_ _Deref_pre_impl_ _Bound_impl_ +#define _Deref_out_bound_impl_ _Deref_post_impl_ _Bound_impl_ +#define _Deref_ret_bound_impl_ _Deref_post_impl_ _Bound_impl_ + + +#define __null_impl _SA_annotes0(SAL_null) // _SA_annotes1(SAL_null, __yes) +#define __notnull_impl _SA_annotes0(SAL_notnull) // _SA_annotes1(SAL_null, __no) +#define __maybenull_impl _SA_annotes0(SAL_maybenull) // _SA_annotes1(SAL_null, __maybe) + +#define __valid_impl _SA_annotes0(SAL_valid) // _SA_annotes1(SAL_valid, __yes) +#define __notvalid_impl _SA_annotes0(SAL_notvalid) // _SA_annotes1(SAL_valid, __no) +#define __maybevalid_impl _SA_annotes0(SAL_maybevalid) // _SA_annotes1(SAL_valid, __maybe) + +#define __null_impl_notref _Notref_ _Null_impl_ +#define __maybenull_impl_notref _Notref_ _Maybenull_impl_ +#define __notnull_impl_notref _Notref_ _Notnull_impl_ + +#define __zterm_impl _SA_annotes1(SAL_nullTerminated, __yes) +#define __maybezterm_impl _SA_annotes1(SAL_nullTerminated, __maybe) +#define __maybzterm_impl _SA_annotes1(SAL_nullTerminated, __maybe) +#define __notzterm_impl _SA_annotes1(SAL_nullTerminated, __no) + +#define __readaccess_impl _SA_annotes1(SAL_access, 0x1) +#define __writeaccess_impl _SA_annotes1(SAL_access, 0x2) +#define __allaccess_impl _SA_annotes1(SAL_access, 0x3) + +#define __readaccess_impl_notref _Notref_ _SA_annotes1(SAL_access, 0x1) +#define __writeaccess_impl_notref _Notref_ _SA_annotes1(SAL_access, 0x2) +#define __allaccess_impl_notref _Notref_ _SA_annotes1(SAL_access, 0x3) + +#define __cap_impl(size) _SA_annotes1(SAL_writableTo,elementCount(size)) +#define __cap_c_impl(size) _SA_annotes1(SAL_writableTo,elementCount(size)) +#define __cap_c_one_notref_impl _Notref_ _SA_annotes1(SAL_writableTo,elementCount(1)) +#define __cap_for_impl(param) _SA_annotes1(SAL_writableTo,inexpressibleCount(sizeof(param))) +#define __cap_x_impl(size) _SA_annotes1(SAL_writableTo,inexpressibleCount(#size)) + +#define __bytecap_impl(size) _SA_annotes1(SAL_writableTo,byteCount(size)) +#define __bytecap_c_impl(size) _SA_annotes1(SAL_writableTo,byteCount(size)) +#define __bytecap_x_impl(size) _SA_annotes1(SAL_writableTo,inexpressibleCount(#size)) + +#define __mult_impl(mult,size) _SA_annotes1(SAL_writableTo,(mult)*(size)) + +#define __count_impl(size) _SA_annotes1(SAL_readableTo,elementCount(size)) +#define __count_c_impl(size) _SA_annotes1(SAL_readableTo,elementCount(size)) +#define __count_x_impl(size) _SA_annotes1(SAL_readableTo,inexpressibleCount(#size)) + +#define __bytecount_impl(size) _SA_annotes1(SAL_readableTo,byteCount(size)) +#define __bytecount_c_impl(size) _SA_annotes1(SAL_readableTo,byteCount(size)) +#define __bytecount_x_impl(size) _SA_annotes1(SAL_readableTo,inexpressibleCount(#size)) + +#define _At_impl_(target, annos) _SA_annotes0(SAL_at(target)) _Group_(annos) +#define _At_buffer_impl_(target, iter, bound, annos) _SA_annotes3(SAL_at_buffer, target, iter, bound) _Group_(annos) +#define _Group_impl_(annos) _SA_annotes0(SAL_begin) annos _SA_annotes0(SAL_end) +#define _GrouP_impl_(annos) _SA_annotes0(SAL_BEGIN) annos _SA_annotes0(SAL_END) +#define _When_impl_(expr, annos) _SA_annotes0(SAL_when(expr)) _Group_(annos) + +#define _Use_decl_anno_impl_ __declspec("SAL_useHeader()") // this is a special case! + +#define _Pre1_impl_(p1) _Pre_impl_ p1 +#define _Pre2_impl_(p1,p2) _Pre_impl_ p1 _Pre_impl_ p2 +#define _Pre3_impl_(p1,p2,p3) _Pre_impl_ p1 _Pre_impl_ p2 _Pre_impl_ p3 + +#define _Post1_impl_(p1) _Post_impl_ p1 +#define _Post2_impl_(p1,p2) _Post_impl_ p1 _Post_impl_ p2 +#define _Post3_impl_(p1,p2,p3) _Post_impl_ p1 _Post_impl_ p2 _Post_impl_ p3 + +#define _Ret1_impl_(p1) _Post_impl_ p1 +#define _Ret2_impl_(p1,p2) _Post_impl_ p1 _Post_impl_ p2 +#define _Ret3_impl_(p1,p2,p3) _Post_impl_ p1 _Post_impl_ p2 _Post_impl_ p3 + +#define _Deref_pre1_impl_(p1) _Deref_pre_impl_ p1 +#define _Deref_pre2_impl_(p1,p2) _Deref_pre_impl_ p1 _Deref_pre_impl_ p2 +#define _Deref_pre3_impl_(p1,p2,p3) _Deref_pre_impl_ p1 _Deref_pre_impl_ p2 _Deref_pre_impl_ p3 + +#define _Deref_post1_impl_(p1) _Deref_post_impl_ p1 +#define _Deref_post2_impl_(p1,p2) _Deref_post_impl_ p1 _Deref_post_impl_ p2 +#define _Deref_post3_impl_(p1,p2,p3) _Deref_post_impl_ p1 _Deref_post_impl_ p2 _Deref_post_impl_ p3 + +#define _Deref_ret1_impl_(p1) _Deref_post_impl_ p1 +#define _Deref_ret2_impl_(p1,p2) _Deref_post_impl_ p1 _Deref_post_impl_ p2 +#define _Deref_ret3_impl_(p1,p2,p3) _Deref_post_impl_ p1 _Deref_post_impl_ p2 _Deref_post_impl_ p3 + +#define _Deref2_pre1_impl_(p1) _Deref_pre_impl_ _Notref_impl_ _Deref_impl_ p1 +#define _Deref2_post1_impl_(p1) _Deref_post_impl_ _Notref_impl_ _Deref_impl_ p1 +#define _Deref2_ret1_impl_(p1) _Deref_post_impl_ _Notref_impl_ _Deref_impl_ p1 + +#define __inner_typefix(ctype) _SA_annotes1(SAL_typefix, ctype) +#define __inner_exceptthat _SA_annotes0(SAL_except) + +#elif defined(_MSC_EXTENSIONS) && !defined( MIDL_PASS ) && !defined(__midl) && !defined(RC_INVOKED) && defined(_PFT_VER) && _MSC_VER >= 1400 /*IFSTRIP=IGN*/ // ][ + +// minimum attribute expansion for foreground build + +#pragma push_macro( "SA" ) +#pragma push_macro( "REPEATABLE" ) + +#ifdef __cplusplus // [ +#define SA( id ) id +#define REPEATABLE [repeatable] +#else // !__cplusplus // ][ +#define SA( id ) SA_##id +#define REPEATABLE +#endif // !__cplusplus // ] + +REPEATABLE +[source_annotation_attribute( SA( Parameter ) )] +struct __P_impl +{ +#ifdef __cplusplus // [ + __P_impl(); +#endif // ] + int __d_; +}; +typedef struct __P_impl __P_impl; + +REPEATABLE +[source_annotation_attribute( SA( ReturnValue ) )] +struct __R_impl +{ +#ifdef __cplusplus // [ + __R_impl(); +#endif // ] + int __d_; +}; +typedef struct __R_impl __R_impl; + +[source_annotation_attribute( SA( Method ) )] +struct __M_ +{ +#ifdef __cplusplus // [ + __M_(); +#endif // ] + int __d_; +}; +typedef struct __M_ __M_; + +[source_annotation_attribute( SA( All ) )] +struct __A_ +{ +#ifdef __cplusplus // [ + __A_(); +#endif // ] + int __d_; +}; +typedef struct __A_ __A_; + +[source_annotation_attribute( SA( Field ) )] +struct __F_ +{ +#ifdef __cplusplus // [ + __F_(); +#endif // ] + int __d_; +}; +typedef struct __F_ __F_; + +#pragma pop_macro( "REPEATABLE" ) +#pragma pop_macro( "SA" ) + + +#define _SAL_nop_impl_ + +#define _At_impl_(target, annos) [__A_(__d_=0)] +#define _At_buffer_impl_(target, iter, bound, annos) [__A_(__d_=0)] +#define _When_impl_(expr, annos) annos +#define _Group_impl_(annos) annos +#define _GrouP_impl_(annos) annos +#define _Use_decl_anno_impl_ [__M_(__d_=0)] + +#define _Points_to_data_impl_ [__P_impl(__d_=0)] +#define _Literal_impl_ [__P_impl(__d_=0)] +#define _Notliteral_impl_ [__P_impl(__d_=0)] + +#define _Pre_valid_impl_ [__P_impl(__d_=0)] +#define _Post_valid_impl_ [__P_impl(__d_=0)] +#define _Ret_valid_impl_ [__R_impl(__d_=0)] + +#define _Check_return_impl_ [__R_impl(__d_=0)] +#define _Must_inspect_impl_ [__R_impl(__d_=0)] + +#define _Success_impl_(expr) [__M_(__d_=0)] +#define _On_failure_impl_(expr) [__M_(__d_=0)] +#define _Always_impl_(expr) [__M_(__d_=0)] + +#define _Printf_format_string_impl_ [__P_impl(__d_=0)] +#define _Scanf_format_string_impl_ [__P_impl(__d_=0)] +#define _Scanf_s_format_string_impl_ [__P_impl(__d_=0)] + +#define _Raises_SEH_exception_impl_ [__M_(__d_=0)] +#define _Maybe_raises_SEH_exception_impl_ [__M_(__d_=0)] + +#define _In_bound_impl_ [__P_impl(__d_=0)] +#define _Out_bound_impl_ [__P_impl(__d_=0)] +#define _Ret_bound_impl_ [__R_impl(__d_=0)] +#define _Deref_in_bound_impl_ [__P_impl(__d_=0)] +#define _Deref_out_bound_impl_ [__P_impl(__d_=0)] +#define _Deref_ret_bound_impl_ [__R_impl(__d_=0)] + +#define _Range_impl_(min,max) [__P_impl(__d_=0)] +#define _In_range_impl_(min,max) [__P_impl(__d_=0)] +#define _Out_range_impl_(min,max) [__P_impl(__d_=0)] +#define _Ret_range_impl_(min,max) [__R_impl(__d_=0)] +#define _Deref_in_range_impl_(min,max) [__P_impl(__d_=0)] +#define _Deref_out_range_impl_(min,max) [__P_impl(__d_=0)] +#define _Deref_ret_range_impl_(min,max) [__R_impl(__d_=0)] + +#define _Field_range_impl_(min,max) [__F_(__d_=0)] + +#define _Pre_satisfies_impl_(cond) [__A_(__d_=0)] +#define _Post_satisfies_impl_(cond) [__A_(__d_=0)] +#define _Satisfies_impl_(cond) [__A_(__d_=0)] + +#define _Null_impl_ [__A_(__d_=0)] +#define _Notnull_impl_ [__A_(__d_=0)] +#define _Maybenull_impl_ [__A_(__d_=0)] + +#define _Valid_impl_ [__A_(__d_=0)] +#define _Notvalid_impl_ [__A_(__d_=0)] +#define _Maybevalid_impl_ [__A_(__d_=0)] + +#define _Readable_bytes_impl_(size) [__A_(__d_=0)] +#define _Readable_elements_impl_(size) [__A_(__d_=0)] +#define _Writable_bytes_impl_(size) [__A_(__d_=0)] +#define _Writable_elements_impl_(size) [__A_(__d_=0)] + +#define _Null_terminated_impl_ [__A_(__d_=0)] +#define _NullNull_terminated_impl_ [__A_(__d_=0)] + +#define _Pre_impl_ [__P_impl(__d_=0)] +#define _Pre1_impl_(p1) [__P_impl(__d_=0)] +#define _Pre2_impl_(p1,p2) [__P_impl(__d_=0)] +#define _Pre3_impl_(p1,p2,p3) [__P_impl(__d_=0)] + +#define _Post_impl_ [__P_impl(__d_=0)] +#define _Post1_impl_(p1) [__P_impl(__d_=0)] +#define _Post2_impl_(p1,p2) [__P_impl(__d_=0)] +#define _Post3_impl_(p1,p2,p3) [__P_impl(__d_=0)] + +#define _Ret1_impl_(p1) [__R_impl(__d_=0)] +#define _Ret2_impl_(p1,p2) [__R_impl(__d_=0)] +#define _Ret3_impl_(p1,p2,p3) [__R_impl(__d_=0)] + +#define _Deref_pre1_impl_(p1) [__P_impl(__d_=0)] +#define _Deref_pre2_impl_(p1,p2) [__P_impl(__d_=0)] +#define _Deref_pre3_impl_(p1,p2,p3) [__P_impl(__d_=0)] + +#define _Deref_post1_impl_(p1) [__P_impl(__d_=0)] +#define _Deref_post2_impl_(p1,p2) [__P_impl(__d_=0)] +#define _Deref_post3_impl_(p1,p2,p3) [__P_impl(__d_=0)] + +#define _Deref_ret1_impl_(p1) [__R_impl(__d_=0)] +#define _Deref_ret2_impl_(p1,p2) [__R_impl(__d_=0)] +#define _Deref_ret3_impl_(p1,p2,p3) [__R_impl(__d_=0)] + +#define _Deref2_pre1_impl_(p1) //[__P_impl(__d_=0)] +#define _Deref2_post1_impl_(p1) //[__P_impl(__d_=0)] +#define _Deref2_ret1_impl_(p1) //[__P_impl(__d_=0)] + +#else // ][ + + +#define _SAL_nop_impl_ X + +#define _At_impl_(target, annos) +#define _When_impl_(expr, annos) +#define _Group_impl_(annos) +#define _GrouP_impl_(annos) +#define _At_buffer_impl_(target, iter, bound, annos) +#define _Use_decl_anno_impl_ +#define _Points_to_data_impl_ +#define _Literal_impl_ +#define _Notliteral_impl_ +#define _Notref_impl_ + +#define _Pre_valid_impl_ +#define _Post_valid_impl_ +#define _Ret_valid_impl_ + +#define _Check_return_impl_ +#define _Must_inspect_impl_ + +#define _Success_impl_(expr) +#define _On_failure_impl_(annos) +#define _Always_impl_(annos) + +#define _Printf_format_string_impl_ +#define _Scanf_format_string_impl_ +#define _Scanf_s_format_string_impl_ + +#define _In_bound_impl_ +#define _Out_bound_impl_ +#define _Ret_bound_impl_ +#define _Deref_in_bound_impl_ +#define _Deref_out_bound_impl_ +#define _Deref_ret_bound_impl_ + +#define _Range_impl_(min,max) +#define _In_range_impl_(min,max) +#define _Out_range_impl_(min,max) +#define _Ret_range_impl_(min,max) +#define _Deref_in_range_impl_(min,max) +#define _Deref_out_range_impl_(min,max) +#define _Deref_ret_range_impl_(min,max) + +#define _Satisfies_impl_(expr) +#define _Pre_satisfies_impl_(expr) +#define _Post_satisfies_impl_(expr) + +#define _Null_impl_ +#define _Notnull_impl_ +#define _Maybenull_impl_ + +#define _Valid_impl_ +#define _Notvalid_impl_ +#define _Maybevalid_impl_ + +#define _Field_range_impl_(min,max) + +#define _Pre_impl_ +#define _Pre1_impl_(p1) +#define _Pre2_impl_(p1,p2) +#define _Pre3_impl_(p1,p2,p3) + +#define _Post_impl_ +#define _Post1_impl_(p1) +#define _Post2_impl_(p1,p2) +#define _Post3_impl_(p1,p2,p3) + +#define _Ret1_impl_(p1) +#define _Ret2_impl_(p1,p2) +#define _Ret3_impl_(p1,p2,p3) + +#define _Deref_pre1_impl_(p1) +#define _Deref_pre2_impl_(p1,p2) +#define _Deref_pre3_impl_(p1,p2,p3) + +#define _Deref_post1_impl_(p1) +#define _Deref_post2_impl_(p1,p2) +#define _Deref_post3_impl_(p1,p2,p3) + +#define _Deref_ret1_impl_(p1) +#define _Deref_ret2_impl_(p1,p2) +#define _Deref_ret3_impl_(p1,p2,p3) + +#define _Deref2_pre1_impl_(p1) +#define _Deref2_post1_impl_(p1) +#define _Deref2_ret1_impl_(p1) + +#define _Readable_bytes_impl_(size) +#define _Readable_elements_impl_(size) +#define _Writable_bytes_impl_(size) +#define _Writable_elements_impl_(size) + +#define _Null_terminated_impl_ +#define _NullNull_terminated_impl_ + +// Obsolete -- may be needed for transition to attributes. +#define __inner_typefix(ctype) +#define __inner_exceptthat + +#endif // ] + +// This section contains the deprecated annotations + +/* + ------------------------------------------------------------------------------- + Introduction + + sal.h provides a set of annotations to describe how a function uses its + parameters - the assumptions it makes about them, and the guarantees it makes + upon finishing. + + Annotations may be placed before either a function parameter's type or its return + type, and describe the function's behavior regarding the parameter or return value. + There are two classes of annotations: buffer annotations and advanced annotations. + Buffer annotations describe how functions use their pointer parameters, and + advanced annotations either describe complex/unusual buffer behavior, or provide + additional information about a parameter that is not otherwise expressible. + + ------------------------------------------------------------------------------- + Buffer Annotations + + The most important annotations in sal.h provide a consistent way to annotate + buffer parameters or return values for a function. Each of these annotations describes + a single buffer (which could be a string, a fixed-length or variable-length array, + or just a pointer) that the function interacts with: where it is, how large it is, + how much is initialized, and what the function does with it. + + The appropriate macro for a given buffer can be constructed using the table below. + Just pick the appropriate values from each category, and combine them together + with a leading underscore. Some combinations of values do not make sense as buffer + annotations. Only meaningful annotations can be added to your code; for a list of + these, see the buffer annotation definitions section. + + Only a single buffer annotation should be used for each parameter. + + |------------|------------|---------|--------|----------|----------|---------------| + | Level | Usage | Size | Output | NullTerm | Optional | Parameters | + |------------|------------|---------|--------|----------|----------|---------------| + | <> | <> | <> | <> | _z | <> | <> | + | _deref | _in | _ecount | _full | _nz | _opt | (size) | + | _deref_opt | _out | _bcount | _part | | | (size,length) | + | | _inout | | | | | | + | | | | | | | | + |------------|------------|---------|--------|----------|----------|---------------| + + Level: Describes the buffer pointer's level of indirection from the parameter or + return value 'p'. + + <> : p is the buffer pointer. + _deref : *p is the buffer pointer. p must not be NULL. + _deref_opt : *p may be the buffer pointer. p may be NULL, in which case the rest of + the annotation is ignored. + + Usage: Describes how the function uses the buffer. + + <> : The buffer is not accessed. If used on the return value or with _deref, the + function will provide the buffer, and it will be uninitialized at exit. + Otherwise, the caller must provide the buffer. This should only be used + for alloc and free functions. + _in : The function will only read from the buffer. The caller must provide the + buffer and initialize it. Cannot be used with _deref. + _out : The function will only write to the buffer. If used on the return value or + with _deref, the function will provide the buffer and initialize it. + Otherwise, the caller must provide the buffer, and the function will + initialize it. + _inout : The function may freely read from and write to the buffer. The caller must + provide the buffer and initialize it. If used with _deref, the buffer may + be reallocated by the function. + + Size: Describes the total size of the buffer. This may be less than the space actually + allocated for the buffer, in which case it describes the accessible amount. + + <> : No buffer size is given. If the type specifies the buffer size (such as + with LPSTR and LPWSTR), that amount is used. Otherwise, the buffer is one + element long. Must be used with _in, _out, or _inout. + _ecount : The buffer size is an explicit element count. + _bcount : The buffer size is an explicit byte count. + + Output: Describes how much of the buffer will be initialized by the function. For + _inout buffers, this also describes how much is initialized at entry. Omit this + category for _in buffers; they must be fully initialized by the caller. + + <> : The type specifies how much is initialized. For instance, a function initializing + an LPWSTR must NULL-terminate the string. + _full : The function initializes the entire buffer. + _part : The function initializes part of the buffer, and explicitly indicates how much. + + NullTerm: States if the present of a '\0' marks the end of valid elements in the buffer. + _z : A '\0' indicated the end of the buffer + _nz : The buffer may not be null terminated and a '\0' does not indicate the end of the + buffer. + Optional: Describes if the buffer itself is optional. + + <> : The pointer to the buffer must not be NULL. + _opt : The pointer to the buffer might be NULL. It will be checked before being dereferenced. + + Parameters: Gives explicit counts for the size and length of the buffer. + + <> : There is no explicit count. Use when neither _ecount nor _bcount is used. + (size) : Only the buffer's total size is given. Use with _ecount or _bcount but not _part. + (size,length) : The buffer's total size and initialized length are given. Use with _ecount_part + and _bcount_part. + + ------------------------------------------------------------------------------- + Buffer Annotation Examples + + LWSTDAPI_(BOOL) StrToIntExA( + __in LPCSTR pszString, + DWORD dwFlags, + __out int *piRet -- A pointer whose dereference will be filled in. + ); + + void MyPaintingFunction( + __in HWND hwndControl, -- An initialized read-only parameter. + __in_opt HDC hdcOptional, -- An initialized read-only parameter that might be NULL. + __inout IPropertyStore *ppsStore -- An initialized parameter that may be freely used + -- and modified. + ); + + LWSTDAPI_(BOOL) PathCompactPathExA( + __out_ecount(cchMax) LPSTR pszOut, -- A string buffer with cch elements that will + -- be NULL terminated on exit. + __in LPCSTR pszSrc, + UINT cchMax, + DWORD dwFlags + ); + + HRESULT SHLocalAllocBytes( + size_t cb, + __deref_bcount(cb) T **ppv -- A pointer whose dereference will be set to an + -- uninitialized buffer with cb bytes. + ); + + __inout_bcount_full(cb) : A buffer with cb elements that is fully initialized at + entry and exit, and may be written to by this function. + + __out_ecount_part(count, *countOut) : A buffer with count elements that will be + partially initialized by this function. The function indicates how much it + initialized by setting *countOut. + + ------------------------------------------------------------------------------- + Advanced Annotations + + Advanced annotations describe behavior that is not expressible with the regular + buffer macros. These may be used either to annotate buffer parameters that involve + complex or conditional behavior, or to enrich existing annotations with additional + information. + + __success(expr) f : + indicates whether function f succeeded or not. If is true at exit, + all the function's guarantees (as given by other annotations) must hold. If + is false at exit, the caller should not expect any of the function's guarantees + to hold. If not used, the function must always satisfy its guarantees. Added + automatically to functions that indicate success in standard ways, such as by + returning an HRESULT. + + __nullterminated p : + Pointer p is a buffer that may be read or written up to and including the first + NULL character or pointer. May be used on typedefs, which marks valid (properly + initialized) instances of that type as being NULL-terminated. + + __nullnullterminated p : + Pointer p is a buffer that may be read or written up to and including the first + sequence of two NULL characters or pointers. May be used on typedefs, which marks + valid instances of that type as being double-NULL terminated. + + __reserved v : + Value v must be 0/NULL, reserved for future use. + + __checkReturn v : + Return value v must not be ignored by callers of this function. + + __typefix(ctype) v : + Value v should be treated as an instance of ctype, rather than its declared type. + + __override f : + Specify C#-style 'override' behaviour for overriding virtual methods. + + __callback f : + Function f can be used as a function pointer. + + __format_string p : + Pointer p is a string that contains % markers in the style of printf. + + __blocksOn(resource) f : + Function f blocks on the resource 'resource'. + + __fallthrough : + Annotates switch statement labels where fall-through is desired, to distinguish + from forgotten break statements. + + ------------------------------------------------------------------------------- + Advanced Annotation Examples + + __success(return != FALSE) LWSTDAPI_(BOOL) + PathCanonicalizeA(__out_ecount(MAX_PATH) LPSTR pszBuf, LPCSTR pszPath) : + pszBuf is only guaranteed to be NULL-terminated when TRUE is returned. + + typedef __nullterminated WCHAR* LPWSTR : Initialized LPWSTRs are NULL-terminated strings. + + __out_ecount(cch) __typefix(LPWSTR) void *psz : psz is a buffer parameter which will be + a NULL-terminated WCHAR string at exit, and which initially contains cch WCHARs. + + ------------------------------------------------------------------------------- +*/ + +#define __specstrings + +#ifdef __cplusplus // [ +#ifndef __nothrow // [ +# define __nothrow __declspec(nothrow) +#endif // ] +extern "C" { +#else // ][ +#ifndef __nothrow // [ +# define __nothrow +#endif // ] +#endif /* #ifdef __cplusplus */ // ] + + +/* + ------------------------------------------------------------------------------- + Helper Macro Definitions + + These express behavior common to many of the high-level annotations. + DO NOT USE THESE IN YOUR CODE. + ------------------------------------------------------------------------------- +*/ + +/* + The helper annotations are only understood by the compiler version used by + various defect detection tools. When the regular compiler is running, they + are defined into nothing, and do not affect the compiled code. +*/ + +#if !defined(__midl) && defined(_PREFAST_) // [ + + /* + In the primitive "SAL_*" annotations "SAL" stands for Standard + Annotation Language. These "SAL_*" annotations are the + primitives the compiler understands and high-level MACROs + will decompose into these primivates. + */ + + #define _SA_SPECSTRIZE( x ) #x + + /* + __null p + __notnull p + __maybenull p + + Annotates a pointer p. States that pointer p is null. Commonly used + in the negated form __notnull or the possibly null form __maybenull. + */ + +#ifndef PAL_STDCPP_COMPAT + #define __null _Null_impl_ + #define __notnull _Notnull_impl_ + #define __maybenull _Maybenull_impl_ +#endif // !PAL_STDCPP_COMPAT + + /* + __readonly l + __notreadonly l + __mabyereadonly l + + Annotates a location l. States that location l is not modified after + this point. If the annotation is placed on the precondition state of + a function, the restriction only applies until the postcondition state + of the function. __maybereadonly states that the annotated location + may be modified, whereas __notreadonly states that a location must be + modified. + */ + + #define __readonly _Pre1_impl_(__readaccess_impl) + #define __notreadonly _Pre1_impl_(__allaccess_impl) + #define __maybereadonly _Pre1_impl_(__readaccess_impl) + + /* + __valid v + __notvalid v + __maybevalid v + + Annotates any value v. States that the value satisfies all properties of + valid values of its type. For example, for a string buffer, valid means + that the buffer pointer is either NULL or points to a NULL-terminated string. + */ + + #define __valid _Valid_impl_ + #define __notvalid _Notvalid_impl_ + #define __maybevalid _Maybevalid_impl_ + + /* + __readableTo(extent) p + + Annotates a buffer pointer p. If the buffer can be read, extent describes + how much of the buffer is readable. For a reader of the buffer, this is + an explicit permission to read up to that amount, rather than a restriction to + read only up to it. + */ + + #define __readableTo(extent) _SA_annotes1(SAL_readableTo, extent) + + /* + + __elem_readableTo(size) + + Annotates a buffer pointer p as being readable to size elements. + */ + + #define __elem_readableTo(size) _SA_annotes1(SAL_readableTo, elementCount( size )) + + /* + __byte_readableTo(size) + + Annotates a buffer pointer p as being readable to size bytes. + */ + #define __byte_readableTo(size) _SA_annotes1(SAL_readableTo, byteCount(size)) + + /* + __writableTo(extent) p + + Annotates a buffer pointer p. If the buffer can be modified, extent + describes how much of the buffer is writable (usually the allocation + size). For a writer of the buffer, this is an explicit permission to + write up to that amount, rather than a restriction to write only up to it. + */ + #define __writableTo(size) _SA_annotes1(SAL_writableTo, size) + + /* + __elem_writableTo(size) + + Annotates a buffer pointer p as being writable to size elements. + */ + #define __elem_writableTo(size) _SA_annotes1(SAL_writableTo, elementCount( size )) + + /* + __byte_writableTo(size) + + Annotates a buffer pointer p as being writable to size bytes. + */ + #define __byte_writableTo(size) _SA_annotes1(SAL_writableTo, byteCount( size)) + + /* + __deref p + + Annotates a pointer p. The next annotation applies one dereference down + in the type. If readableTo(p, size) then the next annotation applies to + all elements *(p+i) for which i satisfies the size. If p is a pointer + to a struct, the next annotation applies to all fields of the struct. + */ + #define __deref _Deref_impl_ + + /* + __pre __next_annotation + + The next annotation applies in the precondition state + */ + #define __pre _Pre_impl_ + + /* + __post __next_annotation + + The next annotation applies in the postcondition state + */ + #define __post _Post_impl_ + + /* + __precond() + + When is true, the next annotation applies in the precondition state + (currently not enabled) + */ + #define __precond(expr) __pre + + /* + __postcond() + + When is true, the next annotation applies in the postcondition state + (currently not enabled) + */ + #define __postcond(expr) __post + + /* + __exceptthat + + Given a set of annotations Q containing __exceptthat maybeP, the effect of + the except clause is to erase any P or notP annotations (explicit or + implied) within Q at the same level of dereferencing that the except + clause appears, and to replace it with maybeP. + + Example 1: __valid __pre_except_maybenull on a pointer p means that the + pointer may be null, and is otherwise valid, thus overriding + the implicit notnull annotation implied by __valid on + pointers. + + Example 2: __valid __deref __pre_except_maybenull on an int **p means + that p is not null (implied by valid), but the elements + pointed to by p could be null, and are otherwise valid. + */ + #define __exceptthat __inner_exceptthat + + /* + _refparam + + Added to all out parameter macros to indicate that they are all reference + parameters. + */ + #define __refparam _Notref_ __deref __notreadonly + + /* + __inner_* + + Helper macros that directly correspond to certain high-level annotations. + + */ + + /* + Macros to classify the entrypoints and indicate their category. + + Pre-defined control point categories include: RPC, LPC, DeviceDriver, UserToKernel, ISAPI, COM. + + */ + #define __inner_control_entrypoint(category) _SA_annotes2(SAL_entrypoint, controlEntry, category) + + + /* + Pre-defined data entry point categories include: Registry, File, Network. + */ + #define __inner_data_entrypoint(category) _SA_annotes2(SAL_entrypoint, dataEntry, category) + + #define __inner_override _SA_annotes0(__override) + #define __inner_callback _SA_annotes0(__callback) + #define __inner_blocksOn(resource) _SA_annotes1(SAL_blocksOn, resource) + #define __inner_fallthrough_dec __inline __nothrow void __FallThrough() {} + #define __inner_fallthrough __FallThrough(); + + #define __post_except_maybenull __post __inner_exceptthat _Maybenull_impl_ + #define __pre_except_maybenull __pre __inner_exceptthat _Maybenull_impl_ + + #define __post_deref_except_maybenull __post __deref __inner_exceptthat _Maybenull_impl_ + #define __pre_deref_except_maybenull __pre __deref __inner_exceptthat _Maybenull_impl_ + + #define __inexpressible_readableTo(size) _Readable_elements_impl_(_Inexpressible_(size)) + #define __inexpressible_writableTo(size) _Writable_elements_impl_(_Inexpressible_(size)) + + +#else // ][ +#ifndef PAL_STDCPP_COMPAT + #define __null + #define __notnull +#endif // !PAL_STDCPP_COMPAT + #define __maybenull + #define __readonly + #define __notreadonly + #define __maybereadonly + #define __valid + #define __notvalid + #define __maybevalid + #define __readableTo(extent) + #define __elem_readableTo(size) + #define __byte_readableTo(size) + #define __writableTo(size) + #define __elem_writableTo(size) + #define __byte_writableTo(size) + #define __deref + #define __pre + #define __post + #define __precond(expr) + #define __postcond(expr) + #define __exceptthat + #define __inner_override + #define __inner_callback + #define __inner_blocksOn(resource) + #define __inner_fallthrough_dec + #define __inner_fallthrough + #define __refparam + #define __inner_control_entrypoint(category) + #define __inner_data_entrypoint(category) + + #define __post_except_maybenull + #define __pre_except_maybenull + #define __post_deref_except_maybenull + #define __pre_deref_except_maybenull + + #define __inexpressible_readableTo(size) + #define __inexpressible_writableTo(size) + +#endif /* #if !defined(__midl) && defined(_PREFAST_) */ // ] + +/* +------------------------------------------------------------------------------- +Buffer Annotation Definitions + +Any of these may be used to directly annotate functions, but only one should +be used for each parameter. To determine which annotation to use for a given +buffer, use the table in the buffer annotations section. +------------------------------------------------------------------------------- +*/ + +// These macros conflict with c++ headers. +#ifndef PAL_STDCPP_COMPAT +#define __in _SAL1_Source_(__in, (), _In_) +#define __out _SAL1_Source_(__out, (), _Out_) +#endif // !PAL_STDCPP_COMPAT + +#define __ecount(size) _SAL1_Source_(__ecount, (size), __notnull __elem_writableTo(size)) +#define __bcount(size) _SAL1_Source_(__bcount, (size), __notnull __byte_writableTo(size)) +#define __in_ecount(size) _SAL1_Source_(__in_ecount, (size), _In_reads_(size)) +#define __in_bcount(size) _SAL1_Source_(__in_bcount, (size), _In_reads_bytes_(size)) +#define __in_z _SAL1_Source_(__in_z, (), _In_z_) +#define __in_ecount_z(size) _SAL1_Source_(__in_ecount_z, (size), _In_reads_z_(size)) +#define __in_bcount_z(size) _SAL1_Source_(__in_bcount_z, (size), __in_bcount(size) __pre __nullterminated) +#define __in_nz _SAL1_Source_(__in_nz, (), __in) +#define __in_ecount_nz(size) _SAL1_Source_(__in_ecount_nz, (size), __in_ecount(size)) +#define __in_bcount_nz(size) _SAL1_Source_(__in_bcount_nz, (size), __in_bcount(size)) +#define __out_ecount(size) _SAL1_Source_(__out_ecount, (size), _Out_writes_(size)) +#define __out_bcount(size) _SAL1_Source_(__out_bcount, (size), _Out_writes_bytes_(size)) +#define __out_ecount_part(size,length) _SAL1_Source_(__out_ecount_part, (size,length), _Out_writes_to_(size,length)) +#define __out_bcount_part(size,length) _SAL1_Source_(__out_bcount_part, (size,length), _Out_writes_bytes_to_(size,length)) +#define __out_ecount_full(size) _SAL1_Source_(__out_ecount_full, (size), _Out_writes_all_(size)) +#define __out_bcount_full(size) _SAL1_Source_(__out_bcount_full, (size), _Out_writes_bytes_all_(size)) +#define __out_z _SAL1_Source_(__out_z, (), __post __valid __refparam __post __nullterminated) +#define __out_z_opt _SAL1_Source_(__out_z_opt, (), __post __valid __refparam __post __nullterminated __pre_except_maybenull) +#define __out_ecount_z(size) _SAL1_Source_(__out_ecount_z, (size), __ecount(size) __post __valid __refparam __post __nullterminated) +#define __out_bcount_z(size) _SAL1_Source_(__out_bcount_z, (size), __bcount(size) __post __valid __refparam __post __nullterminated) +#define __out_ecount_part_z(size,length) _SAL1_Source_(__out_ecount_part_z, (size,length), __out_ecount_part(size,length) __post __nullterminated) +#define __out_bcount_part_z(size,length) _SAL1_Source_(__out_bcount_part_z, (size,length), __out_bcount_part(size,length) __post __nullterminated) +#define __out_ecount_full_z(size) _SAL1_Source_(__out_ecount_full_z, (size), __out_ecount_full(size) __post __nullterminated) +#define __out_bcount_full_z(size) _SAL1_Source_(__out_bcount_full_z, (size), __out_bcount_full(size) __post __nullterminated) +#define __out_nz _SAL1_Source_(__out_nz, (), __post __valid __refparam) +#define __out_nz_opt _SAL1_Source_(__out_nz_opt, (), __post __valid __refparam __post_except_maybenull_) +#define __out_ecount_nz(size) _SAL1_Source_(__out_ecount_nz, (size), __ecount(size) __post __valid __refparam) +#define __out_bcount_nz(size) _SAL1_Source_(__out_bcount_nz, (size), __bcount(size) __post __valid __refparam) +#define __inout _SAL1_Source_(__inout, (), _Inout_) +#define __inout_ecount(size) _SAL1_Source_(__inout_ecount, (size), _Inout_updates_(size)) +#define __inout_bcount(size) _SAL1_Source_(__inout_bcount, (size), _Inout_updates_bytes_(size)) +#define __inout_ecount_part(size,length) _SAL1_Source_(__inout_ecount_part, (size,length), _Inout_updates_to_(size,length)) +#define __inout_bcount_part(size,length) _SAL1_Source_(__inout_bcount_part, (size,length), _Inout_updates_bytes_to_(size,length)) +#define __inout_ecount_full(size) _SAL1_Source_(__inout_ecount_full, (size), _Inout_updates_all_(size)) +#define __inout_bcount_full(size) _SAL1_Source_(__inout_bcount_full, (size), _Inout_updates_bytes_all_(size)) +#define __inout_z _SAL1_Source_(__inout_z, (), _Inout_z_) +#define __inout_ecount_z(size) _SAL1_Source_(__inout_ecount_z, (size), _Inout_updates_z_(size)) +#define __inout_bcount_z(size) _SAL1_Source_(__inout_bcount_z, (size), __inout_bcount(size) __pre __nullterminated __post __nullterminated) +#define __inout_nz _SAL1_Source_(__inout_nz, (), __inout) +#define __inout_ecount_nz(size) _SAL1_Source_(__inout_ecount_nz, (size), __inout_ecount(size)) +#define __inout_bcount_nz(size) _SAL1_Source_(__inout_bcount_nz, (size), __inout_bcount(size)) +#define __ecount_opt(size) _SAL1_Source_(__ecount_opt, (size), __ecount(size) __pre_except_maybenull) +#define __bcount_opt(size) _SAL1_Source_(__bcount_opt, (size), __bcount(size) __pre_except_maybenull) +#define __in_opt _SAL1_Source_(__in_opt, (), _In_opt_) +#define __in_ecount_opt(size) _SAL1_Source_(__in_ecount_opt, (size), _In_reads_opt_(size)) +#define __in_bcount_opt(size) _SAL1_Source_(__in_bcount_opt, (size), _In_reads_bytes_opt_(size)) +#define __in_z_opt _SAL1_Source_(__in_z_opt, (), _In_opt_z_) +#define __in_ecount_z_opt(size) _SAL1_Source_(__in_ecount_z_opt, (size), __in_ecount_opt(size) __pre __nullterminated) +#define __in_bcount_z_opt(size) _SAL1_Source_(__in_bcount_z_opt, (size), __in_bcount_opt(size) __pre __nullterminated) +#define __in_nz_opt _SAL1_Source_(__in_nz_opt, (), __in_opt) +#define __in_ecount_nz_opt(size) _SAL1_Source_(__in_ecount_nz_opt, (size), __in_ecount_opt(size)) +#define __in_bcount_nz_opt(size) _SAL1_Source_(__in_bcount_nz_opt, (size), __in_bcount_opt(size)) +#define __out_opt _SAL1_Source_(__out_opt, (), _Out_opt_) +#define __out_ecount_opt(size) _SAL1_Source_(__out_ecount_opt, (size), _Out_writes_opt_(size)) +#define __out_bcount_opt(size) _SAL1_Source_(__out_bcount_opt, (size), _Out_writes_bytes_opt_(size)) +#define __out_ecount_part_opt(size,length) _SAL1_Source_(__out_ecount_part_opt, (size,length), __out_ecount_part(size,length) __pre_except_maybenull) +#define __out_bcount_part_opt(size,length) _SAL1_Source_(__out_bcount_part_opt, (size,length), __out_bcount_part(size,length) __pre_except_maybenull) +#define __out_ecount_full_opt(size) _SAL1_Source_(__out_ecount_full_opt, (size), __out_ecount_full(size) __pre_except_maybenull) +#define __out_bcount_full_opt(size) _SAL1_Source_(__out_bcount_full_opt, (size), __out_bcount_full(size) __pre_except_maybenull) +#define __out_ecount_z_opt(size) _SAL1_Source_(__out_ecount_z_opt, (size), __out_ecount_opt(size) __post __nullterminated) +#define __out_bcount_z_opt(size) _SAL1_Source_(__out_bcount_z_opt, (size), __out_bcount_opt(size) __post __nullterminated) +#define __out_ecount_part_z_opt(size,length) _SAL1_Source_(__out_ecount_part_z_opt, (size,length), __out_ecount_part_opt(size,length) __post __nullterminated) +#define __out_bcount_part_z_opt(size,length) _SAL1_Source_(__out_bcount_part_z_opt, (size,length), __out_bcount_part_opt(size,length) __post __nullterminated) +#define __out_ecount_full_z_opt(size) _SAL1_Source_(__out_ecount_full_z_opt, (size), __out_ecount_full_opt(size) __post __nullterminated) +#define __out_bcount_full_z_opt(size) _SAL1_Source_(__out_bcount_full_z_opt, (size), __out_bcount_full_opt(size) __post __nullterminated) +#define __out_ecount_nz_opt(size) _SAL1_Source_(__out_ecount_nz_opt, (size), __out_ecount_opt(size) __post __nullterminated) +#define __out_bcount_nz_opt(size) _SAL1_Source_(__out_bcount_nz_opt, (size), __out_bcount_opt(size) __post __nullterminated) +#define __inout_opt _SAL1_Source_(__inout_opt, (), _Inout_opt_) +#define __inout_ecount_opt(size) _SAL1_Source_(__inout_ecount_opt, (size), __inout_ecount(size) __pre_except_maybenull) +#define __inout_bcount_opt(size) _SAL1_Source_(__inout_bcount_opt, (size), __inout_bcount(size) __pre_except_maybenull) +#define __inout_ecount_part_opt(size,length) _SAL1_Source_(__inout_ecount_part_opt, (size,length), __inout_ecount_part(size,length) __pre_except_maybenull) +#define __inout_bcount_part_opt(size,length) _SAL1_Source_(__inout_bcount_part_opt, (size,length), __inout_bcount_part(size,length) __pre_except_maybenull) +#define __inout_ecount_full_opt(size) _SAL1_Source_(__inout_ecount_full_opt, (size), __inout_ecount_full(size) __pre_except_maybenull) +#define __inout_bcount_full_opt(size) _SAL1_Source_(__inout_bcount_full_opt, (size), __inout_bcount_full(size) __pre_except_maybenull) +#define __inout_z_opt _SAL1_Source_(__inout_z_opt, (), __inout_opt __pre __nullterminated __post __nullterminated) +#define __inout_ecount_z_opt(size) _SAL1_Source_(__inout_ecount_z_opt, (size), __inout_ecount_opt(size) __pre __nullterminated __post __nullterminated) +#define __inout_ecount_z_opt(size) _SAL1_Source_(__inout_ecount_z_opt, (size), __inout_ecount_opt(size) __pre __nullterminated __post __nullterminated) +#define __inout_bcount_z_opt(size) _SAL1_Source_(__inout_bcount_z_opt, (size), __inout_bcount_opt(size)) +#define __inout_nz_opt _SAL1_Source_(__inout_nz_opt, (), __inout_opt) +#define __inout_ecount_nz_opt(size) _SAL1_Source_(__inout_ecount_nz_opt, (size), __inout_ecount_opt(size)) +#define __inout_bcount_nz_opt(size) _SAL1_Source_(__inout_bcount_nz_opt, (size), __inout_bcount_opt(size)) +#define __deref_ecount(size) _SAL1_Source_(__deref_ecount, (size), _Notref_ __ecount(1) __post _Notref_ __elem_readableTo(1) __post _Notref_ __deref _Notref_ __notnull __post __deref __elem_writableTo(size)) +#define __deref_bcount(size) _SAL1_Source_(__deref_bcount, (size), _Notref_ __ecount(1) __post _Notref_ __elem_readableTo(1) __post _Notref_ __deref _Notref_ __notnull __post __deref __byte_writableTo(size)) +#define __deref_out _SAL1_Source_(__deref_out, (), _Outptr_) +#define __deref_out_ecount(size) _SAL1_Source_(__deref_out_ecount, (size), _Outptr_result_buffer_(size)) +#define __deref_out_bcount(size) _SAL1_Source_(__deref_out_bcount, (size), _Outptr_result_bytebuffer_(size)) +#define __deref_out_ecount_part(size,length) _SAL1_Source_(__deref_out_ecount_part, (size,length), _Outptr_result_buffer_to_(size,length)) +#define __deref_out_bcount_part(size,length) _SAL1_Source_(__deref_out_bcount_part, (size,length), _Outptr_result_bytebuffer_to_(size,length)) +#define __deref_out_ecount_full(size) _SAL1_Source_(__deref_out_ecount_full, (size), __deref_out_ecount_part(size,size)) +#define __deref_out_bcount_full(size) _SAL1_Source_(__deref_out_bcount_full, (size), __deref_out_bcount_part(size,size)) +#define __deref_out_z _SAL1_Source_(__deref_out_z, (), _Outptr_result_z_) +#define __deref_out_ecount_z(size) _SAL1_Source_(__deref_out_ecount_z, (size), __deref_out_ecount(size) __post __deref __nullterminated) +#define __deref_out_bcount_z(size) _SAL1_Source_(__deref_out_bcount_z, (size), __deref_out_bcount(size) __post __deref __nullterminated) +#define __deref_out_nz _SAL1_Source_(__deref_out_nz, (), __deref_out) +#define __deref_out_ecount_nz(size) _SAL1_Source_(__deref_out_ecount_nz, (size), __deref_out_ecount(size)) +#define __deref_out_bcount_nz(size) _SAL1_Source_(__deref_out_bcount_nz, (size), __deref_out_ecount(size)) +#define __deref_inout _SAL1_Source_(__deref_inout, (), _Notref_ __notnull _Notref_ __elem_readableTo(1) __pre __deref __valid __post _Notref_ __deref __valid __refparam) +#define __deref_inout_z _SAL1_Source_(__deref_inout_z, (), __deref_inout __pre __deref __nullterminated __post _Notref_ __deref __nullterminated) +#define __deref_inout_ecount(size) _SAL1_Source_(__deref_inout_ecount, (size), __deref_inout __pre __deref __elem_writableTo(size) __post _Notref_ __deref __elem_writableTo(size)) +#define __deref_inout_bcount(size) _SAL1_Source_(__deref_inout_bcount, (size), __deref_inout __pre __deref __byte_writableTo(size) __post _Notref_ __deref __byte_writableTo(size)) +#define __deref_inout_ecount_part(size,length) _SAL1_Source_(__deref_inout_ecount_part, (size,length), __deref_inout_ecount(size) __pre __deref __elem_readableTo(length) __post __deref __elem_readableTo(length)) +#define __deref_inout_bcount_part(size,length) _SAL1_Source_(__deref_inout_bcount_part, (size,length), __deref_inout_bcount(size) __pre __deref __byte_readableTo(length) __post __deref __byte_readableTo(length)) +#define __deref_inout_ecount_full(size) _SAL1_Source_(__deref_inout_ecount_full, (size), __deref_inout_ecount_part(size,size)) +#define __deref_inout_bcount_full(size) _SAL1_Source_(__deref_inout_bcount_full, (size), __deref_inout_bcount_part(size,size)) +#define __deref_inout_ecount_z(size) _SAL1_Source_(__deref_inout_ecount_z, (size), __deref_inout_ecount(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_inout_bcount_z(size) _SAL1_Source_(__deref_inout_bcount_z, (size), __deref_inout_bcount(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_inout_nz _SAL1_Source_(__deref_inout_nz, (), __deref_inout) +#define __deref_inout_ecount_nz(size) _SAL1_Source_(__deref_inout_ecount_nz, (size), __deref_inout_ecount(size)) +#define __deref_inout_bcount_nz(size) _SAL1_Source_(__deref_inout_bcount_nz, (size), __deref_inout_ecount(size)) +#define __deref_ecount_opt(size) _SAL1_Source_(__deref_ecount_opt, (size), __deref_ecount(size) __post_deref_except_maybenull) +#define __deref_bcount_opt(size) _SAL1_Source_(__deref_bcount_opt, (size), __deref_bcount(size) __post_deref_except_maybenull) +#define __deref_out_opt _SAL1_Source_(__deref_out_opt, (), __deref_out __post_deref_except_maybenull) +#define __deref_out_ecount_opt(size) _SAL1_Source_(__deref_out_ecount_opt, (size), __deref_out_ecount(size) __post_deref_except_maybenull) +#define __deref_out_bcount_opt(size) _SAL1_Source_(__deref_out_bcount_opt, (size), __deref_out_bcount(size) __post_deref_except_maybenull) +#define __deref_out_ecount_part_opt(size,length) _SAL1_Source_(__deref_out_ecount_part_opt, (size,length), __deref_out_ecount_part(size,length) __post_deref_except_maybenull) +#define __deref_out_bcount_part_opt(size,length) _SAL1_Source_(__deref_out_bcount_part_opt, (size,length), __deref_out_bcount_part(size,length) __post_deref_except_maybenull) +#define __deref_out_ecount_full_opt(size) _SAL1_Source_(__deref_out_ecount_full_opt, (size), __deref_out_ecount_full(size) __post_deref_except_maybenull) +#define __deref_out_bcount_full_opt(size) _SAL1_Source_(__deref_out_bcount_full_opt, (size), __deref_out_bcount_full(size) __post_deref_except_maybenull) +#define __deref_out_z_opt _SAL1_Source_(__deref_out_z_opt, (), _Outptr_result_maybenull_z_) +#define __deref_out_ecount_z_opt(size) _SAL1_Source_(__deref_out_ecount_z_opt, (size), __deref_out_ecount_opt(size) __post __deref __nullterminated) +#define __deref_out_bcount_z_opt(size) _SAL1_Source_(__deref_out_bcount_z_opt, (size), __deref_out_bcount_opt(size) __post __deref __nullterminated) +#define __deref_out_nz_opt _SAL1_Source_(__deref_out_nz_opt, (), __deref_out_opt) +#define __deref_out_ecount_nz_opt(size) _SAL1_Source_(__deref_out_ecount_nz_opt, (size), __deref_out_ecount_opt(size)) +#define __deref_out_bcount_nz_opt(size) _SAL1_Source_(__deref_out_bcount_nz_opt, (size), __deref_out_bcount_opt(size)) +#define __deref_inout_opt _SAL1_Source_(__deref_inout_opt, (), __deref_inout __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_ecount_opt(size) _SAL1_Source_(__deref_inout_ecount_opt, (size), __deref_inout_ecount(size) __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_bcount_opt(size) _SAL1_Source_(__deref_inout_bcount_opt, (size), __deref_inout_bcount(size) __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_ecount_part_opt(size,length) _SAL1_Source_(__deref_inout_ecount_part_opt, (size,length), __deref_inout_ecount_part(size,length) __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_bcount_part_opt(size,length) _SAL1_Source_(__deref_inout_bcount_part_opt, (size,length), __deref_inout_bcount_part(size,length) __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_ecount_full_opt(size) _SAL1_Source_(__deref_inout_ecount_full_opt, (size), __deref_inout_ecount_full(size) __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_bcount_full_opt(size) _SAL1_Source_(__deref_inout_bcount_full_opt, (size), __deref_inout_bcount_full(size) __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_z_opt _SAL1_Source_(__deref_inout_z_opt, (), __deref_inout_opt __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_inout_ecount_z_opt(size) _SAL1_Source_(__deref_inout_ecount_z_opt, (size), __deref_inout_ecount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_inout_bcount_z_opt(size) _SAL1_Source_(__deref_inout_bcount_z_opt, (size), __deref_inout_bcount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_inout_nz_opt _SAL1_Source_(__deref_inout_nz_opt, (), __deref_inout_opt) +#define __deref_inout_ecount_nz_opt(size) _SAL1_Source_(__deref_inout_ecount_nz_opt, (size), __deref_inout_ecount_opt(size)) +#define __deref_inout_bcount_nz_opt(size) _SAL1_Source_(__deref_inout_bcount_nz_opt, (size), __deref_inout_bcount_opt(size)) +#define __deref_opt_ecount(size) _SAL1_Source_(__deref_opt_ecount, (size), __deref_ecount(size) __pre_except_maybenull) +#define __deref_opt_bcount(size) _SAL1_Source_(__deref_opt_bcount, (size), __deref_bcount(size) __pre_except_maybenull) +#define __deref_opt_out _SAL1_Source_(__deref_opt_out, (), _Outptr_opt_) +#define __deref_opt_out_z _SAL1_Source_(__deref_opt_out_z, (), _Outptr_opt_result_z_) +#define __deref_opt_out_ecount(size) _SAL1_Source_(__deref_opt_out_ecount, (size), __deref_out_ecount(size) __pre_except_maybenull) +#define __deref_opt_out_bcount(size) _SAL1_Source_(__deref_opt_out_bcount, (size), __deref_out_bcount(size) __pre_except_maybenull) +#define __deref_opt_out_ecount_part(size,length) _SAL1_Source_(__deref_opt_out_ecount_part, (size,length), __deref_out_ecount_part(size,length) __pre_except_maybenull) +#define __deref_opt_out_bcount_part(size,length) _SAL1_Source_(__deref_opt_out_bcount_part, (size,length), __deref_out_bcount_part(size,length) __pre_except_maybenull) +#define __deref_opt_out_ecount_full(size) _SAL1_Source_(__deref_opt_out_ecount_full, (size), __deref_out_ecount_full(size) __pre_except_maybenull) +#define __deref_opt_out_bcount_full(size) _SAL1_Source_(__deref_opt_out_bcount_full, (size), __deref_out_bcount_full(size) __pre_except_maybenull) +#define __deref_opt_inout _SAL1_Source_(__deref_opt_inout, (), _Inout_opt_) +#define __deref_opt_inout_ecount(size) _SAL1_Source_(__deref_opt_inout_ecount, (size), __deref_inout_ecount(size) __pre_except_maybenull) +#define __deref_opt_inout_bcount(size) _SAL1_Source_(__deref_opt_inout_bcount, (size), __deref_inout_bcount(size) __pre_except_maybenull) +#define __deref_opt_inout_ecount_part(size,length) _SAL1_Source_(__deref_opt_inout_ecount_part, (size,length), __deref_inout_ecount_part(size,length) __pre_except_maybenull) +#define __deref_opt_inout_bcount_part(size,length) _SAL1_Source_(__deref_opt_inout_bcount_part, (size,length), __deref_inout_bcount_part(size,length) __pre_except_maybenull) +#define __deref_opt_inout_ecount_full(size) _SAL1_Source_(__deref_opt_inout_ecount_full, (size), __deref_inout_ecount_full(size) __pre_except_maybenull) +#define __deref_opt_inout_bcount_full(size) _SAL1_Source_(__deref_opt_inout_bcount_full, (size), __deref_inout_bcount_full(size) __pre_except_maybenull) +#define __deref_opt_inout_z _SAL1_Source_(__deref_opt_inout_z, (), __deref_opt_inout __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_opt_inout_ecount_z(size) _SAL1_Source_(__deref_opt_inout_ecount_z, (size), __deref_opt_inout_ecount(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_opt_inout_bcount_z(size) _SAL1_Source_(__deref_opt_inout_bcount_z, (size), __deref_opt_inout_bcount(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_opt_inout_nz _SAL1_Source_(__deref_opt_inout_nz, (), __deref_opt_inout) +#define __deref_opt_inout_ecount_nz(size) _SAL1_Source_(__deref_opt_inout_ecount_nz, (size), __deref_opt_inout_ecount(size)) +#define __deref_opt_inout_bcount_nz(size) _SAL1_Source_(__deref_opt_inout_bcount_nz, (size), __deref_opt_inout_bcount(size)) +#define __deref_opt_ecount_opt(size) _SAL1_Source_(__deref_opt_ecount_opt, (size), __deref_ecount_opt(size) __pre_except_maybenull) +#define __deref_opt_bcount_opt(size) _SAL1_Source_(__deref_opt_bcount_opt, (size), __deref_bcount_opt(size) __pre_except_maybenull) +#define __deref_opt_out_opt _SAL1_Source_(__deref_opt_out_opt, (), _Outptr_opt_result_maybenull_) +#define __deref_opt_out_ecount_opt(size) _SAL1_Source_(__deref_opt_out_ecount_opt, (size), __deref_out_ecount_opt(size) __pre_except_maybenull) +#define __deref_opt_out_bcount_opt(size) _SAL1_Source_(__deref_opt_out_bcount_opt, (size), __deref_out_bcount_opt(size) __pre_except_maybenull) +#define __deref_opt_out_ecount_part_opt(size,length) _SAL1_Source_(__deref_opt_out_ecount_part_opt, (size,length), __deref_out_ecount_part_opt(size,length) __pre_except_maybenull) +#define __deref_opt_out_bcount_part_opt(size,length) _SAL1_Source_(__deref_opt_out_bcount_part_opt, (size,length), __deref_out_bcount_part_opt(size,length) __pre_except_maybenull) +#define __deref_opt_out_ecount_full_opt(size) _SAL1_Source_(__deref_opt_out_ecount_full_opt, (size), __deref_out_ecount_full_opt(size) __pre_except_maybenull) +#define __deref_opt_out_bcount_full_opt(size) _SAL1_Source_(__deref_opt_out_bcount_full_opt, (size), __deref_out_bcount_full_opt(size) __pre_except_maybenull) +#define __deref_opt_out_z_opt _SAL1_Source_(__deref_opt_out_z_opt, (), __post __deref __valid __refparam __pre_except_maybenull __pre_deref_except_maybenull __post_deref_except_maybenull __post __deref __nullterminated) +#define __deref_opt_out_ecount_z_opt(size) _SAL1_Source_(__deref_opt_out_ecount_z_opt, (size), __deref_opt_out_ecount_opt(size) __post __deref __nullterminated) +#define __deref_opt_out_bcount_z_opt(size) _SAL1_Source_(__deref_opt_out_bcount_z_opt, (size), __deref_opt_out_bcount_opt(size) __post __deref __nullterminated) +#define __deref_opt_out_nz_opt _SAL1_Source_(__deref_opt_out_nz_opt, (), __deref_opt_out_opt) +#define __deref_opt_out_ecount_nz_opt(size) _SAL1_Source_(__deref_opt_out_ecount_nz_opt, (size), __deref_opt_out_ecount_opt(size)) +#define __deref_opt_out_bcount_nz_opt(size) _SAL1_Source_(__deref_opt_out_bcount_nz_opt, (size), __deref_opt_out_bcount_opt(size)) +#define __deref_opt_inout_opt _SAL1_Source_(__deref_opt_inout_opt, (), __deref_inout_opt __pre_except_maybenull) +#define __deref_opt_inout_ecount_opt(size) _SAL1_Source_(__deref_opt_inout_ecount_opt, (size), __deref_inout_ecount_opt(size) __pre_except_maybenull) +#define __deref_opt_inout_bcount_opt(size) _SAL1_Source_(__deref_opt_inout_bcount_opt, (size), __deref_inout_bcount_opt(size) __pre_except_maybenull) +#define __deref_opt_inout_ecount_part_opt(size,length) _SAL1_Source_(__deref_opt_inout_ecount_part_opt, (size,length), __deref_inout_ecount_part_opt(size,length) __pre_except_maybenull) +#define __deref_opt_inout_bcount_part_opt(size,length) _SAL1_Source_(__deref_opt_inout_bcount_part_opt, (size,length), __deref_inout_bcount_part_opt(size,length) __pre_except_maybenull) +#define __deref_opt_inout_ecount_full_opt(size) _SAL1_Source_(__deref_opt_inout_ecount_full_opt, (size), __deref_inout_ecount_full_opt(size) __pre_except_maybenull) +#define __deref_opt_inout_bcount_full_opt(size) _SAL1_Source_(__deref_opt_inout_bcount_full_opt, (size), __deref_inout_bcount_full_opt(size) __pre_except_maybenull) +#define __deref_opt_inout_z_opt _SAL1_Source_(__deref_opt_inout_z_opt, (), __deref_opt_inout_opt __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_opt_inout_ecount_z_opt(size) _SAL1_Source_(__deref_opt_inout_ecount_z_opt, (size), __deref_opt_inout_ecount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_opt_inout_bcount_z_opt(size) _SAL1_Source_(__deref_opt_inout_bcount_z_opt, (size), __deref_opt_inout_bcount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_opt_inout_nz_opt _SAL1_Source_(__deref_opt_inout_nz_opt, (), __deref_opt_inout_opt) +#define __deref_opt_inout_ecount_nz_opt(size) _SAL1_Source_(__deref_opt_inout_ecount_nz_opt, (size), __deref_opt_inout_ecount_opt(size)) +#define __deref_opt_inout_bcount_nz_opt(size) _SAL1_Source_(__deref_opt_inout_bcount_nz_opt, (size), __deref_opt_inout_bcount_opt(size)) + +/* +------------------------------------------------------------------------------- +Advanced Annotation Definitions + +Any of these may be used to directly annotate functions, and may be used in +combination with each other or with regular buffer macros. For an explanation +of each annotation, see the advanced annotations section. +------------------------------------------------------------------------------- +*/ + +#define __success(expr) _Success_(expr) +#define __nullterminated _Null_terminated_ +#define __nullnullterminated +#define __clr_reserved _SAL1_Source_(__reserved, (), _Reserved_) +#define __checkReturn _SAL1_Source_(__checkReturn, (), _Check_return_) +#define __typefix(ctype) _SAL1_Source_(__typefix, (ctype), __inner_typefix(ctype)) +#define __override __inner_override +#define __callback __inner_callback +#define __format_string _Printf_format_string_ +#define __blocksOn(resource) __inner_blocksOn(resource) +#define __control_entrypoint(category) __inner_control_entrypoint(category) +#define __data_entrypoint(category) __inner_data_entrypoint(category) +#define __useHeader _Use_decl_anno_impl_ +#define __on_failure(annotes) _On_failure_impl_(annotes _SAL_nop_impl_) + +#ifndef __fallthrough // [ + __inner_fallthrough_dec + #define __fallthrough __inner_fallthrough +#endif // ] + +#ifndef __analysis_assume // [ +#ifdef _PREFAST_ // [ +#define __analysis_assume(expr) __assume(expr) +#else // ][ +#define __analysis_assume(expr) +#endif // ] +#endif // ] + +#ifndef _Analysis_assume_ // [ +#ifdef _PREFAST_ // [ +#define _Analysis_assume_(expr) __assume(expr) +#else // ][ +#define _Analysis_assume_(expr) +#endif // ] +#endif // ] + +#define _Analysis_noreturn_ _SAL2_Source_(_Analysis_noreturn_, (), _SA_annotes0(SAL_terminates)) + +#ifdef _PREFAST_ // [ +__inline __nothrow +void __AnalysisAssumeNullterminated(_Post_ __nullterminated void *p); + +#define _Analysis_assume_nullterminated_(x) __AnalysisAssumeNullterminated(x) +#else // ][ +#define _Analysis_assume_nullterminated_(x) +#endif // ] + +// +// Set the analysis mode (global flags to analysis). +// They take effect at the point of declaration; use at global scope +// as a declaration. +// + +// Synthesize a unique symbol. +#define ___MKID(x, y) x ## y +#define __MKID(x, y) ___MKID(x, y) +#define __GENSYM(x) __MKID(x, __COUNTER__) + +__ANNOTATION(SAL_analysisMode(__AuToQuOtE __In_impl_ char *mode);) + +#define _Analysis_mode_impl_(mode) _SA_annotes1(SAL_analysisMode, #mode) + +#define _Analysis_mode_(mode) \ + typedef _Analysis_mode_impl_(mode) int \ + __GENSYM(__prefast_analysis_mode_flag); + +// The following are predefined: +// _Analysis_operator_new_throw_ (operator new throws) +// _Analysis_operator_new_null_ (operator new returns null) +// _Analysis_operator_new_never_fails_ (operator new never fails) +// + +// Function class annotations. +__ANNOTATION(SAL_functionClassNew(__In_impl_ char*);) +__PRIMOP(int, _In_function_class_(__In_impl_ char*);) +#define _In_function_class_(x) _In_function_class_(#x) + +#define _Function_class_(x) _SA_annotes1(SAL_functionClassNew, #x) + +/* + * interlocked operand used in interlocked instructions + */ +//#define _Interlocked_operand_ _Pre_ _SA_annotes0(SAL_interlocked) + +#define _Enum_is_bitflag_ _SA_annotes0(SAL_enumIsBitflag) +#define _Strict_type_match_ _SA_annotes0(SAL_strictType2) + +#define _Maybe_raises_SEH_exception_ _Pre_ _SA_annotes1(SAL_inTry,__yes) +#define _Raises_SEH_exception_ _Group_(_Maybe_raises_SEH_exception_ _Analysis_noreturn_) + +#ifdef __cplusplus // [ +} +#endif // ] + +// Rotor doesn't need concurrency sal. +// #include + +#define _Interlocked_operand_ diff --git a/PlatformSupport/MSVCStubs.hpp b/PlatformSupport/MSVCStubs.hpp new file mode 100644 index 0000000..63a560a --- /dev/null +++ b/PlatformSupport/MSVCStubs.hpp @@ -0,0 +1,39 @@ +#include + +#define __cdecl + +typedef struct _GUID { + unsigned long Data1; + unsigned short Data2; + unsigned short Data3; + unsigned char Data4[8]; +} GUID; + +typedef size_t HANDLE; + +#define UNREFERENCED_PARAMETER + +#define DEFINE_ENUM_FLAG_OPERATORS(ENUMTYPE) \ +extern "C++" { \ +inline ENUMTYPE operator | (ENUMTYPE a, ENUMTYPE b) { return ENUMTYPE(((int)a) | ((int)b)); } \ +inline ENUMTYPE &operator |= (ENUMTYPE &a, ENUMTYPE b) { return (ENUMTYPE &)(((int &)a) |= ((int)b)); } \ +inline ENUMTYPE operator & (ENUMTYPE a, ENUMTYPE b) { return ENUMTYPE(((int)a) & ((int)b)); } \ +inline ENUMTYPE &operator &= (ENUMTYPE &a, ENUMTYPE b) { return (ENUMTYPE &)(((int &)a) &= ((int)b)); } \ +inline ENUMTYPE operator ~ (ENUMTYPE a) { return ENUMTYPE(~((int)a)); } \ +inline ENUMTYPE operator ^ (ENUMTYPE a, ENUMTYPE b) { return ENUMTYPE(((int)a) ^ ((int)b)); } \ +inline ENUMTYPE &operator ^= (ENUMTYPE &a, ENUMTYPE b) { return (ENUMTYPE &)(((int &)a) ^= ((int)b)); } \ +} + +#define ARRAYSIZE(a) \ + ((sizeof(a) / sizeof(*(a))) / \ + static_cast(!(sizeof(a) % sizeof(*(a))))) + +#define _countof(array) (sizeof(array) / sizeof(array[0])) + +#include "MSVCSal.h" + +#include + +#define BOOL int +#define TRUE 1 +#define FALSE 0 \ No newline at end of file diff --git a/PlatformSupport/StdFS.hpp b/PlatformSupport/StdFS.hpp new file mode 100644 index 0000000..193e7bf --- /dev/null +++ b/PlatformSupport/StdFS.hpp @@ -0,0 +1,173 @@ +#pragma once + +template +class ScopedFile +{ +public: + + + ScopedFile(const std::string & path, bool reset = false) + { + init(path, reset); + } + + ScopedFile(const std::wstring & path, bool reset = false) + { + std::wstring_convert, wchar_t> converter; + init(converter.to_bytes(path), reset); + } + + ~ScopedFile() + { + _stream.close(); + + if (this->_temp) + { + DeleteFile(); + } + } + + virtual size_t GetLength(); + virtual void SeekBegin(size_t offset); + virtual void DeleteFile(); + + void MakeTemp() + { + this->_temp = false; + } + +protected: + + T _stream; + std::string _path; +private: + + + void init(const std::string & path, bool reset) + { + this->_path = path; + this->_temp = false; + + if (reset) + { + _stream.open(path, std::ios::trunc); + _stream.close(); + } + + _stream.open(path, std::ios::binary); + } + + + bool _temp; +}; + +class ScopedWriteFile : public ScopedFile +{ +public: + + ScopedWriteFile(const std::string & name, bool reset = false) : ScopedFile(name, reset) + { + // - + } + + ScopedWriteFile(const std::wstring & name, bool reset = false) : ScopedFile(name, reset) + { + // - + } + + void SeekBegin(size_t offset) + { + _stream.seekp(offset, std::ios_base::beg); + } + + size_t GetLength() + { + if (!_stream.is_open()) + { + return 0; + } + auto old = _stream.tellp(); + _stream.seekp(0, std::ios_base::end); + auto ret = static_cast(_stream.tellp()); + _stream.seekp(old, std::ios_base::beg); + return ret; + } + + bool Write(const void * in, size_t length) + { + if (!_stream.is_open()) + { + return false; + } + if (!_stream.write(reinterpret_cast(in), length)) + { + return false; + } + return true; + } + + void DeleteFile() + { + std::filesystem::remove(_path); + } +}; + + +class ScopedReadFile : public ScopedFile +{ +public: + + ScopedReadFile(const std::string & name) : ScopedFile(name, false) + { + // - + } + + ScopedReadFile(const std::wstring & name) : ScopedFile(name, false) + { + // - + } + + void SeekBegin(size_t offset) + { + _stream.seekg(offset, std::ios_base::beg); + } + + void SeekEnd(size_t offset) + { + _stream.seekg(offset, std::ios_base::end); + } + + size_t GetLength() + { + if (!_stream.is_open()) + { + return 0; + } + auto old = _stream.tellg(); + _stream.seekg(0, std::ios_base::end); + auto ret = static_cast(_stream.tellg()); + _stream.seekg(old, std::ios_base::beg); + return ret; + } + + size_t Read(void * in, size_t length) + { + if (!_stream.is_open()) + { + return false; + } + return _stream.read(reinterpret_cast(in), length) ? length : _stream.gcount(); + } + + size_t Read(size_t length) + { + std::vector in; + in.resize(length); + return Read(in.data(), length); + } + + void DeleteFile() + { + std::filesystem::remove(_path); + } +}; diff --git a/PlatformSupport/Win32DXG.hpp b/PlatformSupport/Win32DXG.hpp new file mode 100644 index 0000000..2c5f13a --- /dev/null +++ b/PlatformSupport/Win32DXG.hpp @@ -0,0 +1,124 @@ +typedef enum DXGI_FORMAT { + DXGI_FORMAT_UNKNOWN, + DXGI_FORMAT_R32G32B32A32_TYPELESS, + DXGI_FORMAT_R32G32B32A32_FLOAT, + DXGI_FORMAT_R32G32B32A32_UINT, + DXGI_FORMAT_R32G32B32A32_SINT, + DXGI_FORMAT_R32G32B32_TYPELESS, + DXGI_FORMAT_R32G32B32_FLOAT, + DXGI_FORMAT_R32G32B32_UINT, + DXGI_FORMAT_R32G32B32_SINT, + DXGI_FORMAT_R16G16B16A16_TYPELESS, + DXGI_FORMAT_R16G16B16A16_FLOAT, + DXGI_FORMAT_R16G16B16A16_UNORM, + DXGI_FORMAT_R16G16B16A16_UINT, + DXGI_FORMAT_R16G16B16A16_SNORM, + DXGI_FORMAT_R16G16B16A16_SINT, + DXGI_FORMAT_R32G32_TYPELESS, + DXGI_FORMAT_R32G32_FLOAT, + DXGI_FORMAT_R32G32_UINT, + DXGI_FORMAT_R32G32_SINT, + DXGI_FORMAT_R32G8X24_TYPELESS, + DXGI_FORMAT_D32_FLOAT_S8X24_UINT, + DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS, + DXGI_FORMAT_X32_TYPELESS_G8X24_UINT, + DXGI_FORMAT_R10G10B10A2_TYPELESS, + DXGI_FORMAT_R10G10B10A2_UNORM, + DXGI_FORMAT_R10G10B10A2_UINT, + DXGI_FORMAT_R11G11B10_FLOAT, + DXGI_FORMAT_R8G8B8A8_TYPELESS, + DXGI_FORMAT_R8G8B8A8_UNORM, + DXGI_FORMAT_R8G8B8A8_UNORM_SRGB, + DXGI_FORMAT_R8G8B8A8_UINT, + DXGI_FORMAT_R8G8B8A8_SNORM, + DXGI_FORMAT_R8G8B8A8_SINT, + DXGI_FORMAT_R16G16_TYPELESS, + DXGI_FORMAT_R16G16_FLOAT, + DXGI_FORMAT_R16G16_UNORM, + DXGI_FORMAT_R16G16_UINT, + DXGI_FORMAT_R16G16_SNORM, + DXGI_FORMAT_R16G16_SINT, + DXGI_FORMAT_R32_TYPELESS, + DXGI_FORMAT_D32_FLOAT, + DXGI_FORMAT_R32_FLOAT, + DXGI_FORMAT_R32_UINT, + DXGI_FORMAT_R32_SINT, + DXGI_FORMAT_R24G8_TYPELESS, + DXGI_FORMAT_D24_UNORM_S8_UINT, + DXGI_FORMAT_R24_UNORM_X8_TYPELESS, + DXGI_FORMAT_X24_TYPELESS_G8_UINT, + DXGI_FORMAT_R8G8_TYPELESS, + DXGI_FORMAT_R8G8_UNORM, + DXGI_FORMAT_R8G8_UINT, + DXGI_FORMAT_R8G8_SNORM, + DXGI_FORMAT_R8G8_SINT, + DXGI_FORMAT_R16_TYPELESS, + DXGI_FORMAT_R16_FLOAT, + DXGI_FORMAT_D16_UNORM, + DXGI_FORMAT_R16_UNORM, + DXGI_FORMAT_R16_UINT, + DXGI_FORMAT_R16_SNORM, + DXGI_FORMAT_R16_SINT, + DXGI_FORMAT_R8_TYPELESS, + DXGI_FORMAT_R8_UNORM, + DXGI_FORMAT_R8_UINT, + DXGI_FORMAT_R8_SNORM, + DXGI_FORMAT_R8_SINT, + DXGI_FORMAT_A8_UNORM, + DXGI_FORMAT_R1_UNORM, + DXGI_FORMAT_R9G9B9E5_SHAREDEXP, + DXGI_FORMAT_R8G8_B8G8_UNORM, + DXGI_FORMAT_G8R8_G8B8_UNORM, + DXGI_FORMAT_BC1_TYPELESS, + DXGI_FORMAT_BC1_UNORM, + DXGI_FORMAT_BC1_UNORM_SRGB, + DXGI_FORMAT_BC2_TYPELESS, + DXGI_FORMAT_BC2_UNORM, + DXGI_FORMAT_BC2_UNORM_SRGB, + DXGI_FORMAT_BC3_TYPELESS, + DXGI_FORMAT_BC3_UNORM, + DXGI_FORMAT_BC3_UNORM_SRGB, + DXGI_FORMAT_BC4_TYPELESS, + DXGI_FORMAT_BC4_UNORM, + DXGI_FORMAT_BC4_SNORM, + DXGI_FORMAT_BC5_TYPELESS, + DXGI_FORMAT_BC5_UNORM, + DXGI_FORMAT_BC5_SNORM, + DXGI_FORMAT_B5G6R5_UNORM, + DXGI_FORMAT_B5G5R5A1_UNORM, + DXGI_FORMAT_B8G8R8A8_UNORM, + DXGI_FORMAT_B8G8R8X8_UNORM, + DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM, + DXGI_FORMAT_B8G8R8A8_TYPELESS, + DXGI_FORMAT_B8G8R8A8_UNORM_SRGB, + DXGI_FORMAT_B8G8R8X8_TYPELESS, + DXGI_FORMAT_B8G8R8X8_UNORM_SRGB, + DXGI_FORMAT_BC6H_TYPELESS, + DXGI_FORMAT_BC6H_UF16, + DXGI_FORMAT_BC6H_SF16, + DXGI_FORMAT_BC7_TYPELESS, + DXGI_FORMAT_BC7_UNORM, + DXGI_FORMAT_BC7_UNORM_SRGB, + DXGI_FORMAT_AYUV, + DXGI_FORMAT_Y410, + DXGI_FORMAT_Y416, + DXGI_FORMAT_NV12, + DXGI_FORMAT_P010, + DXGI_FORMAT_P016, + DXGI_FORMAT_420_OPAQUE, + DXGI_FORMAT_YUY2, + DXGI_FORMAT_Y210, + DXGI_FORMAT_Y216, + DXGI_FORMAT_NV11, + DXGI_FORMAT_AI44, + DXGI_FORMAT_IA44, + DXGI_FORMAT_P8, + DXGI_FORMAT_A8P8, + DXGI_FORMAT_B4G4R4A4_UNORM, + DXGI_FORMAT_P208, + DXGI_FORMAT_V208, + DXGI_FORMAT_V408, + DXGI_FORMAT_SAMPLER_FEEDBACK_MIN_MIP_OPAQUE, + DXGI_FORMAT_SAMPLER_FEEDBACK_MIP_REGION_USED_OPAQUE, + DXGI_FORMAT_FORCE_UINT +} ; diff --git a/PlatformSupport/Win32Errors.hpp b/PlatformSupport/Win32Errors.hpp new file mode 100644 index 0000000..d696a69 --- /dev/null +++ b/PlatformSupport/Win32Errors.hpp @@ -0,0 +1,76 @@ +#pragma once + +#include + +using HRESULT = LONG; +using WORD = int16_t; +using DWORD = uint32_t; +using UINT = uint32_t; +using INT = int32_t; + +#define FACILITY_WIN32 7 + +#define SEVERITY_SUCCESS 0 +#define SEVERITY_ERROR 1 + +#define __HRESULT_FROM_WIN32(x) ((HRESULT)(x) <= 0 ? ((HRESULT)(x)) : ((HRESULT) (((x) & 0x0000FFFF) | (FACILITY_WIN32 << 16) | 0x80000000))) +#define HRESULT_FROM_WIN32(x) __HRESULT_FROM_WIN32(x) + +#define FAILED(hr) (((HRESULT)(hr)) < 0) +#define IS_ERROR(Status) (((unsigned long)(Status)) >> 31 == SEVERITY_ERROR) + +#define S_OK ((HRESULT)0) +#define S_FALSE ((HRESULT)1L) +#define E_UNEXPECTED ((HRESULT)(0x8000FFFFL)) +#define E_NOTIMPL ((HRESULT)(0x80004001L)) +#define E_OUTOFMEMORY ((HRESULT)(0x8007000EL)) +#define E_INVALIDARG ((HRESULT)(0x80070057L)) +#define E_NOINTERFACE ((HRESULT)(0x80004002L)) +#define E_POINTER ((HRESULT)(0x80004003L)) +#define E_HANDLE ((HRESULT)(0x80070006L)) +#define E_ABORT ((HRESULT)(0x80004004L)) +#define E_FAIL ((HRESULT)(0x80004005L)) +#define E_ACCESSDENIED ((HRESULT)(0x80070005L)) +#define E_PENDING ((HRESULT)(0x8000000AL)) + + #define ERROR_ACCESS_DENIED 5 + #define ERROR_INVALID_HANDLE 6 + #define ERROR_OUTOFMEMORY 14 + #define ERROR_NOT_SUPPORTED 50 + #define ERROR_INVALID_PARAMETER 87 + #define ERROR_CALL_NOT_IMPLEMENTED 120 + #define ERROR_INVALID_NAME 123 + #define ERROR_MOD_NOT_FOUND 126 + #define ERROR_NO_MORE_ITEMS 259 + #define ERROR_INVALID_ADDRESS 487 + + #define ERROR_ARITHMETIC_OVERFLOW 534 + + +#define ERROR_SUCCESS 0 +#define ERROR_INVALID_FUNCTION 1 +#define ERROR_FILE_NOT_FOUND 2 +#define ERROR_PATH_NOT_FOUND 3 +#define ERROR_TOO_MANY_OPEN_FILES 4 +#define ERROR_ACCESS_DENIED 5 +#define ERROR_INVALID_HANDLE 6 +#define ERROR_ARENA_TRASHED 7 +#define ERROR_NOT_ENOUGH_MEMORY 8 +#define ERROR_INVALID_BLOCK 9 +#define ERROR_BAD_ENVIRONMENT 10 +#define ERROR_BAD_FORMAT 11 +#define ERROR_INVALID_ACCESS 12 +#define ERROR_INVALID_DATA 13 +#define ERROR_OUTOFMEMORY 14 + +#define ERROR_FILE_EXISTS 80 +#define ERROR_CANNOT_MAKE 82 + +#define ERROR_INSUFFICIENT_BUFFER 122 +#define ERROR_INVALID_STATE 5023 +#define ERROR_FILE_TOO_LARGE 223 +#define ERROR_HANDLE_EOF 38 +#define ERROR_FILE_TOO_LARGE 223 + +#define E_NOT_SUFFICIENT_BUFFER HRESULT_FROM_WIN32(ERROR_INSUFFICIENT_BUFFER) +#define E_NOT_VALID_STATE HRESULT_FROM_WIN32(ERROR_INVALID_STATE) \ No newline at end of file diff --git a/PlatformSupport/Win32Public.hpp b/PlatformSupport/Win32Public.hpp new file mode 100644 index 0000000..f0390dd --- /dev/null +++ b/PlatformSupport/Win32Public.hpp @@ -0,0 +1,7 @@ + #pragma once + +#if defined(_DXTX_NOWIN) + using LONG = size_t; + #include "Win32DXG.hpp" + #include "Win32Errors.hpp" +#endif \ No newline at end of file diff --git a/cmake_install.cmake b/cmake_install.cmake new file mode 100644 index 0000000..85f2eda --- /dev/null +++ b/cmake_install.cmake @@ -0,0 +1,54 @@ +# Install script for directory: /run/media/reece/Misc/GameEngine/Master/Public/DirectXTex + +# Set the install prefix +if(NOT DEFINED CMAKE_INSTALL_PREFIX) + set(CMAKE_INSTALL_PREFIX "/usr/local") +endif() +string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}") + +# Set the install configuration name. +if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME) + if(BUILD_TYPE) + string(REGEX REPLACE "^[^A-Za-z0-9_]+" "" + CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}") + else() + set(CMAKE_INSTALL_CONFIG_NAME "") + endif() + message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"") +endif() + +# Set the component getting installed. +if(NOT CMAKE_INSTALL_COMPONENT) + if(COMPONENT) + message(STATUS "Install component: \"${COMPONENT}\"") + set(CMAKE_INSTALL_COMPONENT "${COMPONENT}") + else() + set(CMAKE_INSTALL_COMPONENT) + endif() +endif() + +# Install shared libraries without execute permission? +if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE) + set(CMAKE_INSTALL_SO_NO_EXE "0") +endif() + +# Is this installation the result of a crosscompile? +if(NOT DEFINED CMAKE_CROSSCOMPILING) + set(CMAKE_CROSSCOMPILING "FALSE") +endif() + +# Set default install directory permissions. +if(NOT DEFINED CMAKE_OBJDUMP) + set(CMAKE_OBJDUMP "/usr/bin/objdump") +endif() + +if(CMAKE_INSTALL_COMPONENT) + set(CMAKE_INSTALL_MANIFEST "install_manifest_${CMAKE_INSTALL_COMPONENT}.txt") +else() + set(CMAKE_INSTALL_MANIFEST "install_manifest.txt") +endif() + +string(REPLACE ";" "\n" CMAKE_INSTALL_MANIFEST_CONTENT + "${CMAKE_INSTALL_MANIFEST_FILES}") +file(WRITE "/run/media/reece/Misc/GameEngine/Master/Public/DirectXTex/${CMAKE_INSTALL_MANIFEST}" + "${CMAKE_INSTALL_MANIFEST_CONTENT}") diff --git a/grep b/grep new file mode 100644 index 0000000..e69de29