From cd3cc166b4dc63efb60b53178ed0c5e0f493807c Mon Sep 17 00:00:00 2001 From: Amaury Le Leyzour Date: Tue, 27 Aug 2019 11:42:48 -0700 Subject: [PATCH 1/2] Add Neon to glm A few simple functions that use Neon as compiler does not use the full potential of Neon For now, -DGLM_FORCE_NEON is required until it's the default --- glm/detail/qualifier.hpp | 20 ++ glm/detail/type_vec4_simd.inl | 334 ++++++++++++++++++++++++++++++++++ glm/simd/platform.h | 29 ++- 3 files changed, 378 insertions(+), 5 deletions(-) diff --git a/glm/detail/qualifier.hpp b/glm/detail/qualifier.hpp index 115817f1..b6c9df0c 100644 --- a/glm/detail/qualifier.hpp +++ b/glm/detail/qualifier.hpp @@ -167,6 +167,26 @@ namespace detail }; # endif +# if GLM_ARCH & GLM_ARCH_NEON_BIT + template<> + struct storage<4, float, true> + { + typedef glm_f32vec4 type; + }; + + template<> + struct storage<4, int, true> + { + typedef glm_i32vec4 type; + }; + + template<> + struct storage<4, unsigned int, true> + { + typedef glm_u32vec4 type; + }; +# endif + enum genTypeEnum { GENTYPE_VEC, diff --git a/glm/detail/type_vec4_simd.inl b/glm/detail/type_vec4_simd.inl index 415b73cb..404c991c 100644 --- a/glm/detail/type_vec4_simd.inl +++ b/glm/detail/type_vec4_simd.inl @@ -461,3 +461,337 @@ namespace detail }//namespace glm #endif//GLM_ARCH & GLM_ARCH_SSE2_BIT + +#if GLM_ARCH & GLM_ARCH_NEON_BIT +namespace glm { +namespace detail { + + template + struct compute_vec4_add + { + static + vec<4, float, Q> + call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) + { + vec<4, float, Q> Result; + Result.data = vaddq_f32(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec4_add + { + static + vec<4, uint, Q> + call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b) + { + vec<4, uint, Q> Result; + Result.data = vaddq_u32(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec4_add + { + static + vec<4, int, Q> + call(vec<4, int, Q> const& a, vec<4, int, Q> const& b) + { + vec<4, uint, Q> Result; + Result.data = vaddq_s32(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec4_sub + { + static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) + { + vec<4, float, Q> Result; + Result.data = vsubq_f32(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec4_sub + { + static vec<4, uint, Q> call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b) + { + vec<4, uint, Q> Result; + Result.data = vsubq_u32(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec4_sub + { + static vec<4, int, Q> call(vec<4, int, Q> const& a, vec<4, int, Q> const& b) + { + vec<4, int, Q> Result; + Result.data = vsubq_s32(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec4_mul + { + static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) + { + vec<4, float, Q> Result; + Result.data = vmulq_f32(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec4_mul + { + static vec<4, uint, Q> call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b) + { + vec<4, uint, Q> Result; + Result.data = vmulq_u32(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec4_mul + { + static vec<4, int, Q> call(vec<4, int, Q> const& a, vec<4, int, Q> const& b) + { + vec<4, int, Q> Result; + Result.data = vmulq_s32(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec4_div + { + static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) + { + vec<4, float, Q> Result; + Result.data = vdivq_f32(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec4_div + { + static vec<4, uint, Q> call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b) + { + vec<4, uint, Q> Result; + Result.data = vdivq_u32(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec4_div + { + static vec<4, int, Q> call(vec<4, float, Q> const& a, vec<4, int, Q> const& b) + { + vec<4, int, Q> Result; + Result.data = vdivq_s32(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec4_equal + { + static bool call(vec<4, float, Q> const& v1, vec<4, float, Q> const& v2) + { + uint32x4_t cmp = vceqq_f32(v1.data, v2.data); +#if GLM_ARCH & GLM_ARCH_ARMV8_BIT + cmp = vpminq_u32(cmp, cmp); + cmp = vpminq_u32(cmp, cmp); + uint32_t r = cmp[0]; +#else + uint32x2_t cmpx2 = vpmin_u32(vget_low_f32(cmp), vget_high_f32(cmp)); + cmpx2 = vpmin_u32(cmpx2, cmpx2); + uint32_t r = cmpx2[0]; +#endif + return r == ~0u; + } + }; + + template + struct compute_vec4_equal + { + static bool call(vec<4, uint, Q> const& v1, vec<4, uint, Q> const& v2) + { + uint32x4_t cmp = vceqq_u32(v1.data, v2.data); +#if GLM_ARCH & GLM_ARCH_ARMV8_BIT + cmp = vpminq_u32(cmp, cmp); + cmp = vpminq_u32(cmp, cmp); + uint32_t r = cmp[0]; +#else + uint32x2_t cmpx2 = vpmin_u32(vget_low_f32(cmp), vget_high_f32(cmp)); + cmpx2 = vpmin_u32(cmpx2, cmpx2); + uint32_t r = cmpx2[0]; +#endif + return r == ~0u; + } + }; + + template + struct compute_vec4_equal + { + static bool call(vec<4, int, Q> const& v1, vec<4, int, Q> const& v2) + { + uint32x4_t cmp = vceqq_s32(v1.data, v2.data); +#if GLM_ARCH & GLM_ARCH_ARMV8_BIT + cmp = vpminq_u32(cmp, cmp); + cmp = vpminq_u32(cmp, cmp); + uint32_t r = cmp[0]; +#else + uint32x2_t cmpx2 = vpmin_u32(vget_low_f32(cmp), vget_high_f32(cmp)); + cmpx2 = vpmin_u32(cmpx2, cmpx2); + uint32_t r = cmpx2[0]; +#endif + return r == ~0u; + } + }; + + template + struct compute_vec4_nequal + { + static bool call(vec<4, float, Q> const& v1, vec<4, float, Q> const& v2) + { + return !compute_vec4_equal::call(v1, v2); + } + }; + + template + struct compute_vec4_nequal + { + static bool call(vec<4, uint, Q> const& v1, vec<4, uint, Q> const& v2) + { + return !compute_vec4_equal::call(v1, v2); + } + }; + + template + struct compute_vec4_nequal + { + static bool call(vec<4, int, Q> const& v1, vec<4, int, Q> const& v2) + { + return !compute_vec4_equal::call(v1, v2); + } + }; + +}//namespace detail + +#if !GLM_CONFIG_XYZW_ONLY + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_lowp>::vec(float _s) : + data(vdupq_n_f32(_s)) + {} + + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_mediump>::vec(float _s) : + data(vdupq_n_f32(_s)) + {} + + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(float _s) : + data(vdupq_n_f32(_s)) + {} + + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_lowp>::vec(int _s) : + data(vdupq_n_s32(_s)) + {} + + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_mediump>::vec(int _s) : + data(vdupq_n_s32(_s)) + {} + + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_highp>::vec(int _s) : + data(vdupq_n_s32(_s)) + {} + + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, uint, aligned_lowp>::vec(uint _s) : + data(vdupq_n_u32(_s)) + {} + + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, uint, aligned_mediump>::vec(uint _s) : + data(vdupq_n_u32(_s)) + {} + + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, uint, aligned_highp>::vec(uint _s) : + data(vdupq_n_u32(_s)) + {} + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(const vec<4, float, aligned_highp>& rhs) : + data(rhs.data) + {} + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(const vec<4, int, aligned_highp>& rhs) : + data(vcvtq_f32_s32(rhs.data)) + {} + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(const vec<4, uint, aligned_highp>& rhs) : + data(vcvtq_f32_u32(rhs.data)) + {} + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_lowp>::vec(int _x, int _y, int _z, int _w) : + data(vcvtq_f32_s32(vec<4, int, aligned_lowp>(_x, _y, _z, _w).data)) + {} + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_mediump>::vec(int _x, int _y, int _z, int _w) : + data(vcvtq_f32_s32(vec<4, int, aligned_mediump>(_x, _y, _z, _w).data)) + {} + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(int _x, int _y, int _z, int _w) : + data(vcvtq_f32_s32(vec<4, int, aligned_highp>(_x, _y, _z, _w).data)) + {} + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_lowp>::vec(uint _x, uint _y, uint _z, uint _w) : + data(vcvtq_f32_u32(vec<4, uint, aligned_lowp>(_x, _y, _z, _w).data)) + {} + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_mediump>::vec(uint _x, uint _y, uint _z, uint _w) : + data(vcvtq_f32_u32(vec<4, uint, aligned_mediump>(_x, _y, _z, _w).data)) + {} + + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(uint _x, uint _y, uint _z, uint _w) : + data(vcvtq_f32_u32(vec<4, uint, aligned_highp>(_x, _y, _z, _w).data)) + {} + +#endif +}//namespace glm + +#endif diff --git a/glm/simd/platform.h b/glm/simd/platform.h index 58168205..e522f6ca 100644 --- a/glm/simd/platform.h +++ b/glm/simd/platform.h @@ -235,10 +235,11 @@ // User defines: GLM_FORCE_PURE GLM_FORCE_INTRINSICS GLM_FORCE_SSE2 GLM_FORCE_SSE3 GLM_FORCE_AVX GLM_FORCE_AVX2 GLM_FORCE_AVX2 -#define GLM_ARCH_MIPS_BIT (0x10000000) -#define GLM_ARCH_PPC_BIT (0x20000000) -#define GLM_ARCH_ARM_BIT (0x40000000) -#define GLM_ARCH_X86_BIT (0x80000000) +#define GLM_ARCH_MIPS_BIT (0x10000000) +#define GLM_ARCH_PPC_BIT (0x20000000) +#define GLM_ARCH_ARM_BIT (0x40000000) +#define GLM_ARCH_ARMV8_BIT (0x01000000) +#define GLM_ARCH_X86_BIT (0x80000000) #define GLM_ARCH_SIMD_BIT (0x00001000) @@ -263,6 +264,7 @@ #define GLM_ARCH_AVX (GLM_ARCH_AVX_BIT | GLM_ARCH_SSE42) #define GLM_ARCH_AVX2 (GLM_ARCH_AVX2_BIT | GLM_ARCH_AVX) #define GLM_ARCH_ARM (GLM_ARCH_ARM_BIT) +#define GLM_ARCH_ARMV8 (GLM_ARCH_NEON_BIT | GLM_ARCH_SIMD_BIT | GLM_ARCH_ARM | GLM_ARCH_ARMV8_BIT) #define GLM_ARCH_NEON (GLM_ARCH_NEON_BIT | GLM_ARCH_SIMD_BIT | GLM_ARCH_ARM) #define GLM_ARCH_MIPS (GLM_ARCH_MIPS_BIT) #define GLM_ARCH_PPC (GLM_ARCH_PPC_BIT) @@ -270,7 +272,11 @@ #if defined(GLM_FORCE_ARCH_UNKNOWN) || defined(GLM_FORCE_PURE) # define GLM_ARCH GLM_ARCH_UNKNOWN #elif defined(GLM_FORCE_NEON) -# define GLM_ARCH (GLM_ARCH_NEON) +# if __ARM_ARCH >= 8 +# define GLM_ARCH (GLM_ARCH_ARMV8) +# else +# define GLM_ARCH (GLM_ARCH_NEON) +# endif # define GLM_FORCE_INTRINSICS #elif defined(GLM_FORCE_AVX2) # define GLM_ARCH (GLM_ARCH_AVX2) @@ -313,9 +319,14 @@ # define GLM_ARCH (GLM_ARCH_SSE2) # elif defined(__i386__) # define GLM_ARCH (GLM_ARCH_X86) +# elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) +# define GLM_ARCH (GLM_ARCH_ARMV8) +#warning "ARM v8" # elif defined(__ARM_NEON) +#warning "ARM NEON" # define GLM_ARCH (GLM_ARCH_ARM | GLM_ARCH_NEON) # elif defined(__arm__ ) || defined(_M_ARM) +#warning "ARM v6" # define GLM_ARCH (GLM_ARCH_ARM) # elif defined(__mips__ ) # define GLM_ARCH (GLM_ARCH_MIPS) @@ -355,6 +366,8 @@ # include #elif GLM_ARCH & GLM_ARCH_SSE2_BIT # include +#elif GLM_ARCH & GLM_ARCH_NEON_BIT +# include #endif//GLM_ARCH #if GLM_ARCH & GLM_ARCH_SSE2_BIT @@ -380,3 +393,9 @@ typedef __m256i glm_i64vec4; typedef __m256i glm_u64vec4; #endif + +#if GLM_ARCH & GLM_ARCH_NEON_BIT + typedef float32x4_t glm_f32vec4; + typedef int32x4_t glm_i32vec4; + typedef uint32x4_t glm_u32vec4; +#endif From d2aa30fe92c837775c02df27fce5f35ca77aab22 Mon Sep 17 00:00:00 2001 From: Amaury Le Leyzour Date: Tue, 3 Sep 2019 10:14:30 -0700 Subject: [PATCH 2/2] Remove debug warnings --- glm/simd/platform.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/glm/simd/platform.h b/glm/simd/platform.h index e522f6ca..378513ea 100644 --- a/glm/simd/platform.h +++ b/glm/simd/platform.h @@ -321,12 +321,9 @@ # define GLM_ARCH (GLM_ARCH_X86) # elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) # define GLM_ARCH (GLM_ARCH_ARMV8) -#warning "ARM v8" # elif defined(__ARM_NEON) -#warning "ARM NEON" # define GLM_ARCH (GLM_ARCH_ARM | GLM_ARCH_NEON) # elif defined(__arm__ ) || defined(_M_ARM) -#warning "ARM v6" # define GLM_ARCH (GLM_ARCH_ARM) # elif defined(__mips__ ) # define GLM_ARCH (GLM_ARCH_MIPS)