diff --git a/reference/shaders-no-opt/comp/subgroups_arithmetic_fadd.vk.comp b/reference/shaders-no-opt/comp/subgroups_arithmetic_fadd.vk.comp new file mode 100644 index 00000000..9a8377d3 --- /dev/null +++ b/reference/shaders-no-opt/comp/subgroups_arithmetic_fadd.vk.comp @@ -0,0 +1,892 @@ +#version 450 + +#if defined(GL_KHR_shader_subgroup_ballot) +#extension GL_KHR_shader_subgroup_ballot : require +#elif defined(GL_NV_shader_thread_group) +#extension GL_NV_shader_thread_group : require +#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64) +#extension GL_ARB_shader_int64 : enable +#extension GL_ARB_shader_ballot : require +#else +#error No extensions available to emulate requested subgroup feature. +#endif + +#if defined(GL_KHR_shader_subgroup_basic) +#extension GL_KHR_shader_subgroup_basic : require +#elif defined(GL_NV_shader_thread_group) +#extension GL_NV_shader_thread_group : require +#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64) +#extension GL_ARB_shader_int64 : enable +#extension GL_ARB_shader_ballot : require +#elif defined(GL_AMD_gcn_shader) && (defined(GL_AMD_gpu_shader_int64) || defined(GL_NV_gpu_shader5)) +#extension GL_AMD_gpu_shader_int64 : enable +#extension GL_NV_gpu_shader5 : enable +#extension GL_AMD_gcn_shader : require +#else +#error No extensions available to emulate requested subgroup feature. +#endif + +#if defined(GL_KHR_shader_subgroup_ballot) +#extension GL_KHR_shader_subgroup_ballot : require +#elif defined(GL_NV_shader_thread_group) +#extension GL_NV_shader_thread_group : require +#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64) +#extension GL_ARB_shader_int64 : enable +#extension GL_ARB_shader_ballot : require +#else +#error No extensions available to emulate requested subgroup feature. +#endif + +#if defined(GL_NV_shader_thread_group) +#extension GL_NV_shader_thread_group : require +#endif + +#if defined(GL_KHR_shader_subgroup_arithmetic) +#extension GL_KHR_shader_subgroup_arithmetic : require +#elif defined(GL_NV_shader_thread_shuffle) +#extension GL_NV_shader_thread_shuffle : require +#else +#error No extensions available to emulate requested subgroup feature. +#endif + +#if defined(GL_KHR_shader_subgroup_arithmetic) +#extension GL_KHR_shader_subgroup_arithmetic : require +#elif defined(GL_NV_shader_thread_shuffle) +#extension GL_NV_shader_thread_shuffle : require +#else +#error No extensions available to emulate requested subgroup feature. +#endif + +#if defined(GL_KHR_shader_subgroup_arithmetic) +#extension GL_KHR_shader_subgroup_arithmetic : require +#elif defined(GL_NV_shader_thread_shuffle) +#extension GL_NV_shader_thread_shuffle : require +#else +#error No extensions available to emulate requested subgroup feature. +#endif +layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in; + +layout(binding = 1, std430) buffer DATA_OUT +{ + float data_out_float; + vec2 data_out_vec2; + vec3 data_out_vec3; + vec4 data_out_vec4; + double data_out_double; + dvec2 data_out_dvec2; + dvec3 data_out_dvec3; + dvec4 data_out_dvec4; +} _16; + +layout(binding = 0, std430) buffer DATA_IN +{ + float data_in_float[128]; + vec2 data_in_vec2[128]; + vec3 data_in_vec3[128]; + vec4 data_in_vec4[128]; + double data_in_double[128]; + dvec2 data_in_dvec2[128]; + dvec3 data_in_dvec3[128]; + dvec4 data_in_dvec4[128]; +} _31; + +#if defined(GL_KHR_shader_subgroup_ballot) +#elif defined(GL_NV_shader_thread_group) +#define gl_SubgroupEqMask uvec4(gl_ThreadEqMaskNV, 0u, 0u, 0u) +#define gl_SubgroupGeMask uvec4(gl_ThreadGeMaskNV, 0u, 0u, 0u) +#define gl_SubgroupGtMask uvec4(gl_ThreadGtMaskNV, 0u, 0u, 0u) +#define gl_SubgroupLeMask uvec4(gl_ThreadLeMaskNV, 0u, 0u, 0u) +#define gl_SubgroupLtMask uvec4(gl_ThreadLtMaskNV, 0u, 0u, 0u) +#elif defined(GL_ARB_shader_ballot) +#define gl_SubgroupEqMask uvec4(unpackUint2x32(gl_SubGroupEqMaskARB), 0u, 0u) +#define gl_SubgroupGeMask uvec4(unpackUint2x32(gl_SubGroupGeMaskARB), 0u, 0u) +#define gl_SubgroupGtMask uvec4(unpackUint2x32(gl_SubGroupGtMaskARB), 0u, 0u) +#define gl_SubgroupLeMask uvec4(unpackUint2x32(gl_SubGroupLeMaskARB), 0u, 0u) +#define gl_SubgroupLtMask uvec4(unpackUint2x32(gl_SubGroupLtMaskARB), 0u, 0u) +#endif + +#if defined(GL_KHR_shader_subgroup_basic) +#elif defined(GL_NV_shader_thread_group) +#define gl_SubgroupSize gl_WarpSizeNV +#elif defined(GL_ARB_shader_ballot) +#define gl_SubgroupSize gl_SubGroupSizeARB +#elif defined(GL_AMD_gcn_shader) +#define gl_SubgroupSize uint(gl_SIMDGroupSizeAMD) +#endif + +#if defined(GL_KHR_shader_subgroup_ballot) +#elif defined(GL_NV_shader_thread_group) +uvec4 subgroupBallot(bool v) { return uvec4(ballotThreadNV(v), 0u, 0u, 0u); } +#elif defined(GL_ARB_shader_ballot) +uvec4 subgroupBallot(bool v) { return uvec4(unpackUint2x32(ballotARB(v)), 0u, 0u); } +#endif + +#ifndef GL_KHR_shader_subgroup_basic +bool subgroupElect() +{ + uvec4 activeMask = subgroupBallot(true); + uint firstLive = subgroupBallotFindLSB(activeMask); + return gl_SubgroupInvocationID == firstLive; +} +#endif + +#ifndef GL_KHR_shader_subgroup_ballot +uint subgroupBallotBitCount(uvec4 value) +{ + ivec2 c = bitCount(value.xy); +#ifdef GL_NV_shader_thread_group + return uint(c.x); +#else + return uint(c.x + c.y); +#endif +} +#endif + +#ifndef GL_KHR_shader_subgroup_ballot +bool subgroupBallotBitExtract(uvec4 value, uint index) +{ +#ifdef GL_NV_shader_thread_group + uint shifted = value.x >> index; +#else + uint shifted = value[index >> 5u] >> (index & 0x1fu); +#endif + return (shifted & 1u) != 0u; +} +#endif + +#if defined(GL_KHR_shader_subgroup_arithmetic) +#elif defined(GL_NV_shader_thread_shuffle) +float subgroupAdd(float v) +{ + float reduction = 0.0f; + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + float s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction += valid ? s : 0.0f; + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + float s = shuffleNV(v, i, gl_SubgroupSize); + reduction += valid ? s : 0.0f; + } + } + return reduction; +} +vec2 subgroupAdd(vec2 v) +{ + vec2 reduction = vec2(0.0f); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + vec2 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction += valid ? s : vec2(0.0f); + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + vec2 s = shuffleNV(v, i, gl_SubgroupSize); + reduction += valid ? s : vec2(0.0f); + } + } + return reduction; +} +vec3 subgroupAdd(vec3 v) +{ + vec3 reduction = vec3(0.0f); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + vec3 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction += valid ? s : vec3(0.0f); + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + vec3 s = shuffleNV(v, i, gl_SubgroupSize); + reduction += valid ? s : vec3(0.0f); + } + } + return reduction; +} +vec4 subgroupAdd(vec4 v) +{ + vec4 reduction = vec4(0.0f); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + vec4 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction += valid ? s : vec4(0.0f); + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + vec4 s = shuffleNV(v, i, gl_SubgroupSize); + reduction += valid ? s : vec4(0.0f); + } + } + return reduction; +} +double subgroupAdd(double v) +{ + double reduction = 0.0LF; + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + double s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction += valid ? s : 0.0LF; + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + double s = shuffleNV(v, i, gl_SubgroupSize); + reduction += valid ? s : 0.0LF; + } + } + return reduction; +} +dvec2 subgroupAdd(dvec2 v) +{ + dvec2 reduction = dvec2(0.0LF); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + dvec2 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction += valid ? s : dvec2(0.0LF); + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + dvec2 s = shuffleNV(v, i, gl_SubgroupSize); + reduction += valid ? s : dvec2(0.0LF); + } + } + return reduction; +} +dvec3 subgroupAdd(dvec3 v) +{ + dvec3 reduction = dvec3(0.0LF); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + dvec3 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction += valid ? s : dvec3(0.0LF); + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + dvec3 s = shuffleNV(v, i, gl_SubgroupSize); + reduction += valid ? s : dvec3(0.0LF); + } + } + return reduction; +} +dvec4 subgroupAdd(dvec4 v) +{ + dvec4 reduction = dvec4(0.0LF); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + dvec4 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction += valid ? s : dvec4(0.0LF); + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + dvec4 s = shuffleNV(v, i, gl_SubgroupSize); + reduction += valid ? s : dvec4(0.0LF); + } + } + return reduction; +} +#endif + +#if defined(GL_KHR_shader_subgroup_arithmetic) +#elif defined(GL_NV_shader_thread_shuffle) +float subgroupExclusiveAdd(float v) +{ + float excl_scan = 0.0f; + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + float s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan += valid ? s : 0.0f; + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = 0.0f; + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + float s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan += valid ? s : 0.0f; + } + } + return excl_scan; +} +vec2 subgroupExclusiveAdd(vec2 v) +{ + vec2 excl_scan = vec2(0.0f); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + vec2 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan += valid ? s : vec2(0.0f); + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = vec2(0.0f); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + vec2 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan += valid ? s : vec2(0.0f); + } + } + return excl_scan; +} +vec3 subgroupExclusiveAdd(vec3 v) +{ + vec3 excl_scan = vec3(0.0f); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + vec3 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan += valid ? s : vec3(0.0f); + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = vec3(0.0f); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + vec3 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan += valid ? s : vec3(0.0f); + } + } + return excl_scan; +} +vec4 subgroupExclusiveAdd(vec4 v) +{ + vec4 excl_scan = vec4(0.0f); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + vec4 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan += valid ? s : vec4(0.0f); + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = vec4(0.0f); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + vec4 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan += valid ? s : vec4(0.0f); + } + } + return excl_scan; +} +double subgroupExclusiveAdd(double v) +{ + double excl_scan = 0.0LF; + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + double s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan += valid ? s : 0.0LF; + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = 0.0LF; + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + double s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan += valid ? s : 0.0LF; + } + } + return excl_scan; +} +dvec2 subgroupExclusiveAdd(dvec2 v) +{ + dvec2 excl_scan = dvec2(0.0LF); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + dvec2 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan += valid ? s : dvec2(0.0LF); + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = dvec2(0.0LF); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + dvec2 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan += valid ? s : dvec2(0.0LF); + } + } + return excl_scan; +} +dvec3 subgroupExclusiveAdd(dvec3 v) +{ + dvec3 excl_scan = dvec3(0.0LF); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + dvec3 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan += valid ? s : dvec3(0.0LF); + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = dvec3(0.0LF); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + dvec3 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan += valid ? s : dvec3(0.0LF); + } + } + return excl_scan; +} +dvec4 subgroupExclusiveAdd(dvec4 v) +{ + dvec4 excl_scan = dvec4(0.0LF); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + dvec4 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan += valid ? s : dvec4(0.0LF); + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = dvec4(0.0LF); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + dvec4 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan += valid ? s : dvec4(0.0LF); + } + } + return excl_scan; +} +#endif + +#if defined(GL_KHR_shader_subgroup_arithmetic) +#elif defined(GL_NV_shader_thread_shuffle) +float subgroupInclusiveAdd(float v) +{ + float incl_scan = 0.0f; + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + float s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan += valid ? s : 0.0f; + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + float s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan += valid ? s : 0.0f; + } + } + return incl_scan; +} +vec2 subgroupInclusiveAdd(vec2 v) +{ + vec2 incl_scan = vec2(0.0f); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + vec2 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan += valid ? s : vec2(0.0f); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + vec2 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan += valid ? s : vec2(0.0f); + } + } + return incl_scan; +} +vec3 subgroupInclusiveAdd(vec3 v) +{ + vec3 incl_scan = vec3(0.0f); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + vec3 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan += valid ? s : vec3(0.0f); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + vec3 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan += valid ? s : vec3(0.0f); + } + } + return incl_scan; +} +vec4 subgroupInclusiveAdd(vec4 v) +{ + vec4 incl_scan = vec4(0.0f); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + vec4 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan += valid ? s : vec4(0.0f); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + vec4 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan += valid ? s : vec4(0.0f); + } + } + return incl_scan; +} +double subgroupInclusiveAdd(double v) +{ + double incl_scan = 0.0LF; + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + double s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan += valid ? s : 0.0LF; + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + double s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan += valid ? s : 0.0LF; + } + } + return incl_scan; +} +dvec2 subgroupInclusiveAdd(dvec2 v) +{ + dvec2 incl_scan = dvec2(0.0LF); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + dvec2 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan += valid ? s : dvec2(0.0LF); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + dvec2 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan += valid ? s : dvec2(0.0LF); + } + } + return incl_scan; +} +dvec3 subgroupInclusiveAdd(dvec3 v) +{ + dvec3 incl_scan = dvec3(0.0LF); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + dvec3 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan += valid ? s : dvec3(0.0LF); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + dvec3 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan += valid ? s : dvec3(0.0LF); + } + } + return incl_scan; +} +dvec4 subgroupInclusiveAdd(dvec4 v) +{ + dvec4 incl_scan = dvec4(0.0LF); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + dvec4 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan += valid ? s : dvec4(0.0LF); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + dvec4 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan += valid ? s : dvec4(0.0LF); + } + } + return incl_scan; +} +#endif + +void main() +{ + _16.data_out_float = subgroupAdd(_31.data_in_float[gl_LocalInvocationID.x]); + _16.data_out_vec2 = subgroupAdd(_31.data_in_vec2[gl_LocalInvocationID.x]); + _16.data_out_vec3 = subgroupAdd(_31.data_in_vec3[gl_LocalInvocationID.x]); + _16.data_out_vec4 = subgroupAdd(_31.data_in_vec4[gl_LocalInvocationID.x]); + _16.data_out_double = subgroupAdd(_31.data_in_double[gl_LocalInvocationID.x]); + _16.data_out_dvec2 = subgroupAdd(_31.data_in_dvec2[gl_LocalInvocationID.x]); + _16.data_out_dvec3 = subgroupAdd(_31.data_in_dvec3[gl_LocalInvocationID.x]); + _16.data_out_dvec4 = subgroupAdd(_31.data_in_dvec4[gl_LocalInvocationID.x]); + _16.data_out_float = subgroupExclusiveAdd(_31.data_in_float[gl_LocalInvocationID.x]); + _16.data_out_vec2 = subgroupExclusiveAdd(_31.data_in_vec2[gl_LocalInvocationID.x]); + _16.data_out_vec3 = subgroupExclusiveAdd(_31.data_in_vec3[gl_LocalInvocationID.x]); + _16.data_out_vec4 = subgroupExclusiveAdd(_31.data_in_vec4[gl_LocalInvocationID.x]); + _16.data_out_double = subgroupExclusiveAdd(_31.data_in_double[gl_LocalInvocationID.x]); + _16.data_out_dvec2 = subgroupExclusiveAdd(_31.data_in_dvec2[gl_LocalInvocationID.x]); + _16.data_out_dvec3 = subgroupExclusiveAdd(_31.data_in_dvec3[gl_LocalInvocationID.x]); + _16.data_out_dvec4 = subgroupExclusiveAdd(_31.data_in_dvec4[gl_LocalInvocationID.x]); + _16.data_out_float = subgroupInclusiveAdd(_31.data_in_float[gl_LocalInvocationID.x]); + _16.data_out_vec2 = subgroupInclusiveAdd(_31.data_in_vec2[gl_LocalInvocationID.x]); + _16.data_out_vec3 = subgroupInclusiveAdd(_31.data_in_vec3[gl_LocalInvocationID.x]); + _16.data_out_vec4 = subgroupInclusiveAdd(_31.data_in_vec4[gl_LocalInvocationID.x]); + _16.data_out_double = subgroupInclusiveAdd(_31.data_in_double[gl_LocalInvocationID.x]); + _16.data_out_dvec2 = subgroupInclusiveAdd(_31.data_in_dvec2[gl_LocalInvocationID.x]); + _16.data_out_dvec3 = subgroupInclusiveAdd(_31.data_in_dvec3[gl_LocalInvocationID.x]); + _16.data_out_dvec4 = subgroupInclusiveAdd(_31.data_in_dvec4[gl_LocalInvocationID.x]); +} + diff --git a/reference/shaders-no-opt/comp/subgroups_arithmetic_fadd.vk.comp.vk b/reference/shaders-no-opt/comp/subgroups_arithmetic_fadd.vk.comp.vk new file mode 100644 index 00000000..db446318 --- /dev/null +++ b/reference/shaders-no-opt/comp/subgroups_arithmetic_fadd.vk.comp.vk @@ -0,0 +1,56 @@ +#version 450 +#extension GL_KHR_shader_subgroup_arithmetic : require +layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in; + +layout(set = 0, binding = 1, std430) buffer DATA_OUT +{ + float data_out_float; + vec2 data_out_vec2; + vec3 data_out_vec3; + vec4 data_out_vec4; + double data_out_double; + dvec2 data_out_dvec2; + dvec3 data_out_dvec3; + dvec4 data_out_dvec4; +} _16; + +layout(set = 0, binding = 0, std430) buffer DATA_IN +{ + float data_in_float[128]; + vec2 data_in_vec2[128]; + vec3 data_in_vec3[128]; + vec4 data_in_vec4[128]; + double data_in_double[128]; + dvec2 data_in_dvec2[128]; + dvec3 data_in_dvec3[128]; + dvec4 data_in_dvec4[128]; +} _31; + +void main() +{ + _16.data_out_float = subgroupAdd(_31.data_in_float[gl_LocalInvocationID.x]); + _16.data_out_vec2 = subgroupAdd(_31.data_in_vec2[gl_LocalInvocationID.x]); + _16.data_out_vec3 = subgroupAdd(_31.data_in_vec3[gl_LocalInvocationID.x]); + _16.data_out_vec4 = subgroupAdd(_31.data_in_vec4[gl_LocalInvocationID.x]); + _16.data_out_double = subgroupAdd(_31.data_in_double[gl_LocalInvocationID.x]); + _16.data_out_dvec2 = subgroupAdd(_31.data_in_dvec2[gl_LocalInvocationID.x]); + _16.data_out_dvec3 = subgroupAdd(_31.data_in_dvec3[gl_LocalInvocationID.x]); + _16.data_out_dvec4 = subgroupAdd(_31.data_in_dvec4[gl_LocalInvocationID.x]); + _16.data_out_float = subgroupExclusiveAdd(_31.data_in_float[gl_LocalInvocationID.x]); + _16.data_out_vec2 = subgroupExclusiveAdd(_31.data_in_vec2[gl_LocalInvocationID.x]); + _16.data_out_vec3 = subgroupExclusiveAdd(_31.data_in_vec3[gl_LocalInvocationID.x]); + _16.data_out_vec4 = subgroupExclusiveAdd(_31.data_in_vec4[gl_LocalInvocationID.x]); + _16.data_out_double = subgroupExclusiveAdd(_31.data_in_double[gl_LocalInvocationID.x]); + _16.data_out_dvec2 = subgroupExclusiveAdd(_31.data_in_dvec2[gl_LocalInvocationID.x]); + _16.data_out_dvec3 = subgroupExclusiveAdd(_31.data_in_dvec3[gl_LocalInvocationID.x]); + _16.data_out_dvec4 = subgroupExclusiveAdd(_31.data_in_dvec4[gl_LocalInvocationID.x]); + _16.data_out_float = subgroupInclusiveAdd(_31.data_in_float[gl_LocalInvocationID.x]); + _16.data_out_vec2 = subgroupInclusiveAdd(_31.data_in_vec2[gl_LocalInvocationID.x]); + _16.data_out_vec3 = subgroupInclusiveAdd(_31.data_in_vec3[gl_LocalInvocationID.x]); + _16.data_out_vec4 = subgroupInclusiveAdd(_31.data_in_vec4[gl_LocalInvocationID.x]); + _16.data_out_double = subgroupInclusiveAdd(_31.data_in_double[gl_LocalInvocationID.x]); + _16.data_out_dvec2 = subgroupInclusiveAdd(_31.data_in_dvec2[gl_LocalInvocationID.x]); + _16.data_out_dvec3 = subgroupInclusiveAdd(_31.data_in_dvec3[gl_LocalInvocationID.x]); + _16.data_out_dvec4 = subgroupInclusiveAdd(_31.data_in_dvec4[gl_LocalInvocationID.x]); +} + diff --git a/reference/shaders-no-opt/comp/subgroups_arithmetic_fmul.vk.comp b/reference/shaders-no-opt/comp/subgroups_arithmetic_fmul.vk.comp new file mode 100644 index 00000000..d9b117a1 --- /dev/null +++ b/reference/shaders-no-opt/comp/subgroups_arithmetic_fmul.vk.comp @@ -0,0 +1,892 @@ +#version 450 + +#if defined(GL_KHR_shader_subgroup_ballot) +#extension GL_KHR_shader_subgroup_ballot : require +#elif defined(GL_NV_shader_thread_group) +#extension GL_NV_shader_thread_group : require +#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64) +#extension GL_ARB_shader_int64 : enable +#extension GL_ARB_shader_ballot : require +#else +#error No extensions available to emulate requested subgroup feature. +#endif + +#if defined(GL_KHR_shader_subgroup_basic) +#extension GL_KHR_shader_subgroup_basic : require +#elif defined(GL_NV_shader_thread_group) +#extension GL_NV_shader_thread_group : require +#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64) +#extension GL_ARB_shader_int64 : enable +#extension GL_ARB_shader_ballot : require +#elif defined(GL_AMD_gcn_shader) && (defined(GL_AMD_gpu_shader_int64) || defined(GL_NV_gpu_shader5)) +#extension GL_AMD_gpu_shader_int64 : enable +#extension GL_NV_gpu_shader5 : enable +#extension GL_AMD_gcn_shader : require +#else +#error No extensions available to emulate requested subgroup feature. +#endif + +#if defined(GL_KHR_shader_subgroup_ballot) +#extension GL_KHR_shader_subgroup_ballot : require +#elif defined(GL_NV_shader_thread_group) +#extension GL_NV_shader_thread_group : require +#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64) +#extension GL_ARB_shader_int64 : enable +#extension GL_ARB_shader_ballot : require +#else +#error No extensions available to emulate requested subgroup feature. +#endif + +#if defined(GL_NV_shader_thread_group) +#extension GL_NV_shader_thread_group : require +#endif + +#if defined(GL_KHR_shader_subgroup_arithmetic) +#extension GL_KHR_shader_subgroup_arithmetic : require +#elif defined(GL_NV_shader_thread_shuffle) +#extension GL_NV_shader_thread_shuffle : require +#else +#error No extensions available to emulate requested subgroup feature. +#endif + +#if defined(GL_KHR_shader_subgroup_arithmetic) +#extension GL_KHR_shader_subgroup_arithmetic : require +#elif defined(GL_NV_shader_thread_shuffle) +#extension GL_NV_shader_thread_shuffle : require +#else +#error No extensions available to emulate requested subgroup feature. +#endif + +#if defined(GL_KHR_shader_subgroup_arithmetic) +#extension GL_KHR_shader_subgroup_arithmetic : require +#elif defined(GL_NV_shader_thread_shuffle) +#extension GL_NV_shader_thread_shuffle : require +#else +#error No extensions available to emulate requested subgroup feature. +#endif +layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in; + +layout(binding = 1, std430) buffer DATA_OUT +{ + float data_out_float; + vec2 data_out_vec2; + vec3 data_out_vec3; + vec4 data_out_vec4; + double data_out_double; + dvec2 data_out_dvec2; + dvec3 data_out_dvec3; + dvec4 data_out_dvec4; +} _16; + +layout(binding = 0, std430) buffer DATA_IN +{ + float data_in_float[128]; + vec2 data_in_vec2[128]; + vec3 data_in_vec3[128]; + vec4 data_in_vec4[128]; + double data_in_double[128]; + dvec2 data_in_dvec2[128]; + dvec3 data_in_dvec3[128]; + dvec4 data_in_dvec4[128]; +} _31; + +#if defined(GL_KHR_shader_subgroup_ballot) +#elif defined(GL_NV_shader_thread_group) +#define gl_SubgroupEqMask uvec4(gl_ThreadEqMaskNV, 0u, 0u, 0u) +#define gl_SubgroupGeMask uvec4(gl_ThreadGeMaskNV, 0u, 0u, 0u) +#define gl_SubgroupGtMask uvec4(gl_ThreadGtMaskNV, 0u, 0u, 0u) +#define gl_SubgroupLeMask uvec4(gl_ThreadLeMaskNV, 0u, 0u, 0u) +#define gl_SubgroupLtMask uvec4(gl_ThreadLtMaskNV, 0u, 0u, 0u) +#elif defined(GL_ARB_shader_ballot) +#define gl_SubgroupEqMask uvec4(unpackUint2x32(gl_SubGroupEqMaskARB), 0u, 0u) +#define gl_SubgroupGeMask uvec4(unpackUint2x32(gl_SubGroupGeMaskARB), 0u, 0u) +#define gl_SubgroupGtMask uvec4(unpackUint2x32(gl_SubGroupGtMaskARB), 0u, 0u) +#define gl_SubgroupLeMask uvec4(unpackUint2x32(gl_SubGroupLeMaskARB), 0u, 0u) +#define gl_SubgroupLtMask uvec4(unpackUint2x32(gl_SubGroupLtMaskARB), 0u, 0u) +#endif + +#if defined(GL_KHR_shader_subgroup_basic) +#elif defined(GL_NV_shader_thread_group) +#define gl_SubgroupSize gl_WarpSizeNV +#elif defined(GL_ARB_shader_ballot) +#define gl_SubgroupSize gl_SubGroupSizeARB +#elif defined(GL_AMD_gcn_shader) +#define gl_SubgroupSize uint(gl_SIMDGroupSizeAMD) +#endif + +#if defined(GL_KHR_shader_subgroup_ballot) +#elif defined(GL_NV_shader_thread_group) +uvec4 subgroupBallot(bool v) { return uvec4(ballotThreadNV(v), 0u, 0u, 0u); } +#elif defined(GL_ARB_shader_ballot) +uvec4 subgroupBallot(bool v) { return uvec4(unpackUint2x32(ballotARB(v)), 0u, 0u); } +#endif + +#ifndef GL_KHR_shader_subgroup_basic +bool subgroupElect() +{ + uvec4 activeMask = subgroupBallot(true); + uint firstLive = subgroupBallotFindLSB(activeMask); + return gl_SubgroupInvocationID == firstLive; +} +#endif + +#ifndef GL_KHR_shader_subgroup_ballot +uint subgroupBallotBitCount(uvec4 value) +{ + ivec2 c = bitCount(value.xy); +#ifdef GL_NV_shader_thread_group + return uint(c.x); +#else + return uint(c.x + c.y); +#endif +} +#endif + +#ifndef GL_KHR_shader_subgroup_ballot +bool subgroupBallotBitExtract(uvec4 value, uint index) +{ +#ifdef GL_NV_shader_thread_group + uint shifted = value.x >> index; +#else + uint shifted = value[index >> 5u] >> (index & 0x1fu); +#endif + return (shifted & 1u) != 0u; +} +#endif + +#if defined(GL_KHR_shader_subgroup_arithmetic) +#elif defined(GL_NV_shader_thread_shuffle) +float subgroupMul(float v) +{ + float reduction = 1.0f; + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + float s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction *= valid ? s : 1.0f; + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + float s = shuffleNV(v, i, gl_SubgroupSize); + reduction *= valid ? s : 1.0f; + } + } + return reduction; +} +vec2 subgroupMul(vec2 v) +{ + vec2 reduction = vec2(1.0f); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + vec2 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction *= valid ? s : vec2(1.0f); + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + vec2 s = shuffleNV(v, i, gl_SubgroupSize); + reduction *= valid ? s : vec2(1.0f); + } + } + return reduction; +} +vec3 subgroupMul(vec3 v) +{ + vec3 reduction = vec3(1.0f); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + vec3 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction *= valid ? s : vec3(1.0f); + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + vec3 s = shuffleNV(v, i, gl_SubgroupSize); + reduction *= valid ? s : vec3(1.0f); + } + } + return reduction; +} +vec4 subgroupMul(vec4 v) +{ + vec4 reduction = vec4(1.0f); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + vec4 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction *= valid ? s : vec4(1.0f); + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + vec4 s = shuffleNV(v, i, gl_SubgroupSize); + reduction *= valid ? s : vec4(1.0f); + } + } + return reduction; +} +double subgroupMul(double v) +{ + double reduction = 0.0LF; + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + double s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction *= valid ? s : 0.0LF; + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + double s = shuffleNV(v, i, gl_SubgroupSize); + reduction *= valid ? s : 0.0LF; + } + } + return reduction; +} +dvec2 subgroupMul(dvec2 v) +{ + dvec2 reduction = dvec2(1.0LF); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + dvec2 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction *= valid ? s : dvec2(1.0LF); + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + dvec2 s = shuffleNV(v, i, gl_SubgroupSize); + reduction *= valid ? s : dvec2(1.0LF); + } + } + return reduction; +} +dvec3 subgroupMul(dvec3 v) +{ + dvec3 reduction = dvec3(1.0LF); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + dvec3 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction *= valid ? s : dvec3(1.0LF); + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + dvec3 s = shuffleNV(v, i, gl_SubgroupSize); + reduction *= valid ? s : dvec3(1.0LF); + } + } + return reduction; +} +dvec4 subgroupMul(dvec4 v) +{ + dvec4 reduction = dvec4(1.0LF); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + dvec4 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction *= valid ? s : dvec4(1.0LF); + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + dvec4 s = shuffleNV(v, i, gl_SubgroupSize); + reduction *= valid ? s : dvec4(1.0LF); + } + } + return reduction; +} +#endif + +#if defined(GL_KHR_shader_subgroup_arithmetic) +#elif defined(GL_NV_shader_thread_shuffle) +float subgroupExclusiveMul(float v) +{ + float excl_scan = 1.0f; + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + float s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan *= valid ? s : 1.0f; + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = 1.0f; + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + float s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan *= valid ? s : 1.0f; + } + } + return excl_scan; +} +vec2 subgroupExclusiveMul(vec2 v) +{ + vec2 excl_scan = vec2(1.0f); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + vec2 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan *= valid ? s : vec2(1.0f); + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = vec2(1.0f); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + vec2 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan *= valid ? s : vec2(1.0f); + } + } + return excl_scan; +} +vec3 subgroupExclusiveMul(vec3 v) +{ + vec3 excl_scan = vec3(1.0f); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + vec3 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan *= valid ? s : vec3(1.0f); + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = vec3(1.0f); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + vec3 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan *= valid ? s : vec3(1.0f); + } + } + return excl_scan; +} +vec4 subgroupExclusiveMul(vec4 v) +{ + vec4 excl_scan = vec4(1.0f); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + vec4 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan *= valid ? s : vec4(1.0f); + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = vec4(1.0f); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + vec4 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan *= valid ? s : vec4(1.0f); + } + } + return excl_scan; +} +double subgroupExclusiveMul(double v) +{ + double excl_scan = 0.0LF; + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + double s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan *= valid ? s : 0.0LF; + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = 0.0LF; + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + double s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan *= valid ? s : 0.0LF; + } + } + return excl_scan; +} +dvec2 subgroupExclusiveMul(dvec2 v) +{ + dvec2 excl_scan = dvec2(1.0LF); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + dvec2 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan *= valid ? s : dvec2(1.0LF); + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = dvec2(1.0LF); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + dvec2 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan *= valid ? s : dvec2(1.0LF); + } + } + return excl_scan; +} +dvec3 subgroupExclusiveMul(dvec3 v) +{ + dvec3 excl_scan = dvec3(1.0LF); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + dvec3 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan *= valid ? s : dvec3(1.0LF); + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = dvec3(1.0LF); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + dvec3 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan *= valid ? s : dvec3(1.0LF); + } + } + return excl_scan; +} +dvec4 subgroupExclusiveMul(dvec4 v) +{ + dvec4 excl_scan = dvec4(1.0LF); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + dvec4 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan *= valid ? s : dvec4(1.0LF); + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = dvec4(1.0LF); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + dvec4 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan *= valid ? s : dvec4(1.0LF); + } + } + return excl_scan; +} +#endif + +#if defined(GL_KHR_shader_subgroup_arithmetic) +#elif defined(GL_NV_shader_thread_shuffle) +float subgroupInclusiveMul(float v) +{ + float incl_scan = 1.0f; + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + float s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan *= valid ? s : 1.0f; + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + float s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan *= valid ? s : 1.0f; + } + } + return incl_scan; +} +vec2 subgroupInclusiveMul(vec2 v) +{ + vec2 incl_scan = vec2(1.0f); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + vec2 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan *= valid ? s : vec2(1.0f); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + vec2 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan *= valid ? s : vec2(1.0f); + } + } + return incl_scan; +} +vec3 subgroupInclusiveMul(vec3 v) +{ + vec3 incl_scan = vec3(1.0f); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + vec3 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan *= valid ? s : vec3(1.0f); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + vec3 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan *= valid ? s : vec3(1.0f); + } + } + return incl_scan; +} +vec4 subgroupInclusiveMul(vec4 v) +{ + vec4 incl_scan = vec4(1.0f); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + vec4 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan *= valid ? s : vec4(1.0f); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + vec4 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan *= valid ? s : vec4(1.0f); + } + } + return incl_scan; +} +double subgroupInclusiveMul(double v) +{ + double incl_scan = 0.0LF; + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + double s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan *= valid ? s : 0.0LF; + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + double s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan *= valid ? s : 0.0LF; + } + } + return incl_scan; +} +dvec2 subgroupInclusiveMul(dvec2 v) +{ + dvec2 incl_scan = dvec2(1.0LF); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + dvec2 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan *= valid ? s : dvec2(1.0LF); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + dvec2 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan *= valid ? s : dvec2(1.0LF); + } + } + return incl_scan; +} +dvec3 subgroupInclusiveMul(dvec3 v) +{ + dvec3 incl_scan = dvec3(1.0LF); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + dvec3 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan *= valid ? s : dvec3(1.0LF); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + dvec3 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan *= valid ? s : dvec3(1.0LF); + } + } + return incl_scan; +} +dvec4 subgroupInclusiveMul(dvec4 v) +{ + dvec4 incl_scan = dvec4(1.0LF); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + dvec4 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan *= valid ? s : dvec4(1.0LF); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + dvec4 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan *= valid ? s : dvec4(1.0LF); + } + } + return incl_scan; +} +#endif + +void main() +{ + _16.data_out_float = subgroupMul(_31.data_in_float[gl_LocalInvocationID.x]); + _16.data_out_vec2 = subgroupMul(_31.data_in_vec2[gl_LocalInvocationID.x]); + _16.data_out_vec3 = subgroupMul(_31.data_in_vec3[gl_LocalInvocationID.x]); + _16.data_out_vec4 = subgroupMul(_31.data_in_vec4[gl_LocalInvocationID.x]); + _16.data_out_double = subgroupMul(_31.data_in_double[gl_LocalInvocationID.x]); + _16.data_out_dvec2 = subgroupMul(_31.data_in_dvec2[gl_LocalInvocationID.x]); + _16.data_out_dvec3 = subgroupMul(_31.data_in_dvec3[gl_LocalInvocationID.x]); + _16.data_out_dvec4 = subgroupMul(_31.data_in_dvec4[gl_LocalInvocationID.x]); + _16.data_out_float = subgroupExclusiveMul(_31.data_in_float[gl_LocalInvocationID.x]); + _16.data_out_vec2 = subgroupExclusiveMul(_31.data_in_vec2[gl_LocalInvocationID.x]); + _16.data_out_vec3 = subgroupExclusiveMul(_31.data_in_vec3[gl_LocalInvocationID.x]); + _16.data_out_vec4 = subgroupExclusiveMul(_31.data_in_vec4[gl_LocalInvocationID.x]); + _16.data_out_double = subgroupExclusiveMul(_31.data_in_double[gl_LocalInvocationID.x]); + _16.data_out_dvec2 = subgroupExclusiveMul(_31.data_in_dvec2[gl_LocalInvocationID.x]); + _16.data_out_dvec3 = subgroupExclusiveMul(_31.data_in_dvec3[gl_LocalInvocationID.x]); + _16.data_out_dvec4 = subgroupExclusiveMul(_31.data_in_dvec4[gl_LocalInvocationID.x]); + _16.data_out_float = subgroupInclusiveMul(_31.data_in_float[gl_LocalInvocationID.x]); + _16.data_out_vec2 = subgroupInclusiveMul(_31.data_in_vec2[gl_LocalInvocationID.x]); + _16.data_out_vec3 = subgroupInclusiveMul(_31.data_in_vec3[gl_LocalInvocationID.x]); + _16.data_out_vec4 = subgroupInclusiveMul(_31.data_in_vec4[gl_LocalInvocationID.x]); + _16.data_out_double = subgroupInclusiveMul(_31.data_in_double[gl_LocalInvocationID.x]); + _16.data_out_dvec2 = subgroupInclusiveMul(_31.data_in_dvec2[gl_LocalInvocationID.x]); + _16.data_out_dvec3 = subgroupInclusiveMul(_31.data_in_dvec3[gl_LocalInvocationID.x]); + _16.data_out_dvec4 = subgroupInclusiveMul(_31.data_in_dvec4[gl_LocalInvocationID.x]); +} + diff --git a/reference/shaders-no-opt/comp/subgroups_arithmetic_fmul.vk.comp.vk b/reference/shaders-no-opt/comp/subgroups_arithmetic_fmul.vk.comp.vk new file mode 100644 index 00000000..42f41954 --- /dev/null +++ b/reference/shaders-no-opt/comp/subgroups_arithmetic_fmul.vk.comp.vk @@ -0,0 +1,56 @@ +#version 450 +#extension GL_KHR_shader_subgroup_arithmetic : require +layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in; + +layout(set = 0, binding = 1, std430) buffer DATA_OUT +{ + float data_out_float; + vec2 data_out_vec2; + vec3 data_out_vec3; + vec4 data_out_vec4; + double data_out_double; + dvec2 data_out_dvec2; + dvec3 data_out_dvec3; + dvec4 data_out_dvec4; +} _16; + +layout(set = 0, binding = 0, std430) buffer DATA_IN +{ + float data_in_float[128]; + vec2 data_in_vec2[128]; + vec3 data_in_vec3[128]; + vec4 data_in_vec4[128]; + double data_in_double[128]; + dvec2 data_in_dvec2[128]; + dvec3 data_in_dvec3[128]; + dvec4 data_in_dvec4[128]; +} _31; + +void main() +{ + _16.data_out_float = subgroupMul(_31.data_in_float[gl_LocalInvocationID.x]); + _16.data_out_vec2 = subgroupMul(_31.data_in_vec2[gl_LocalInvocationID.x]); + _16.data_out_vec3 = subgroupMul(_31.data_in_vec3[gl_LocalInvocationID.x]); + _16.data_out_vec4 = subgroupMul(_31.data_in_vec4[gl_LocalInvocationID.x]); + _16.data_out_double = subgroupMul(_31.data_in_double[gl_LocalInvocationID.x]); + _16.data_out_dvec2 = subgroupMul(_31.data_in_dvec2[gl_LocalInvocationID.x]); + _16.data_out_dvec3 = subgroupMul(_31.data_in_dvec3[gl_LocalInvocationID.x]); + _16.data_out_dvec4 = subgroupMul(_31.data_in_dvec4[gl_LocalInvocationID.x]); + _16.data_out_float = subgroupExclusiveMul(_31.data_in_float[gl_LocalInvocationID.x]); + _16.data_out_vec2 = subgroupExclusiveMul(_31.data_in_vec2[gl_LocalInvocationID.x]); + _16.data_out_vec3 = subgroupExclusiveMul(_31.data_in_vec3[gl_LocalInvocationID.x]); + _16.data_out_vec4 = subgroupExclusiveMul(_31.data_in_vec4[gl_LocalInvocationID.x]); + _16.data_out_double = subgroupExclusiveMul(_31.data_in_double[gl_LocalInvocationID.x]); + _16.data_out_dvec2 = subgroupExclusiveMul(_31.data_in_dvec2[gl_LocalInvocationID.x]); + _16.data_out_dvec3 = subgroupExclusiveMul(_31.data_in_dvec3[gl_LocalInvocationID.x]); + _16.data_out_dvec4 = subgroupExclusiveMul(_31.data_in_dvec4[gl_LocalInvocationID.x]); + _16.data_out_float = subgroupInclusiveMul(_31.data_in_float[gl_LocalInvocationID.x]); + _16.data_out_vec2 = subgroupInclusiveMul(_31.data_in_vec2[gl_LocalInvocationID.x]); + _16.data_out_vec3 = subgroupInclusiveMul(_31.data_in_vec3[gl_LocalInvocationID.x]); + _16.data_out_vec4 = subgroupInclusiveMul(_31.data_in_vec4[gl_LocalInvocationID.x]); + _16.data_out_double = subgroupInclusiveMul(_31.data_in_double[gl_LocalInvocationID.x]); + _16.data_out_dvec2 = subgroupInclusiveMul(_31.data_in_dvec2[gl_LocalInvocationID.x]); + _16.data_out_dvec3 = subgroupInclusiveMul(_31.data_in_dvec3[gl_LocalInvocationID.x]); + _16.data_out_dvec4 = subgroupInclusiveMul(_31.data_in_dvec4[gl_LocalInvocationID.x]); +} + diff --git a/reference/shaders-no-opt/comp/subgroups_arithmetic_iadd.vk.comp b/reference/shaders-no-opt/comp/subgroups_arithmetic_iadd.vk.comp new file mode 100644 index 00000000..e95d86d9 --- /dev/null +++ b/reference/shaders-no-opt/comp/subgroups_arithmetic_iadd.vk.comp @@ -0,0 +1,892 @@ +#version 450 + +#if defined(GL_KHR_shader_subgroup_ballot) +#extension GL_KHR_shader_subgroup_ballot : require +#elif defined(GL_NV_shader_thread_group) +#extension GL_NV_shader_thread_group : require +#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64) +#extension GL_ARB_shader_int64 : enable +#extension GL_ARB_shader_ballot : require +#else +#error No extensions available to emulate requested subgroup feature. +#endif + +#if defined(GL_KHR_shader_subgroup_basic) +#extension GL_KHR_shader_subgroup_basic : require +#elif defined(GL_NV_shader_thread_group) +#extension GL_NV_shader_thread_group : require +#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64) +#extension GL_ARB_shader_int64 : enable +#extension GL_ARB_shader_ballot : require +#elif defined(GL_AMD_gcn_shader) && (defined(GL_AMD_gpu_shader_int64) || defined(GL_NV_gpu_shader5)) +#extension GL_AMD_gpu_shader_int64 : enable +#extension GL_NV_gpu_shader5 : enable +#extension GL_AMD_gcn_shader : require +#else +#error No extensions available to emulate requested subgroup feature. +#endif + +#if defined(GL_KHR_shader_subgroup_ballot) +#extension GL_KHR_shader_subgroup_ballot : require +#elif defined(GL_NV_shader_thread_group) +#extension GL_NV_shader_thread_group : require +#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64) +#extension GL_ARB_shader_int64 : enable +#extension GL_ARB_shader_ballot : require +#else +#error No extensions available to emulate requested subgroup feature. +#endif + +#if defined(GL_NV_shader_thread_group) +#extension GL_NV_shader_thread_group : require +#endif + +#if defined(GL_KHR_shader_subgroup_arithmetic) +#extension GL_KHR_shader_subgroup_arithmetic : require +#elif defined(GL_NV_shader_thread_shuffle) +#extension GL_NV_shader_thread_shuffle : require +#else +#error No extensions available to emulate requested subgroup feature. +#endif + +#if defined(GL_KHR_shader_subgroup_arithmetic) +#extension GL_KHR_shader_subgroup_arithmetic : require +#elif defined(GL_NV_shader_thread_shuffle) +#extension GL_NV_shader_thread_shuffle : require +#else +#error No extensions available to emulate requested subgroup feature. +#endif + +#if defined(GL_KHR_shader_subgroup_arithmetic) +#extension GL_KHR_shader_subgroup_arithmetic : require +#elif defined(GL_NV_shader_thread_shuffle) +#extension GL_NV_shader_thread_shuffle : require +#else +#error No extensions available to emulate requested subgroup feature. +#endif +layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in; + +layout(binding = 1, std430) buffer DATA_OUT +{ + int data_out_int; + ivec2 data_out_ivec2; + ivec3 data_out_ivec3; + ivec4 data_out_ivec4; + uint data_out_uint; + uvec2 data_out_uvec2; + uvec3 data_out_uvec3; + uvec4 data_out_uvec4; +} _16; + +layout(binding = 0, std430) buffer DATA_IN +{ + int data_in_int[128]; + ivec2 data_in_ivec2[128]; + ivec3 data_in_ivec3[128]; + ivec4 data_in_ivec4[128]; + uint data_in_uint[128]; + uvec2 data_in_uvec2[128]; + uvec3 data_in_uvec3[128]; + uvec4 data_in_uvec4[128]; +} _29; + +#if defined(GL_KHR_shader_subgroup_ballot) +#elif defined(GL_NV_shader_thread_group) +#define gl_SubgroupEqMask uvec4(gl_ThreadEqMaskNV, 0u, 0u, 0u) +#define gl_SubgroupGeMask uvec4(gl_ThreadGeMaskNV, 0u, 0u, 0u) +#define gl_SubgroupGtMask uvec4(gl_ThreadGtMaskNV, 0u, 0u, 0u) +#define gl_SubgroupLeMask uvec4(gl_ThreadLeMaskNV, 0u, 0u, 0u) +#define gl_SubgroupLtMask uvec4(gl_ThreadLtMaskNV, 0u, 0u, 0u) +#elif defined(GL_ARB_shader_ballot) +#define gl_SubgroupEqMask uvec4(unpackUint2x32(gl_SubGroupEqMaskARB), 0u, 0u) +#define gl_SubgroupGeMask uvec4(unpackUint2x32(gl_SubGroupGeMaskARB), 0u, 0u) +#define gl_SubgroupGtMask uvec4(unpackUint2x32(gl_SubGroupGtMaskARB), 0u, 0u) +#define gl_SubgroupLeMask uvec4(unpackUint2x32(gl_SubGroupLeMaskARB), 0u, 0u) +#define gl_SubgroupLtMask uvec4(unpackUint2x32(gl_SubGroupLtMaskARB), 0u, 0u) +#endif + +#if defined(GL_KHR_shader_subgroup_basic) +#elif defined(GL_NV_shader_thread_group) +#define gl_SubgroupSize gl_WarpSizeNV +#elif defined(GL_ARB_shader_ballot) +#define gl_SubgroupSize gl_SubGroupSizeARB +#elif defined(GL_AMD_gcn_shader) +#define gl_SubgroupSize uint(gl_SIMDGroupSizeAMD) +#endif + +#if defined(GL_KHR_shader_subgroup_ballot) +#elif defined(GL_NV_shader_thread_group) +uvec4 subgroupBallot(bool v) { return uvec4(ballotThreadNV(v), 0u, 0u, 0u); } +#elif defined(GL_ARB_shader_ballot) +uvec4 subgroupBallot(bool v) { return uvec4(unpackUint2x32(ballotARB(v)), 0u, 0u); } +#endif + +#ifndef GL_KHR_shader_subgroup_basic +bool subgroupElect() +{ + uvec4 activeMask = subgroupBallot(true); + uint firstLive = subgroupBallotFindLSB(activeMask); + return gl_SubgroupInvocationID == firstLive; +} +#endif + +#ifndef GL_KHR_shader_subgroup_ballot +uint subgroupBallotBitCount(uvec4 value) +{ + ivec2 c = bitCount(value.xy); +#ifdef GL_NV_shader_thread_group + return uint(c.x); +#else + return uint(c.x + c.y); +#endif +} +#endif + +#ifndef GL_KHR_shader_subgroup_ballot +bool subgroupBallotBitExtract(uvec4 value, uint index) +{ +#ifdef GL_NV_shader_thread_group + uint shifted = value.x >> index; +#else + uint shifted = value[index >> 5u] >> (index & 0x1fu); +#endif + return (shifted & 1u) != 0u; +} +#endif + +#if defined(GL_KHR_shader_subgroup_arithmetic) +#elif defined(GL_NV_shader_thread_shuffle) +uint subgroupAdd(uint v) +{ + uint reduction = 0u; + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + uint s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction += valid ? s : 0u; + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + uint s = shuffleNV(v, i, gl_SubgroupSize); + reduction += valid ? s : 0u; + } + } + return reduction; +} +uvec2 subgroupAdd(uvec2 v) +{ + uvec2 reduction = uvec2(0u); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + uvec2 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction += valid ? s : uvec2(0u); + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + uvec2 s = shuffleNV(v, i, gl_SubgroupSize); + reduction += valid ? s : uvec2(0u); + } + } + return reduction; +} +uvec3 subgroupAdd(uvec3 v) +{ + uvec3 reduction = uvec3(0u); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + uvec3 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction += valid ? s : uvec3(0u); + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + uvec3 s = shuffleNV(v, i, gl_SubgroupSize); + reduction += valid ? s : uvec3(0u); + } + } + return reduction; +} +uvec4 subgroupAdd(uvec4 v) +{ + uvec4 reduction = uvec4(0u); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + uvec4 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction += valid ? s : uvec4(0u); + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + uvec4 s = shuffleNV(v, i, gl_SubgroupSize); + reduction += valid ? s : uvec4(0u); + } + } + return reduction; +} +int subgroupAdd(int v) +{ + int reduction = 0; + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + int s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction += valid ? s : 0; + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + int s = shuffleNV(v, i, gl_SubgroupSize); + reduction += valid ? s : 0; + } + } + return reduction; +} +ivec2 subgroupAdd(ivec2 v) +{ + ivec2 reduction = ivec2(0); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + ivec2 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction += valid ? s : ivec2(0); + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + ivec2 s = shuffleNV(v, i, gl_SubgroupSize); + reduction += valid ? s : ivec2(0); + } + } + return reduction; +} +ivec3 subgroupAdd(ivec3 v) +{ + ivec3 reduction = ivec3(0); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + ivec3 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction += valid ? s : ivec3(0); + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + ivec3 s = shuffleNV(v, i, gl_SubgroupSize); + reduction += valid ? s : ivec3(0); + } + } + return reduction; +} +ivec4 subgroupAdd(ivec4 v) +{ + ivec4 reduction = ivec4(0); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + ivec4 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction += valid ? s : ivec4(0); + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + ivec4 s = shuffleNV(v, i, gl_SubgroupSize); + reduction += valid ? s : ivec4(0); + } + } + return reduction; +} +#endif + +#if defined(GL_KHR_shader_subgroup_arithmetic) +#elif defined(GL_NV_shader_thread_shuffle) +uint subgroupExclusiveAdd(uint v) +{ + uint excl_scan = 0u; + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + uint s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan += valid ? s : 0u; + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = 0u; + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + uint s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan += valid ? s : 0u; + } + } + return excl_scan; +} +uvec2 subgroupExclusiveAdd(uvec2 v) +{ + uvec2 excl_scan = uvec2(0u); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + uvec2 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan += valid ? s : uvec2(0u); + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = uvec2(0u); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + uvec2 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan += valid ? s : uvec2(0u); + } + } + return excl_scan; +} +uvec3 subgroupExclusiveAdd(uvec3 v) +{ + uvec3 excl_scan = uvec3(0u); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + uvec3 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan += valid ? s : uvec3(0u); + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = uvec3(0u); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + uvec3 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan += valid ? s : uvec3(0u); + } + } + return excl_scan; +} +uvec4 subgroupExclusiveAdd(uvec4 v) +{ + uvec4 excl_scan = uvec4(0u); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + uvec4 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan += valid ? s : uvec4(0u); + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = uvec4(0u); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + uvec4 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan += valid ? s : uvec4(0u); + } + } + return excl_scan; +} +int subgroupExclusiveAdd(int v) +{ + int excl_scan = 0; + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + int s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan += valid ? s : 0; + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = 0; + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + int s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan += valid ? s : 0; + } + } + return excl_scan; +} +ivec2 subgroupExclusiveAdd(ivec2 v) +{ + ivec2 excl_scan = ivec2(0); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + ivec2 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan += valid ? s : ivec2(0); + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = ivec2(0); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + ivec2 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan += valid ? s : ivec2(0); + } + } + return excl_scan; +} +ivec3 subgroupExclusiveAdd(ivec3 v) +{ + ivec3 excl_scan = ivec3(0); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + ivec3 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan += valid ? s : ivec3(0); + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = ivec3(0); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + ivec3 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan += valid ? s : ivec3(0); + } + } + return excl_scan; +} +ivec4 subgroupExclusiveAdd(ivec4 v) +{ + ivec4 excl_scan = ivec4(0); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + ivec4 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan += valid ? s : ivec4(0); + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = ivec4(0); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + ivec4 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan += valid ? s : ivec4(0); + } + } + return excl_scan; +} +#endif + +#if defined(GL_KHR_shader_subgroup_arithmetic) +#elif defined(GL_NV_shader_thread_shuffle) +uint subgroupInclusiveAdd(uint v) +{ + uint incl_scan = 0u; + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + uint s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan += valid ? s : 0u; + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + uint s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan += valid ? s : 0u; + } + } + return incl_scan; +} +uvec2 subgroupInclusiveAdd(uvec2 v) +{ + uvec2 incl_scan = uvec2(0u); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + uvec2 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan += valid ? s : uvec2(0u); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + uvec2 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan += valid ? s : uvec2(0u); + } + } + return incl_scan; +} +uvec3 subgroupInclusiveAdd(uvec3 v) +{ + uvec3 incl_scan = uvec3(0u); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + uvec3 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan += valid ? s : uvec3(0u); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + uvec3 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan += valid ? s : uvec3(0u); + } + } + return incl_scan; +} +uvec4 subgroupInclusiveAdd(uvec4 v) +{ + uvec4 incl_scan = uvec4(0u); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + uvec4 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan += valid ? s : uvec4(0u); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + uvec4 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan += valid ? s : uvec4(0u); + } + } + return incl_scan; +} +int subgroupInclusiveAdd(int v) +{ + int incl_scan = 0; + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + int s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan += valid ? s : 0; + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + int s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan += valid ? s : 0; + } + } + return incl_scan; +} +ivec2 subgroupInclusiveAdd(ivec2 v) +{ + ivec2 incl_scan = ivec2(0); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + ivec2 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan += valid ? s : ivec2(0); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + ivec2 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan += valid ? s : ivec2(0); + } + } + return incl_scan; +} +ivec3 subgroupInclusiveAdd(ivec3 v) +{ + ivec3 incl_scan = ivec3(0); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + ivec3 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan += valid ? s : ivec3(0); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + ivec3 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan += valid ? s : ivec3(0); + } + } + return incl_scan; +} +ivec4 subgroupInclusiveAdd(ivec4 v) +{ + ivec4 incl_scan = ivec4(0); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + ivec4 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan += valid ? s : ivec4(0); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + ivec4 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan += valid ? s : ivec4(0); + } + } + return incl_scan; +} +#endif + +void main() +{ + _16.data_out_int = subgroupAdd(_29.data_in_int[gl_LocalInvocationID.x]); + _16.data_out_ivec2 = subgroupAdd(_29.data_in_ivec2[gl_LocalInvocationID.x]); + _16.data_out_ivec3 = subgroupAdd(_29.data_in_ivec3[gl_LocalInvocationID.x]); + _16.data_out_ivec4 = subgroupAdd(_29.data_in_ivec4[gl_LocalInvocationID.x]); + _16.data_out_uint = subgroupAdd(_29.data_in_uint[gl_LocalInvocationID.x]); + _16.data_out_uvec2 = subgroupAdd(_29.data_in_uvec2[gl_LocalInvocationID.x]); + _16.data_out_uvec3 = subgroupAdd(_29.data_in_uvec3[gl_LocalInvocationID.x]); + _16.data_out_uvec4 = subgroupAdd(_29.data_in_uvec4[gl_LocalInvocationID.x]); + _16.data_out_int = subgroupExclusiveAdd(_29.data_in_int[gl_LocalInvocationID.x]); + _16.data_out_ivec2 = subgroupExclusiveAdd(_29.data_in_ivec2[gl_LocalInvocationID.x]); + _16.data_out_ivec3 = subgroupExclusiveAdd(_29.data_in_ivec3[gl_LocalInvocationID.x]); + _16.data_out_ivec4 = subgroupExclusiveAdd(_29.data_in_ivec4[gl_LocalInvocationID.x]); + _16.data_out_uint = subgroupExclusiveAdd(_29.data_in_uint[gl_LocalInvocationID.x]); + _16.data_out_uvec2 = subgroupExclusiveAdd(_29.data_in_uvec2[gl_LocalInvocationID.x]); + _16.data_out_uvec3 = subgroupExclusiveAdd(_29.data_in_uvec3[gl_LocalInvocationID.x]); + _16.data_out_uvec4 = subgroupExclusiveAdd(_29.data_in_uvec4[gl_LocalInvocationID.x]); + _16.data_out_int = subgroupInclusiveAdd(_29.data_in_int[gl_LocalInvocationID.x]); + _16.data_out_ivec2 = subgroupInclusiveAdd(_29.data_in_ivec2[gl_LocalInvocationID.x]); + _16.data_out_ivec3 = subgroupInclusiveAdd(_29.data_in_ivec3[gl_LocalInvocationID.x]); + _16.data_out_ivec4 = subgroupInclusiveAdd(_29.data_in_ivec4[gl_LocalInvocationID.x]); + _16.data_out_uint = subgroupInclusiveAdd(_29.data_in_uint[gl_LocalInvocationID.x]); + _16.data_out_uvec2 = subgroupInclusiveAdd(_29.data_in_uvec2[gl_LocalInvocationID.x]); + _16.data_out_uvec3 = subgroupInclusiveAdd(_29.data_in_uvec3[gl_LocalInvocationID.x]); + _16.data_out_uvec4 = subgroupInclusiveAdd(_29.data_in_uvec4[gl_LocalInvocationID.x]); +} + diff --git a/reference/shaders-no-opt/comp/subgroups_arithmetic_iadd.vk.comp.vk b/reference/shaders-no-opt/comp/subgroups_arithmetic_iadd.vk.comp.vk new file mode 100644 index 00000000..2423e4d2 --- /dev/null +++ b/reference/shaders-no-opt/comp/subgroups_arithmetic_iadd.vk.comp.vk @@ -0,0 +1,56 @@ +#version 450 +#extension GL_KHR_shader_subgroup_arithmetic : require +layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in; + +layout(set = 0, binding = 1, std430) buffer DATA_OUT +{ + int data_out_int; + ivec2 data_out_ivec2; + ivec3 data_out_ivec3; + ivec4 data_out_ivec4; + uint data_out_uint; + uvec2 data_out_uvec2; + uvec3 data_out_uvec3; + uvec4 data_out_uvec4; +} _16; + +layout(set = 0, binding = 0, std430) buffer DATA_IN +{ + int data_in_int[128]; + ivec2 data_in_ivec2[128]; + ivec3 data_in_ivec3[128]; + ivec4 data_in_ivec4[128]; + uint data_in_uint[128]; + uvec2 data_in_uvec2[128]; + uvec3 data_in_uvec3[128]; + uvec4 data_in_uvec4[128]; +} _29; + +void main() +{ + _16.data_out_int = subgroupAdd(_29.data_in_int[gl_LocalInvocationID.x]); + _16.data_out_ivec2 = subgroupAdd(_29.data_in_ivec2[gl_LocalInvocationID.x]); + _16.data_out_ivec3 = subgroupAdd(_29.data_in_ivec3[gl_LocalInvocationID.x]); + _16.data_out_ivec4 = subgroupAdd(_29.data_in_ivec4[gl_LocalInvocationID.x]); + _16.data_out_uint = subgroupAdd(_29.data_in_uint[gl_LocalInvocationID.x]); + _16.data_out_uvec2 = subgroupAdd(_29.data_in_uvec2[gl_LocalInvocationID.x]); + _16.data_out_uvec3 = subgroupAdd(_29.data_in_uvec3[gl_LocalInvocationID.x]); + _16.data_out_uvec4 = subgroupAdd(_29.data_in_uvec4[gl_LocalInvocationID.x]); + _16.data_out_int = subgroupExclusiveAdd(_29.data_in_int[gl_LocalInvocationID.x]); + _16.data_out_ivec2 = subgroupExclusiveAdd(_29.data_in_ivec2[gl_LocalInvocationID.x]); + _16.data_out_ivec3 = subgroupExclusiveAdd(_29.data_in_ivec3[gl_LocalInvocationID.x]); + _16.data_out_ivec4 = subgroupExclusiveAdd(_29.data_in_ivec4[gl_LocalInvocationID.x]); + _16.data_out_uint = subgroupExclusiveAdd(_29.data_in_uint[gl_LocalInvocationID.x]); + _16.data_out_uvec2 = subgroupExclusiveAdd(_29.data_in_uvec2[gl_LocalInvocationID.x]); + _16.data_out_uvec3 = subgroupExclusiveAdd(_29.data_in_uvec3[gl_LocalInvocationID.x]); + _16.data_out_uvec4 = subgroupExclusiveAdd(_29.data_in_uvec4[gl_LocalInvocationID.x]); + _16.data_out_int = subgroupInclusiveAdd(_29.data_in_int[gl_LocalInvocationID.x]); + _16.data_out_ivec2 = subgroupInclusiveAdd(_29.data_in_ivec2[gl_LocalInvocationID.x]); + _16.data_out_ivec3 = subgroupInclusiveAdd(_29.data_in_ivec3[gl_LocalInvocationID.x]); + _16.data_out_ivec4 = subgroupInclusiveAdd(_29.data_in_ivec4[gl_LocalInvocationID.x]); + _16.data_out_uint = subgroupInclusiveAdd(_29.data_in_uint[gl_LocalInvocationID.x]); + _16.data_out_uvec2 = subgroupInclusiveAdd(_29.data_in_uvec2[gl_LocalInvocationID.x]); + _16.data_out_uvec3 = subgroupInclusiveAdd(_29.data_in_uvec3[gl_LocalInvocationID.x]); + _16.data_out_uvec4 = subgroupInclusiveAdd(_29.data_in_uvec4[gl_LocalInvocationID.x]); +} + diff --git a/reference/shaders-no-opt/comp/subgroups_arithmetic_imul.vk.comp b/reference/shaders-no-opt/comp/subgroups_arithmetic_imul.vk.comp new file mode 100644 index 00000000..17e52c02 --- /dev/null +++ b/reference/shaders-no-opt/comp/subgroups_arithmetic_imul.vk.comp @@ -0,0 +1,892 @@ +#version 450 + +#if defined(GL_KHR_shader_subgroup_ballot) +#extension GL_KHR_shader_subgroup_ballot : require +#elif defined(GL_NV_shader_thread_group) +#extension GL_NV_shader_thread_group : require +#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64) +#extension GL_ARB_shader_int64 : enable +#extension GL_ARB_shader_ballot : require +#else +#error No extensions available to emulate requested subgroup feature. +#endif + +#if defined(GL_KHR_shader_subgroup_basic) +#extension GL_KHR_shader_subgroup_basic : require +#elif defined(GL_NV_shader_thread_group) +#extension GL_NV_shader_thread_group : require +#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64) +#extension GL_ARB_shader_int64 : enable +#extension GL_ARB_shader_ballot : require +#elif defined(GL_AMD_gcn_shader) && (defined(GL_AMD_gpu_shader_int64) || defined(GL_NV_gpu_shader5)) +#extension GL_AMD_gpu_shader_int64 : enable +#extension GL_NV_gpu_shader5 : enable +#extension GL_AMD_gcn_shader : require +#else +#error No extensions available to emulate requested subgroup feature. +#endif + +#if defined(GL_KHR_shader_subgroup_ballot) +#extension GL_KHR_shader_subgroup_ballot : require +#elif defined(GL_NV_shader_thread_group) +#extension GL_NV_shader_thread_group : require +#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64) +#extension GL_ARB_shader_int64 : enable +#extension GL_ARB_shader_ballot : require +#else +#error No extensions available to emulate requested subgroup feature. +#endif + +#if defined(GL_NV_shader_thread_group) +#extension GL_NV_shader_thread_group : require +#endif + +#if defined(GL_KHR_shader_subgroup_arithmetic) +#extension GL_KHR_shader_subgroup_arithmetic : require +#elif defined(GL_NV_shader_thread_shuffle) +#extension GL_NV_shader_thread_shuffle : require +#else +#error No extensions available to emulate requested subgroup feature. +#endif + +#if defined(GL_KHR_shader_subgroup_arithmetic) +#extension GL_KHR_shader_subgroup_arithmetic : require +#elif defined(GL_NV_shader_thread_shuffle) +#extension GL_NV_shader_thread_shuffle : require +#else +#error No extensions available to emulate requested subgroup feature. +#endif + +#if defined(GL_KHR_shader_subgroup_arithmetic) +#extension GL_KHR_shader_subgroup_arithmetic : require +#elif defined(GL_NV_shader_thread_shuffle) +#extension GL_NV_shader_thread_shuffle : require +#else +#error No extensions available to emulate requested subgroup feature. +#endif +layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in; + +layout(binding = 1, std430) buffer DATA_OUT +{ + int data_out_int; + ivec2 data_out_ivec2; + ivec3 data_out_ivec3; + ivec4 data_out_ivec4; + uint data_out_uint; + uvec2 data_out_uvec2; + uvec3 data_out_uvec3; + uvec4 data_out_uvec4; +} _16; + +layout(binding = 0, std430) buffer DATA_IN +{ + int data_in_int[128]; + ivec2 data_in_ivec2[128]; + ivec3 data_in_ivec3[128]; + ivec4 data_in_ivec4[128]; + uint data_in_uint[128]; + uvec2 data_in_uvec2[128]; + uvec3 data_in_uvec3[128]; + uvec4 data_in_uvec4[128]; +} _29; + +#if defined(GL_KHR_shader_subgroup_ballot) +#elif defined(GL_NV_shader_thread_group) +#define gl_SubgroupEqMask uvec4(gl_ThreadEqMaskNV, 0u, 0u, 0u) +#define gl_SubgroupGeMask uvec4(gl_ThreadGeMaskNV, 0u, 0u, 0u) +#define gl_SubgroupGtMask uvec4(gl_ThreadGtMaskNV, 0u, 0u, 0u) +#define gl_SubgroupLeMask uvec4(gl_ThreadLeMaskNV, 0u, 0u, 0u) +#define gl_SubgroupLtMask uvec4(gl_ThreadLtMaskNV, 0u, 0u, 0u) +#elif defined(GL_ARB_shader_ballot) +#define gl_SubgroupEqMask uvec4(unpackUint2x32(gl_SubGroupEqMaskARB), 0u, 0u) +#define gl_SubgroupGeMask uvec4(unpackUint2x32(gl_SubGroupGeMaskARB), 0u, 0u) +#define gl_SubgroupGtMask uvec4(unpackUint2x32(gl_SubGroupGtMaskARB), 0u, 0u) +#define gl_SubgroupLeMask uvec4(unpackUint2x32(gl_SubGroupLeMaskARB), 0u, 0u) +#define gl_SubgroupLtMask uvec4(unpackUint2x32(gl_SubGroupLtMaskARB), 0u, 0u) +#endif + +#if defined(GL_KHR_shader_subgroup_basic) +#elif defined(GL_NV_shader_thread_group) +#define gl_SubgroupSize gl_WarpSizeNV +#elif defined(GL_ARB_shader_ballot) +#define gl_SubgroupSize gl_SubGroupSizeARB +#elif defined(GL_AMD_gcn_shader) +#define gl_SubgroupSize uint(gl_SIMDGroupSizeAMD) +#endif + +#if defined(GL_KHR_shader_subgroup_ballot) +#elif defined(GL_NV_shader_thread_group) +uvec4 subgroupBallot(bool v) { return uvec4(ballotThreadNV(v), 0u, 0u, 0u); } +#elif defined(GL_ARB_shader_ballot) +uvec4 subgroupBallot(bool v) { return uvec4(unpackUint2x32(ballotARB(v)), 0u, 0u); } +#endif + +#ifndef GL_KHR_shader_subgroup_basic +bool subgroupElect() +{ + uvec4 activeMask = subgroupBallot(true); + uint firstLive = subgroupBallotFindLSB(activeMask); + return gl_SubgroupInvocationID == firstLive; +} +#endif + +#ifndef GL_KHR_shader_subgroup_ballot +uint subgroupBallotBitCount(uvec4 value) +{ + ivec2 c = bitCount(value.xy); +#ifdef GL_NV_shader_thread_group + return uint(c.x); +#else + return uint(c.x + c.y); +#endif +} +#endif + +#ifndef GL_KHR_shader_subgroup_ballot +bool subgroupBallotBitExtract(uvec4 value, uint index) +{ +#ifdef GL_NV_shader_thread_group + uint shifted = value.x >> index; +#else + uint shifted = value[index >> 5u] >> (index & 0x1fu); +#endif + return (shifted & 1u) != 0u; +} +#endif + +#if defined(GL_KHR_shader_subgroup_arithmetic) +#elif defined(GL_NV_shader_thread_shuffle) +uint subgroupMul(uint v) +{ + uint reduction = 1u; + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + uint s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction *= valid ? s : 1u; + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + uint s = shuffleNV(v, i, gl_SubgroupSize); + reduction *= valid ? s : 1u; + } + } + return reduction; +} +uvec2 subgroupMul(uvec2 v) +{ + uvec2 reduction = uvec2(1u); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + uvec2 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction *= valid ? s : uvec2(1u); + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + uvec2 s = shuffleNV(v, i, gl_SubgroupSize); + reduction *= valid ? s : uvec2(1u); + } + } + return reduction; +} +uvec3 subgroupMul(uvec3 v) +{ + uvec3 reduction = uvec3(1u); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + uvec3 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction *= valid ? s : uvec3(1u); + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + uvec3 s = shuffleNV(v, i, gl_SubgroupSize); + reduction *= valid ? s : uvec3(1u); + } + } + return reduction; +} +uvec4 subgroupMul(uvec4 v) +{ + uvec4 reduction = uvec4(1u); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + uvec4 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction *= valid ? s : uvec4(1u); + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + uvec4 s = shuffleNV(v, i, gl_SubgroupSize); + reduction *= valid ? s : uvec4(1u); + } + } + return reduction; +} +int subgroupMul(int v) +{ + int reduction = 1; + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + int s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction *= valid ? s : 1; + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + int s = shuffleNV(v, i, gl_SubgroupSize); + reduction *= valid ? s : 1; + } + } + return reduction; +} +ivec2 subgroupMul(ivec2 v) +{ + ivec2 reduction = ivec2(1); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + ivec2 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction *= valid ? s : ivec2(1); + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + ivec2 s = shuffleNV(v, i, gl_SubgroupSize); + reduction *= valid ? s : ivec2(1); + } + } + return reduction; +} +ivec3 subgroupMul(ivec3 v) +{ + ivec3 reduction = ivec3(1); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + ivec3 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction *= valid ? s : ivec3(1); + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + ivec3 s = shuffleNV(v, i, gl_SubgroupSize); + reduction *= valid ? s : ivec3(1); + } + } + return reduction; +} +ivec4 subgroupMul(ivec4 v) +{ + ivec4 reduction = ivec4(1); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + reduction = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + ivec4 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); + reduction *= valid ? s : ivec4(1); + } + } + else + { + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + ivec4 s = shuffleNV(v, i, gl_SubgroupSize); + reduction *= valid ? s : ivec4(1); + } + } + return reduction; +} +#endif + +#if defined(GL_KHR_shader_subgroup_arithmetic) +#elif defined(GL_NV_shader_thread_shuffle) +uint subgroupExclusiveMul(uint v) +{ + uint excl_scan = 1u; + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + uint s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan *= valid ? s : 1u; + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = 1u; + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + uint s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan *= valid ? s : 1u; + } + } + return excl_scan; +} +uvec2 subgroupExclusiveMul(uvec2 v) +{ + uvec2 excl_scan = uvec2(1u); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + uvec2 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan *= valid ? s : uvec2(1u); + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = uvec2(1u); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + uvec2 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan *= valid ? s : uvec2(1u); + } + } + return excl_scan; +} +uvec3 subgroupExclusiveMul(uvec3 v) +{ + uvec3 excl_scan = uvec3(1u); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + uvec3 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan *= valid ? s : uvec3(1u); + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = uvec3(1u); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + uvec3 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan *= valid ? s : uvec3(1u); + } + } + return excl_scan; +} +uvec4 subgroupExclusiveMul(uvec4 v) +{ + uvec4 excl_scan = uvec4(1u); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + uvec4 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan *= valid ? s : uvec4(1u); + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = uvec4(1u); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + uvec4 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan *= valid ? s : uvec4(1u); + } + } + return excl_scan; +} +int subgroupExclusiveMul(int v) +{ + int excl_scan = 1; + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + int s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan *= valid ? s : 1; + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = 1; + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + int s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan *= valid ? s : 1; + } + } + return excl_scan; +} +ivec2 subgroupExclusiveMul(ivec2 v) +{ + ivec2 excl_scan = ivec2(1); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + ivec2 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan *= valid ? s : ivec2(1); + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = ivec2(1); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + ivec2 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan *= valid ? s : ivec2(1); + } + } + return excl_scan; +} +ivec3 subgroupExclusiveMul(ivec3 v) +{ + ivec3 excl_scan = ivec3(1); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + ivec3 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan *= valid ? s : ivec3(1); + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = ivec3(1); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + ivec3 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan *= valid ? s : ivec3(1); + } + } + return excl_scan; +} +ivec4 subgroupExclusiveMul(ivec4 v) +{ + ivec4 excl_scan = ivec4(1); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + excl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + ivec4 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); + excl_scan *= valid ? s : ivec4(1); + } + excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); + if (subgroupElect()) + { + excl_scan = ivec4(1); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLtMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + ivec4 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + excl_scan *= valid ? s : ivec4(1); + } + } + return excl_scan; +} +#endif + +#if defined(GL_KHR_shader_subgroup_arithmetic) +#elif defined(GL_NV_shader_thread_shuffle) +uint subgroupInclusiveMul(uint v) +{ + uint incl_scan = 1u; + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + uint s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan *= valid ? s : 1u; + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + uint s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan *= valid ? s : 1u; + } + } + return incl_scan; +} +uvec2 subgroupInclusiveMul(uvec2 v) +{ + uvec2 incl_scan = uvec2(1u); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + uvec2 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan *= valid ? s : uvec2(1u); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + uvec2 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan *= valid ? s : uvec2(1u); + } + } + return incl_scan; +} +uvec3 subgroupInclusiveMul(uvec3 v) +{ + uvec3 incl_scan = uvec3(1u); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + uvec3 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan *= valid ? s : uvec3(1u); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + uvec3 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan *= valid ? s : uvec3(1u); + } + } + return incl_scan; +} +uvec4 subgroupInclusiveMul(uvec4 v) +{ + uvec4 incl_scan = uvec4(1u); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + uvec4 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan *= valid ? s : uvec4(1u); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + uvec4 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan *= valid ? s : uvec4(1u); + } + } + return incl_scan; +} +int subgroupInclusiveMul(int v) +{ + int incl_scan = 1; + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + int s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan *= valid ? s : 1; + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + int s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan *= valid ? s : 1; + } + } + return incl_scan; +} +ivec2 subgroupInclusiveMul(ivec2 v) +{ + ivec2 incl_scan = ivec2(1); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + ivec2 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan *= valid ? s : ivec2(1); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + ivec2 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan *= valid ? s : ivec2(1); + } + } + return incl_scan; +} +ivec3 subgroupInclusiveMul(ivec3 v) +{ + ivec3 incl_scan = ivec3(1); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + ivec3 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan *= valid ? s : ivec3(1); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + ivec3 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan *= valid ? s : ivec3(1); + } + } + return incl_scan; +} +ivec4 subgroupInclusiveMul(ivec4 v) +{ + ivec4 incl_scan = ivec4(1); + uvec4 active_threads = subgroupBallot(true); + if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) + { + uint total = gl_SubgroupSize / 2u; + incl_scan = v; + for (uint i = 1u; i <= total; i <<= 1u) + { + bool valid; + ivec4 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); + incl_scan *= valid ? s : ivec4(1); + } + } + else + { + uint total = subgroupBallotBitCount(gl_SubgroupLeMask); + for (uint i = 0u; i < gl_SubgroupSize; ++i) + { + bool valid = subgroupBallotBitExtract(active_threads, i); + ivec4 s = shuffleNV(v, i, gl_SubgroupSize); + valid = valid && (i < total); + incl_scan *= valid ? s : ivec4(1); + } + } + return incl_scan; +} +#endif + +void main() +{ + _16.data_out_int = subgroupMul(_29.data_in_int[gl_LocalInvocationID.x]); + _16.data_out_ivec2 = subgroupMul(_29.data_in_ivec2[gl_LocalInvocationID.x]); + _16.data_out_ivec3 = subgroupMul(_29.data_in_ivec3[gl_LocalInvocationID.x]); + _16.data_out_ivec4 = subgroupMul(_29.data_in_ivec4[gl_LocalInvocationID.x]); + _16.data_out_uint = subgroupMul(_29.data_in_uint[gl_LocalInvocationID.x]); + _16.data_out_uvec2 = subgroupMul(_29.data_in_uvec2[gl_LocalInvocationID.x]); + _16.data_out_uvec3 = subgroupMul(_29.data_in_uvec3[gl_LocalInvocationID.x]); + _16.data_out_uvec4 = subgroupMul(_29.data_in_uvec4[gl_LocalInvocationID.x]); + _16.data_out_int = subgroupExclusiveMul(_29.data_in_int[gl_LocalInvocationID.x]); + _16.data_out_ivec2 = subgroupExclusiveMul(_29.data_in_ivec2[gl_LocalInvocationID.x]); + _16.data_out_ivec3 = subgroupExclusiveMul(_29.data_in_ivec3[gl_LocalInvocationID.x]); + _16.data_out_ivec4 = subgroupExclusiveMul(_29.data_in_ivec4[gl_LocalInvocationID.x]); + _16.data_out_uint = subgroupExclusiveMul(_29.data_in_uint[gl_LocalInvocationID.x]); + _16.data_out_uvec2 = subgroupExclusiveMul(_29.data_in_uvec2[gl_LocalInvocationID.x]); + _16.data_out_uvec3 = subgroupExclusiveMul(_29.data_in_uvec3[gl_LocalInvocationID.x]); + _16.data_out_uvec4 = subgroupExclusiveMul(_29.data_in_uvec4[gl_LocalInvocationID.x]); + _16.data_out_int = subgroupInclusiveMul(_29.data_in_int[gl_LocalInvocationID.x]); + _16.data_out_ivec2 = subgroupInclusiveMul(_29.data_in_ivec2[gl_LocalInvocationID.x]); + _16.data_out_ivec3 = subgroupInclusiveMul(_29.data_in_ivec3[gl_LocalInvocationID.x]); + _16.data_out_ivec4 = subgroupInclusiveMul(_29.data_in_ivec4[gl_LocalInvocationID.x]); + _16.data_out_uint = subgroupInclusiveMul(_29.data_in_uint[gl_LocalInvocationID.x]); + _16.data_out_uvec2 = subgroupInclusiveMul(_29.data_in_uvec2[gl_LocalInvocationID.x]); + _16.data_out_uvec3 = subgroupInclusiveMul(_29.data_in_uvec3[gl_LocalInvocationID.x]); + _16.data_out_uvec4 = subgroupInclusiveMul(_29.data_in_uvec4[gl_LocalInvocationID.x]); +} + diff --git a/reference/shaders-no-opt/comp/subgroups_arithmetic_imul.vk.comp.vk b/reference/shaders-no-opt/comp/subgroups_arithmetic_imul.vk.comp.vk new file mode 100644 index 00000000..706684dd --- /dev/null +++ b/reference/shaders-no-opt/comp/subgroups_arithmetic_imul.vk.comp.vk @@ -0,0 +1,56 @@ +#version 450 +#extension GL_KHR_shader_subgroup_arithmetic : require +layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in; + +layout(set = 0, binding = 1, std430) buffer DATA_OUT +{ + int data_out_int; + ivec2 data_out_ivec2; + ivec3 data_out_ivec3; + ivec4 data_out_ivec4; + uint data_out_uint; + uvec2 data_out_uvec2; + uvec3 data_out_uvec3; + uvec4 data_out_uvec4; +} _16; + +layout(set = 0, binding = 0, std430) buffer DATA_IN +{ + int data_in_int[128]; + ivec2 data_in_ivec2[128]; + ivec3 data_in_ivec3[128]; + ivec4 data_in_ivec4[128]; + uint data_in_uint[128]; + uvec2 data_in_uvec2[128]; + uvec3 data_in_uvec3[128]; + uvec4 data_in_uvec4[128]; +} _29; + +void main() +{ + _16.data_out_int = subgroupMul(_29.data_in_int[gl_LocalInvocationID.x]); + _16.data_out_ivec2 = subgroupMul(_29.data_in_ivec2[gl_LocalInvocationID.x]); + _16.data_out_ivec3 = subgroupMul(_29.data_in_ivec3[gl_LocalInvocationID.x]); + _16.data_out_ivec4 = subgroupMul(_29.data_in_ivec4[gl_LocalInvocationID.x]); + _16.data_out_uint = subgroupMul(_29.data_in_uint[gl_LocalInvocationID.x]); + _16.data_out_uvec2 = subgroupMul(_29.data_in_uvec2[gl_LocalInvocationID.x]); + _16.data_out_uvec3 = subgroupMul(_29.data_in_uvec3[gl_LocalInvocationID.x]); + _16.data_out_uvec4 = subgroupMul(_29.data_in_uvec4[gl_LocalInvocationID.x]); + _16.data_out_int = subgroupExclusiveMul(_29.data_in_int[gl_LocalInvocationID.x]); + _16.data_out_ivec2 = subgroupExclusiveMul(_29.data_in_ivec2[gl_LocalInvocationID.x]); + _16.data_out_ivec3 = subgroupExclusiveMul(_29.data_in_ivec3[gl_LocalInvocationID.x]); + _16.data_out_ivec4 = subgroupExclusiveMul(_29.data_in_ivec4[gl_LocalInvocationID.x]); + _16.data_out_uint = subgroupExclusiveMul(_29.data_in_uint[gl_LocalInvocationID.x]); + _16.data_out_uvec2 = subgroupExclusiveMul(_29.data_in_uvec2[gl_LocalInvocationID.x]); + _16.data_out_uvec3 = subgroupExclusiveMul(_29.data_in_uvec3[gl_LocalInvocationID.x]); + _16.data_out_uvec4 = subgroupExclusiveMul(_29.data_in_uvec4[gl_LocalInvocationID.x]); + _16.data_out_int = subgroupInclusiveMul(_29.data_in_int[gl_LocalInvocationID.x]); + _16.data_out_ivec2 = subgroupInclusiveMul(_29.data_in_ivec2[gl_LocalInvocationID.x]); + _16.data_out_ivec3 = subgroupInclusiveMul(_29.data_in_ivec3[gl_LocalInvocationID.x]); + _16.data_out_ivec4 = subgroupInclusiveMul(_29.data_in_ivec4[gl_LocalInvocationID.x]); + _16.data_out_uint = subgroupInclusiveMul(_29.data_in_uint[gl_LocalInvocationID.x]); + _16.data_out_uvec2 = subgroupInclusiveMul(_29.data_in_uvec2[gl_LocalInvocationID.x]); + _16.data_out_uvec3 = subgroupInclusiveMul(_29.data_in_uvec3[gl_LocalInvocationID.x]); + _16.data_out_uvec4 = subgroupInclusiveMul(_29.data_in_uvec4[gl_LocalInvocationID.x]); +} + diff --git a/shaders-no-opt/comp/subgroups_arithmetic_fadd.vk.comp b/shaders-no-opt/comp/subgroups_arithmetic_fadd.vk.comp new file mode 100644 index 00000000..421c7670 --- /dev/null +++ b/shaders-no-opt/comp/subgroups_arithmetic_fadd.vk.comp @@ -0,0 +1,61 @@ +#version 450 +#extension GL_KHR_shader_subgroup_arithmetic : require + +layout(local_size_x = 128) in; + +layout(std430, binding = 0) buffer DATA_IN +{ + float data_in_float[128]; + vec2 data_in_vec2[128]; + vec3 data_in_vec3[128]; + vec4 data_in_vec4[128]; + double data_in_double[128]; + dvec2 data_in_dvec2[128]; + dvec3 data_in_dvec3[128]; + dvec4 data_in_dvec4[128]; +}; + +layout(std430, binding = 1) buffer DATA_OUT +{ + float data_out_float; + vec2 data_out_vec2; + vec3 data_out_vec3; + vec4 data_out_vec4; + double data_out_double; + dvec2 data_out_dvec2; + dvec3 data_out_dvec3; + dvec4 data_out_dvec4; +}; + +void main() +{ + data_out_float = subgroupAdd(data_in_float[gl_LocalInvocationID.x]); + data_out_vec2 = subgroupAdd(data_in_vec2[gl_LocalInvocationID.x]); + data_out_vec3 = subgroupAdd(data_in_vec3[gl_LocalInvocationID.x]); + data_out_vec4 = subgroupAdd(data_in_vec4[gl_LocalInvocationID.x]); + + data_out_double = subgroupAdd(data_in_double[gl_LocalInvocationID.x]); + data_out_dvec2 = subgroupAdd(data_in_dvec2[gl_LocalInvocationID.x]); + data_out_dvec3 = subgroupAdd(data_in_dvec3[gl_LocalInvocationID.x]); + data_out_dvec4 = subgroupAdd(data_in_dvec4[gl_LocalInvocationID.x]); + + data_out_float = subgroupExclusiveAdd(data_in_float[gl_LocalInvocationID.x]); + data_out_vec2 = subgroupExclusiveAdd(data_in_vec2[gl_LocalInvocationID.x]); + data_out_vec3 = subgroupExclusiveAdd(data_in_vec3[gl_LocalInvocationID.x]); + data_out_vec4 = subgroupExclusiveAdd(data_in_vec4[gl_LocalInvocationID.x]); + + data_out_double = subgroupExclusiveAdd(data_in_double[gl_LocalInvocationID.x]); + data_out_dvec2 = subgroupExclusiveAdd(data_in_dvec2[gl_LocalInvocationID.x]); + data_out_dvec3 = subgroupExclusiveAdd(data_in_dvec3[gl_LocalInvocationID.x]); + data_out_dvec4 = subgroupExclusiveAdd(data_in_dvec4[gl_LocalInvocationID.x]); + + data_out_float = subgroupInclusiveAdd(data_in_float[gl_LocalInvocationID.x]); + data_out_vec2 = subgroupInclusiveAdd(data_in_vec2[gl_LocalInvocationID.x]); + data_out_vec3 = subgroupInclusiveAdd(data_in_vec3[gl_LocalInvocationID.x]); + data_out_vec4 = subgroupInclusiveAdd(data_in_vec4[gl_LocalInvocationID.x]); + + data_out_double = subgroupInclusiveAdd(data_in_double[gl_LocalInvocationID.x]); + data_out_dvec2 = subgroupInclusiveAdd(data_in_dvec2[gl_LocalInvocationID.x]); + data_out_dvec3 = subgroupInclusiveAdd(data_in_dvec3[gl_LocalInvocationID.x]); + data_out_dvec4 = subgroupInclusiveAdd(data_in_dvec4[gl_LocalInvocationID.x]); +} diff --git a/shaders-no-opt/comp/subgroups_arithmetic_fmul.vk.comp b/shaders-no-opt/comp/subgroups_arithmetic_fmul.vk.comp new file mode 100644 index 00000000..d1d93174 --- /dev/null +++ b/shaders-no-opt/comp/subgroups_arithmetic_fmul.vk.comp @@ -0,0 +1,61 @@ +#version 450 +#extension GL_KHR_shader_subgroup_arithmetic : require + +layout(local_size_x = 128) in; + +layout(std430, binding = 0) buffer DATA_IN +{ + float data_in_float[128]; + vec2 data_in_vec2[128]; + vec3 data_in_vec3[128]; + vec4 data_in_vec4[128]; + double data_in_double[128]; + dvec2 data_in_dvec2[128]; + dvec3 data_in_dvec3[128]; + dvec4 data_in_dvec4[128]; +}; + +layout(std430, binding = 1) buffer DATA_OUT +{ + float data_out_float; + vec2 data_out_vec2; + vec3 data_out_vec3; + vec4 data_out_vec4; + double data_out_double; + dvec2 data_out_dvec2; + dvec3 data_out_dvec3; + dvec4 data_out_dvec4; +}; + +void main() +{ + data_out_float = subgroupMul(data_in_float[gl_LocalInvocationID.x]); + data_out_vec2 = subgroupMul(data_in_vec2[gl_LocalInvocationID.x]); + data_out_vec3 = subgroupMul(data_in_vec3[gl_LocalInvocationID.x]); + data_out_vec4 = subgroupMul(data_in_vec4[gl_LocalInvocationID.x]); + + data_out_double = subgroupMul(data_in_double[gl_LocalInvocationID.x]); + data_out_dvec2 = subgroupMul(data_in_dvec2[gl_LocalInvocationID.x]); + data_out_dvec3 = subgroupMul(data_in_dvec3[gl_LocalInvocationID.x]); + data_out_dvec4 = subgroupMul(data_in_dvec4[gl_LocalInvocationID.x]); + + data_out_float = subgroupExclusiveMul(data_in_float[gl_LocalInvocationID.x]); + data_out_vec2 = subgroupExclusiveMul(data_in_vec2[gl_LocalInvocationID.x]); + data_out_vec3 = subgroupExclusiveMul(data_in_vec3[gl_LocalInvocationID.x]); + data_out_vec4 = subgroupExclusiveMul(data_in_vec4[gl_LocalInvocationID.x]); + + data_out_double = subgroupExclusiveMul(data_in_double[gl_LocalInvocationID.x]); + data_out_dvec2 = subgroupExclusiveMul(data_in_dvec2[gl_LocalInvocationID.x]); + data_out_dvec3 = subgroupExclusiveMul(data_in_dvec3[gl_LocalInvocationID.x]); + data_out_dvec4 = subgroupExclusiveMul(data_in_dvec4[gl_LocalInvocationID.x]); + + data_out_float = subgroupInclusiveMul(data_in_float[gl_LocalInvocationID.x]); + data_out_vec2 = subgroupInclusiveMul(data_in_vec2[gl_LocalInvocationID.x]); + data_out_vec3 = subgroupInclusiveMul(data_in_vec3[gl_LocalInvocationID.x]); + data_out_vec4 = subgroupInclusiveMul(data_in_vec4[gl_LocalInvocationID.x]); + + data_out_double = subgroupInclusiveMul(data_in_double[gl_LocalInvocationID.x]); + data_out_dvec2 = subgroupInclusiveMul(data_in_dvec2[gl_LocalInvocationID.x]); + data_out_dvec3 = subgroupInclusiveMul(data_in_dvec3[gl_LocalInvocationID.x]); + data_out_dvec4 = subgroupInclusiveMul(data_in_dvec4[gl_LocalInvocationID.x]); +} diff --git a/shaders-no-opt/comp/subgroups_arithmetic_iadd.vk.comp b/shaders-no-opt/comp/subgroups_arithmetic_iadd.vk.comp new file mode 100644 index 00000000..eb62c66e --- /dev/null +++ b/shaders-no-opt/comp/subgroups_arithmetic_iadd.vk.comp @@ -0,0 +1,61 @@ +#version 450 +#extension GL_KHR_shader_subgroup_arithmetic : require + +layout(local_size_x = 128) in; + +layout(std430, binding = 0) buffer DATA_IN +{ + int data_in_int[128]; + ivec2 data_in_ivec2[128]; + ivec3 data_in_ivec3[128]; + ivec4 data_in_ivec4[128]; + uint data_in_uint[128]; + uvec2 data_in_uvec2[128]; + uvec3 data_in_uvec3[128]; + uvec4 data_in_uvec4[128]; +}; + +layout(std430, binding = 1) buffer DATA_OUT +{ + int data_out_int; + ivec2 data_out_ivec2; + ivec3 data_out_ivec3; + ivec4 data_out_ivec4; + uint data_out_uint; + uvec2 data_out_uvec2; + uvec3 data_out_uvec3; + uvec4 data_out_uvec4; +}; + +void main() +{ + data_out_int = subgroupAdd(data_in_int[gl_LocalInvocationID.x]); + data_out_ivec2 = subgroupAdd(data_in_ivec2[gl_LocalInvocationID.x]); + data_out_ivec3 = subgroupAdd(data_in_ivec3[gl_LocalInvocationID.x]); + data_out_ivec4 = subgroupAdd(data_in_ivec4[gl_LocalInvocationID.x]); + + data_out_uint = subgroupAdd(data_in_uint[gl_LocalInvocationID.x]); + data_out_uvec2 = subgroupAdd(data_in_uvec2[gl_LocalInvocationID.x]); + data_out_uvec3 = subgroupAdd(data_in_uvec3[gl_LocalInvocationID.x]); + data_out_uvec4 = subgroupAdd(data_in_uvec4[gl_LocalInvocationID.x]); + + data_out_int = subgroupExclusiveAdd(data_in_int[gl_LocalInvocationID.x]); + data_out_ivec2 = subgroupExclusiveAdd(data_in_ivec2[gl_LocalInvocationID.x]); + data_out_ivec3 = subgroupExclusiveAdd(data_in_ivec3[gl_LocalInvocationID.x]); + data_out_ivec4 = subgroupExclusiveAdd(data_in_ivec4[gl_LocalInvocationID.x]); + + data_out_uint = subgroupExclusiveAdd(data_in_uint[gl_LocalInvocationID.x]); + data_out_uvec2 = subgroupExclusiveAdd(data_in_uvec2[gl_LocalInvocationID.x]); + data_out_uvec3 = subgroupExclusiveAdd(data_in_uvec3[gl_LocalInvocationID.x]); + data_out_uvec4 = subgroupExclusiveAdd(data_in_uvec4[gl_LocalInvocationID.x]); + + data_out_int = subgroupInclusiveAdd(data_in_int[gl_LocalInvocationID.x]); + data_out_ivec2 = subgroupInclusiveAdd(data_in_ivec2[gl_LocalInvocationID.x]); + data_out_ivec3 = subgroupInclusiveAdd(data_in_ivec3[gl_LocalInvocationID.x]); + data_out_ivec4 = subgroupInclusiveAdd(data_in_ivec4[gl_LocalInvocationID.x]); + + data_out_uint = subgroupInclusiveAdd(data_in_uint[gl_LocalInvocationID.x]); + data_out_uvec2 = subgroupInclusiveAdd(data_in_uvec2[gl_LocalInvocationID.x]); + data_out_uvec3 = subgroupInclusiveAdd(data_in_uvec3[gl_LocalInvocationID.x]); + data_out_uvec4 = subgroupInclusiveAdd(data_in_uvec4[gl_LocalInvocationID.x]); +} diff --git a/shaders-no-opt/comp/subgroups_arithmetic_imul.vk.comp b/shaders-no-opt/comp/subgroups_arithmetic_imul.vk.comp new file mode 100644 index 00000000..b2cd0bcf --- /dev/null +++ b/shaders-no-opt/comp/subgroups_arithmetic_imul.vk.comp @@ -0,0 +1,61 @@ +#version 450 +#extension GL_KHR_shader_subgroup_arithmetic : require + +layout(local_size_x = 128) in; + +layout(std430, binding = 0) buffer DATA_IN +{ + int data_in_int[128]; + ivec2 data_in_ivec2[128]; + ivec3 data_in_ivec3[128]; + ivec4 data_in_ivec4[128]; + uint data_in_uint[128]; + uvec2 data_in_uvec2[128]; + uvec3 data_in_uvec3[128]; + uvec4 data_in_uvec4[128]; +}; + +layout(std430, binding = 1) buffer DATA_OUT +{ + int data_out_int; + ivec2 data_out_ivec2; + ivec3 data_out_ivec3; + ivec4 data_out_ivec4; + uint data_out_uint; + uvec2 data_out_uvec2; + uvec3 data_out_uvec3; + uvec4 data_out_uvec4; +}; + +void main() +{ + data_out_int = subgroupMul(data_in_int[gl_LocalInvocationID.x]); + data_out_ivec2 = subgroupMul(data_in_ivec2[gl_LocalInvocationID.x]); + data_out_ivec3 = subgroupMul(data_in_ivec3[gl_LocalInvocationID.x]); + data_out_ivec4 = subgroupMul(data_in_ivec4[gl_LocalInvocationID.x]); + + data_out_uint = subgroupMul(data_in_uint[gl_LocalInvocationID.x]); + data_out_uvec2 = subgroupMul(data_in_uvec2[gl_LocalInvocationID.x]); + data_out_uvec3 = subgroupMul(data_in_uvec3[gl_LocalInvocationID.x]); + data_out_uvec4 = subgroupMul(data_in_uvec4[gl_LocalInvocationID.x]); + + data_out_int = subgroupExclusiveMul(data_in_int[gl_LocalInvocationID.x]); + data_out_ivec2 = subgroupExclusiveMul(data_in_ivec2[gl_LocalInvocationID.x]); + data_out_ivec3 = subgroupExclusiveMul(data_in_ivec3[gl_LocalInvocationID.x]); + data_out_ivec4 = subgroupExclusiveMul(data_in_ivec4[gl_LocalInvocationID.x]); + + data_out_uint = subgroupExclusiveMul(data_in_uint[gl_LocalInvocationID.x]); + data_out_uvec2 = subgroupExclusiveMul(data_in_uvec2[gl_LocalInvocationID.x]); + data_out_uvec3 = subgroupExclusiveMul(data_in_uvec3[gl_LocalInvocationID.x]); + data_out_uvec4 = subgroupExclusiveMul(data_in_uvec4[gl_LocalInvocationID.x]); + + data_out_int = subgroupInclusiveMul(data_in_int[gl_LocalInvocationID.x]); + data_out_ivec2 = subgroupInclusiveMul(data_in_ivec2[gl_LocalInvocationID.x]); + data_out_ivec3 = subgroupInclusiveMul(data_in_ivec3[gl_LocalInvocationID.x]); + data_out_ivec4 = subgroupInclusiveMul(data_in_ivec4[gl_LocalInvocationID.x]); + + data_out_uint = subgroupInclusiveMul(data_in_uint[gl_LocalInvocationID.x]); + data_out_uvec2 = subgroupInclusiveMul(data_in_uvec2[gl_LocalInvocationID.x]); + data_out_uvec3 = subgroupInclusiveMul(data_in_uvec3[gl_LocalInvocationID.x]); + data_out_uvec4 = subgroupInclusiveMul(data_in_uvec4[gl_LocalInvocationID.x]); +} diff --git a/spirv_glsl.cpp b/spirv_glsl.cpp index f4ceab6a..f2fb8781 100644 --- a/spirv_glsl.cpp +++ b/spirv_glsl.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #ifndef _WIN32 #include @@ -3993,6 +3994,169 @@ void CompilerGLSL::emit_output_variable_initializer(const SPIRVariable &var) } } +void CompilerGLSL::emit_subgroup_arithmetic_workaround(const std::string &func, Op op, GroupOperation group_op) +{ + std::string result; + switch (group_op) + { + case GroupOperationReduce: + result = "reduction"; + break; + + case GroupOperationExclusiveScan: + result = "excl_scan"; + break; + + case GroupOperationInclusiveScan: + result = "incl_scan"; + break; + + default: + SPIRV_CROSS_THROW("Unsupported workaround for arithmetic group operation"); + } + + struct TypeInfo + { + std::string type; + std::string identity; + }; + + std::vector type_infos; + switch (op) + { + case OpGroupNonUniformIAdd: + { + type_infos.emplace_back(TypeInfo{ "uint", "0u" }); + type_infos.emplace_back(TypeInfo{ "uvec2", "uvec2(0u)" }); + type_infos.emplace_back(TypeInfo{ "uvec3", "uvec3(0u)" }); + type_infos.emplace_back(TypeInfo{ "uvec4", "uvec4(0u)" }); + type_infos.emplace_back(TypeInfo{ "int", "0" }); + type_infos.emplace_back(TypeInfo{ "ivec2", "ivec2(0)" }); + type_infos.emplace_back(TypeInfo{ "ivec3", "ivec3(0)" }); + type_infos.emplace_back(TypeInfo{ "ivec4", "ivec4(0)" }); + break; + } + + case OpGroupNonUniformFAdd: + { + type_infos.emplace_back(TypeInfo{ "float", "0.0f" }); + type_infos.emplace_back(TypeInfo{ "vec2", "vec2(0.0f)" }); + type_infos.emplace_back(TypeInfo{ "vec3", "vec3(0.0f)" }); + type_infos.emplace_back(TypeInfo{ "vec4", "vec4(0.0f)" }); + // ARB_gpu_shader_fp64 is required in GL4.0 which in turn is required by NV_thread_shuffle + type_infos.emplace_back(TypeInfo{ "double", "0.0LF" }); + type_infos.emplace_back(TypeInfo{ "dvec2", "dvec2(0.0LF)" }); + type_infos.emplace_back(TypeInfo{ "dvec3", "dvec3(0.0LF)" }); + type_infos.emplace_back(TypeInfo{ "dvec4", "dvec4(0.0LF)" }); + break; + } + + case OpGroupNonUniformIMul: + { + type_infos.emplace_back(TypeInfo{ "uint", "1u" }); + type_infos.emplace_back(TypeInfo{ "uvec2", "uvec2(1u)" }); + type_infos.emplace_back(TypeInfo{ "uvec3", "uvec3(1u)" }); + type_infos.emplace_back(TypeInfo{ "uvec4", "uvec4(1u)" }); + type_infos.emplace_back(TypeInfo{ "int", "1" }); + type_infos.emplace_back(TypeInfo{ "ivec2", "ivec2(1)" }); + type_infos.emplace_back(TypeInfo{ "ivec3", "ivec3(1)" }); + type_infos.emplace_back(TypeInfo{ "ivec4", "ivec4(1)" }); + break; + } + + case OpGroupNonUniformFMul: + { + type_infos.emplace_back(TypeInfo{ "float", "1.0f" }); + type_infos.emplace_back(TypeInfo{ "vec2", "vec2(1.0f)" }); + type_infos.emplace_back(TypeInfo{ "vec3", "vec3(1.0f)" }); + type_infos.emplace_back(TypeInfo{ "vec4", "vec4(1.0f)" }); + type_infos.emplace_back(TypeInfo{ "double", "0.0LF" }); + type_infos.emplace_back(TypeInfo{ "dvec2", "dvec2(1.0LF)" }); + type_infos.emplace_back(TypeInfo{ "dvec3", "dvec3(1.0LF)" }); + type_infos.emplace_back(TypeInfo{ "dvec4", "dvec4(1.0LF)" }); + break; + } + + default: + SPIRV_CROSS_THROW("Unsupported workaround for arithmetic group operation"); + } + + const bool op_is_addition = op == OpGroupNonUniformIAdd || op == OpGroupNonUniformFAdd; + const bool op_is_multiplication = op == OpGroupNonUniformIMul || op == OpGroupNonUniformFMul; + std::string op_symbol; + if (op_is_addition) + { + op_symbol = "+="; + } + else if (op_is_multiplication) + { + op_symbol = "*="; + } + + for (const TypeInfo &t : type_infos) + { + statement(t.type, " ", func, "(", t.type, " v)"); + begin_scope(); + statement(t.type, " ", result, " = ", t.identity, ";"); + statement("uvec4 active_threads = subgroupBallot(true);"); + statement("if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)"); + begin_scope(); + statement("uint total = gl_SubgroupSize / 2u;"); + statement(result, " = v;"); + statement("for (uint i = 1u; i <= total; i <<= 1u)"); + begin_scope(); + statement("bool valid;"); + if (group_op == GroupOperationReduce) + { + statement(t.type, " s = shuffleXorNV(", result, ", i, gl_SubgroupSize, valid);"); + } + else if (group_op == GroupOperationExclusiveScan || group_op == GroupOperationInclusiveScan) + { + statement(t.type, " s = shuffleUpNV(", result, ", i, gl_SubgroupSize, valid);"); + } + if (op_is_addition || op_is_multiplication) + { + statement(result, " ", op_symbol, " valid ? s : ", t.identity, ";"); + } + end_scope(); + if (group_op == GroupOperationExclusiveScan) + { + statement(result, " = shuffleUpNV(", result, ", 1u, gl_SubgroupSize);"); + statement("if (subgroupElect())"); + begin_scope(); + statement(result, " = ", t.identity, ";"); + end_scope(); + } + end_scope(); + statement("else"); + begin_scope(); + if (group_op == GroupOperationExclusiveScan) + { + statement("uint total = subgroupBallotBitCount(gl_SubgroupLtMask);"); + } + else if (group_op == GroupOperationInclusiveScan) + { + statement("uint total = subgroupBallotBitCount(gl_SubgroupLeMask);"); + } + statement("for (uint i = 0u; i < gl_SubgroupSize; ++i)"); + begin_scope(); + statement("bool valid = subgroupBallotBitExtract(active_threads, i);"); + statement(t.type, " s = shuffleNV(v, i, gl_SubgroupSize);"); + if (group_op == GroupOperationExclusiveScan || group_op == GroupOperationInclusiveScan) + { + statement("valid = valid && (i < total);"); + } + if (op_is_addition || op_is_multiplication) + { + statement(result, " ", op_symbol, " valid ? s : ", t.identity, ";"); + } + end_scope(); + end_scope(); + statement("return ", result, ";"); + end_scope(); + } +} + void CompilerGLSL::emit_extension_workarounds(spv::ExecutionModel model) { static const char *workaround_types[] = { "int", "ivec2", "ivec3", "ivec4", "uint", "uvec2", "uvec3", "uvec4", @@ -4396,6 +4560,57 @@ void CompilerGLSL::emit_extension_workarounds(spv::ExecutionModel model) statement("#endif"); statement(""); } + + auto arithmetic_feature_helper = + [&](Supp::Feature feat, std::string func_name, spv::Op op, spv::GroupOperation group_op) + { + if (shader_subgroup_supporter.is_feature_requested(feat)) + { + auto exts = Supp::get_candidates_for_feature(feat, result); + for (auto &e : exts) + { + const char *name = Supp::get_extension_name(e); + statement(&e == &exts.front() ? "#if" : "#elif", " defined(", name, ")"); + + switch (e) + { + case Supp::NV_shader_thread_shuffle: + emit_subgroup_arithmetic_workaround(func_name, op, group_op); + break; + default: + break; + } + } + statement("#endif"); + statement(""); + } + }; + + arithmetic_feature_helper(Supp::SubgroupArithmeticIAddReduce, "subgroupAdd", OpGroupNonUniformIAdd, + GroupOperationReduce); + arithmetic_feature_helper(Supp::SubgroupArithmeticIAddExclusiveScan, "subgroupExclusiveAdd", + OpGroupNonUniformIAdd, GroupOperationExclusiveScan); + arithmetic_feature_helper(Supp::SubgroupArithmeticIAddInclusiveScan, "subgroupInclusiveAdd", + OpGroupNonUniformIAdd, GroupOperationInclusiveScan); + arithmetic_feature_helper(Supp::SubgroupArithmeticFAddReduce, "subgroupAdd", OpGroupNonUniformFAdd, + GroupOperationReduce); + arithmetic_feature_helper(Supp::SubgroupArithmeticFAddExclusiveScan, "subgroupExclusiveAdd", + OpGroupNonUniformFAdd, GroupOperationExclusiveScan); + arithmetic_feature_helper(Supp::SubgroupArithmeticFAddInclusiveScan, "subgroupInclusiveAdd", + OpGroupNonUniformFAdd, GroupOperationInclusiveScan); + + arithmetic_feature_helper(Supp::SubgroupArithmeticIMulReduce, "subgroupMul", OpGroupNonUniformIMul, + GroupOperationReduce); + arithmetic_feature_helper(Supp::SubgroupArithmeticIMulExclusiveScan, "subgroupExclusiveMul", + OpGroupNonUniformIMul, GroupOperationExclusiveScan); + arithmetic_feature_helper(Supp::SubgroupArithmeticIMulInclusiveScan, "subgroupInclusiveMul", + OpGroupNonUniformIMul, GroupOperationInclusiveScan); + arithmetic_feature_helper(Supp::SubgroupArithmeticFMulReduce, "subgroupMul", OpGroupNonUniformFMul, + GroupOperationReduce); + arithmetic_feature_helper(Supp::SubgroupArithmeticFMulExclusiveScan, "subgroupExclusiveMul", + OpGroupNonUniformFMul, GroupOperationExclusiveScan); + arithmetic_feature_helper(Supp::SubgroupArithmeticFMulInclusiveScan, "subgroupInclusiveMul", + OpGroupNonUniformFMul, GroupOperationInclusiveScan); } if (!workaround_ubo_load_overload_types.empty()) @@ -7109,7 +7324,7 @@ string CompilerGLSL::to_combined_image_sampler(VariableID image_id, VariableID s } } -bool CompilerGLSL::is_supported_subgroup_op_in_opengl(spv::Op op) +bool CompilerGLSL::is_supported_subgroup_op_in_opengl(spv::Op op, const uint32_t *ops) { switch (op) { @@ -7128,6 +7343,22 @@ bool CompilerGLSL::is_supported_subgroup_op_in_opengl(spv::Op op) case OpGroupNonUniformBallotBitExtract: case OpGroupNonUniformInverseBallot: return true; + case OpGroupNonUniformIAdd: + case OpGroupNonUniformFAdd: + case OpGroupNonUniformIMul: + case OpGroupNonUniformFMul: + { + const GroupOperation operation = static_cast(ops[3]); + if (operation == GroupOperationReduce || operation == GroupOperationInclusiveScan || + operation == GroupOperationExclusiveScan) + { + return true; + } + else + { + return false; + } + } default: return false; } @@ -8725,7 +8956,7 @@ void CompilerGLSL::emit_subgroup_op(const Instruction &i) const uint32_t *ops = stream(i); auto op = static_cast(i.op); - if (!options.vulkan_semantics && !is_supported_subgroup_op_in_opengl(op)) + if (!options.vulkan_semantics && !is_supported_subgroup_op_in_opengl(op, ops)) SPIRV_CROSS_THROW("This subgroup operation is only supported in Vulkan semantics."); // If we need to do implicit bitcasts, make sure we do it with the correct type. @@ -8793,12 +9024,34 @@ void CompilerGLSL::emit_subgroup_op(const Instruction &i) } break; - case OpGroupNonUniformFAdd: - case OpGroupNonUniformFMul: + // clang-format off +#define GLSL_GROUP_OP(OP)\ + case OpGroupNonUniform##OP:\ + {\ + auto operation = static_cast(ops[3]);\ + if (operation == GroupOperationClusteredReduce)\ + require_extension_internal("GL_KHR_shader_subgroup_clustered");\ + else if (operation == GroupOperationReduce)\ + request_subgroup_feature(ShaderSubgroupSupportHelper::SubgroupArithmetic##OP##Reduce);\ + else if (operation == GroupOperationExclusiveScan)\ + request_subgroup_feature(ShaderSubgroupSupportHelper::SubgroupArithmetic##OP##ExclusiveScan);\ + else if (operation == GroupOperationInclusiveScan)\ + request_subgroup_feature(ShaderSubgroupSupportHelper::SubgroupArithmetic##OP##InclusiveScan);\ + else\ + SPIRV_CROSS_THROW("Invalid group operation.");\ + break;\ + } + + GLSL_GROUP_OP(IAdd) + GLSL_GROUP_OP(FAdd) + GLSL_GROUP_OP(IMul) + GLSL_GROUP_OP(FMul) + +#undef GLSL_GROUP_OP + // clang-format on + case OpGroupNonUniformFMin: case OpGroupNonUniformFMax: - case OpGroupNonUniformIAdd: - case OpGroupNonUniformIMul: case OpGroupNonUniformSMin: case OpGroupNonUniformSMax: case OpGroupNonUniformUMin: @@ -17667,6 +17920,7 @@ const char *CompilerGLSL::ShaderSubgroupSupportHelper::get_extension_name(Candid static const char *const retval[CandidateCount] = { "GL_KHR_shader_subgroup_ballot", "GL_KHR_shader_subgroup_basic", "GL_KHR_shader_subgroup_vote", + "GL_KHR_shader_subgroup_arithmetic", "GL_NV_gpu_shader_5", "GL_NV_shader_thread_group", "GL_NV_shader_thread_shuffle", @@ -17715,6 +17969,21 @@ CompilerGLSL::ShaderSubgroupSupportHelper::FeatureVector CompilerGLSL::ShaderSub return { SubgroupMask }; case SubgroupBallotBitCount: return { SubgroupBallot }; + case SubgroupArithmeticIAddReduce: + case SubgroupArithmeticIAddInclusiveScan: + case SubgroupArithmeticFAddReduce: + case SubgroupArithmeticFAddInclusiveScan: + case SubgroupArithmeticIMulReduce: + case SubgroupArithmeticIMulInclusiveScan: + case SubgroupArithmeticFMulReduce: + case SubgroupArithmeticFMulInclusiveScan: + return { SubgroupSize, SubgroupBallot, SubgroupBallotBitCount, SubgroupMask, SubgroupBallotBitExtract }; + case SubgroupArithmeticIAddExclusiveScan: + case SubgroupArithmeticFAddExclusiveScan: + case SubgroupArithmeticIMulExclusiveScan: + case SubgroupArithmeticFMulExclusiveScan: + return { SubgroupSize, SubgroupBallot, SubgroupBallotBitCount, + SubgroupMask, SubgroupElect, SubgroupBallotBitExtract }; default: return {}; } @@ -17728,11 +17997,15 @@ CompilerGLSL::ShaderSubgroupSupportHelper::FeatureMask CompilerGLSL::ShaderSubgr bool CompilerGLSL::ShaderSubgroupSupportHelper::can_feature_be_implemented_without_extensions(Feature feature) { - static const bool retval[FeatureCount] = { false, false, false, false, false, false, - true, // SubgroupBalloFindLSB_MSB - false, false, false, false, - true, // SubgroupMemBarrier - replaced with workgroup memory barriers - false, false, true, false }; + static const bool retval[FeatureCount] = { + false, false, false, false, false, false, + true, // SubgroupBalloFindLSB_MSB + false, false, false, false, + true, // SubgroupMemBarrier - replaced with workgroup memory barriers + false, false, true, false, + false, false, false, false, false, false, // iadd, fadd + false, false, false, false, false, false, // imul , fmul + }; return retval[feature]; } @@ -17744,7 +18017,11 @@ CompilerGLSL::ShaderSubgroupSupportHelper::Candidate CompilerGLSL::ShaderSubgrou KHR_shader_subgroup_ballot, KHR_shader_subgroup_basic, KHR_shader_subgroup_basic, KHR_shader_subgroup_basic, KHR_shader_subgroup_basic, KHR_shader_subgroup_ballot, KHR_shader_subgroup_ballot, KHR_shader_subgroup_vote, KHR_shader_subgroup_vote, KHR_shader_subgroup_basic, KHR_shader_subgroup_basic, KHR_shader_subgroup_basic, - KHR_shader_subgroup_ballot, KHR_shader_subgroup_ballot, KHR_shader_subgroup_ballot, KHR_shader_subgroup_ballot + KHR_shader_subgroup_ballot, KHR_shader_subgroup_ballot, KHR_shader_subgroup_ballot, KHR_shader_subgroup_ballot, + KHR_shader_subgroup_arithmetic, KHR_shader_subgroup_arithmetic, KHR_shader_subgroup_arithmetic, + KHR_shader_subgroup_arithmetic, KHR_shader_subgroup_arithmetic, KHR_shader_subgroup_arithmetic, + KHR_shader_subgroup_arithmetic, KHR_shader_subgroup_arithmetic, KHR_shader_subgroup_arithmetic, + KHR_shader_subgroup_arithmetic, KHR_shader_subgroup_arithmetic, KHR_shader_subgroup_arithmetic, }; return extensions[feature]; @@ -17840,6 +18117,19 @@ CompilerGLSL::ShaderSubgroupSupportHelper::CandidateVector CompilerGLSL::ShaderS return { NV_shader_thread_group }; case SubgroupBallotBitCount: return {}; + case SubgroupArithmeticIAddReduce: + case SubgroupArithmeticIAddExclusiveScan: + case SubgroupArithmeticIAddInclusiveScan: + case SubgroupArithmeticFAddReduce: + case SubgroupArithmeticFAddExclusiveScan: + case SubgroupArithmeticFAddInclusiveScan: + case SubgroupArithmeticIMulReduce: + case SubgroupArithmeticIMulExclusiveScan: + case SubgroupArithmeticIMulInclusiveScan: + case SubgroupArithmeticFMulReduce: + case SubgroupArithmeticFMulExclusiveScan: + case SubgroupArithmeticFMulInclusiveScan: + return { KHR_shader_subgroup_arithmetic, NV_shader_thread_shuffle }; default: return {}; } @@ -17864,6 +18154,7 @@ CompilerGLSL::ShaderSubgroupSupportHelper::Result::Result() weights[KHR_shader_subgroup_ballot] = big_num; weights[KHR_shader_subgroup_basic] = big_num; weights[KHR_shader_subgroup_vote] = big_num; + weights[KHR_shader_subgroup_arithmetic] = big_num; } void CompilerGLSL::request_workaround_wrapper_overload(TypeID id) diff --git a/spirv_glsl.hpp b/spirv_glsl.hpp index 0b40ed80..d6e2477e 100644 --- a/spirv_glsl.hpp +++ b/spirv_glsl.hpp @@ -292,6 +292,7 @@ protected: KHR_shader_subgroup_ballot, KHR_shader_subgroup_basic, KHR_shader_subgroup_vote, + KHR_shader_subgroup_arithmetic, NV_gpu_shader_5, NV_shader_thread_group, NV_shader_thread_shuffle, @@ -324,7 +325,18 @@ protected: SubgroupInverseBallot_InclBitCount_ExclBitCout = 13, SubgroupBallotBitExtract = 14, SubgroupBallotBitCount = 15, - + SubgroupArithmeticIAddReduce = 16, + SubgroupArithmeticIAddExclusiveScan = 17, + SubgroupArithmeticIAddInclusiveScan = 18, + SubgroupArithmeticFAddReduce = 19, + SubgroupArithmeticFAddExclusiveScan = 20, + SubgroupArithmeticFAddInclusiveScan = 21, + SubgroupArithmeticIMulReduce = 22, + SubgroupArithmeticIMulExclusiveScan = 23, + SubgroupArithmeticIMulInclusiveScan = 24, + SubgroupArithmeticFMulReduce = 25, + SubgroupArithmeticFMulExclusiveScan = 26, + SubgroupArithmeticFMulInclusiveScan = 27, FeatureCount }; @@ -358,7 +370,7 @@ protected: }; // TODO remove this function when all subgroup ops are supported (or make it always return true) - static bool is_supported_subgroup_op_in_opengl(spv::Op op); + static bool is_supported_subgroup_op_in_opengl(spv::Op op, const uint32_t *ops); void reset(uint32_t iteration_count); void emit_function(SPIRFunction &func, const Bitset &return_flags); @@ -627,6 +639,7 @@ protected: void emit_struct(SPIRType &type); void emit_resources(); void emit_extension_workarounds(spv::ExecutionModel model); + void emit_subgroup_arithmetic_workaround(const std::string &func, spv::Op op, spv::GroupOperation group_op); void emit_polyfills(uint32_t polyfills, bool relaxed); void emit_buffer_block_native(const SPIRVariable &var); void emit_buffer_reference_block(uint32_t type_id, bool forward_declaration);