Merge pull request #2133 from georgeouzou/main
GLSL: Support GL_KHR_shader_subgroup_arithmetic (WIP)
This commit is contained in:
commit
54b48a2f3a
892
reference/shaders-no-opt/comp/subgroups_arithmetic_fadd.vk.comp
Normal file
892
reference/shaders-no-opt/comp/subgroups_arithmetic_fadd.vk.comp
Normal file
@ -0,0 +1,892 @@
|
||||
#version 450
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_ballot)
|
||||
#extension GL_KHR_shader_subgroup_ballot : require
|
||||
#elif defined(GL_NV_shader_thread_group)
|
||||
#extension GL_NV_shader_thread_group : require
|
||||
#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64)
|
||||
#extension GL_ARB_shader_int64 : enable
|
||||
#extension GL_ARB_shader_ballot : require
|
||||
#else
|
||||
#error No extensions available to emulate requested subgroup feature.
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_basic)
|
||||
#extension GL_KHR_shader_subgroup_basic : require
|
||||
#elif defined(GL_NV_shader_thread_group)
|
||||
#extension GL_NV_shader_thread_group : require
|
||||
#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64)
|
||||
#extension GL_ARB_shader_int64 : enable
|
||||
#extension GL_ARB_shader_ballot : require
|
||||
#elif defined(GL_AMD_gcn_shader) && (defined(GL_AMD_gpu_shader_int64) || defined(GL_NV_gpu_shader5))
|
||||
#extension GL_AMD_gpu_shader_int64 : enable
|
||||
#extension GL_NV_gpu_shader5 : enable
|
||||
#extension GL_AMD_gcn_shader : require
|
||||
#else
|
||||
#error No extensions available to emulate requested subgroup feature.
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_ballot)
|
||||
#extension GL_KHR_shader_subgroup_ballot : require
|
||||
#elif defined(GL_NV_shader_thread_group)
|
||||
#extension GL_NV_shader_thread_group : require
|
||||
#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64)
|
||||
#extension GL_ARB_shader_int64 : enable
|
||||
#extension GL_ARB_shader_ballot : require
|
||||
#else
|
||||
#error No extensions available to emulate requested subgroup feature.
|
||||
#endif
|
||||
|
||||
#if defined(GL_NV_shader_thread_group)
|
||||
#extension GL_NV_shader_thread_group : require
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#elif defined(GL_NV_shader_thread_shuffle)
|
||||
#extension GL_NV_shader_thread_shuffle : require
|
||||
#else
|
||||
#error No extensions available to emulate requested subgroup feature.
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#elif defined(GL_NV_shader_thread_shuffle)
|
||||
#extension GL_NV_shader_thread_shuffle : require
|
||||
#else
|
||||
#error No extensions available to emulate requested subgroup feature.
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#elif defined(GL_NV_shader_thread_shuffle)
|
||||
#extension GL_NV_shader_thread_shuffle : require
|
||||
#else
|
||||
#error No extensions available to emulate requested subgroup feature.
|
||||
#endif
|
||||
layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(binding = 1, std430) buffer DATA_OUT
|
||||
{
|
||||
float data_out_float;
|
||||
vec2 data_out_vec2;
|
||||
vec3 data_out_vec3;
|
||||
vec4 data_out_vec4;
|
||||
double data_out_double;
|
||||
dvec2 data_out_dvec2;
|
||||
dvec3 data_out_dvec3;
|
||||
dvec4 data_out_dvec4;
|
||||
} _16;
|
||||
|
||||
layout(binding = 0, std430) buffer DATA_IN
|
||||
{
|
||||
float data_in_float[128];
|
||||
vec2 data_in_vec2[128];
|
||||
vec3 data_in_vec3[128];
|
||||
vec4 data_in_vec4[128];
|
||||
double data_in_double[128];
|
||||
dvec2 data_in_dvec2[128];
|
||||
dvec3 data_in_dvec3[128];
|
||||
dvec4 data_in_dvec4[128];
|
||||
} _31;
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_ballot)
|
||||
#elif defined(GL_NV_shader_thread_group)
|
||||
#define gl_SubgroupEqMask uvec4(gl_ThreadEqMaskNV, 0u, 0u, 0u)
|
||||
#define gl_SubgroupGeMask uvec4(gl_ThreadGeMaskNV, 0u, 0u, 0u)
|
||||
#define gl_SubgroupGtMask uvec4(gl_ThreadGtMaskNV, 0u, 0u, 0u)
|
||||
#define gl_SubgroupLeMask uvec4(gl_ThreadLeMaskNV, 0u, 0u, 0u)
|
||||
#define gl_SubgroupLtMask uvec4(gl_ThreadLtMaskNV, 0u, 0u, 0u)
|
||||
#elif defined(GL_ARB_shader_ballot)
|
||||
#define gl_SubgroupEqMask uvec4(unpackUint2x32(gl_SubGroupEqMaskARB), 0u, 0u)
|
||||
#define gl_SubgroupGeMask uvec4(unpackUint2x32(gl_SubGroupGeMaskARB), 0u, 0u)
|
||||
#define gl_SubgroupGtMask uvec4(unpackUint2x32(gl_SubGroupGtMaskARB), 0u, 0u)
|
||||
#define gl_SubgroupLeMask uvec4(unpackUint2x32(gl_SubGroupLeMaskARB), 0u, 0u)
|
||||
#define gl_SubgroupLtMask uvec4(unpackUint2x32(gl_SubGroupLtMaskARB), 0u, 0u)
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_basic)
|
||||
#elif defined(GL_NV_shader_thread_group)
|
||||
#define gl_SubgroupSize gl_WarpSizeNV
|
||||
#elif defined(GL_ARB_shader_ballot)
|
||||
#define gl_SubgroupSize gl_SubGroupSizeARB
|
||||
#elif defined(GL_AMD_gcn_shader)
|
||||
#define gl_SubgroupSize uint(gl_SIMDGroupSizeAMD)
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_ballot)
|
||||
#elif defined(GL_NV_shader_thread_group)
|
||||
uvec4 subgroupBallot(bool v) { return uvec4(ballotThreadNV(v), 0u, 0u, 0u); }
|
||||
#elif defined(GL_ARB_shader_ballot)
|
||||
uvec4 subgroupBallot(bool v) { return uvec4(unpackUint2x32(ballotARB(v)), 0u, 0u); }
|
||||
#endif
|
||||
|
||||
#ifndef GL_KHR_shader_subgroup_basic
|
||||
bool subgroupElect()
|
||||
{
|
||||
uvec4 activeMask = subgroupBallot(true);
|
||||
uint firstLive = subgroupBallotFindLSB(activeMask);
|
||||
return gl_SubgroupInvocationID == firstLive;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef GL_KHR_shader_subgroup_ballot
|
||||
uint subgroupBallotBitCount(uvec4 value)
|
||||
{
|
||||
ivec2 c = bitCount(value.xy);
|
||||
#ifdef GL_NV_shader_thread_group
|
||||
return uint(c.x);
|
||||
#else
|
||||
return uint(c.x + c.y);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef GL_KHR_shader_subgroup_ballot
|
||||
bool subgroupBallotBitExtract(uvec4 value, uint index)
|
||||
{
|
||||
#ifdef GL_NV_shader_thread_group
|
||||
uint shifted = value.x >> index;
|
||||
#else
|
||||
uint shifted = value[index >> 5u] >> (index & 0x1fu);
|
||||
#endif
|
||||
return (shifted & 1u) != 0u;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
||||
#elif defined(GL_NV_shader_thread_shuffle)
|
||||
float subgroupAdd(float v)
|
||||
{
|
||||
float reduction = 0.0f;
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
float s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction += valid ? s : 0.0f;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
float s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction += valid ? s : 0.0f;
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
vec2 subgroupAdd(vec2 v)
|
||||
{
|
||||
vec2 reduction = vec2(0.0f);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
vec2 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction += valid ? s : vec2(0.0f);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
vec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction += valid ? s : vec2(0.0f);
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
vec3 subgroupAdd(vec3 v)
|
||||
{
|
||||
vec3 reduction = vec3(0.0f);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
vec3 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction += valid ? s : vec3(0.0f);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
vec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction += valid ? s : vec3(0.0f);
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
vec4 subgroupAdd(vec4 v)
|
||||
{
|
||||
vec4 reduction = vec4(0.0f);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
vec4 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction += valid ? s : vec4(0.0f);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
vec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction += valid ? s : vec4(0.0f);
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
double subgroupAdd(double v)
|
||||
{
|
||||
double reduction = 0.0LF;
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
double s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction += valid ? s : 0.0LF;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
double s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction += valid ? s : 0.0LF;
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
dvec2 subgroupAdd(dvec2 v)
|
||||
{
|
||||
dvec2 reduction = dvec2(0.0LF);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
dvec2 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction += valid ? s : dvec2(0.0LF);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
dvec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction += valid ? s : dvec2(0.0LF);
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
dvec3 subgroupAdd(dvec3 v)
|
||||
{
|
||||
dvec3 reduction = dvec3(0.0LF);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
dvec3 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction += valid ? s : dvec3(0.0LF);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
dvec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction += valid ? s : dvec3(0.0LF);
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
dvec4 subgroupAdd(dvec4 v)
|
||||
{
|
||||
dvec4 reduction = dvec4(0.0LF);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
dvec4 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction += valid ? s : dvec4(0.0LF);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
dvec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction += valid ? s : dvec4(0.0LF);
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
||||
#elif defined(GL_NV_shader_thread_shuffle)
|
||||
float subgroupExclusiveAdd(float v)
|
||||
{
|
||||
float excl_scan = 0.0f;
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
float s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan += valid ? s : 0.0f;
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = 0.0f;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
float s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan += valid ? s : 0.0f;
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
vec2 subgroupExclusiveAdd(vec2 v)
|
||||
{
|
||||
vec2 excl_scan = vec2(0.0f);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
vec2 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan += valid ? s : vec2(0.0f);
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = vec2(0.0f);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
vec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan += valid ? s : vec2(0.0f);
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
vec3 subgroupExclusiveAdd(vec3 v)
|
||||
{
|
||||
vec3 excl_scan = vec3(0.0f);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
vec3 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan += valid ? s : vec3(0.0f);
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = vec3(0.0f);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
vec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan += valid ? s : vec3(0.0f);
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
vec4 subgroupExclusiveAdd(vec4 v)
|
||||
{
|
||||
vec4 excl_scan = vec4(0.0f);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
vec4 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan += valid ? s : vec4(0.0f);
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = vec4(0.0f);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
vec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan += valid ? s : vec4(0.0f);
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
double subgroupExclusiveAdd(double v)
|
||||
{
|
||||
double excl_scan = 0.0LF;
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
double s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan += valid ? s : 0.0LF;
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = 0.0LF;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
double s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan += valid ? s : 0.0LF;
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
dvec2 subgroupExclusiveAdd(dvec2 v)
|
||||
{
|
||||
dvec2 excl_scan = dvec2(0.0LF);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
dvec2 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan += valid ? s : dvec2(0.0LF);
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = dvec2(0.0LF);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
dvec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan += valid ? s : dvec2(0.0LF);
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
dvec3 subgroupExclusiveAdd(dvec3 v)
|
||||
{
|
||||
dvec3 excl_scan = dvec3(0.0LF);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
dvec3 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan += valid ? s : dvec3(0.0LF);
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = dvec3(0.0LF);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
dvec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan += valid ? s : dvec3(0.0LF);
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
dvec4 subgroupExclusiveAdd(dvec4 v)
|
||||
{
|
||||
dvec4 excl_scan = dvec4(0.0LF);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
dvec4 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan += valid ? s : dvec4(0.0LF);
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = dvec4(0.0LF);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
dvec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan += valid ? s : dvec4(0.0LF);
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
||||
#elif defined(GL_NV_shader_thread_shuffle)
|
||||
float subgroupInclusiveAdd(float v)
|
||||
{
|
||||
float incl_scan = 0.0f;
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
float s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan += valid ? s : 0.0f;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
float s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan += valid ? s : 0.0f;
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
vec2 subgroupInclusiveAdd(vec2 v)
|
||||
{
|
||||
vec2 incl_scan = vec2(0.0f);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
vec2 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan += valid ? s : vec2(0.0f);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
vec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan += valid ? s : vec2(0.0f);
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
vec3 subgroupInclusiveAdd(vec3 v)
|
||||
{
|
||||
vec3 incl_scan = vec3(0.0f);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
vec3 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan += valid ? s : vec3(0.0f);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
vec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan += valid ? s : vec3(0.0f);
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
vec4 subgroupInclusiveAdd(vec4 v)
|
||||
{
|
||||
vec4 incl_scan = vec4(0.0f);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
vec4 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan += valid ? s : vec4(0.0f);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
vec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan += valid ? s : vec4(0.0f);
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
double subgroupInclusiveAdd(double v)
|
||||
{
|
||||
double incl_scan = 0.0LF;
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
double s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan += valid ? s : 0.0LF;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
double s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan += valid ? s : 0.0LF;
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
dvec2 subgroupInclusiveAdd(dvec2 v)
|
||||
{
|
||||
dvec2 incl_scan = dvec2(0.0LF);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
dvec2 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan += valid ? s : dvec2(0.0LF);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
dvec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan += valid ? s : dvec2(0.0LF);
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
dvec3 subgroupInclusiveAdd(dvec3 v)
|
||||
{
|
||||
dvec3 incl_scan = dvec3(0.0LF);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
dvec3 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan += valid ? s : dvec3(0.0LF);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
dvec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan += valid ? s : dvec3(0.0LF);
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
dvec4 subgroupInclusiveAdd(dvec4 v)
|
||||
{
|
||||
dvec4 incl_scan = dvec4(0.0LF);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
dvec4 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan += valid ? s : dvec4(0.0LF);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
dvec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan += valid ? s : dvec4(0.0LF);
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
#endif
|
||||
|
||||
void main()
|
||||
{
|
||||
_16.data_out_float = subgroupAdd(_31.data_in_float[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec2 = subgroupAdd(_31.data_in_vec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec3 = subgroupAdd(_31.data_in_vec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec4 = subgroupAdd(_31.data_in_vec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_double = subgroupAdd(_31.data_in_double[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec2 = subgroupAdd(_31.data_in_dvec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec3 = subgroupAdd(_31.data_in_dvec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec4 = subgroupAdd(_31.data_in_dvec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_float = subgroupExclusiveAdd(_31.data_in_float[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec2 = subgroupExclusiveAdd(_31.data_in_vec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec3 = subgroupExclusiveAdd(_31.data_in_vec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec4 = subgroupExclusiveAdd(_31.data_in_vec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_double = subgroupExclusiveAdd(_31.data_in_double[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec2 = subgroupExclusiveAdd(_31.data_in_dvec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec3 = subgroupExclusiveAdd(_31.data_in_dvec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec4 = subgroupExclusiveAdd(_31.data_in_dvec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_float = subgroupInclusiveAdd(_31.data_in_float[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec2 = subgroupInclusiveAdd(_31.data_in_vec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec3 = subgroupInclusiveAdd(_31.data_in_vec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec4 = subgroupInclusiveAdd(_31.data_in_vec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_double = subgroupInclusiveAdd(_31.data_in_double[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec2 = subgroupInclusiveAdd(_31.data_in_dvec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec3 = subgroupInclusiveAdd(_31.data_in_dvec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec4 = subgroupInclusiveAdd(_31.data_in_dvec4[gl_LocalInvocationID.x]);
|
||||
}
|
||||
|
@ -0,0 +1,56 @@
|
||||
#version 450
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(set = 0, binding = 1, std430) buffer DATA_OUT
|
||||
{
|
||||
float data_out_float;
|
||||
vec2 data_out_vec2;
|
||||
vec3 data_out_vec3;
|
||||
vec4 data_out_vec4;
|
||||
double data_out_double;
|
||||
dvec2 data_out_dvec2;
|
||||
dvec3 data_out_dvec3;
|
||||
dvec4 data_out_dvec4;
|
||||
} _16;
|
||||
|
||||
layout(set = 0, binding = 0, std430) buffer DATA_IN
|
||||
{
|
||||
float data_in_float[128];
|
||||
vec2 data_in_vec2[128];
|
||||
vec3 data_in_vec3[128];
|
||||
vec4 data_in_vec4[128];
|
||||
double data_in_double[128];
|
||||
dvec2 data_in_dvec2[128];
|
||||
dvec3 data_in_dvec3[128];
|
||||
dvec4 data_in_dvec4[128];
|
||||
} _31;
|
||||
|
||||
void main()
|
||||
{
|
||||
_16.data_out_float = subgroupAdd(_31.data_in_float[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec2 = subgroupAdd(_31.data_in_vec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec3 = subgroupAdd(_31.data_in_vec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec4 = subgroupAdd(_31.data_in_vec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_double = subgroupAdd(_31.data_in_double[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec2 = subgroupAdd(_31.data_in_dvec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec3 = subgroupAdd(_31.data_in_dvec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec4 = subgroupAdd(_31.data_in_dvec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_float = subgroupExclusiveAdd(_31.data_in_float[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec2 = subgroupExclusiveAdd(_31.data_in_vec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec3 = subgroupExclusiveAdd(_31.data_in_vec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec4 = subgroupExclusiveAdd(_31.data_in_vec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_double = subgroupExclusiveAdd(_31.data_in_double[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec2 = subgroupExclusiveAdd(_31.data_in_dvec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec3 = subgroupExclusiveAdd(_31.data_in_dvec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec4 = subgroupExclusiveAdd(_31.data_in_dvec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_float = subgroupInclusiveAdd(_31.data_in_float[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec2 = subgroupInclusiveAdd(_31.data_in_vec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec3 = subgroupInclusiveAdd(_31.data_in_vec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec4 = subgroupInclusiveAdd(_31.data_in_vec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_double = subgroupInclusiveAdd(_31.data_in_double[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec2 = subgroupInclusiveAdd(_31.data_in_dvec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec3 = subgroupInclusiveAdd(_31.data_in_dvec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec4 = subgroupInclusiveAdd(_31.data_in_dvec4[gl_LocalInvocationID.x]);
|
||||
}
|
||||
|
892
reference/shaders-no-opt/comp/subgroups_arithmetic_fmul.vk.comp
Normal file
892
reference/shaders-no-opt/comp/subgroups_arithmetic_fmul.vk.comp
Normal file
@ -0,0 +1,892 @@
|
||||
#version 450
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_ballot)
|
||||
#extension GL_KHR_shader_subgroup_ballot : require
|
||||
#elif defined(GL_NV_shader_thread_group)
|
||||
#extension GL_NV_shader_thread_group : require
|
||||
#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64)
|
||||
#extension GL_ARB_shader_int64 : enable
|
||||
#extension GL_ARB_shader_ballot : require
|
||||
#else
|
||||
#error No extensions available to emulate requested subgroup feature.
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_basic)
|
||||
#extension GL_KHR_shader_subgroup_basic : require
|
||||
#elif defined(GL_NV_shader_thread_group)
|
||||
#extension GL_NV_shader_thread_group : require
|
||||
#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64)
|
||||
#extension GL_ARB_shader_int64 : enable
|
||||
#extension GL_ARB_shader_ballot : require
|
||||
#elif defined(GL_AMD_gcn_shader) && (defined(GL_AMD_gpu_shader_int64) || defined(GL_NV_gpu_shader5))
|
||||
#extension GL_AMD_gpu_shader_int64 : enable
|
||||
#extension GL_NV_gpu_shader5 : enable
|
||||
#extension GL_AMD_gcn_shader : require
|
||||
#else
|
||||
#error No extensions available to emulate requested subgroup feature.
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_ballot)
|
||||
#extension GL_KHR_shader_subgroup_ballot : require
|
||||
#elif defined(GL_NV_shader_thread_group)
|
||||
#extension GL_NV_shader_thread_group : require
|
||||
#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64)
|
||||
#extension GL_ARB_shader_int64 : enable
|
||||
#extension GL_ARB_shader_ballot : require
|
||||
#else
|
||||
#error No extensions available to emulate requested subgroup feature.
|
||||
#endif
|
||||
|
||||
#if defined(GL_NV_shader_thread_group)
|
||||
#extension GL_NV_shader_thread_group : require
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#elif defined(GL_NV_shader_thread_shuffle)
|
||||
#extension GL_NV_shader_thread_shuffle : require
|
||||
#else
|
||||
#error No extensions available to emulate requested subgroup feature.
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#elif defined(GL_NV_shader_thread_shuffle)
|
||||
#extension GL_NV_shader_thread_shuffle : require
|
||||
#else
|
||||
#error No extensions available to emulate requested subgroup feature.
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#elif defined(GL_NV_shader_thread_shuffle)
|
||||
#extension GL_NV_shader_thread_shuffle : require
|
||||
#else
|
||||
#error No extensions available to emulate requested subgroup feature.
|
||||
#endif
|
||||
layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(binding = 1, std430) buffer DATA_OUT
|
||||
{
|
||||
float data_out_float;
|
||||
vec2 data_out_vec2;
|
||||
vec3 data_out_vec3;
|
||||
vec4 data_out_vec4;
|
||||
double data_out_double;
|
||||
dvec2 data_out_dvec2;
|
||||
dvec3 data_out_dvec3;
|
||||
dvec4 data_out_dvec4;
|
||||
} _16;
|
||||
|
||||
layout(binding = 0, std430) buffer DATA_IN
|
||||
{
|
||||
float data_in_float[128];
|
||||
vec2 data_in_vec2[128];
|
||||
vec3 data_in_vec3[128];
|
||||
vec4 data_in_vec4[128];
|
||||
double data_in_double[128];
|
||||
dvec2 data_in_dvec2[128];
|
||||
dvec3 data_in_dvec3[128];
|
||||
dvec4 data_in_dvec4[128];
|
||||
} _31;
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_ballot)
|
||||
#elif defined(GL_NV_shader_thread_group)
|
||||
#define gl_SubgroupEqMask uvec4(gl_ThreadEqMaskNV, 0u, 0u, 0u)
|
||||
#define gl_SubgroupGeMask uvec4(gl_ThreadGeMaskNV, 0u, 0u, 0u)
|
||||
#define gl_SubgroupGtMask uvec4(gl_ThreadGtMaskNV, 0u, 0u, 0u)
|
||||
#define gl_SubgroupLeMask uvec4(gl_ThreadLeMaskNV, 0u, 0u, 0u)
|
||||
#define gl_SubgroupLtMask uvec4(gl_ThreadLtMaskNV, 0u, 0u, 0u)
|
||||
#elif defined(GL_ARB_shader_ballot)
|
||||
#define gl_SubgroupEqMask uvec4(unpackUint2x32(gl_SubGroupEqMaskARB), 0u, 0u)
|
||||
#define gl_SubgroupGeMask uvec4(unpackUint2x32(gl_SubGroupGeMaskARB), 0u, 0u)
|
||||
#define gl_SubgroupGtMask uvec4(unpackUint2x32(gl_SubGroupGtMaskARB), 0u, 0u)
|
||||
#define gl_SubgroupLeMask uvec4(unpackUint2x32(gl_SubGroupLeMaskARB), 0u, 0u)
|
||||
#define gl_SubgroupLtMask uvec4(unpackUint2x32(gl_SubGroupLtMaskARB), 0u, 0u)
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_basic)
|
||||
#elif defined(GL_NV_shader_thread_group)
|
||||
#define gl_SubgroupSize gl_WarpSizeNV
|
||||
#elif defined(GL_ARB_shader_ballot)
|
||||
#define gl_SubgroupSize gl_SubGroupSizeARB
|
||||
#elif defined(GL_AMD_gcn_shader)
|
||||
#define gl_SubgroupSize uint(gl_SIMDGroupSizeAMD)
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_ballot)
|
||||
#elif defined(GL_NV_shader_thread_group)
|
||||
uvec4 subgroupBallot(bool v) { return uvec4(ballotThreadNV(v), 0u, 0u, 0u); }
|
||||
#elif defined(GL_ARB_shader_ballot)
|
||||
uvec4 subgroupBallot(bool v) { return uvec4(unpackUint2x32(ballotARB(v)), 0u, 0u); }
|
||||
#endif
|
||||
|
||||
#ifndef GL_KHR_shader_subgroup_basic
|
||||
bool subgroupElect()
|
||||
{
|
||||
uvec4 activeMask = subgroupBallot(true);
|
||||
uint firstLive = subgroupBallotFindLSB(activeMask);
|
||||
return gl_SubgroupInvocationID == firstLive;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef GL_KHR_shader_subgroup_ballot
|
||||
uint subgroupBallotBitCount(uvec4 value)
|
||||
{
|
||||
ivec2 c = bitCount(value.xy);
|
||||
#ifdef GL_NV_shader_thread_group
|
||||
return uint(c.x);
|
||||
#else
|
||||
return uint(c.x + c.y);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef GL_KHR_shader_subgroup_ballot
|
||||
bool subgroupBallotBitExtract(uvec4 value, uint index)
|
||||
{
|
||||
#ifdef GL_NV_shader_thread_group
|
||||
uint shifted = value.x >> index;
|
||||
#else
|
||||
uint shifted = value[index >> 5u] >> (index & 0x1fu);
|
||||
#endif
|
||||
return (shifted & 1u) != 0u;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
||||
#elif defined(GL_NV_shader_thread_shuffle)
|
||||
float subgroupMul(float v)
|
||||
{
|
||||
float reduction = 1.0f;
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
float s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction *= valid ? s : 1.0f;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
float s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction *= valid ? s : 1.0f;
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
vec2 subgroupMul(vec2 v)
|
||||
{
|
||||
vec2 reduction = vec2(1.0f);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
vec2 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction *= valid ? s : vec2(1.0f);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
vec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction *= valid ? s : vec2(1.0f);
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
vec3 subgroupMul(vec3 v)
|
||||
{
|
||||
vec3 reduction = vec3(1.0f);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
vec3 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction *= valid ? s : vec3(1.0f);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
vec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction *= valid ? s : vec3(1.0f);
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
vec4 subgroupMul(vec4 v)
|
||||
{
|
||||
vec4 reduction = vec4(1.0f);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
vec4 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction *= valid ? s : vec4(1.0f);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
vec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction *= valid ? s : vec4(1.0f);
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
double subgroupMul(double v)
|
||||
{
|
||||
double reduction = 0.0LF;
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
double s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction *= valid ? s : 0.0LF;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
double s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction *= valid ? s : 0.0LF;
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
dvec2 subgroupMul(dvec2 v)
|
||||
{
|
||||
dvec2 reduction = dvec2(1.0LF);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
dvec2 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction *= valid ? s : dvec2(1.0LF);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
dvec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction *= valid ? s : dvec2(1.0LF);
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
dvec3 subgroupMul(dvec3 v)
|
||||
{
|
||||
dvec3 reduction = dvec3(1.0LF);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
dvec3 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction *= valid ? s : dvec3(1.0LF);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
dvec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction *= valid ? s : dvec3(1.0LF);
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
dvec4 subgroupMul(dvec4 v)
|
||||
{
|
||||
dvec4 reduction = dvec4(1.0LF);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
dvec4 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction *= valid ? s : dvec4(1.0LF);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
dvec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction *= valid ? s : dvec4(1.0LF);
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
||||
#elif defined(GL_NV_shader_thread_shuffle)
|
||||
float subgroupExclusiveMul(float v)
|
||||
{
|
||||
float excl_scan = 1.0f;
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
float s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan *= valid ? s : 1.0f;
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = 1.0f;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
float s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan *= valid ? s : 1.0f;
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
vec2 subgroupExclusiveMul(vec2 v)
|
||||
{
|
||||
vec2 excl_scan = vec2(1.0f);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
vec2 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan *= valid ? s : vec2(1.0f);
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = vec2(1.0f);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
vec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan *= valid ? s : vec2(1.0f);
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
vec3 subgroupExclusiveMul(vec3 v)
|
||||
{
|
||||
vec3 excl_scan = vec3(1.0f);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
vec3 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan *= valid ? s : vec3(1.0f);
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = vec3(1.0f);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
vec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan *= valid ? s : vec3(1.0f);
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
vec4 subgroupExclusiveMul(vec4 v)
|
||||
{
|
||||
vec4 excl_scan = vec4(1.0f);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
vec4 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan *= valid ? s : vec4(1.0f);
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = vec4(1.0f);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
vec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan *= valid ? s : vec4(1.0f);
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
double subgroupExclusiveMul(double v)
|
||||
{
|
||||
double excl_scan = 0.0LF;
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
double s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan *= valid ? s : 0.0LF;
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = 0.0LF;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
double s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan *= valid ? s : 0.0LF;
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
dvec2 subgroupExclusiveMul(dvec2 v)
|
||||
{
|
||||
dvec2 excl_scan = dvec2(1.0LF);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
dvec2 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan *= valid ? s : dvec2(1.0LF);
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = dvec2(1.0LF);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
dvec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan *= valid ? s : dvec2(1.0LF);
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
dvec3 subgroupExclusiveMul(dvec3 v)
|
||||
{
|
||||
dvec3 excl_scan = dvec3(1.0LF);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
dvec3 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan *= valid ? s : dvec3(1.0LF);
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = dvec3(1.0LF);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
dvec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan *= valid ? s : dvec3(1.0LF);
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
dvec4 subgroupExclusiveMul(dvec4 v)
|
||||
{
|
||||
dvec4 excl_scan = dvec4(1.0LF);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
dvec4 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan *= valid ? s : dvec4(1.0LF);
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = dvec4(1.0LF);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
dvec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan *= valid ? s : dvec4(1.0LF);
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
||||
#elif defined(GL_NV_shader_thread_shuffle)
|
||||
float subgroupInclusiveMul(float v)
|
||||
{
|
||||
float incl_scan = 1.0f;
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
float s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan *= valid ? s : 1.0f;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
float s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan *= valid ? s : 1.0f;
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
vec2 subgroupInclusiveMul(vec2 v)
|
||||
{
|
||||
vec2 incl_scan = vec2(1.0f);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
vec2 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan *= valid ? s : vec2(1.0f);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
vec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan *= valid ? s : vec2(1.0f);
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
vec3 subgroupInclusiveMul(vec3 v)
|
||||
{
|
||||
vec3 incl_scan = vec3(1.0f);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
vec3 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan *= valid ? s : vec3(1.0f);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
vec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan *= valid ? s : vec3(1.0f);
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
vec4 subgroupInclusiveMul(vec4 v)
|
||||
{
|
||||
vec4 incl_scan = vec4(1.0f);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
vec4 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan *= valid ? s : vec4(1.0f);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
vec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan *= valid ? s : vec4(1.0f);
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
double subgroupInclusiveMul(double v)
|
||||
{
|
||||
double incl_scan = 0.0LF;
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
double s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan *= valid ? s : 0.0LF;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
double s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan *= valid ? s : 0.0LF;
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
dvec2 subgroupInclusiveMul(dvec2 v)
|
||||
{
|
||||
dvec2 incl_scan = dvec2(1.0LF);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
dvec2 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan *= valid ? s : dvec2(1.0LF);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
dvec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan *= valid ? s : dvec2(1.0LF);
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
dvec3 subgroupInclusiveMul(dvec3 v)
|
||||
{
|
||||
dvec3 incl_scan = dvec3(1.0LF);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
dvec3 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan *= valid ? s : dvec3(1.0LF);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
dvec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan *= valid ? s : dvec3(1.0LF);
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
dvec4 subgroupInclusiveMul(dvec4 v)
|
||||
{
|
||||
dvec4 incl_scan = dvec4(1.0LF);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
dvec4 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan *= valid ? s : dvec4(1.0LF);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
dvec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan *= valid ? s : dvec4(1.0LF);
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
#endif
|
||||
|
||||
void main()
|
||||
{
|
||||
_16.data_out_float = subgroupMul(_31.data_in_float[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec2 = subgroupMul(_31.data_in_vec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec3 = subgroupMul(_31.data_in_vec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec4 = subgroupMul(_31.data_in_vec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_double = subgroupMul(_31.data_in_double[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec2 = subgroupMul(_31.data_in_dvec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec3 = subgroupMul(_31.data_in_dvec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec4 = subgroupMul(_31.data_in_dvec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_float = subgroupExclusiveMul(_31.data_in_float[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec2 = subgroupExclusiveMul(_31.data_in_vec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec3 = subgroupExclusiveMul(_31.data_in_vec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec4 = subgroupExclusiveMul(_31.data_in_vec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_double = subgroupExclusiveMul(_31.data_in_double[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec2 = subgroupExclusiveMul(_31.data_in_dvec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec3 = subgroupExclusiveMul(_31.data_in_dvec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec4 = subgroupExclusiveMul(_31.data_in_dvec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_float = subgroupInclusiveMul(_31.data_in_float[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec2 = subgroupInclusiveMul(_31.data_in_vec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec3 = subgroupInclusiveMul(_31.data_in_vec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec4 = subgroupInclusiveMul(_31.data_in_vec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_double = subgroupInclusiveMul(_31.data_in_double[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec2 = subgroupInclusiveMul(_31.data_in_dvec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec3 = subgroupInclusiveMul(_31.data_in_dvec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec4 = subgroupInclusiveMul(_31.data_in_dvec4[gl_LocalInvocationID.x]);
|
||||
}
|
||||
|
@ -0,0 +1,56 @@
|
||||
#version 450
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(set = 0, binding = 1, std430) buffer DATA_OUT
|
||||
{
|
||||
float data_out_float;
|
||||
vec2 data_out_vec2;
|
||||
vec3 data_out_vec3;
|
||||
vec4 data_out_vec4;
|
||||
double data_out_double;
|
||||
dvec2 data_out_dvec2;
|
||||
dvec3 data_out_dvec3;
|
||||
dvec4 data_out_dvec4;
|
||||
} _16;
|
||||
|
||||
layout(set = 0, binding = 0, std430) buffer DATA_IN
|
||||
{
|
||||
float data_in_float[128];
|
||||
vec2 data_in_vec2[128];
|
||||
vec3 data_in_vec3[128];
|
||||
vec4 data_in_vec4[128];
|
||||
double data_in_double[128];
|
||||
dvec2 data_in_dvec2[128];
|
||||
dvec3 data_in_dvec3[128];
|
||||
dvec4 data_in_dvec4[128];
|
||||
} _31;
|
||||
|
||||
void main()
|
||||
{
|
||||
_16.data_out_float = subgroupMul(_31.data_in_float[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec2 = subgroupMul(_31.data_in_vec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec3 = subgroupMul(_31.data_in_vec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec4 = subgroupMul(_31.data_in_vec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_double = subgroupMul(_31.data_in_double[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec2 = subgroupMul(_31.data_in_dvec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec3 = subgroupMul(_31.data_in_dvec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec4 = subgroupMul(_31.data_in_dvec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_float = subgroupExclusiveMul(_31.data_in_float[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec2 = subgroupExclusiveMul(_31.data_in_vec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec3 = subgroupExclusiveMul(_31.data_in_vec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec4 = subgroupExclusiveMul(_31.data_in_vec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_double = subgroupExclusiveMul(_31.data_in_double[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec2 = subgroupExclusiveMul(_31.data_in_dvec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec3 = subgroupExclusiveMul(_31.data_in_dvec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec4 = subgroupExclusiveMul(_31.data_in_dvec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_float = subgroupInclusiveMul(_31.data_in_float[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec2 = subgroupInclusiveMul(_31.data_in_vec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec3 = subgroupInclusiveMul(_31.data_in_vec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_vec4 = subgroupInclusiveMul(_31.data_in_vec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_double = subgroupInclusiveMul(_31.data_in_double[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec2 = subgroupInclusiveMul(_31.data_in_dvec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec3 = subgroupInclusiveMul(_31.data_in_dvec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_dvec4 = subgroupInclusiveMul(_31.data_in_dvec4[gl_LocalInvocationID.x]);
|
||||
}
|
||||
|
892
reference/shaders-no-opt/comp/subgroups_arithmetic_iadd.vk.comp
Normal file
892
reference/shaders-no-opt/comp/subgroups_arithmetic_iadd.vk.comp
Normal file
@ -0,0 +1,892 @@
|
||||
#version 450
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_ballot)
|
||||
#extension GL_KHR_shader_subgroup_ballot : require
|
||||
#elif defined(GL_NV_shader_thread_group)
|
||||
#extension GL_NV_shader_thread_group : require
|
||||
#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64)
|
||||
#extension GL_ARB_shader_int64 : enable
|
||||
#extension GL_ARB_shader_ballot : require
|
||||
#else
|
||||
#error No extensions available to emulate requested subgroup feature.
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_basic)
|
||||
#extension GL_KHR_shader_subgroup_basic : require
|
||||
#elif defined(GL_NV_shader_thread_group)
|
||||
#extension GL_NV_shader_thread_group : require
|
||||
#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64)
|
||||
#extension GL_ARB_shader_int64 : enable
|
||||
#extension GL_ARB_shader_ballot : require
|
||||
#elif defined(GL_AMD_gcn_shader) && (defined(GL_AMD_gpu_shader_int64) || defined(GL_NV_gpu_shader5))
|
||||
#extension GL_AMD_gpu_shader_int64 : enable
|
||||
#extension GL_NV_gpu_shader5 : enable
|
||||
#extension GL_AMD_gcn_shader : require
|
||||
#else
|
||||
#error No extensions available to emulate requested subgroup feature.
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_ballot)
|
||||
#extension GL_KHR_shader_subgroup_ballot : require
|
||||
#elif defined(GL_NV_shader_thread_group)
|
||||
#extension GL_NV_shader_thread_group : require
|
||||
#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64)
|
||||
#extension GL_ARB_shader_int64 : enable
|
||||
#extension GL_ARB_shader_ballot : require
|
||||
#else
|
||||
#error No extensions available to emulate requested subgroup feature.
|
||||
#endif
|
||||
|
||||
#if defined(GL_NV_shader_thread_group)
|
||||
#extension GL_NV_shader_thread_group : require
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#elif defined(GL_NV_shader_thread_shuffle)
|
||||
#extension GL_NV_shader_thread_shuffle : require
|
||||
#else
|
||||
#error No extensions available to emulate requested subgroup feature.
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#elif defined(GL_NV_shader_thread_shuffle)
|
||||
#extension GL_NV_shader_thread_shuffle : require
|
||||
#else
|
||||
#error No extensions available to emulate requested subgroup feature.
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#elif defined(GL_NV_shader_thread_shuffle)
|
||||
#extension GL_NV_shader_thread_shuffle : require
|
||||
#else
|
||||
#error No extensions available to emulate requested subgroup feature.
|
||||
#endif
|
||||
layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(binding = 1, std430) buffer DATA_OUT
|
||||
{
|
||||
int data_out_int;
|
||||
ivec2 data_out_ivec2;
|
||||
ivec3 data_out_ivec3;
|
||||
ivec4 data_out_ivec4;
|
||||
uint data_out_uint;
|
||||
uvec2 data_out_uvec2;
|
||||
uvec3 data_out_uvec3;
|
||||
uvec4 data_out_uvec4;
|
||||
} _16;
|
||||
|
||||
layout(binding = 0, std430) buffer DATA_IN
|
||||
{
|
||||
int data_in_int[128];
|
||||
ivec2 data_in_ivec2[128];
|
||||
ivec3 data_in_ivec3[128];
|
||||
ivec4 data_in_ivec4[128];
|
||||
uint data_in_uint[128];
|
||||
uvec2 data_in_uvec2[128];
|
||||
uvec3 data_in_uvec3[128];
|
||||
uvec4 data_in_uvec4[128];
|
||||
} _29;
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_ballot)
|
||||
#elif defined(GL_NV_shader_thread_group)
|
||||
#define gl_SubgroupEqMask uvec4(gl_ThreadEqMaskNV, 0u, 0u, 0u)
|
||||
#define gl_SubgroupGeMask uvec4(gl_ThreadGeMaskNV, 0u, 0u, 0u)
|
||||
#define gl_SubgroupGtMask uvec4(gl_ThreadGtMaskNV, 0u, 0u, 0u)
|
||||
#define gl_SubgroupLeMask uvec4(gl_ThreadLeMaskNV, 0u, 0u, 0u)
|
||||
#define gl_SubgroupLtMask uvec4(gl_ThreadLtMaskNV, 0u, 0u, 0u)
|
||||
#elif defined(GL_ARB_shader_ballot)
|
||||
#define gl_SubgroupEqMask uvec4(unpackUint2x32(gl_SubGroupEqMaskARB), 0u, 0u)
|
||||
#define gl_SubgroupGeMask uvec4(unpackUint2x32(gl_SubGroupGeMaskARB), 0u, 0u)
|
||||
#define gl_SubgroupGtMask uvec4(unpackUint2x32(gl_SubGroupGtMaskARB), 0u, 0u)
|
||||
#define gl_SubgroupLeMask uvec4(unpackUint2x32(gl_SubGroupLeMaskARB), 0u, 0u)
|
||||
#define gl_SubgroupLtMask uvec4(unpackUint2x32(gl_SubGroupLtMaskARB), 0u, 0u)
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_basic)
|
||||
#elif defined(GL_NV_shader_thread_group)
|
||||
#define gl_SubgroupSize gl_WarpSizeNV
|
||||
#elif defined(GL_ARB_shader_ballot)
|
||||
#define gl_SubgroupSize gl_SubGroupSizeARB
|
||||
#elif defined(GL_AMD_gcn_shader)
|
||||
#define gl_SubgroupSize uint(gl_SIMDGroupSizeAMD)
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_ballot)
|
||||
#elif defined(GL_NV_shader_thread_group)
|
||||
uvec4 subgroupBallot(bool v) { return uvec4(ballotThreadNV(v), 0u, 0u, 0u); }
|
||||
#elif defined(GL_ARB_shader_ballot)
|
||||
uvec4 subgroupBallot(bool v) { return uvec4(unpackUint2x32(ballotARB(v)), 0u, 0u); }
|
||||
#endif
|
||||
|
||||
#ifndef GL_KHR_shader_subgroup_basic
|
||||
bool subgroupElect()
|
||||
{
|
||||
uvec4 activeMask = subgroupBallot(true);
|
||||
uint firstLive = subgroupBallotFindLSB(activeMask);
|
||||
return gl_SubgroupInvocationID == firstLive;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef GL_KHR_shader_subgroup_ballot
|
||||
uint subgroupBallotBitCount(uvec4 value)
|
||||
{
|
||||
ivec2 c = bitCount(value.xy);
|
||||
#ifdef GL_NV_shader_thread_group
|
||||
return uint(c.x);
|
||||
#else
|
||||
return uint(c.x + c.y);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef GL_KHR_shader_subgroup_ballot
|
||||
bool subgroupBallotBitExtract(uvec4 value, uint index)
|
||||
{
|
||||
#ifdef GL_NV_shader_thread_group
|
||||
uint shifted = value.x >> index;
|
||||
#else
|
||||
uint shifted = value[index >> 5u] >> (index & 0x1fu);
|
||||
#endif
|
||||
return (shifted & 1u) != 0u;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
||||
#elif defined(GL_NV_shader_thread_shuffle)
|
||||
uint subgroupAdd(uint v)
|
||||
{
|
||||
uint reduction = 0u;
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
uint s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction += valid ? s : 0u;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
uint s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction += valid ? s : 0u;
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
uvec2 subgroupAdd(uvec2 v)
|
||||
{
|
||||
uvec2 reduction = uvec2(0u);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
uvec2 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction += valid ? s : uvec2(0u);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
uvec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction += valid ? s : uvec2(0u);
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
uvec3 subgroupAdd(uvec3 v)
|
||||
{
|
||||
uvec3 reduction = uvec3(0u);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
uvec3 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction += valid ? s : uvec3(0u);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
uvec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction += valid ? s : uvec3(0u);
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
uvec4 subgroupAdd(uvec4 v)
|
||||
{
|
||||
uvec4 reduction = uvec4(0u);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
uvec4 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction += valid ? s : uvec4(0u);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
uvec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction += valid ? s : uvec4(0u);
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
int subgroupAdd(int v)
|
||||
{
|
||||
int reduction = 0;
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
int s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction += valid ? s : 0;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
int s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction += valid ? s : 0;
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
ivec2 subgroupAdd(ivec2 v)
|
||||
{
|
||||
ivec2 reduction = ivec2(0);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
ivec2 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction += valid ? s : ivec2(0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
ivec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction += valid ? s : ivec2(0);
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
ivec3 subgroupAdd(ivec3 v)
|
||||
{
|
||||
ivec3 reduction = ivec3(0);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
ivec3 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction += valid ? s : ivec3(0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
ivec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction += valid ? s : ivec3(0);
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
ivec4 subgroupAdd(ivec4 v)
|
||||
{
|
||||
ivec4 reduction = ivec4(0);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
ivec4 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction += valid ? s : ivec4(0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
ivec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction += valid ? s : ivec4(0);
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
||||
#elif defined(GL_NV_shader_thread_shuffle)
|
||||
uint subgroupExclusiveAdd(uint v)
|
||||
{
|
||||
uint excl_scan = 0u;
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
uint s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan += valid ? s : 0u;
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = 0u;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
uint s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan += valid ? s : 0u;
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
uvec2 subgroupExclusiveAdd(uvec2 v)
|
||||
{
|
||||
uvec2 excl_scan = uvec2(0u);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
uvec2 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan += valid ? s : uvec2(0u);
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = uvec2(0u);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
uvec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan += valid ? s : uvec2(0u);
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
uvec3 subgroupExclusiveAdd(uvec3 v)
|
||||
{
|
||||
uvec3 excl_scan = uvec3(0u);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
uvec3 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan += valid ? s : uvec3(0u);
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = uvec3(0u);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
uvec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan += valid ? s : uvec3(0u);
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
uvec4 subgroupExclusiveAdd(uvec4 v)
|
||||
{
|
||||
uvec4 excl_scan = uvec4(0u);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
uvec4 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan += valid ? s : uvec4(0u);
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = uvec4(0u);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
uvec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan += valid ? s : uvec4(0u);
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
int subgroupExclusiveAdd(int v)
|
||||
{
|
||||
int excl_scan = 0;
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
int s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan += valid ? s : 0;
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = 0;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
int s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan += valid ? s : 0;
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
ivec2 subgroupExclusiveAdd(ivec2 v)
|
||||
{
|
||||
ivec2 excl_scan = ivec2(0);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
ivec2 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan += valid ? s : ivec2(0);
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = ivec2(0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
ivec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan += valid ? s : ivec2(0);
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
ivec3 subgroupExclusiveAdd(ivec3 v)
|
||||
{
|
||||
ivec3 excl_scan = ivec3(0);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
ivec3 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan += valid ? s : ivec3(0);
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = ivec3(0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
ivec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan += valid ? s : ivec3(0);
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
ivec4 subgroupExclusiveAdd(ivec4 v)
|
||||
{
|
||||
ivec4 excl_scan = ivec4(0);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
ivec4 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan += valid ? s : ivec4(0);
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = ivec4(0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
ivec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan += valid ? s : ivec4(0);
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
||||
#elif defined(GL_NV_shader_thread_shuffle)
|
||||
uint subgroupInclusiveAdd(uint v)
|
||||
{
|
||||
uint incl_scan = 0u;
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
uint s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan += valid ? s : 0u;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
uint s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan += valid ? s : 0u;
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
uvec2 subgroupInclusiveAdd(uvec2 v)
|
||||
{
|
||||
uvec2 incl_scan = uvec2(0u);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
uvec2 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan += valid ? s : uvec2(0u);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
uvec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan += valid ? s : uvec2(0u);
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
uvec3 subgroupInclusiveAdd(uvec3 v)
|
||||
{
|
||||
uvec3 incl_scan = uvec3(0u);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
uvec3 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan += valid ? s : uvec3(0u);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
uvec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan += valid ? s : uvec3(0u);
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
uvec4 subgroupInclusiveAdd(uvec4 v)
|
||||
{
|
||||
uvec4 incl_scan = uvec4(0u);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
uvec4 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan += valid ? s : uvec4(0u);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
uvec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan += valid ? s : uvec4(0u);
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
int subgroupInclusiveAdd(int v)
|
||||
{
|
||||
int incl_scan = 0;
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
int s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan += valid ? s : 0;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
int s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan += valid ? s : 0;
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
ivec2 subgroupInclusiveAdd(ivec2 v)
|
||||
{
|
||||
ivec2 incl_scan = ivec2(0);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
ivec2 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan += valid ? s : ivec2(0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
ivec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan += valid ? s : ivec2(0);
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
ivec3 subgroupInclusiveAdd(ivec3 v)
|
||||
{
|
||||
ivec3 incl_scan = ivec3(0);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
ivec3 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan += valid ? s : ivec3(0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
ivec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan += valid ? s : ivec3(0);
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
ivec4 subgroupInclusiveAdd(ivec4 v)
|
||||
{
|
||||
ivec4 incl_scan = ivec4(0);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
ivec4 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan += valid ? s : ivec4(0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
ivec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan += valid ? s : ivec4(0);
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
#endif
|
||||
|
||||
void main()
|
||||
{
|
||||
_16.data_out_int = subgroupAdd(_29.data_in_int[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec2 = subgroupAdd(_29.data_in_ivec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec3 = subgroupAdd(_29.data_in_ivec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec4 = subgroupAdd(_29.data_in_ivec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uint = subgroupAdd(_29.data_in_uint[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec2 = subgroupAdd(_29.data_in_uvec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec3 = subgroupAdd(_29.data_in_uvec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec4 = subgroupAdd(_29.data_in_uvec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_int = subgroupExclusiveAdd(_29.data_in_int[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec2 = subgroupExclusiveAdd(_29.data_in_ivec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec3 = subgroupExclusiveAdd(_29.data_in_ivec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec4 = subgroupExclusiveAdd(_29.data_in_ivec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uint = subgroupExclusiveAdd(_29.data_in_uint[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec2 = subgroupExclusiveAdd(_29.data_in_uvec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec3 = subgroupExclusiveAdd(_29.data_in_uvec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec4 = subgroupExclusiveAdd(_29.data_in_uvec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_int = subgroupInclusiveAdd(_29.data_in_int[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec2 = subgroupInclusiveAdd(_29.data_in_ivec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec3 = subgroupInclusiveAdd(_29.data_in_ivec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec4 = subgroupInclusiveAdd(_29.data_in_ivec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uint = subgroupInclusiveAdd(_29.data_in_uint[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec2 = subgroupInclusiveAdd(_29.data_in_uvec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec3 = subgroupInclusiveAdd(_29.data_in_uvec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec4 = subgroupInclusiveAdd(_29.data_in_uvec4[gl_LocalInvocationID.x]);
|
||||
}
|
||||
|
@ -0,0 +1,56 @@
|
||||
#version 450
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(set = 0, binding = 1, std430) buffer DATA_OUT
|
||||
{
|
||||
int data_out_int;
|
||||
ivec2 data_out_ivec2;
|
||||
ivec3 data_out_ivec3;
|
||||
ivec4 data_out_ivec4;
|
||||
uint data_out_uint;
|
||||
uvec2 data_out_uvec2;
|
||||
uvec3 data_out_uvec3;
|
||||
uvec4 data_out_uvec4;
|
||||
} _16;
|
||||
|
||||
layout(set = 0, binding = 0, std430) buffer DATA_IN
|
||||
{
|
||||
int data_in_int[128];
|
||||
ivec2 data_in_ivec2[128];
|
||||
ivec3 data_in_ivec3[128];
|
||||
ivec4 data_in_ivec4[128];
|
||||
uint data_in_uint[128];
|
||||
uvec2 data_in_uvec2[128];
|
||||
uvec3 data_in_uvec3[128];
|
||||
uvec4 data_in_uvec4[128];
|
||||
} _29;
|
||||
|
||||
void main()
|
||||
{
|
||||
_16.data_out_int = subgroupAdd(_29.data_in_int[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec2 = subgroupAdd(_29.data_in_ivec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec3 = subgroupAdd(_29.data_in_ivec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec4 = subgroupAdd(_29.data_in_ivec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uint = subgroupAdd(_29.data_in_uint[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec2 = subgroupAdd(_29.data_in_uvec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec3 = subgroupAdd(_29.data_in_uvec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec4 = subgroupAdd(_29.data_in_uvec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_int = subgroupExclusiveAdd(_29.data_in_int[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec2 = subgroupExclusiveAdd(_29.data_in_ivec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec3 = subgroupExclusiveAdd(_29.data_in_ivec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec4 = subgroupExclusiveAdd(_29.data_in_ivec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uint = subgroupExclusiveAdd(_29.data_in_uint[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec2 = subgroupExclusiveAdd(_29.data_in_uvec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec3 = subgroupExclusiveAdd(_29.data_in_uvec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec4 = subgroupExclusiveAdd(_29.data_in_uvec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_int = subgroupInclusiveAdd(_29.data_in_int[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec2 = subgroupInclusiveAdd(_29.data_in_ivec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec3 = subgroupInclusiveAdd(_29.data_in_ivec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec4 = subgroupInclusiveAdd(_29.data_in_ivec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uint = subgroupInclusiveAdd(_29.data_in_uint[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec2 = subgroupInclusiveAdd(_29.data_in_uvec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec3 = subgroupInclusiveAdd(_29.data_in_uvec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec4 = subgroupInclusiveAdd(_29.data_in_uvec4[gl_LocalInvocationID.x]);
|
||||
}
|
||||
|
892
reference/shaders-no-opt/comp/subgroups_arithmetic_imul.vk.comp
Normal file
892
reference/shaders-no-opt/comp/subgroups_arithmetic_imul.vk.comp
Normal file
@ -0,0 +1,892 @@
|
||||
#version 450
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_ballot)
|
||||
#extension GL_KHR_shader_subgroup_ballot : require
|
||||
#elif defined(GL_NV_shader_thread_group)
|
||||
#extension GL_NV_shader_thread_group : require
|
||||
#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64)
|
||||
#extension GL_ARB_shader_int64 : enable
|
||||
#extension GL_ARB_shader_ballot : require
|
||||
#else
|
||||
#error No extensions available to emulate requested subgroup feature.
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_basic)
|
||||
#extension GL_KHR_shader_subgroup_basic : require
|
||||
#elif defined(GL_NV_shader_thread_group)
|
||||
#extension GL_NV_shader_thread_group : require
|
||||
#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64)
|
||||
#extension GL_ARB_shader_int64 : enable
|
||||
#extension GL_ARB_shader_ballot : require
|
||||
#elif defined(GL_AMD_gcn_shader) && (defined(GL_AMD_gpu_shader_int64) || defined(GL_NV_gpu_shader5))
|
||||
#extension GL_AMD_gpu_shader_int64 : enable
|
||||
#extension GL_NV_gpu_shader5 : enable
|
||||
#extension GL_AMD_gcn_shader : require
|
||||
#else
|
||||
#error No extensions available to emulate requested subgroup feature.
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_ballot)
|
||||
#extension GL_KHR_shader_subgroup_ballot : require
|
||||
#elif defined(GL_NV_shader_thread_group)
|
||||
#extension GL_NV_shader_thread_group : require
|
||||
#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64)
|
||||
#extension GL_ARB_shader_int64 : enable
|
||||
#extension GL_ARB_shader_ballot : require
|
||||
#else
|
||||
#error No extensions available to emulate requested subgroup feature.
|
||||
#endif
|
||||
|
||||
#if defined(GL_NV_shader_thread_group)
|
||||
#extension GL_NV_shader_thread_group : require
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#elif defined(GL_NV_shader_thread_shuffle)
|
||||
#extension GL_NV_shader_thread_shuffle : require
|
||||
#else
|
||||
#error No extensions available to emulate requested subgroup feature.
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#elif defined(GL_NV_shader_thread_shuffle)
|
||||
#extension GL_NV_shader_thread_shuffle : require
|
||||
#else
|
||||
#error No extensions available to emulate requested subgroup feature.
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#elif defined(GL_NV_shader_thread_shuffle)
|
||||
#extension GL_NV_shader_thread_shuffle : require
|
||||
#else
|
||||
#error No extensions available to emulate requested subgroup feature.
|
||||
#endif
|
||||
layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(binding = 1, std430) buffer DATA_OUT
|
||||
{
|
||||
int data_out_int;
|
||||
ivec2 data_out_ivec2;
|
||||
ivec3 data_out_ivec3;
|
||||
ivec4 data_out_ivec4;
|
||||
uint data_out_uint;
|
||||
uvec2 data_out_uvec2;
|
||||
uvec3 data_out_uvec3;
|
||||
uvec4 data_out_uvec4;
|
||||
} _16;
|
||||
|
||||
layout(binding = 0, std430) buffer DATA_IN
|
||||
{
|
||||
int data_in_int[128];
|
||||
ivec2 data_in_ivec2[128];
|
||||
ivec3 data_in_ivec3[128];
|
||||
ivec4 data_in_ivec4[128];
|
||||
uint data_in_uint[128];
|
||||
uvec2 data_in_uvec2[128];
|
||||
uvec3 data_in_uvec3[128];
|
||||
uvec4 data_in_uvec4[128];
|
||||
} _29;
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_ballot)
|
||||
#elif defined(GL_NV_shader_thread_group)
|
||||
#define gl_SubgroupEqMask uvec4(gl_ThreadEqMaskNV, 0u, 0u, 0u)
|
||||
#define gl_SubgroupGeMask uvec4(gl_ThreadGeMaskNV, 0u, 0u, 0u)
|
||||
#define gl_SubgroupGtMask uvec4(gl_ThreadGtMaskNV, 0u, 0u, 0u)
|
||||
#define gl_SubgroupLeMask uvec4(gl_ThreadLeMaskNV, 0u, 0u, 0u)
|
||||
#define gl_SubgroupLtMask uvec4(gl_ThreadLtMaskNV, 0u, 0u, 0u)
|
||||
#elif defined(GL_ARB_shader_ballot)
|
||||
#define gl_SubgroupEqMask uvec4(unpackUint2x32(gl_SubGroupEqMaskARB), 0u, 0u)
|
||||
#define gl_SubgroupGeMask uvec4(unpackUint2x32(gl_SubGroupGeMaskARB), 0u, 0u)
|
||||
#define gl_SubgroupGtMask uvec4(unpackUint2x32(gl_SubGroupGtMaskARB), 0u, 0u)
|
||||
#define gl_SubgroupLeMask uvec4(unpackUint2x32(gl_SubGroupLeMaskARB), 0u, 0u)
|
||||
#define gl_SubgroupLtMask uvec4(unpackUint2x32(gl_SubGroupLtMaskARB), 0u, 0u)
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_basic)
|
||||
#elif defined(GL_NV_shader_thread_group)
|
||||
#define gl_SubgroupSize gl_WarpSizeNV
|
||||
#elif defined(GL_ARB_shader_ballot)
|
||||
#define gl_SubgroupSize gl_SubGroupSizeARB
|
||||
#elif defined(GL_AMD_gcn_shader)
|
||||
#define gl_SubgroupSize uint(gl_SIMDGroupSizeAMD)
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_ballot)
|
||||
#elif defined(GL_NV_shader_thread_group)
|
||||
uvec4 subgroupBallot(bool v) { return uvec4(ballotThreadNV(v), 0u, 0u, 0u); }
|
||||
#elif defined(GL_ARB_shader_ballot)
|
||||
uvec4 subgroupBallot(bool v) { return uvec4(unpackUint2x32(ballotARB(v)), 0u, 0u); }
|
||||
#endif
|
||||
|
||||
#ifndef GL_KHR_shader_subgroup_basic
|
||||
bool subgroupElect()
|
||||
{
|
||||
uvec4 activeMask = subgroupBallot(true);
|
||||
uint firstLive = subgroupBallotFindLSB(activeMask);
|
||||
return gl_SubgroupInvocationID == firstLive;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef GL_KHR_shader_subgroup_ballot
|
||||
uint subgroupBallotBitCount(uvec4 value)
|
||||
{
|
||||
ivec2 c = bitCount(value.xy);
|
||||
#ifdef GL_NV_shader_thread_group
|
||||
return uint(c.x);
|
||||
#else
|
||||
return uint(c.x + c.y);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef GL_KHR_shader_subgroup_ballot
|
||||
bool subgroupBallotBitExtract(uvec4 value, uint index)
|
||||
{
|
||||
#ifdef GL_NV_shader_thread_group
|
||||
uint shifted = value.x >> index;
|
||||
#else
|
||||
uint shifted = value[index >> 5u] >> (index & 0x1fu);
|
||||
#endif
|
||||
return (shifted & 1u) != 0u;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
||||
#elif defined(GL_NV_shader_thread_shuffle)
|
||||
uint subgroupMul(uint v)
|
||||
{
|
||||
uint reduction = 1u;
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
uint s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction *= valid ? s : 1u;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
uint s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction *= valid ? s : 1u;
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
uvec2 subgroupMul(uvec2 v)
|
||||
{
|
||||
uvec2 reduction = uvec2(1u);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
uvec2 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction *= valid ? s : uvec2(1u);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
uvec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction *= valid ? s : uvec2(1u);
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
uvec3 subgroupMul(uvec3 v)
|
||||
{
|
||||
uvec3 reduction = uvec3(1u);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
uvec3 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction *= valid ? s : uvec3(1u);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
uvec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction *= valid ? s : uvec3(1u);
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
uvec4 subgroupMul(uvec4 v)
|
||||
{
|
||||
uvec4 reduction = uvec4(1u);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
uvec4 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction *= valid ? s : uvec4(1u);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
uvec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction *= valid ? s : uvec4(1u);
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
int subgroupMul(int v)
|
||||
{
|
||||
int reduction = 1;
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
int s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction *= valid ? s : 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
int s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction *= valid ? s : 1;
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
ivec2 subgroupMul(ivec2 v)
|
||||
{
|
||||
ivec2 reduction = ivec2(1);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
ivec2 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction *= valid ? s : ivec2(1);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
ivec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction *= valid ? s : ivec2(1);
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
ivec3 subgroupMul(ivec3 v)
|
||||
{
|
||||
ivec3 reduction = ivec3(1);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
ivec3 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction *= valid ? s : ivec3(1);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
ivec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction *= valid ? s : ivec3(1);
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
ivec4 subgroupMul(ivec4 v)
|
||||
{
|
||||
ivec4 reduction = ivec4(1);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
reduction = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
ivec4 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
||||
reduction *= valid ? s : ivec4(1);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
ivec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
reduction *= valid ? s : ivec4(1);
|
||||
}
|
||||
}
|
||||
return reduction;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
||||
#elif defined(GL_NV_shader_thread_shuffle)
|
||||
uint subgroupExclusiveMul(uint v)
|
||||
{
|
||||
uint excl_scan = 1u;
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
uint s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan *= valid ? s : 1u;
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = 1u;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
uint s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan *= valid ? s : 1u;
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
uvec2 subgroupExclusiveMul(uvec2 v)
|
||||
{
|
||||
uvec2 excl_scan = uvec2(1u);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
uvec2 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan *= valid ? s : uvec2(1u);
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = uvec2(1u);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
uvec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan *= valid ? s : uvec2(1u);
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
uvec3 subgroupExclusiveMul(uvec3 v)
|
||||
{
|
||||
uvec3 excl_scan = uvec3(1u);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
uvec3 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan *= valid ? s : uvec3(1u);
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = uvec3(1u);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
uvec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan *= valid ? s : uvec3(1u);
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
uvec4 subgroupExclusiveMul(uvec4 v)
|
||||
{
|
||||
uvec4 excl_scan = uvec4(1u);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
uvec4 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan *= valid ? s : uvec4(1u);
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = uvec4(1u);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
uvec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan *= valid ? s : uvec4(1u);
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
int subgroupExclusiveMul(int v)
|
||||
{
|
||||
int excl_scan = 1;
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
int s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan *= valid ? s : 1;
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
int s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan *= valid ? s : 1;
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
ivec2 subgroupExclusiveMul(ivec2 v)
|
||||
{
|
||||
ivec2 excl_scan = ivec2(1);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
ivec2 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan *= valid ? s : ivec2(1);
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = ivec2(1);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
ivec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan *= valid ? s : ivec2(1);
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
ivec3 subgroupExclusiveMul(ivec3 v)
|
||||
{
|
||||
ivec3 excl_scan = ivec3(1);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
ivec3 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan *= valid ? s : ivec3(1);
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = ivec3(1);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
ivec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan *= valid ? s : ivec3(1);
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
ivec4 subgroupExclusiveMul(ivec4 v)
|
||||
{
|
||||
ivec4 excl_scan = ivec4(1);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
excl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
ivec4 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
||||
excl_scan *= valid ? s : ivec4(1);
|
||||
}
|
||||
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
||||
if (subgroupElect())
|
||||
{
|
||||
excl_scan = ivec4(1);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
ivec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
excl_scan *= valid ? s : ivec4(1);
|
||||
}
|
||||
}
|
||||
return excl_scan;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
||||
#elif defined(GL_NV_shader_thread_shuffle)
|
||||
uint subgroupInclusiveMul(uint v)
|
||||
{
|
||||
uint incl_scan = 1u;
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
uint s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan *= valid ? s : 1u;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
uint s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan *= valid ? s : 1u;
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
uvec2 subgroupInclusiveMul(uvec2 v)
|
||||
{
|
||||
uvec2 incl_scan = uvec2(1u);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
uvec2 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan *= valid ? s : uvec2(1u);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
uvec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan *= valid ? s : uvec2(1u);
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
uvec3 subgroupInclusiveMul(uvec3 v)
|
||||
{
|
||||
uvec3 incl_scan = uvec3(1u);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
uvec3 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan *= valid ? s : uvec3(1u);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
uvec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan *= valid ? s : uvec3(1u);
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
uvec4 subgroupInclusiveMul(uvec4 v)
|
||||
{
|
||||
uvec4 incl_scan = uvec4(1u);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
uvec4 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan *= valid ? s : uvec4(1u);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
uvec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan *= valid ? s : uvec4(1u);
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
int subgroupInclusiveMul(int v)
|
||||
{
|
||||
int incl_scan = 1;
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
int s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan *= valid ? s : 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
int s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan *= valid ? s : 1;
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
ivec2 subgroupInclusiveMul(ivec2 v)
|
||||
{
|
||||
ivec2 incl_scan = ivec2(1);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
ivec2 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan *= valid ? s : ivec2(1);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
ivec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan *= valid ? s : ivec2(1);
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
ivec3 subgroupInclusiveMul(ivec3 v)
|
||||
{
|
||||
ivec3 incl_scan = ivec3(1);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
ivec3 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan *= valid ? s : ivec3(1);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
ivec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan *= valid ? s : ivec3(1);
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
ivec4 subgroupInclusiveMul(ivec4 v)
|
||||
{
|
||||
ivec4 incl_scan = ivec4(1);
|
||||
uvec4 active_threads = subgroupBallot(true);
|
||||
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
||||
{
|
||||
uint total = gl_SubgroupSize / 2u;
|
||||
incl_scan = v;
|
||||
for (uint i = 1u; i <= total; i <<= 1u)
|
||||
{
|
||||
bool valid;
|
||||
ivec4 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
||||
incl_scan *= valid ? s : ivec4(1);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
||||
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
||||
{
|
||||
bool valid = subgroupBallotBitExtract(active_threads, i);
|
||||
ivec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
||||
valid = valid && (i < total);
|
||||
incl_scan *= valid ? s : ivec4(1);
|
||||
}
|
||||
}
|
||||
return incl_scan;
|
||||
}
|
||||
#endif
|
||||
|
||||
void main()
|
||||
{
|
||||
_16.data_out_int = subgroupMul(_29.data_in_int[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec2 = subgroupMul(_29.data_in_ivec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec3 = subgroupMul(_29.data_in_ivec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec4 = subgroupMul(_29.data_in_ivec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uint = subgroupMul(_29.data_in_uint[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec2 = subgroupMul(_29.data_in_uvec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec3 = subgroupMul(_29.data_in_uvec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec4 = subgroupMul(_29.data_in_uvec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_int = subgroupExclusiveMul(_29.data_in_int[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec2 = subgroupExclusiveMul(_29.data_in_ivec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec3 = subgroupExclusiveMul(_29.data_in_ivec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec4 = subgroupExclusiveMul(_29.data_in_ivec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uint = subgroupExclusiveMul(_29.data_in_uint[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec2 = subgroupExclusiveMul(_29.data_in_uvec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec3 = subgroupExclusiveMul(_29.data_in_uvec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec4 = subgroupExclusiveMul(_29.data_in_uvec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_int = subgroupInclusiveMul(_29.data_in_int[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec2 = subgroupInclusiveMul(_29.data_in_ivec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec3 = subgroupInclusiveMul(_29.data_in_ivec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec4 = subgroupInclusiveMul(_29.data_in_ivec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uint = subgroupInclusiveMul(_29.data_in_uint[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec2 = subgroupInclusiveMul(_29.data_in_uvec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec3 = subgroupInclusiveMul(_29.data_in_uvec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec4 = subgroupInclusiveMul(_29.data_in_uvec4[gl_LocalInvocationID.x]);
|
||||
}
|
||||
|
@ -0,0 +1,56 @@
|
||||
#version 450
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(set = 0, binding = 1, std430) buffer DATA_OUT
|
||||
{
|
||||
int data_out_int;
|
||||
ivec2 data_out_ivec2;
|
||||
ivec3 data_out_ivec3;
|
||||
ivec4 data_out_ivec4;
|
||||
uint data_out_uint;
|
||||
uvec2 data_out_uvec2;
|
||||
uvec3 data_out_uvec3;
|
||||
uvec4 data_out_uvec4;
|
||||
} _16;
|
||||
|
||||
layout(set = 0, binding = 0, std430) buffer DATA_IN
|
||||
{
|
||||
int data_in_int[128];
|
||||
ivec2 data_in_ivec2[128];
|
||||
ivec3 data_in_ivec3[128];
|
||||
ivec4 data_in_ivec4[128];
|
||||
uint data_in_uint[128];
|
||||
uvec2 data_in_uvec2[128];
|
||||
uvec3 data_in_uvec3[128];
|
||||
uvec4 data_in_uvec4[128];
|
||||
} _29;
|
||||
|
||||
void main()
|
||||
{
|
||||
_16.data_out_int = subgroupMul(_29.data_in_int[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec2 = subgroupMul(_29.data_in_ivec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec3 = subgroupMul(_29.data_in_ivec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec4 = subgroupMul(_29.data_in_ivec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uint = subgroupMul(_29.data_in_uint[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec2 = subgroupMul(_29.data_in_uvec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec3 = subgroupMul(_29.data_in_uvec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec4 = subgroupMul(_29.data_in_uvec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_int = subgroupExclusiveMul(_29.data_in_int[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec2 = subgroupExclusiveMul(_29.data_in_ivec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec3 = subgroupExclusiveMul(_29.data_in_ivec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec4 = subgroupExclusiveMul(_29.data_in_ivec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uint = subgroupExclusiveMul(_29.data_in_uint[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec2 = subgroupExclusiveMul(_29.data_in_uvec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec3 = subgroupExclusiveMul(_29.data_in_uvec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec4 = subgroupExclusiveMul(_29.data_in_uvec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_int = subgroupInclusiveMul(_29.data_in_int[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec2 = subgroupInclusiveMul(_29.data_in_ivec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec3 = subgroupInclusiveMul(_29.data_in_ivec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_ivec4 = subgroupInclusiveMul(_29.data_in_ivec4[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uint = subgroupInclusiveMul(_29.data_in_uint[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec2 = subgroupInclusiveMul(_29.data_in_uvec2[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec3 = subgroupInclusiveMul(_29.data_in_uvec3[gl_LocalInvocationID.x]);
|
||||
_16.data_out_uvec4 = subgroupInclusiveMul(_29.data_in_uvec4[gl_LocalInvocationID.x]);
|
||||
}
|
||||
|
61
shaders-no-opt/comp/subgroups_arithmetic_fadd.vk.comp
Normal file
61
shaders-no-opt/comp/subgroups_arithmetic_fadd.vk.comp
Normal file
@ -0,0 +1,61 @@
|
||||
#version 450
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
|
||||
layout(local_size_x = 128) in;
|
||||
|
||||
layout(std430, binding = 0) buffer DATA_IN
|
||||
{
|
||||
float data_in_float[128];
|
||||
vec2 data_in_vec2[128];
|
||||
vec3 data_in_vec3[128];
|
||||
vec4 data_in_vec4[128];
|
||||
double data_in_double[128];
|
||||
dvec2 data_in_dvec2[128];
|
||||
dvec3 data_in_dvec3[128];
|
||||
dvec4 data_in_dvec4[128];
|
||||
};
|
||||
|
||||
layout(std430, binding = 1) buffer DATA_OUT
|
||||
{
|
||||
float data_out_float;
|
||||
vec2 data_out_vec2;
|
||||
vec3 data_out_vec3;
|
||||
vec4 data_out_vec4;
|
||||
double data_out_double;
|
||||
dvec2 data_out_dvec2;
|
||||
dvec3 data_out_dvec3;
|
||||
dvec4 data_out_dvec4;
|
||||
};
|
||||
|
||||
void main()
|
||||
{
|
||||
data_out_float = subgroupAdd(data_in_float[gl_LocalInvocationID.x]);
|
||||
data_out_vec2 = subgroupAdd(data_in_vec2[gl_LocalInvocationID.x]);
|
||||
data_out_vec3 = subgroupAdd(data_in_vec3[gl_LocalInvocationID.x]);
|
||||
data_out_vec4 = subgroupAdd(data_in_vec4[gl_LocalInvocationID.x]);
|
||||
|
||||
data_out_double = subgroupAdd(data_in_double[gl_LocalInvocationID.x]);
|
||||
data_out_dvec2 = subgroupAdd(data_in_dvec2[gl_LocalInvocationID.x]);
|
||||
data_out_dvec3 = subgroupAdd(data_in_dvec3[gl_LocalInvocationID.x]);
|
||||
data_out_dvec4 = subgroupAdd(data_in_dvec4[gl_LocalInvocationID.x]);
|
||||
|
||||
data_out_float = subgroupExclusiveAdd(data_in_float[gl_LocalInvocationID.x]);
|
||||
data_out_vec2 = subgroupExclusiveAdd(data_in_vec2[gl_LocalInvocationID.x]);
|
||||
data_out_vec3 = subgroupExclusiveAdd(data_in_vec3[gl_LocalInvocationID.x]);
|
||||
data_out_vec4 = subgroupExclusiveAdd(data_in_vec4[gl_LocalInvocationID.x]);
|
||||
|
||||
data_out_double = subgroupExclusiveAdd(data_in_double[gl_LocalInvocationID.x]);
|
||||
data_out_dvec2 = subgroupExclusiveAdd(data_in_dvec2[gl_LocalInvocationID.x]);
|
||||
data_out_dvec3 = subgroupExclusiveAdd(data_in_dvec3[gl_LocalInvocationID.x]);
|
||||
data_out_dvec4 = subgroupExclusiveAdd(data_in_dvec4[gl_LocalInvocationID.x]);
|
||||
|
||||
data_out_float = subgroupInclusiveAdd(data_in_float[gl_LocalInvocationID.x]);
|
||||
data_out_vec2 = subgroupInclusiveAdd(data_in_vec2[gl_LocalInvocationID.x]);
|
||||
data_out_vec3 = subgroupInclusiveAdd(data_in_vec3[gl_LocalInvocationID.x]);
|
||||
data_out_vec4 = subgroupInclusiveAdd(data_in_vec4[gl_LocalInvocationID.x]);
|
||||
|
||||
data_out_double = subgroupInclusiveAdd(data_in_double[gl_LocalInvocationID.x]);
|
||||
data_out_dvec2 = subgroupInclusiveAdd(data_in_dvec2[gl_LocalInvocationID.x]);
|
||||
data_out_dvec3 = subgroupInclusiveAdd(data_in_dvec3[gl_LocalInvocationID.x]);
|
||||
data_out_dvec4 = subgroupInclusiveAdd(data_in_dvec4[gl_LocalInvocationID.x]);
|
||||
}
|
61
shaders-no-opt/comp/subgroups_arithmetic_fmul.vk.comp
Normal file
61
shaders-no-opt/comp/subgroups_arithmetic_fmul.vk.comp
Normal file
@ -0,0 +1,61 @@
|
||||
#version 450
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
|
||||
layout(local_size_x = 128) in;
|
||||
|
||||
layout(std430, binding = 0) buffer DATA_IN
|
||||
{
|
||||
float data_in_float[128];
|
||||
vec2 data_in_vec2[128];
|
||||
vec3 data_in_vec3[128];
|
||||
vec4 data_in_vec4[128];
|
||||
double data_in_double[128];
|
||||
dvec2 data_in_dvec2[128];
|
||||
dvec3 data_in_dvec3[128];
|
||||
dvec4 data_in_dvec4[128];
|
||||
};
|
||||
|
||||
layout(std430, binding = 1) buffer DATA_OUT
|
||||
{
|
||||
float data_out_float;
|
||||
vec2 data_out_vec2;
|
||||
vec3 data_out_vec3;
|
||||
vec4 data_out_vec4;
|
||||
double data_out_double;
|
||||
dvec2 data_out_dvec2;
|
||||
dvec3 data_out_dvec3;
|
||||
dvec4 data_out_dvec4;
|
||||
};
|
||||
|
||||
void main()
|
||||
{
|
||||
data_out_float = subgroupMul(data_in_float[gl_LocalInvocationID.x]);
|
||||
data_out_vec2 = subgroupMul(data_in_vec2[gl_LocalInvocationID.x]);
|
||||
data_out_vec3 = subgroupMul(data_in_vec3[gl_LocalInvocationID.x]);
|
||||
data_out_vec4 = subgroupMul(data_in_vec4[gl_LocalInvocationID.x]);
|
||||
|
||||
data_out_double = subgroupMul(data_in_double[gl_LocalInvocationID.x]);
|
||||
data_out_dvec2 = subgroupMul(data_in_dvec2[gl_LocalInvocationID.x]);
|
||||
data_out_dvec3 = subgroupMul(data_in_dvec3[gl_LocalInvocationID.x]);
|
||||
data_out_dvec4 = subgroupMul(data_in_dvec4[gl_LocalInvocationID.x]);
|
||||
|
||||
data_out_float = subgroupExclusiveMul(data_in_float[gl_LocalInvocationID.x]);
|
||||
data_out_vec2 = subgroupExclusiveMul(data_in_vec2[gl_LocalInvocationID.x]);
|
||||
data_out_vec3 = subgroupExclusiveMul(data_in_vec3[gl_LocalInvocationID.x]);
|
||||
data_out_vec4 = subgroupExclusiveMul(data_in_vec4[gl_LocalInvocationID.x]);
|
||||
|
||||
data_out_double = subgroupExclusiveMul(data_in_double[gl_LocalInvocationID.x]);
|
||||
data_out_dvec2 = subgroupExclusiveMul(data_in_dvec2[gl_LocalInvocationID.x]);
|
||||
data_out_dvec3 = subgroupExclusiveMul(data_in_dvec3[gl_LocalInvocationID.x]);
|
||||
data_out_dvec4 = subgroupExclusiveMul(data_in_dvec4[gl_LocalInvocationID.x]);
|
||||
|
||||
data_out_float = subgroupInclusiveMul(data_in_float[gl_LocalInvocationID.x]);
|
||||
data_out_vec2 = subgroupInclusiveMul(data_in_vec2[gl_LocalInvocationID.x]);
|
||||
data_out_vec3 = subgroupInclusiveMul(data_in_vec3[gl_LocalInvocationID.x]);
|
||||
data_out_vec4 = subgroupInclusiveMul(data_in_vec4[gl_LocalInvocationID.x]);
|
||||
|
||||
data_out_double = subgroupInclusiveMul(data_in_double[gl_LocalInvocationID.x]);
|
||||
data_out_dvec2 = subgroupInclusiveMul(data_in_dvec2[gl_LocalInvocationID.x]);
|
||||
data_out_dvec3 = subgroupInclusiveMul(data_in_dvec3[gl_LocalInvocationID.x]);
|
||||
data_out_dvec4 = subgroupInclusiveMul(data_in_dvec4[gl_LocalInvocationID.x]);
|
||||
}
|
61
shaders-no-opt/comp/subgroups_arithmetic_iadd.vk.comp
Normal file
61
shaders-no-opt/comp/subgroups_arithmetic_iadd.vk.comp
Normal file
@ -0,0 +1,61 @@
|
||||
#version 450
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
|
||||
layout(local_size_x = 128) in;
|
||||
|
||||
layout(std430, binding = 0) buffer DATA_IN
|
||||
{
|
||||
int data_in_int[128];
|
||||
ivec2 data_in_ivec2[128];
|
||||
ivec3 data_in_ivec3[128];
|
||||
ivec4 data_in_ivec4[128];
|
||||
uint data_in_uint[128];
|
||||
uvec2 data_in_uvec2[128];
|
||||
uvec3 data_in_uvec3[128];
|
||||
uvec4 data_in_uvec4[128];
|
||||
};
|
||||
|
||||
layout(std430, binding = 1) buffer DATA_OUT
|
||||
{
|
||||
int data_out_int;
|
||||
ivec2 data_out_ivec2;
|
||||
ivec3 data_out_ivec3;
|
||||
ivec4 data_out_ivec4;
|
||||
uint data_out_uint;
|
||||
uvec2 data_out_uvec2;
|
||||
uvec3 data_out_uvec3;
|
||||
uvec4 data_out_uvec4;
|
||||
};
|
||||
|
||||
void main()
|
||||
{
|
||||
data_out_int = subgroupAdd(data_in_int[gl_LocalInvocationID.x]);
|
||||
data_out_ivec2 = subgroupAdd(data_in_ivec2[gl_LocalInvocationID.x]);
|
||||
data_out_ivec3 = subgroupAdd(data_in_ivec3[gl_LocalInvocationID.x]);
|
||||
data_out_ivec4 = subgroupAdd(data_in_ivec4[gl_LocalInvocationID.x]);
|
||||
|
||||
data_out_uint = subgroupAdd(data_in_uint[gl_LocalInvocationID.x]);
|
||||
data_out_uvec2 = subgroupAdd(data_in_uvec2[gl_LocalInvocationID.x]);
|
||||
data_out_uvec3 = subgroupAdd(data_in_uvec3[gl_LocalInvocationID.x]);
|
||||
data_out_uvec4 = subgroupAdd(data_in_uvec4[gl_LocalInvocationID.x]);
|
||||
|
||||
data_out_int = subgroupExclusiveAdd(data_in_int[gl_LocalInvocationID.x]);
|
||||
data_out_ivec2 = subgroupExclusiveAdd(data_in_ivec2[gl_LocalInvocationID.x]);
|
||||
data_out_ivec3 = subgroupExclusiveAdd(data_in_ivec3[gl_LocalInvocationID.x]);
|
||||
data_out_ivec4 = subgroupExclusiveAdd(data_in_ivec4[gl_LocalInvocationID.x]);
|
||||
|
||||
data_out_uint = subgroupExclusiveAdd(data_in_uint[gl_LocalInvocationID.x]);
|
||||
data_out_uvec2 = subgroupExclusiveAdd(data_in_uvec2[gl_LocalInvocationID.x]);
|
||||
data_out_uvec3 = subgroupExclusiveAdd(data_in_uvec3[gl_LocalInvocationID.x]);
|
||||
data_out_uvec4 = subgroupExclusiveAdd(data_in_uvec4[gl_LocalInvocationID.x]);
|
||||
|
||||
data_out_int = subgroupInclusiveAdd(data_in_int[gl_LocalInvocationID.x]);
|
||||
data_out_ivec2 = subgroupInclusiveAdd(data_in_ivec2[gl_LocalInvocationID.x]);
|
||||
data_out_ivec3 = subgroupInclusiveAdd(data_in_ivec3[gl_LocalInvocationID.x]);
|
||||
data_out_ivec4 = subgroupInclusiveAdd(data_in_ivec4[gl_LocalInvocationID.x]);
|
||||
|
||||
data_out_uint = subgroupInclusiveAdd(data_in_uint[gl_LocalInvocationID.x]);
|
||||
data_out_uvec2 = subgroupInclusiveAdd(data_in_uvec2[gl_LocalInvocationID.x]);
|
||||
data_out_uvec3 = subgroupInclusiveAdd(data_in_uvec3[gl_LocalInvocationID.x]);
|
||||
data_out_uvec4 = subgroupInclusiveAdd(data_in_uvec4[gl_LocalInvocationID.x]);
|
||||
}
|
61
shaders-no-opt/comp/subgroups_arithmetic_imul.vk.comp
Normal file
61
shaders-no-opt/comp/subgroups_arithmetic_imul.vk.comp
Normal file
@ -0,0 +1,61 @@
|
||||
#version 450
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
|
||||
layout(local_size_x = 128) in;
|
||||
|
||||
layout(std430, binding = 0) buffer DATA_IN
|
||||
{
|
||||
int data_in_int[128];
|
||||
ivec2 data_in_ivec2[128];
|
||||
ivec3 data_in_ivec3[128];
|
||||
ivec4 data_in_ivec4[128];
|
||||
uint data_in_uint[128];
|
||||
uvec2 data_in_uvec2[128];
|
||||
uvec3 data_in_uvec3[128];
|
||||
uvec4 data_in_uvec4[128];
|
||||
};
|
||||
|
||||
layout(std430, binding = 1) buffer DATA_OUT
|
||||
{
|
||||
int data_out_int;
|
||||
ivec2 data_out_ivec2;
|
||||
ivec3 data_out_ivec3;
|
||||
ivec4 data_out_ivec4;
|
||||
uint data_out_uint;
|
||||
uvec2 data_out_uvec2;
|
||||
uvec3 data_out_uvec3;
|
||||
uvec4 data_out_uvec4;
|
||||
};
|
||||
|
||||
void main()
|
||||
{
|
||||
data_out_int = subgroupMul(data_in_int[gl_LocalInvocationID.x]);
|
||||
data_out_ivec2 = subgroupMul(data_in_ivec2[gl_LocalInvocationID.x]);
|
||||
data_out_ivec3 = subgroupMul(data_in_ivec3[gl_LocalInvocationID.x]);
|
||||
data_out_ivec4 = subgroupMul(data_in_ivec4[gl_LocalInvocationID.x]);
|
||||
|
||||
data_out_uint = subgroupMul(data_in_uint[gl_LocalInvocationID.x]);
|
||||
data_out_uvec2 = subgroupMul(data_in_uvec2[gl_LocalInvocationID.x]);
|
||||
data_out_uvec3 = subgroupMul(data_in_uvec3[gl_LocalInvocationID.x]);
|
||||
data_out_uvec4 = subgroupMul(data_in_uvec4[gl_LocalInvocationID.x]);
|
||||
|
||||
data_out_int = subgroupExclusiveMul(data_in_int[gl_LocalInvocationID.x]);
|
||||
data_out_ivec2 = subgroupExclusiveMul(data_in_ivec2[gl_LocalInvocationID.x]);
|
||||
data_out_ivec3 = subgroupExclusiveMul(data_in_ivec3[gl_LocalInvocationID.x]);
|
||||
data_out_ivec4 = subgroupExclusiveMul(data_in_ivec4[gl_LocalInvocationID.x]);
|
||||
|
||||
data_out_uint = subgroupExclusiveMul(data_in_uint[gl_LocalInvocationID.x]);
|
||||
data_out_uvec2 = subgroupExclusiveMul(data_in_uvec2[gl_LocalInvocationID.x]);
|
||||
data_out_uvec3 = subgroupExclusiveMul(data_in_uvec3[gl_LocalInvocationID.x]);
|
||||
data_out_uvec4 = subgroupExclusiveMul(data_in_uvec4[gl_LocalInvocationID.x]);
|
||||
|
||||
data_out_int = subgroupInclusiveMul(data_in_int[gl_LocalInvocationID.x]);
|
||||
data_out_ivec2 = subgroupInclusiveMul(data_in_ivec2[gl_LocalInvocationID.x]);
|
||||
data_out_ivec3 = subgroupInclusiveMul(data_in_ivec3[gl_LocalInvocationID.x]);
|
||||
data_out_ivec4 = subgroupInclusiveMul(data_in_ivec4[gl_LocalInvocationID.x]);
|
||||
|
||||
data_out_uint = subgroupInclusiveMul(data_in_uint[gl_LocalInvocationID.x]);
|
||||
data_out_uvec2 = subgroupInclusiveMul(data_in_uvec2[gl_LocalInvocationID.x]);
|
||||
data_out_uvec3 = subgroupInclusiveMul(data_in_uvec3[gl_LocalInvocationID.x]);
|
||||
data_out_uvec4 = subgroupInclusiveMul(data_in_uvec4[gl_LocalInvocationID.x]);
|
||||
}
|
315
spirv_glsl.cpp
315
spirv_glsl.cpp
@ -30,6 +30,7 @@
|
||||
#include <limits>
|
||||
#include <locale.h>
|
||||
#include <utility>
|
||||
#include <array>
|
||||
|
||||
#ifndef _WIN32
|
||||
#include <langinfo.h>
|
||||
@ -3993,6 +3994,169 @@ void CompilerGLSL::emit_output_variable_initializer(const SPIRVariable &var)
|
||||
}
|
||||
}
|
||||
|
||||
void CompilerGLSL::emit_subgroup_arithmetic_workaround(const std::string &func, Op op, GroupOperation group_op)
|
||||
{
|
||||
std::string result;
|
||||
switch (group_op)
|
||||
{
|
||||
case GroupOperationReduce:
|
||||
result = "reduction";
|
||||
break;
|
||||
|
||||
case GroupOperationExclusiveScan:
|
||||
result = "excl_scan";
|
||||
break;
|
||||
|
||||
case GroupOperationInclusiveScan:
|
||||
result = "incl_scan";
|
||||
break;
|
||||
|
||||
default:
|
||||
SPIRV_CROSS_THROW("Unsupported workaround for arithmetic group operation");
|
||||
}
|
||||
|
||||
struct TypeInfo
|
||||
{
|
||||
std::string type;
|
||||
std::string identity;
|
||||
};
|
||||
|
||||
std::vector<TypeInfo> type_infos;
|
||||
switch (op)
|
||||
{
|
||||
case OpGroupNonUniformIAdd:
|
||||
{
|
||||
type_infos.emplace_back(TypeInfo{ "uint", "0u" });
|
||||
type_infos.emplace_back(TypeInfo{ "uvec2", "uvec2(0u)" });
|
||||
type_infos.emplace_back(TypeInfo{ "uvec3", "uvec3(0u)" });
|
||||
type_infos.emplace_back(TypeInfo{ "uvec4", "uvec4(0u)" });
|
||||
type_infos.emplace_back(TypeInfo{ "int", "0" });
|
||||
type_infos.emplace_back(TypeInfo{ "ivec2", "ivec2(0)" });
|
||||
type_infos.emplace_back(TypeInfo{ "ivec3", "ivec3(0)" });
|
||||
type_infos.emplace_back(TypeInfo{ "ivec4", "ivec4(0)" });
|
||||
break;
|
||||
}
|
||||
|
||||
case OpGroupNonUniformFAdd:
|
||||
{
|
||||
type_infos.emplace_back(TypeInfo{ "float", "0.0f" });
|
||||
type_infos.emplace_back(TypeInfo{ "vec2", "vec2(0.0f)" });
|
||||
type_infos.emplace_back(TypeInfo{ "vec3", "vec3(0.0f)" });
|
||||
type_infos.emplace_back(TypeInfo{ "vec4", "vec4(0.0f)" });
|
||||
// ARB_gpu_shader_fp64 is required in GL4.0 which in turn is required by NV_thread_shuffle
|
||||
type_infos.emplace_back(TypeInfo{ "double", "0.0LF" });
|
||||
type_infos.emplace_back(TypeInfo{ "dvec2", "dvec2(0.0LF)" });
|
||||
type_infos.emplace_back(TypeInfo{ "dvec3", "dvec3(0.0LF)" });
|
||||
type_infos.emplace_back(TypeInfo{ "dvec4", "dvec4(0.0LF)" });
|
||||
break;
|
||||
}
|
||||
|
||||
case OpGroupNonUniformIMul:
|
||||
{
|
||||
type_infos.emplace_back(TypeInfo{ "uint", "1u" });
|
||||
type_infos.emplace_back(TypeInfo{ "uvec2", "uvec2(1u)" });
|
||||
type_infos.emplace_back(TypeInfo{ "uvec3", "uvec3(1u)" });
|
||||
type_infos.emplace_back(TypeInfo{ "uvec4", "uvec4(1u)" });
|
||||
type_infos.emplace_back(TypeInfo{ "int", "1" });
|
||||
type_infos.emplace_back(TypeInfo{ "ivec2", "ivec2(1)" });
|
||||
type_infos.emplace_back(TypeInfo{ "ivec3", "ivec3(1)" });
|
||||
type_infos.emplace_back(TypeInfo{ "ivec4", "ivec4(1)" });
|
||||
break;
|
||||
}
|
||||
|
||||
case OpGroupNonUniformFMul:
|
||||
{
|
||||
type_infos.emplace_back(TypeInfo{ "float", "1.0f" });
|
||||
type_infos.emplace_back(TypeInfo{ "vec2", "vec2(1.0f)" });
|
||||
type_infos.emplace_back(TypeInfo{ "vec3", "vec3(1.0f)" });
|
||||
type_infos.emplace_back(TypeInfo{ "vec4", "vec4(1.0f)" });
|
||||
type_infos.emplace_back(TypeInfo{ "double", "0.0LF" });
|
||||
type_infos.emplace_back(TypeInfo{ "dvec2", "dvec2(1.0LF)" });
|
||||
type_infos.emplace_back(TypeInfo{ "dvec3", "dvec3(1.0LF)" });
|
||||
type_infos.emplace_back(TypeInfo{ "dvec4", "dvec4(1.0LF)" });
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
SPIRV_CROSS_THROW("Unsupported workaround for arithmetic group operation");
|
||||
}
|
||||
|
||||
const bool op_is_addition = op == OpGroupNonUniformIAdd || op == OpGroupNonUniformFAdd;
|
||||
const bool op_is_multiplication = op == OpGroupNonUniformIMul || op == OpGroupNonUniformFMul;
|
||||
std::string op_symbol;
|
||||
if (op_is_addition)
|
||||
{
|
||||
op_symbol = "+=";
|
||||
}
|
||||
else if (op_is_multiplication)
|
||||
{
|
||||
op_symbol = "*=";
|
||||
}
|
||||
|
||||
for (const TypeInfo &t : type_infos)
|
||||
{
|
||||
statement(t.type, " ", func, "(", t.type, " v)");
|
||||
begin_scope();
|
||||
statement(t.type, " ", result, " = ", t.identity, ";");
|
||||
statement("uvec4 active_threads = subgroupBallot(true);");
|
||||
statement("if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)");
|
||||
begin_scope();
|
||||
statement("uint total = gl_SubgroupSize / 2u;");
|
||||
statement(result, " = v;");
|
||||
statement("for (uint i = 1u; i <= total; i <<= 1u)");
|
||||
begin_scope();
|
||||
statement("bool valid;");
|
||||
if (group_op == GroupOperationReduce)
|
||||
{
|
||||
statement(t.type, " s = shuffleXorNV(", result, ", i, gl_SubgroupSize, valid);");
|
||||
}
|
||||
else if (group_op == GroupOperationExclusiveScan || group_op == GroupOperationInclusiveScan)
|
||||
{
|
||||
statement(t.type, " s = shuffleUpNV(", result, ", i, gl_SubgroupSize, valid);");
|
||||
}
|
||||
if (op_is_addition || op_is_multiplication)
|
||||
{
|
||||
statement(result, " ", op_symbol, " valid ? s : ", t.identity, ";");
|
||||
}
|
||||
end_scope();
|
||||
if (group_op == GroupOperationExclusiveScan)
|
||||
{
|
||||
statement(result, " = shuffleUpNV(", result, ", 1u, gl_SubgroupSize);");
|
||||
statement("if (subgroupElect())");
|
||||
begin_scope();
|
||||
statement(result, " = ", t.identity, ";");
|
||||
end_scope();
|
||||
}
|
||||
end_scope();
|
||||
statement("else");
|
||||
begin_scope();
|
||||
if (group_op == GroupOperationExclusiveScan)
|
||||
{
|
||||
statement("uint total = subgroupBallotBitCount(gl_SubgroupLtMask);");
|
||||
}
|
||||
else if (group_op == GroupOperationInclusiveScan)
|
||||
{
|
||||
statement("uint total = subgroupBallotBitCount(gl_SubgroupLeMask);");
|
||||
}
|
||||
statement("for (uint i = 0u; i < gl_SubgroupSize; ++i)");
|
||||
begin_scope();
|
||||
statement("bool valid = subgroupBallotBitExtract(active_threads, i);");
|
||||
statement(t.type, " s = shuffleNV(v, i, gl_SubgroupSize);");
|
||||
if (group_op == GroupOperationExclusiveScan || group_op == GroupOperationInclusiveScan)
|
||||
{
|
||||
statement("valid = valid && (i < total);");
|
||||
}
|
||||
if (op_is_addition || op_is_multiplication)
|
||||
{
|
||||
statement(result, " ", op_symbol, " valid ? s : ", t.identity, ";");
|
||||
}
|
||||
end_scope();
|
||||
end_scope();
|
||||
statement("return ", result, ";");
|
||||
end_scope();
|
||||
}
|
||||
}
|
||||
|
||||
void CompilerGLSL::emit_extension_workarounds(spv::ExecutionModel model)
|
||||
{
|
||||
static const char *workaround_types[] = { "int", "ivec2", "ivec3", "ivec4", "uint", "uvec2", "uvec3", "uvec4",
|
||||
@ -4396,6 +4560,57 @@ void CompilerGLSL::emit_extension_workarounds(spv::ExecutionModel model)
|
||||
statement("#endif");
|
||||
statement("");
|
||||
}
|
||||
|
||||
auto arithmetic_feature_helper =
|
||||
[&](Supp::Feature feat, std::string func_name, spv::Op op, spv::GroupOperation group_op)
|
||||
{
|
||||
if (shader_subgroup_supporter.is_feature_requested(feat))
|
||||
{
|
||||
auto exts = Supp::get_candidates_for_feature(feat, result);
|
||||
for (auto &e : exts)
|
||||
{
|
||||
const char *name = Supp::get_extension_name(e);
|
||||
statement(&e == &exts.front() ? "#if" : "#elif", " defined(", name, ")");
|
||||
|
||||
switch (e)
|
||||
{
|
||||
case Supp::NV_shader_thread_shuffle:
|
||||
emit_subgroup_arithmetic_workaround(func_name, op, group_op);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
statement("#endif");
|
||||
statement("");
|
||||
}
|
||||
};
|
||||
|
||||
arithmetic_feature_helper(Supp::SubgroupArithmeticIAddReduce, "subgroupAdd", OpGroupNonUniformIAdd,
|
||||
GroupOperationReduce);
|
||||
arithmetic_feature_helper(Supp::SubgroupArithmeticIAddExclusiveScan, "subgroupExclusiveAdd",
|
||||
OpGroupNonUniformIAdd, GroupOperationExclusiveScan);
|
||||
arithmetic_feature_helper(Supp::SubgroupArithmeticIAddInclusiveScan, "subgroupInclusiveAdd",
|
||||
OpGroupNonUniformIAdd, GroupOperationInclusiveScan);
|
||||
arithmetic_feature_helper(Supp::SubgroupArithmeticFAddReduce, "subgroupAdd", OpGroupNonUniformFAdd,
|
||||
GroupOperationReduce);
|
||||
arithmetic_feature_helper(Supp::SubgroupArithmeticFAddExclusiveScan, "subgroupExclusiveAdd",
|
||||
OpGroupNonUniformFAdd, GroupOperationExclusiveScan);
|
||||
arithmetic_feature_helper(Supp::SubgroupArithmeticFAddInclusiveScan, "subgroupInclusiveAdd",
|
||||
OpGroupNonUniformFAdd, GroupOperationInclusiveScan);
|
||||
|
||||
arithmetic_feature_helper(Supp::SubgroupArithmeticIMulReduce, "subgroupMul", OpGroupNonUniformIMul,
|
||||
GroupOperationReduce);
|
||||
arithmetic_feature_helper(Supp::SubgroupArithmeticIMulExclusiveScan, "subgroupExclusiveMul",
|
||||
OpGroupNonUniformIMul, GroupOperationExclusiveScan);
|
||||
arithmetic_feature_helper(Supp::SubgroupArithmeticIMulInclusiveScan, "subgroupInclusiveMul",
|
||||
OpGroupNonUniformIMul, GroupOperationInclusiveScan);
|
||||
arithmetic_feature_helper(Supp::SubgroupArithmeticFMulReduce, "subgroupMul", OpGroupNonUniformFMul,
|
||||
GroupOperationReduce);
|
||||
arithmetic_feature_helper(Supp::SubgroupArithmeticFMulExclusiveScan, "subgroupExclusiveMul",
|
||||
OpGroupNonUniformFMul, GroupOperationExclusiveScan);
|
||||
arithmetic_feature_helper(Supp::SubgroupArithmeticFMulInclusiveScan, "subgroupInclusiveMul",
|
||||
OpGroupNonUniformFMul, GroupOperationInclusiveScan);
|
||||
}
|
||||
|
||||
if (!workaround_ubo_load_overload_types.empty())
|
||||
@ -7109,7 +7324,7 @@ string CompilerGLSL::to_combined_image_sampler(VariableID image_id, VariableID s
|
||||
}
|
||||
}
|
||||
|
||||
bool CompilerGLSL::is_supported_subgroup_op_in_opengl(spv::Op op)
|
||||
bool CompilerGLSL::is_supported_subgroup_op_in_opengl(spv::Op op, const uint32_t *ops)
|
||||
{
|
||||
switch (op)
|
||||
{
|
||||
@ -7128,6 +7343,22 @@ bool CompilerGLSL::is_supported_subgroup_op_in_opengl(spv::Op op)
|
||||
case OpGroupNonUniformBallotBitExtract:
|
||||
case OpGroupNonUniformInverseBallot:
|
||||
return true;
|
||||
case OpGroupNonUniformIAdd:
|
||||
case OpGroupNonUniformFAdd:
|
||||
case OpGroupNonUniformIMul:
|
||||
case OpGroupNonUniformFMul:
|
||||
{
|
||||
const GroupOperation operation = static_cast<GroupOperation>(ops[3]);
|
||||
if (operation == GroupOperationReduce || operation == GroupOperationInclusiveScan ||
|
||||
operation == GroupOperationExclusiveScan)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@ -8725,7 +8956,7 @@ void CompilerGLSL::emit_subgroup_op(const Instruction &i)
|
||||
const uint32_t *ops = stream(i);
|
||||
auto op = static_cast<Op>(i.op);
|
||||
|
||||
if (!options.vulkan_semantics && !is_supported_subgroup_op_in_opengl(op))
|
||||
if (!options.vulkan_semantics && !is_supported_subgroup_op_in_opengl(op, ops))
|
||||
SPIRV_CROSS_THROW("This subgroup operation is only supported in Vulkan semantics.");
|
||||
|
||||
// If we need to do implicit bitcasts, make sure we do it with the correct type.
|
||||
@ -8793,12 +9024,34 @@ void CompilerGLSL::emit_subgroup_op(const Instruction &i)
|
||||
}
|
||||
break;
|
||||
|
||||
case OpGroupNonUniformFAdd:
|
||||
case OpGroupNonUniformFMul:
|
||||
// clang-format off
|
||||
#define GLSL_GROUP_OP(OP)\
|
||||
case OpGroupNonUniform##OP:\
|
||||
{\
|
||||
auto operation = static_cast<GroupOperation>(ops[3]);\
|
||||
if (operation == GroupOperationClusteredReduce)\
|
||||
require_extension_internal("GL_KHR_shader_subgroup_clustered");\
|
||||
else if (operation == GroupOperationReduce)\
|
||||
request_subgroup_feature(ShaderSubgroupSupportHelper::SubgroupArithmetic##OP##Reduce);\
|
||||
else if (operation == GroupOperationExclusiveScan)\
|
||||
request_subgroup_feature(ShaderSubgroupSupportHelper::SubgroupArithmetic##OP##ExclusiveScan);\
|
||||
else if (operation == GroupOperationInclusiveScan)\
|
||||
request_subgroup_feature(ShaderSubgroupSupportHelper::SubgroupArithmetic##OP##InclusiveScan);\
|
||||
else\
|
||||
SPIRV_CROSS_THROW("Invalid group operation.");\
|
||||
break;\
|
||||
}
|
||||
|
||||
GLSL_GROUP_OP(IAdd)
|
||||
GLSL_GROUP_OP(FAdd)
|
||||
GLSL_GROUP_OP(IMul)
|
||||
GLSL_GROUP_OP(FMul)
|
||||
|
||||
#undef GLSL_GROUP_OP
|
||||
// clang-format on
|
||||
|
||||
case OpGroupNonUniformFMin:
|
||||
case OpGroupNonUniformFMax:
|
||||
case OpGroupNonUniformIAdd:
|
||||
case OpGroupNonUniformIMul:
|
||||
case OpGroupNonUniformSMin:
|
||||
case OpGroupNonUniformSMax:
|
||||
case OpGroupNonUniformUMin:
|
||||
@ -17667,6 +17920,7 @@ const char *CompilerGLSL::ShaderSubgroupSupportHelper::get_extension_name(Candid
|
||||
static const char *const retval[CandidateCount] = { "GL_KHR_shader_subgroup_ballot",
|
||||
"GL_KHR_shader_subgroup_basic",
|
||||
"GL_KHR_shader_subgroup_vote",
|
||||
"GL_KHR_shader_subgroup_arithmetic",
|
||||
"GL_NV_gpu_shader_5",
|
||||
"GL_NV_shader_thread_group",
|
||||
"GL_NV_shader_thread_shuffle",
|
||||
@ -17715,6 +17969,21 @@ CompilerGLSL::ShaderSubgroupSupportHelper::FeatureVector CompilerGLSL::ShaderSub
|
||||
return { SubgroupMask };
|
||||
case SubgroupBallotBitCount:
|
||||
return { SubgroupBallot };
|
||||
case SubgroupArithmeticIAddReduce:
|
||||
case SubgroupArithmeticIAddInclusiveScan:
|
||||
case SubgroupArithmeticFAddReduce:
|
||||
case SubgroupArithmeticFAddInclusiveScan:
|
||||
case SubgroupArithmeticIMulReduce:
|
||||
case SubgroupArithmeticIMulInclusiveScan:
|
||||
case SubgroupArithmeticFMulReduce:
|
||||
case SubgroupArithmeticFMulInclusiveScan:
|
||||
return { SubgroupSize, SubgroupBallot, SubgroupBallotBitCount, SubgroupMask, SubgroupBallotBitExtract };
|
||||
case SubgroupArithmeticIAddExclusiveScan:
|
||||
case SubgroupArithmeticFAddExclusiveScan:
|
||||
case SubgroupArithmeticIMulExclusiveScan:
|
||||
case SubgroupArithmeticFMulExclusiveScan:
|
||||
return { SubgroupSize, SubgroupBallot, SubgroupBallotBitCount,
|
||||
SubgroupMask, SubgroupElect, SubgroupBallotBitExtract };
|
||||
default:
|
||||
return {};
|
||||
}
|
||||
@ -17728,11 +17997,15 @@ CompilerGLSL::ShaderSubgroupSupportHelper::FeatureMask CompilerGLSL::ShaderSubgr
|
||||
|
||||
bool CompilerGLSL::ShaderSubgroupSupportHelper::can_feature_be_implemented_without_extensions(Feature feature)
|
||||
{
|
||||
static const bool retval[FeatureCount] = { false, false, false, false, false, false,
|
||||
true, // SubgroupBalloFindLSB_MSB
|
||||
false, false, false, false,
|
||||
true, // SubgroupMemBarrier - replaced with workgroup memory barriers
|
||||
false, false, true, false };
|
||||
static const bool retval[FeatureCount] = {
|
||||
false, false, false, false, false, false,
|
||||
true, // SubgroupBalloFindLSB_MSB
|
||||
false, false, false, false,
|
||||
true, // SubgroupMemBarrier - replaced with workgroup memory barriers
|
||||
false, false, true, false,
|
||||
false, false, false, false, false, false, // iadd, fadd
|
||||
false, false, false, false, false, false, // imul , fmul
|
||||
};
|
||||
|
||||
return retval[feature];
|
||||
}
|
||||
@ -17744,7 +18017,11 @@ CompilerGLSL::ShaderSubgroupSupportHelper::Candidate CompilerGLSL::ShaderSubgrou
|
||||
KHR_shader_subgroup_ballot, KHR_shader_subgroup_basic, KHR_shader_subgroup_basic, KHR_shader_subgroup_basic,
|
||||
KHR_shader_subgroup_basic, KHR_shader_subgroup_ballot, KHR_shader_subgroup_ballot, KHR_shader_subgroup_vote,
|
||||
KHR_shader_subgroup_vote, KHR_shader_subgroup_basic, KHR_shader_subgroup_basic, KHR_shader_subgroup_basic,
|
||||
KHR_shader_subgroup_ballot, KHR_shader_subgroup_ballot, KHR_shader_subgroup_ballot, KHR_shader_subgroup_ballot
|
||||
KHR_shader_subgroup_ballot, KHR_shader_subgroup_ballot, KHR_shader_subgroup_ballot, KHR_shader_subgroup_ballot,
|
||||
KHR_shader_subgroup_arithmetic, KHR_shader_subgroup_arithmetic, KHR_shader_subgroup_arithmetic,
|
||||
KHR_shader_subgroup_arithmetic, KHR_shader_subgroup_arithmetic, KHR_shader_subgroup_arithmetic,
|
||||
KHR_shader_subgroup_arithmetic, KHR_shader_subgroup_arithmetic, KHR_shader_subgroup_arithmetic,
|
||||
KHR_shader_subgroup_arithmetic, KHR_shader_subgroup_arithmetic, KHR_shader_subgroup_arithmetic,
|
||||
};
|
||||
|
||||
return extensions[feature];
|
||||
@ -17840,6 +18117,19 @@ CompilerGLSL::ShaderSubgroupSupportHelper::CandidateVector CompilerGLSL::ShaderS
|
||||
return { NV_shader_thread_group };
|
||||
case SubgroupBallotBitCount:
|
||||
return {};
|
||||
case SubgroupArithmeticIAddReduce:
|
||||
case SubgroupArithmeticIAddExclusiveScan:
|
||||
case SubgroupArithmeticIAddInclusiveScan:
|
||||
case SubgroupArithmeticFAddReduce:
|
||||
case SubgroupArithmeticFAddExclusiveScan:
|
||||
case SubgroupArithmeticFAddInclusiveScan:
|
||||
case SubgroupArithmeticIMulReduce:
|
||||
case SubgroupArithmeticIMulExclusiveScan:
|
||||
case SubgroupArithmeticIMulInclusiveScan:
|
||||
case SubgroupArithmeticFMulReduce:
|
||||
case SubgroupArithmeticFMulExclusiveScan:
|
||||
case SubgroupArithmeticFMulInclusiveScan:
|
||||
return { KHR_shader_subgroup_arithmetic, NV_shader_thread_shuffle };
|
||||
default:
|
||||
return {};
|
||||
}
|
||||
@ -17864,6 +18154,7 @@ CompilerGLSL::ShaderSubgroupSupportHelper::Result::Result()
|
||||
weights[KHR_shader_subgroup_ballot] = big_num;
|
||||
weights[KHR_shader_subgroup_basic] = big_num;
|
||||
weights[KHR_shader_subgroup_vote] = big_num;
|
||||
weights[KHR_shader_subgroup_arithmetic] = big_num;
|
||||
}
|
||||
|
||||
void CompilerGLSL::request_workaround_wrapper_overload(TypeID id)
|
||||
|
@ -292,6 +292,7 @@ protected:
|
||||
KHR_shader_subgroup_ballot,
|
||||
KHR_shader_subgroup_basic,
|
||||
KHR_shader_subgroup_vote,
|
||||
KHR_shader_subgroup_arithmetic,
|
||||
NV_gpu_shader_5,
|
||||
NV_shader_thread_group,
|
||||
NV_shader_thread_shuffle,
|
||||
@ -324,7 +325,18 @@ protected:
|
||||
SubgroupInverseBallot_InclBitCount_ExclBitCout = 13,
|
||||
SubgroupBallotBitExtract = 14,
|
||||
SubgroupBallotBitCount = 15,
|
||||
|
||||
SubgroupArithmeticIAddReduce = 16,
|
||||
SubgroupArithmeticIAddExclusiveScan = 17,
|
||||
SubgroupArithmeticIAddInclusiveScan = 18,
|
||||
SubgroupArithmeticFAddReduce = 19,
|
||||
SubgroupArithmeticFAddExclusiveScan = 20,
|
||||
SubgroupArithmeticFAddInclusiveScan = 21,
|
||||
SubgroupArithmeticIMulReduce = 22,
|
||||
SubgroupArithmeticIMulExclusiveScan = 23,
|
||||
SubgroupArithmeticIMulInclusiveScan = 24,
|
||||
SubgroupArithmeticFMulReduce = 25,
|
||||
SubgroupArithmeticFMulExclusiveScan = 26,
|
||||
SubgroupArithmeticFMulInclusiveScan = 27,
|
||||
FeatureCount
|
||||
};
|
||||
|
||||
@ -358,7 +370,7 @@ protected:
|
||||
};
|
||||
|
||||
// TODO remove this function when all subgroup ops are supported (or make it always return true)
|
||||
static bool is_supported_subgroup_op_in_opengl(spv::Op op);
|
||||
static bool is_supported_subgroup_op_in_opengl(spv::Op op, const uint32_t *ops);
|
||||
|
||||
void reset(uint32_t iteration_count);
|
||||
void emit_function(SPIRFunction &func, const Bitset &return_flags);
|
||||
@ -627,6 +639,7 @@ protected:
|
||||
void emit_struct(SPIRType &type);
|
||||
void emit_resources();
|
||||
void emit_extension_workarounds(spv::ExecutionModel model);
|
||||
void emit_subgroup_arithmetic_workaround(const std::string &func, spv::Op op, spv::GroupOperation group_op);
|
||||
void emit_polyfills(uint32_t polyfills, bool relaxed);
|
||||
void emit_buffer_block_native(const SPIRVariable &var);
|
||||
void emit_buffer_reference_block(uint32_t type_id, bool forward_declaration);
|
||||
|
Loading…
Reference in New Issue
Block a user