893 lines
28 KiB
Plaintext
893 lines
28 KiB
Plaintext
#version 450
|
|
|
|
#if defined(GL_KHR_shader_subgroup_ballot)
|
|
#extension GL_KHR_shader_subgroup_ballot : require
|
|
#elif defined(GL_NV_shader_thread_group)
|
|
#extension GL_NV_shader_thread_group : require
|
|
#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64)
|
|
#extension GL_ARB_shader_int64 : enable
|
|
#extension GL_ARB_shader_ballot : require
|
|
#else
|
|
#error No extensions available to emulate requested subgroup feature.
|
|
#endif
|
|
|
|
#if defined(GL_KHR_shader_subgroup_basic)
|
|
#extension GL_KHR_shader_subgroup_basic : require
|
|
#elif defined(GL_NV_shader_thread_group)
|
|
#extension GL_NV_shader_thread_group : require
|
|
#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64)
|
|
#extension GL_ARB_shader_int64 : enable
|
|
#extension GL_ARB_shader_ballot : require
|
|
#elif defined(GL_AMD_gcn_shader) && (defined(GL_AMD_gpu_shader_int64) || defined(GL_NV_gpu_shader5))
|
|
#extension GL_AMD_gpu_shader_int64 : enable
|
|
#extension GL_NV_gpu_shader5 : enable
|
|
#extension GL_AMD_gcn_shader : require
|
|
#else
|
|
#error No extensions available to emulate requested subgroup feature.
|
|
#endif
|
|
|
|
#if defined(GL_KHR_shader_subgroup_ballot)
|
|
#extension GL_KHR_shader_subgroup_ballot : require
|
|
#elif defined(GL_NV_shader_thread_group)
|
|
#extension GL_NV_shader_thread_group : require
|
|
#elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64)
|
|
#extension GL_ARB_shader_int64 : enable
|
|
#extension GL_ARB_shader_ballot : require
|
|
#else
|
|
#error No extensions available to emulate requested subgroup feature.
|
|
#endif
|
|
|
|
#if defined(GL_NV_shader_thread_group)
|
|
#extension GL_NV_shader_thread_group : require
|
|
#endif
|
|
|
|
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
|
#extension GL_KHR_shader_subgroup_arithmetic : require
|
|
#elif defined(GL_NV_shader_thread_shuffle)
|
|
#extension GL_NV_shader_thread_shuffle : require
|
|
#else
|
|
#error No extensions available to emulate requested subgroup feature.
|
|
#endif
|
|
|
|
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
|
#extension GL_KHR_shader_subgroup_arithmetic : require
|
|
#elif defined(GL_NV_shader_thread_shuffle)
|
|
#extension GL_NV_shader_thread_shuffle : require
|
|
#else
|
|
#error No extensions available to emulate requested subgroup feature.
|
|
#endif
|
|
|
|
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
|
#extension GL_KHR_shader_subgroup_arithmetic : require
|
|
#elif defined(GL_NV_shader_thread_shuffle)
|
|
#extension GL_NV_shader_thread_shuffle : require
|
|
#else
|
|
#error No extensions available to emulate requested subgroup feature.
|
|
#endif
|
|
layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
|
|
|
|
layout(binding = 1, std430) buffer DATA_OUT
|
|
{
|
|
int data_out_int;
|
|
ivec2 data_out_ivec2;
|
|
ivec3 data_out_ivec3;
|
|
ivec4 data_out_ivec4;
|
|
uint data_out_uint;
|
|
uvec2 data_out_uvec2;
|
|
uvec3 data_out_uvec3;
|
|
uvec4 data_out_uvec4;
|
|
} _16;
|
|
|
|
layout(binding = 0, std430) buffer DATA_IN
|
|
{
|
|
int data_in_int[128];
|
|
ivec2 data_in_ivec2[128];
|
|
ivec3 data_in_ivec3[128];
|
|
ivec4 data_in_ivec4[128];
|
|
uint data_in_uint[128];
|
|
uvec2 data_in_uvec2[128];
|
|
uvec3 data_in_uvec3[128];
|
|
uvec4 data_in_uvec4[128];
|
|
} _29;
|
|
|
|
#if defined(GL_KHR_shader_subgroup_ballot)
|
|
#elif defined(GL_NV_shader_thread_group)
|
|
#define gl_SubgroupEqMask uvec4(gl_ThreadEqMaskNV, 0u, 0u, 0u)
|
|
#define gl_SubgroupGeMask uvec4(gl_ThreadGeMaskNV, 0u, 0u, 0u)
|
|
#define gl_SubgroupGtMask uvec4(gl_ThreadGtMaskNV, 0u, 0u, 0u)
|
|
#define gl_SubgroupLeMask uvec4(gl_ThreadLeMaskNV, 0u, 0u, 0u)
|
|
#define gl_SubgroupLtMask uvec4(gl_ThreadLtMaskNV, 0u, 0u, 0u)
|
|
#elif defined(GL_ARB_shader_ballot)
|
|
#define gl_SubgroupEqMask uvec4(unpackUint2x32(gl_SubGroupEqMaskARB), 0u, 0u)
|
|
#define gl_SubgroupGeMask uvec4(unpackUint2x32(gl_SubGroupGeMaskARB), 0u, 0u)
|
|
#define gl_SubgroupGtMask uvec4(unpackUint2x32(gl_SubGroupGtMaskARB), 0u, 0u)
|
|
#define gl_SubgroupLeMask uvec4(unpackUint2x32(gl_SubGroupLeMaskARB), 0u, 0u)
|
|
#define gl_SubgroupLtMask uvec4(unpackUint2x32(gl_SubGroupLtMaskARB), 0u, 0u)
|
|
#endif
|
|
|
|
#if defined(GL_KHR_shader_subgroup_basic)
|
|
#elif defined(GL_NV_shader_thread_group)
|
|
#define gl_SubgroupSize gl_WarpSizeNV
|
|
#elif defined(GL_ARB_shader_ballot)
|
|
#define gl_SubgroupSize gl_SubGroupSizeARB
|
|
#elif defined(GL_AMD_gcn_shader)
|
|
#define gl_SubgroupSize uint(gl_SIMDGroupSizeAMD)
|
|
#endif
|
|
|
|
#if defined(GL_KHR_shader_subgroup_ballot)
|
|
#elif defined(GL_NV_shader_thread_group)
|
|
uvec4 subgroupBallot(bool v) { return uvec4(ballotThreadNV(v), 0u, 0u, 0u); }
|
|
#elif defined(GL_ARB_shader_ballot)
|
|
uvec4 subgroupBallot(bool v) { return uvec4(unpackUint2x32(ballotARB(v)), 0u, 0u); }
|
|
#endif
|
|
|
|
#ifndef GL_KHR_shader_subgroup_basic
|
|
bool subgroupElect()
|
|
{
|
|
uvec4 activeMask = subgroupBallot(true);
|
|
uint firstLive = subgroupBallotFindLSB(activeMask);
|
|
return gl_SubgroupInvocationID == firstLive;
|
|
}
|
|
#endif
|
|
|
|
#ifndef GL_KHR_shader_subgroup_ballot
|
|
uint subgroupBallotBitCount(uvec4 value)
|
|
{
|
|
ivec2 c = bitCount(value.xy);
|
|
#ifdef GL_NV_shader_thread_group
|
|
return uint(c.x);
|
|
#else
|
|
return uint(c.x + c.y);
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
#ifndef GL_KHR_shader_subgroup_ballot
|
|
bool subgroupBallotBitExtract(uvec4 value, uint index)
|
|
{
|
|
#ifdef GL_NV_shader_thread_group
|
|
uint shifted = value.x >> index;
|
|
#else
|
|
uint shifted = value[index >> 5u] >> (index & 0x1fu);
|
|
#endif
|
|
return (shifted & 1u) != 0u;
|
|
}
|
|
#endif
|
|
|
|
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
|
#elif defined(GL_NV_shader_thread_shuffle)
|
|
uint subgroupAdd(uint v)
|
|
{
|
|
uint reduction = 0u;
|
|
uvec4 active_threads = subgroupBallot(true);
|
|
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
|
{
|
|
uint total = gl_SubgroupSize / 2u;
|
|
reduction = v;
|
|
for (uint i = 1u; i <= total; i <<= 1u)
|
|
{
|
|
bool valid;
|
|
uint s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
|
reduction += valid ? s : 0u;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
|
{
|
|
bool valid = subgroupBallotBitExtract(active_threads, i);
|
|
uint s = shuffleNV(v, i, gl_SubgroupSize);
|
|
reduction += valid ? s : 0u;
|
|
}
|
|
}
|
|
return reduction;
|
|
}
|
|
uvec2 subgroupAdd(uvec2 v)
|
|
{
|
|
uvec2 reduction = uvec2(0u);
|
|
uvec4 active_threads = subgroupBallot(true);
|
|
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
|
{
|
|
uint total = gl_SubgroupSize / 2u;
|
|
reduction = v;
|
|
for (uint i = 1u; i <= total; i <<= 1u)
|
|
{
|
|
bool valid;
|
|
uvec2 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
|
reduction += valid ? s : uvec2(0u);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
|
{
|
|
bool valid = subgroupBallotBitExtract(active_threads, i);
|
|
uvec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
|
reduction += valid ? s : uvec2(0u);
|
|
}
|
|
}
|
|
return reduction;
|
|
}
|
|
uvec3 subgroupAdd(uvec3 v)
|
|
{
|
|
uvec3 reduction = uvec3(0u);
|
|
uvec4 active_threads = subgroupBallot(true);
|
|
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
|
{
|
|
uint total = gl_SubgroupSize / 2u;
|
|
reduction = v;
|
|
for (uint i = 1u; i <= total; i <<= 1u)
|
|
{
|
|
bool valid;
|
|
uvec3 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
|
reduction += valid ? s : uvec3(0u);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
|
{
|
|
bool valid = subgroupBallotBitExtract(active_threads, i);
|
|
uvec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
|
reduction += valid ? s : uvec3(0u);
|
|
}
|
|
}
|
|
return reduction;
|
|
}
|
|
uvec4 subgroupAdd(uvec4 v)
|
|
{
|
|
uvec4 reduction = uvec4(0u);
|
|
uvec4 active_threads = subgroupBallot(true);
|
|
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
|
{
|
|
uint total = gl_SubgroupSize / 2u;
|
|
reduction = v;
|
|
for (uint i = 1u; i <= total; i <<= 1u)
|
|
{
|
|
bool valid;
|
|
uvec4 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
|
reduction += valid ? s : uvec4(0u);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
|
{
|
|
bool valid = subgroupBallotBitExtract(active_threads, i);
|
|
uvec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
|
reduction += valid ? s : uvec4(0u);
|
|
}
|
|
}
|
|
return reduction;
|
|
}
|
|
int subgroupAdd(int v)
|
|
{
|
|
int reduction = 0;
|
|
uvec4 active_threads = subgroupBallot(true);
|
|
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
|
{
|
|
uint total = gl_SubgroupSize / 2u;
|
|
reduction = v;
|
|
for (uint i = 1u; i <= total; i <<= 1u)
|
|
{
|
|
bool valid;
|
|
int s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
|
reduction += valid ? s : 0;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
|
{
|
|
bool valid = subgroupBallotBitExtract(active_threads, i);
|
|
int s = shuffleNV(v, i, gl_SubgroupSize);
|
|
reduction += valid ? s : 0;
|
|
}
|
|
}
|
|
return reduction;
|
|
}
|
|
ivec2 subgroupAdd(ivec2 v)
|
|
{
|
|
ivec2 reduction = ivec2(0);
|
|
uvec4 active_threads = subgroupBallot(true);
|
|
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
|
{
|
|
uint total = gl_SubgroupSize / 2u;
|
|
reduction = v;
|
|
for (uint i = 1u; i <= total; i <<= 1u)
|
|
{
|
|
bool valid;
|
|
ivec2 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
|
reduction += valid ? s : ivec2(0);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
|
{
|
|
bool valid = subgroupBallotBitExtract(active_threads, i);
|
|
ivec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
|
reduction += valid ? s : ivec2(0);
|
|
}
|
|
}
|
|
return reduction;
|
|
}
|
|
ivec3 subgroupAdd(ivec3 v)
|
|
{
|
|
ivec3 reduction = ivec3(0);
|
|
uvec4 active_threads = subgroupBallot(true);
|
|
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
|
{
|
|
uint total = gl_SubgroupSize / 2u;
|
|
reduction = v;
|
|
for (uint i = 1u; i <= total; i <<= 1u)
|
|
{
|
|
bool valid;
|
|
ivec3 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
|
reduction += valid ? s : ivec3(0);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
|
{
|
|
bool valid = subgroupBallotBitExtract(active_threads, i);
|
|
ivec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
|
reduction += valid ? s : ivec3(0);
|
|
}
|
|
}
|
|
return reduction;
|
|
}
|
|
ivec4 subgroupAdd(ivec4 v)
|
|
{
|
|
ivec4 reduction = ivec4(0);
|
|
uvec4 active_threads = subgroupBallot(true);
|
|
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
|
{
|
|
uint total = gl_SubgroupSize / 2u;
|
|
reduction = v;
|
|
for (uint i = 1u; i <= total; i <<= 1u)
|
|
{
|
|
bool valid;
|
|
ivec4 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid);
|
|
reduction += valid ? s : ivec4(0);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
|
{
|
|
bool valid = subgroupBallotBitExtract(active_threads, i);
|
|
ivec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
|
reduction += valid ? s : ivec4(0);
|
|
}
|
|
}
|
|
return reduction;
|
|
}
|
|
#endif
|
|
|
|
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
|
#elif defined(GL_NV_shader_thread_shuffle)
|
|
uint subgroupExclusiveAdd(uint v)
|
|
{
|
|
uint excl_scan = 0u;
|
|
uvec4 active_threads = subgroupBallot(true);
|
|
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
|
{
|
|
uint total = gl_SubgroupSize / 2u;
|
|
excl_scan = v;
|
|
for (uint i = 1u; i <= total; i <<= 1u)
|
|
{
|
|
bool valid;
|
|
uint s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
|
excl_scan += valid ? s : 0u;
|
|
}
|
|
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
|
if (subgroupElect())
|
|
{
|
|
excl_scan = 0u;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
|
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
|
{
|
|
bool valid = subgroupBallotBitExtract(active_threads, i);
|
|
uint s = shuffleNV(v, i, gl_SubgroupSize);
|
|
valid = valid && (i < total);
|
|
excl_scan += valid ? s : 0u;
|
|
}
|
|
}
|
|
return excl_scan;
|
|
}
|
|
uvec2 subgroupExclusiveAdd(uvec2 v)
|
|
{
|
|
uvec2 excl_scan = uvec2(0u);
|
|
uvec4 active_threads = subgroupBallot(true);
|
|
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
|
{
|
|
uint total = gl_SubgroupSize / 2u;
|
|
excl_scan = v;
|
|
for (uint i = 1u; i <= total; i <<= 1u)
|
|
{
|
|
bool valid;
|
|
uvec2 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
|
excl_scan += valid ? s : uvec2(0u);
|
|
}
|
|
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
|
if (subgroupElect())
|
|
{
|
|
excl_scan = uvec2(0u);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
|
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
|
{
|
|
bool valid = subgroupBallotBitExtract(active_threads, i);
|
|
uvec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
|
valid = valid && (i < total);
|
|
excl_scan += valid ? s : uvec2(0u);
|
|
}
|
|
}
|
|
return excl_scan;
|
|
}
|
|
uvec3 subgroupExclusiveAdd(uvec3 v)
|
|
{
|
|
uvec3 excl_scan = uvec3(0u);
|
|
uvec4 active_threads = subgroupBallot(true);
|
|
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
|
{
|
|
uint total = gl_SubgroupSize / 2u;
|
|
excl_scan = v;
|
|
for (uint i = 1u; i <= total; i <<= 1u)
|
|
{
|
|
bool valid;
|
|
uvec3 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
|
excl_scan += valid ? s : uvec3(0u);
|
|
}
|
|
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
|
if (subgroupElect())
|
|
{
|
|
excl_scan = uvec3(0u);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
|
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
|
{
|
|
bool valid = subgroupBallotBitExtract(active_threads, i);
|
|
uvec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
|
valid = valid && (i < total);
|
|
excl_scan += valid ? s : uvec3(0u);
|
|
}
|
|
}
|
|
return excl_scan;
|
|
}
|
|
uvec4 subgroupExclusiveAdd(uvec4 v)
|
|
{
|
|
uvec4 excl_scan = uvec4(0u);
|
|
uvec4 active_threads = subgroupBallot(true);
|
|
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
|
{
|
|
uint total = gl_SubgroupSize / 2u;
|
|
excl_scan = v;
|
|
for (uint i = 1u; i <= total; i <<= 1u)
|
|
{
|
|
bool valid;
|
|
uvec4 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
|
excl_scan += valid ? s : uvec4(0u);
|
|
}
|
|
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
|
if (subgroupElect())
|
|
{
|
|
excl_scan = uvec4(0u);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
|
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
|
{
|
|
bool valid = subgroupBallotBitExtract(active_threads, i);
|
|
uvec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
|
valid = valid && (i < total);
|
|
excl_scan += valid ? s : uvec4(0u);
|
|
}
|
|
}
|
|
return excl_scan;
|
|
}
|
|
int subgroupExclusiveAdd(int v)
|
|
{
|
|
int excl_scan = 0;
|
|
uvec4 active_threads = subgroupBallot(true);
|
|
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
|
{
|
|
uint total = gl_SubgroupSize / 2u;
|
|
excl_scan = v;
|
|
for (uint i = 1u; i <= total; i <<= 1u)
|
|
{
|
|
bool valid;
|
|
int s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
|
excl_scan += valid ? s : 0;
|
|
}
|
|
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
|
if (subgroupElect())
|
|
{
|
|
excl_scan = 0;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
|
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
|
{
|
|
bool valid = subgroupBallotBitExtract(active_threads, i);
|
|
int s = shuffleNV(v, i, gl_SubgroupSize);
|
|
valid = valid && (i < total);
|
|
excl_scan += valid ? s : 0;
|
|
}
|
|
}
|
|
return excl_scan;
|
|
}
|
|
ivec2 subgroupExclusiveAdd(ivec2 v)
|
|
{
|
|
ivec2 excl_scan = ivec2(0);
|
|
uvec4 active_threads = subgroupBallot(true);
|
|
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
|
{
|
|
uint total = gl_SubgroupSize / 2u;
|
|
excl_scan = v;
|
|
for (uint i = 1u; i <= total; i <<= 1u)
|
|
{
|
|
bool valid;
|
|
ivec2 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
|
excl_scan += valid ? s : ivec2(0);
|
|
}
|
|
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
|
if (subgroupElect())
|
|
{
|
|
excl_scan = ivec2(0);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
|
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
|
{
|
|
bool valid = subgroupBallotBitExtract(active_threads, i);
|
|
ivec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
|
valid = valid && (i < total);
|
|
excl_scan += valid ? s : ivec2(0);
|
|
}
|
|
}
|
|
return excl_scan;
|
|
}
|
|
ivec3 subgroupExclusiveAdd(ivec3 v)
|
|
{
|
|
ivec3 excl_scan = ivec3(0);
|
|
uvec4 active_threads = subgroupBallot(true);
|
|
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
|
{
|
|
uint total = gl_SubgroupSize / 2u;
|
|
excl_scan = v;
|
|
for (uint i = 1u; i <= total; i <<= 1u)
|
|
{
|
|
bool valid;
|
|
ivec3 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
|
excl_scan += valid ? s : ivec3(0);
|
|
}
|
|
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
|
if (subgroupElect())
|
|
{
|
|
excl_scan = ivec3(0);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
|
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
|
{
|
|
bool valid = subgroupBallotBitExtract(active_threads, i);
|
|
ivec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
|
valid = valid && (i < total);
|
|
excl_scan += valid ? s : ivec3(0);
|
|
}
|
|
}
|
|
return excl_scan;
|
|
}
|
|
ivec4 subgroupExclusiveAdd(ivec4 v)
|
|
{
|
|
ivec4 excl_scan = ivec4(0);
|
|
uvec4 active_threads = subgroupBallot(true);
|
|
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
|
{
|
|
uint total = gl_SubgroupSize / 2u;
|
|
excl_scan = v;
|
|
for (uint i = 1u; i <= total; i <<= 1u)
|
|
{
|
|
bool valid;
|
|
ivec4 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid);
|
|
excl_scan += valid ? s : ivec4(0);
|
|
}
|
|
excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize);
|
|
if (subgroupElect())
|
|
{
|
|
excl_scan = ivec4(0);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
uint total = subgroupBallotBitCount(gl_SubgroupLtMask);
|
|
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
|
{
|
|
bool valid = subgroupBallotBitExtract(active_threads, i);
|
|
ivec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
|
valid = valid && (i < total);
|
|
excl_scan += valid ? s : ivec4(0);
|
|
}
|
|
}
|
|
return excl_scan;
|
|
}
|
|
#endif
|
|
|
|
#if defined(GL_KHR_shader_subgroup_arithmetic)
|
|
#elif defined(GL_NV_shader_thread_shuffle)
|
|
uint subgroupInclusiveAdd(uint v)
|
|
{
|
|
uint incl_scan = 0u;
|
|
uvec4 active_threads = subgroupBallot(true);
|
|
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
|
{
|
|
uint total = gl_SubgroupSize / 2u;
|
|
incl_scan = v;
|
|
for (uint i = 1u; i <= total; i <<= 1u)
|
|
{
|
|
bool valid;
|
|
uint s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
|
incl_scan += valid ? s : 0u;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
|
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
|
{
|
|
bool valid = subgroupBallotBitExtract(active_threads, i);
|
|
uint s = shuffleNV(v, i, gl_SubgroupSize);
|
|
valid = valid && (i < total);
|
|
incl_scan += valid ? s : 0u;
|
|
}
|
|
}
|
|
return incl_scan;
|
|
}
|
|
uvec2 subgroupInclusiveAdd(uvec2 v)
|
|
{
|
|
uvec2 incl_scan = uvec2(0u);
|
|
uvec4 active_threads = subgroupBallot(true);
|
|
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
|
{
|
|
uint total = gl_SubgroupSize / 2u;
|
|
incl_scan = v;
|
|
for (uint i = 1u; i <= total; i <<= 1u)
|
|
{
|
|
bool valid;
|
|
uvec2 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
|
incl_scan += valid ? s : uvec2(0u);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
|
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
|
{
|
|
bool valid = subgroupBallotBitExtract(active_threads, i);
|
|
uvec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
|
valid = valid && (i < total);
|
|
incl_scan += valid ? s : uvec2(0u);
|
|
}
|
|
}
|
|
return incl_scan;
|
|
}
|
|
uvec3 subgroupInclusiveAdd(uvec3 v)
|
|
{
|
|
uvec3 incl_scan = uvec3(0u);
|
|
uvec4 active_threads = subgroupBallot(true);
|
|
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
|
{
|
|
uint total = gl_SubgroupSize / 2u;
|
|
incl_scan = v;
|
|
for (uint i = 1u; i <= total; i <<= 1u)
|
|
{
|
|
bool valid;
|
|
uvec3 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
|
incl_scan += valid ? s : uvec3(0u);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
|
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
|
{
|
|
bool valid = subgroupBallotBitExtract(active_threads, i);
|
|
uvec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
|
valid = valid && (i < total);
|
|
incl_scan += valid ? s : uvec3(0u);
|
|
}
|
|
}
|
|
return incl_scan;
|
|
}
|
|
uvec4 subgroupInclusiveAdd(uvec4 v)
|
|
{
|
|
uvec4 incl_scan = uvec4(0u);
|
|
uvec4 active_threads = subgroupBallot(true);
|
|
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
|
{
|
|
uint total = gl_SubgroupSize / 2u;
|
|
incl_scan = v;
|
|
for (uint i = 1u; i <= total; i <<= 1u)
|
|
{
|
|
bool valid;
|
|
uvec4 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
|
incl_scan += valid ? s : uvec4(0u);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
|
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
|
{
|
|
bool valid = subgroupBallotBitExtract(active_threads, i);
|
|
uvec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
|
valid = valid && (i < total);
|
|
incl_scan += valid ? s : uvec4(0u);
|
|
}
|
|
}
|
|
return incl_scan;
|
|
}
|
|
int subgroupInclusiveAdd(int v)
|
|
{
|
|
int incl_scan = 0;
|
|
uvec4 active_threads = subgroupBallot(true);
|
|
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
|
{
|
|
uint total = gl_SubgroupSize / 2u;
|
|
incl_scan = v;
|
|
for (uint i = 1u; i <= total; i <<= 1u)
|
|
{
|
|
bool valid;
|
|
int s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
|
incl_scan += valid ? s : 0;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
|
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
|
{
|
|
bool valid = subgroupBallotBitExtract(active_threads, i);
|
|
int s = shuffleNV(v, i, gl_SubgroupSize);
|
|
valid = valid && (i < total);
|
|
incl_scan += valid ? s : 0;
|
|
}
|
|
}
|
|
return incl_scan;
|
|
}
|
|
ivec2 subgroupInclusiveAdd(ivec2 v)
|
|
{
|
|
ivec2 incl_scan = ivec2(0);
|
|
uvec4 active_threads = subgroupBallot(true);
|
|
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
|
{
|
|
uint total = gl_SubgroupSize / 2u;
|
|
incl_scan = v;
|
|
for (uint i = 1u; i <= total; i <<= 1u)
|
|
{
|
|
bool valid;
|
|
ivec2 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
|
incl_scan += valid ? s : ivec2(0);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
|
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
|
{
|
|
bool valid = subgroupBallotBitExtract(active_threads, i);
|
|
ivec2 s = shuffleNV(v, i, gl_SubgroupSize);
|
|
valid = valid && (i < total);
|
|
incl_scan += valid ? s : ivec2(0);
|
|
}
|
|
}
|
|
return incl_scan;
|
|
}
|
|
ivec3 subgroupInclusiveAdd(ivec3 v)
|
|
{
|
|
ivec3 incl_scan = ivec3(0);
|
|
uvec4 active_threads = subgroupBallot(true);
|
|
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
|
{
|
|
uint total = gl_SubgroupSize / 2u;
|
|
incl_scan = v;
|
|
for (uint i = 1u; i <= total; i <<= 1u)
|
|
{
|
|
bool valid;
|
|
ivec3 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
|
incl_scan += valid ? s : ivec3(0);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
|
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
|
{
|
|
bool valid = subgroupBallotBitExtract(active_threads, i);
|
|
ivec3 s = shuffleNV(v, i, gl_SubgroupSize);
|
|
valid = valid && (i < total);
|
|
incl_scan += valid ? s : ivec3(0);
|
|
}
|
|
}
|
|
return incl_scan;
|
|
}
|
|
ivec4 subgroupInclusiveAdd(ivec4 v)
|
|
{
|
|
ivec4 incl_scan = ivec4(0);
|
|
uvec4 active_threads = subgroupBallot(true);
|
|
if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize)
|
|
{
|
|
uint total = gl_SubgroupSize / 2u;
|
|
incl_scan = v;
|
|
for (uint i = 1u; i <= total; i <<= 1u)
|
|
{
|
|
bool valid;
|
|
ivec4 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid);
|
|
incl_scan += valid ? s : ivec4(0);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
uint total = subgroupBallotBitCount(gl_SubgroupLeMask);
|
|
for (uint i = 0u; i < gl_SubgroupSize; ++i)
|
|
{
|
|
bool valid = subgroupBallotBitExtract(active_threads, i);
|
|
ivec4 s = shuffleNV(v, i, gl_SubgroupSize);
|
|
valid = valid && (i < total);
|
|
incl_scan += valid ? s : ivec4(0);
|
|
}
|
|
}
|
|
return incl_scan;
|
|
}
|
|
#endif
|
|
|
|
void main()
|
|
{
|
|
_16.data_out_int = subgroupAdd(_29.data_in_int[gl_LocalInvocationID.x]);
|
|
_16.data_out_ivec2 = subgroupAdd(_29.data_in_ivec2[gl_LocalInvocationID.x]);
|
|
_16.data_out_ivec3 = subgroupAdd(_29.data_in_ivec3[gl_LocalInvocationID.x]);
|
|
_16.data_out_ivec4 = subgroupAdd(_29.data_in_ivec4[gl_LocalInvocationID.x]);
|
|
_16.data_out_uint = subgroupAdd(_29.data_in_uint[gl_LocalInvocationID.x]);
|
|
_16.data_out_uvec2 = subgroupAdd(_29.data_in_uvec2[gl_LocalInvocationID.x]);
|
|
_16.data_out_uvec3 = subgroupAdd(_29.data_in_uvec3[gl_LocalInvocationID.x]);
|
|
_16.data_out_uvec4 = subgroupAdd(_29.data_in_uvec4[gl_LocalInvocationID.x]);
|
|
_16.data_out_int = subgroupExclusiveAdd(_29.data_in_int[gl_LocalInvocationID.x]);
|
|
_16.data_out_ivec2 = subgroupExclusiveAdd(_29.data_in_ivec2[gl_LocalInvocationID.x]);
|
|
_16.data_out_ivec3 = subgroupExclusiveAdd(_29.data_in_ivec3[gl_LocalInvocationID.x]);
|
|
_16.data_out_ivec4 = subgroupExclusiveAdd(_29.data_in_ivec4[gl_LocalInvocationID.x]);
|
|
_16.data_out_uint = subgroupExclusiveAdd(_29.data_in_uint[gl_LocalInvocationID.x]);
|
|
_16.data_out_uvec2 = subgroupExclusiveAdd(_29.data_in_uvec2[gl_LocalInvocationID.x]);
|
|
_16.data_out_uvec3 = subgroupExclusiveAdd(_29.data_in_uvec3[gl_LocalInvocationID.x]);
|
|
_16.data_out_uvec4 = subgroupExclusiveAdd(_29.data_in_uvec4[gl_LocalInvocationID.x]);
|
|
_16.data_out_int = subgroupInclusiveAdd(_29.data_in_int[gl_LocalInvocationID.x]);
|
|
_16.data_out_ivec2 = subgroupInclusiveAdd(_29.data_in_ivec2[gl_LocalInvocationID.x]);
|
|
_16.data_out_ivec3 = subgroupInclusiveAdd(_29.data_in_ivec3[gl_LocalInvocationID.x]);
|
|
_16.data_out_ivec4 = subgroupInclusiveAdd(_29.data_in_ivec4[gl_LocalInvocationID.x]);
|
|
_16.data_out_uint = subgroupInclusiveAdd(_29.data_in_uint[gl_LocalInvocationID.x]);
|
|
_16.data_out_uvec2 = subgroupInclusiveAdd(_29.data_in_uvec2[gl_LocalInvocationID.x]);
|
|
_16.data_out_uvec3 = subgroupInclusiveAdd(_29.data_in_uvec3[gl_LocalInvocationID.x]);
|
|
_16.data_out_uvec4 = subgroupInclusiveAdd(_29.data_in_uvec4[gl_LocalInvocationID.x]);
|
|
}
|
|
|