MSL: Add support for subgroup operations.
Some support for subgroups is present starting in Metal 2.0 on both iOS
and macOS. macOS gains more complete support in 10.14 (Metal 2.1).
Some restrictions are present. On iOS and on macOS 10.13, the
implementation of `OpGroupNonUniformElect` is incorrect: if thread 0 has
already terminated or is not executing a conditional branch, the first
thread that *is* will falsely believe itself not to be. Unfortunately,
this operation is part of the "basic" feature set; without it, subgroups
cannot be supported at all.
The `SubgroupSize` and `SubgroupLocalInvocationId` builtins are only
available in compute shaders (and, by extension, tessellation control
shaders), despite SPIR-V making them available in all stages. This
limits the usefulness of some of the subgroup operations in fragment
shaders.
Although Metal on macOS supports some clustered, inclusive, and
exclusive operations, it does not support them all. In particular,
inclusive and exclusive min, max, and, or, and xor; as well as cluster
sizes other than 4 are not supported. If this becomes a problem, they
could be emulated, but at a significant performance cost due to the need
for non-uniform operations.
2019-05-15 21:03:30 +00:00
|
|
|
#pragma clang diagnostic ignored "-Wmissing-prototypes"
|
|
|
|
|
|
|
|
#include <metal_stdlib>
|
|
|
|
#include <simd/simd.h>
|
|
|
|
|
|
|
|
using namespace metal;
|
|
|
|
|
|
|
|
struct SSBO
|
|
|
|
{
|
|
|
|
float FragColor;
|
|
|
|
};
|
|
|
|
|
2019-09-18 19:56:51 +00:00
|
|
|
constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(1u);
|
|
|
|
|
2020-10-21 06:51:48 +00:00
|
|
|
template<typename T>
|
|
|
|
inline T spvSubgroupBroadcast(T value, ushort lane)
|
|
|
|
{
|
|
|
|
return simd_broadcast(value, lane);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<>
|
|
|
|
inline bool spvSubgroupBroadcast(bool value, ushort lane)
|
|
|
|
{
|
|
|
|
return !!simd_broadcast((ushort)value, lane);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<uint N>
|
|
|
|
inline vec<bool, N> spvSubgroupBroadcast(vec<bool, N> value, ushort lane)
|
|
|
|
{
|
|
|
|
return (vec<bool, N>)simd_broadcast((vec<ushort, N>)value, lane);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<typename T>
|
|
|
|
inline T spvSubgroupBroadcastFirst(T value)
|
|
|
|
{
|
|
|
|
return simd_broadcast_first(value);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<>
|
|
|
|
inline bool spvSubgroupBroadcastFirst(bool value)
|
|
|
|
{
|
|
|
|
return !!simd_broadcast_first((ushort)value);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<uint N>
|
|
|
|
inline vec<bool, N> spvSubgroupBroadcastFirst(vec<bool, N> value)
|
|
|
|
{
|
|
|
|
return (vec<bool, N>)simd_broadcast_first((vec<ushort, N>)value);
|
|
|
|
}
|
|
|
|
|
MSL: Add support for subgroup operations.
Some support for subgroups is present starting in Metal 2.0 on both iOS
and macOS. macOS gains more complete support in 10.14 (Metal 2.1).
Some restrictions are present. On iOS and on macOS 10.13, the
implementation of `OpGroupNonUniformElect` is incorrect: if thread 0 has
already terminated or is not executing a conditional branch, the first
thread that *is* will falsely believe itself not to be. Unfortunately,
this operation is part of the "basic" feature set; without it, subgroups
cannot be supported at all.
The `SubgroupSize` and `SubgroupLocalInvocationId` builtins are only
available in compute shaders (and, by extension, tessellation control
shaders), despite SPIR-V making them available in all stages. This
limits the usefulness of some of the subgroup operations in fragment
shaders.
Although Metal on macOS supports some clustered, inclusive, and
exclusive operations, it does not support them all. In particular,
inclusive and exclusive min, max, and, or, and xor; as well as cluster
sizes other than 4 are not supported. If this becomes a problem, they
could be emulated, but at a significant performance cost due to the need
for non-uniform operations.
2019-05-15 21:03:30 +00:00
|
|
|
inline uint4 spvSubgroupBallot(bool value)
|
|
|
|
{
|
|
|
|
simd_vote vote = simd_ballot(value);
|
|
|
|
// simd_ballot() returns a 64-bit integer-like object, but
|
|
|
|
// SPIR-V callers expect a uint4. We must convert.
|
|
|
|
// FIXME: This won't include higher bits if Apple ever supports
|
|
|
|
// 128 lanes in an SIMD-group.
|
2021-02-21 17:14:38 +00:00
|
|
|
return uint4(as_type<uint2>((simd_vote::vote_t)vote), 0, 0);
|
MSL: Add support for subgroup operations.
Some support for subgroups is present starting in Metal 2.0 on both iOS
and macOS. macOS gains more complete support in 10.14 (Metal 2.1).
Some restrictions are present. On iOS and on macOS 10.13, the
implementation of `OpGroupNonUniformElect` is incorrect: if thread 0 has
already terminated or is not executing a conditional branch, the first
thread that *is* will falsely believe itself not to be. Unfortunately,
this operation is part of the "basic" feature set; without it, subgroups
cannot be supported at all.
The `SubgroupSize` and `SubgroupLocalInvocationId` builtins are only
available in compute shaders (and, by extension, tessellation control
shaders), despite SPIR-V making them available in all stages. This
limits the usefulness of some of the subgroup operations in fragment
shaders.
Although Metal on macOS supports some clustered, inclusive, and
exclusive operations, it does not support them all. In particular,
inclusive and exclusive min, max, and, or, and xor; as well as cluster
sizes other than 4 are not supported. If this becomes a problem, they
could be emulated, but at a significant performance cost due to the need
for non-uniform operations.
2019-05-15 21:03:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
inline bool spvSubgroupBallotBitExtract(uint4 ballot, uint bit)
|
|
|
|
{
|
|
|
|
return !!extract_bits(ballot[bit / 32], bit % 32, 1);
|
|
|
|
}
|
|
|
|
|
2020-10-21 04:59:30 +00:00
|
|
|
inline uint spvSubgroupBallotFindLSB(uint4 ballot, uint gl_SubgroupSize)
|
MSL: Add support for subgroup operations.
Some support for subgroups is present starting in Metal 2.0 on both iOS
and macOS. macOS gains more complete support in 10.14 (Metal 2.1).
Some restrictions are present. On iOS and on macOS 10.13, the
implementation of `OpGroupNonUniformElect` is incorrect: if thread 0 has
already terminated or is not executing a conditional branch, the first
thread that *is* will falsely believe itself not to be. Unfortunately,
this operation is part of the "basic" feature set; without it, subgroups
cannot be supported at all.
The `SubgroupSize` and `SubgroupLocalInvocationId` builtins are only
available in compute shaders (and, by extension, tessellation control
shaders), despite SPIR-V making them available in all stages. This
limits the usefulness of some of the subgroup operations in fragment
shaders.
Although Metal on macOS supports some clustered, inclusive, and
exclusive operations, it does not support them all. In particular,
inclusive and exclusive min, max, and, or, and xor; as well as cluster
sizes other than 4 are not supported. If this becomes a problem, they
could be emulated, but at a significant performance cost due to the need
for non-uniform operations.
2019-05-15 21:03:30 +00:00
|
|
|
{
|
2020-10-21 04:59:30 +00:00
|
|
|
uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupSize, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupSize - 32, 0)), uint2(0));
|
|
|
|
ballot &= mask;
|
MSL: Add support for subgroup operations.
Some support for subgroups is present starting in Metal 2.0 on both iOS
and macOS. macOS gains more complete support in 10.14 (Metal 2.1).
Some restrictions are present. On iOS and on macOS 10.13, the
implementation of `OpGroupNonUniformElect` is incorrect: if thread 0 has
already terminated or is not executing a conditional branch, the first
thread that *is* will falsely believe itself not to be. Unfortunately,
this operation is part of the "basic" feature set; without it, subgroups
cannot be supported at all.
The `SubgroupSize` and `SubgroupLocalInvocationId` builtins are only
available in compute shaders (and, by extension, tessellation control
shaders), despite SPIR-V making them available in all stages. This
limits the usefulness of some of the subgroup operations in fragment
shaders.
Although Metal on macOS supports some clustered, inclusive, and
exclusive operations, it does not support them all. In particular,
inclusive and exclusive min, max, and, or, and xor; as well as cluster
sizes other than 4 are not supported. If this becomes a problem, they
could be emulated, but at a significant performance cost due to the need
for non-uniform operations.
2019-05-15 21:03:30 +00:00
|
|
|
return select(ctz(ballot.x), select(32 + ctz(ballot.y), select(64 + ctz(ballot.z), select(96 + ctz(ballot.w), uint(-1), ballot.w == 0), ballot.z == 0), ballot.y == 0), ballot.x == 0);
|
|
|
|
}
|
|
|
|
|
2020-10-21 04:59:30 +00:00
|
|
|
inline uint spvSubgroupBallotFindMSB(uint4 ballot, uint gl_SubgroupSize)
|
MSL: Add support for subgroup operations.
Some support for subgroups is present starting in Metal 2.0 on both iOS
and macOS. macOS gains more complete support in 10.14 (Metal 2.1).
Some restrictions are present. On iOS and on macOS 10.13, the
implementation of `OpGroupNonUniformElect` is incorrect: if thread 0 has
already terminated or is not executing a conditional branch, the first
thread that *is* will falsely believe itself not to be. Unfortunately,
this operation is part of the "basic" feature set; without it, subgroups
cannot be supported at all.
The `SubgroupSize` and `SubgroupLocalInvocationId` builtins are only
available in compute shaders (and, by extension, tessellation control
shaders), despite SPIR-V making them available in all stages. This
limits the usefulness of some of the subgroup operations in fragment
shaders.
Although Metal on macOS supports some clustered, inclusive, and
exclusive operations, it does not support them all. In particular,
inclusive and exclusive min, max, and, or, and xor; as well as cluster
sizes other than 4 are not supported. If this becomes a problem, they
could be emulated, but at a significant performance cost due to the need
for non-uniform operations.
2019-05-15 21:03:30 +00:00
|
|
|
{
|
2020-10-21 04:59:30 +00:00
|
|
|
uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupSize, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupSize - 32, 0)), uint2(0));
|
|
|
|
ballot &= mask;
|
MSL: Add support for subgroup operations.
Some support for subgroups is present starting in Metal 2.0 on both iOS
and macOS. macOS gains more complete support in 10.14 (Metal 2.1).
Some restrictions are present. On iOS and on macOS 10.13, the
implementation of `OpGroupNonUniformElect` is incorrect: if thread 0 has
already terminated or is not executing a conditional branch, the first
thread that *is* will falsely believe itself not to be. Unfortunately,
this operation is part of the "basic" feature set; without it, subgroups
cannot be supported at all.
The `SubgroupSize` and `SubgroupLocalInvocationId` builtins are only
available in compute shaders (and, by extension, tessellation control
shaders), despite SPIR-V making them available in all stages. This
limits the usefulness of some of the subgroup operations in fragment
shaders.
Although Metal on macOS supports some clustered, inclusive, and
exclusive operations, it does not support them all. In particular,
inclusive and exclusive min, max, and, or, and xor; as well as cluster
sizes other than 4 are not supported. If this becomes a problem, they
could be emulated, but at a significant performance cost due to the need
for non-uniform operations.
2019-05-15 21:03:30 +00:00
|
|
|
return select(128 - (clz(ballot.w) + 1), select(96 - (clz(ballot.z) + 1), select(64 - (clz(ballot.y) + 1), select(32 - (clz(ballot.x) + 1), uint(-1), ballot.x == 0), ballot.y == 0), ballot.z == 0), ballot.w == 0);
|
|
|
|
}
|
|
|
|
|
2020-10-21 04:59:30 +00:00
|
|
|
inline uint spvPopCount4(uint4 ballot)
|
MSL: Add support for subgroup operations.
Some support for subgroups is present starting in Metal 2.0 on both iOS
and macOS. macOS gains more complete support in 10.14 (Metal 2.1).
Some restrictions are present. On iOS and on macOS 10.13, the
implementation of `OpGroupNonUniformElect` is incorrect: if thread 0 has
already terminated or is not executing a conditional branch, the first
thread that *is* will falsely believe itself not to be. Unfortunately,
this operation is part of the "basic" feature set; without it, subgroups
cannot be supported at all.
The `SubgroupSize` and `SubgroupLocalInvocationId` builtins are only
available in compute shaders (and, by extension, tessellation control
shaders), despite SPIR-V making them available in all stages. This
limits the usefulness of some of the subgroup operations in fragment
shaders.
Although Metal on macOS supports some clustered, inclusive, and
exclusive operations, it does not support them all. In particular,
inclusive and exclusive min, max, and, or, and xor; as well as cluster
sizes other than 4 are not supported. If this becomes a problem, they
could be emulated, but at a significant performance cost due to the need
for non-uniform operations.
2019-05-15 21:03:30 +00:00
|
|
|
{
|
|
|
|
return popcount(ballot.x) + popcount(ballot.y) + popcount(ballot.z) + popcount(ballot.w);
|
|
|
|
}
|
|
|
|
|
2020-10-21 04:59:30 +00:00
|
|
|
inline uint spvSubgroupBallotBitCount(uint4 ballot, uint gl_SubgroupSize)
|
|
|
|
{
|
|
|
|
uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupSize, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupSize - 32, 0)), uint2(0));
|
|
|
|
return spvPopCount4(ballot & mask);
|
|
|
|
}
|
|
|
|
|
MSL: Add support for subgroup operations.
Some support for subgroups is present starting in Metal 2.0 on both iOS
and macOS. macOS gains more complete support in 10.14 (Metal 2.1).
Some restrictions are present. On iOS and on macOS 10.13, the
implementation of `OpGroupNonUniformElect` is incorrect: if thread 0 has
already terminated or is not executing a conditional branch, the first
thread that *is* will falsely believe itself not to be. Unfortunately,
this operation is part of the "basic" feature set; without it, subgroups
cannot be supported at all.
The `SubgroupSize` and `SubgroupLocalInvocationId` builtins are only
available in compute shaders (and, by extension, tessellation control
shaders), despite SPIR-V making them available in all stages. This
limits the usefulness of some of the subgroup operations in fragment
shaders.
Although Metal on macOS supports some clustered, inclusive, and
exclusive operations, it does not support them all. In particular,
inclusive and exclusive min, max, and, or, and xor; as well as cluster
sizes other than 4 are not supported. If this becomes a problem, they
could be emulated, but at a significant performance cost due to the need
for non-uniform operations.
2019-05-15 21:03:30 +00:00
|
|
|
inline uint spvSubgroupBallotInclusiveBitCount(uint4 ballot, uint gl_SubgroupInvocationID)
|
|
|
|
{
|
|
|
|
uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupInvocationID + 1, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupInvocationID + 1 - 32, 0)), uint2(0));
|
2020-10-21 04:59:30 +00:00
|
|
|
return spvPopCount4(ballot & mask);
|
MSL: Add support for subgroup operations.
Some support for subgroups is present starting in Metal 2.0 on both iOS
and macOS. macOS gains more complete support in 10.14 (Metal 2.1).
Some restrictions are present. On iOS and on macOS 10.13, the
implementation of `OpGroupNonUniformElect` is incorrect: if thread 0 has
already terminated or is not executing a conditional branch, the first
thread that *is* will falsely believe itself not to be. Unfortunately,
this operation is part of the "basic" feature set; without it, subgroups
cannot be supported at all.
The `SubgroupSize` and `SubgroupLocalInvocationId` builtins are only
available in compute shaders (and, by extension, tessellation control
shaders), despite SPIR-V making them available in all stages. This
limits the usefulness of some of the subgroup operations in fragment
shaders.
Although Metal on macOS supports some clustered, inclusive, and
exclusive operations, it does not support them all. In particular,
inclusive and exclusive min, max, and, or, and xor; as well as cluster
sizes other than 4 are not supported. If this becomes a problem, they
could be emulated, but at a significant performance cost due to the need
for non-uniform operations.
2019-05-15 21:03:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
inline uint spvSubgroupBallotExclusiveBitCount(uint4 ballot, uint gl_SubgroupInvocationID)
|
|
|
|
{
|
|
|
|
uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupInvocationID, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupInvocationID - 32, 0)), uint2(0));
|
2020-10-21 04:59:30 +00:00
|
|
|
return spvPopCount4(ballot & mask);
|
MSL: Add support for subgroup operations.
Some support for subgroups is present starting in Metal 2.0 on both iOS
and macOS. macOS gains more complete support in 10.14 (Metal 2.1).
Some restrictions are present. On iOS and on macOS 10.13, the
implementation of `OpGroupNonUniformElect` is incorrect: if thread 0 has
already terminated or is not executing a conditional branch, the first
thread that *is* will falsely believe itself not to be. Unfortunately,
this operation is part of the "basic" feature set; without it, subgroups
cannot be supported at all.
The `SubgroupSize` and `SubgroupLocalInvocationId` builtins are only
available in compute shaders (and, by extension, tessellation control
shaders), despite SPIR-V making them available in all stages. This
limits the usefulness of some of the subgroup operations in fragment
shaders.
Although Metal on macOS supports some clustered, inclusive, and
exclusive operations, it does not support them all. In particular,
inclusive and exclusive min, max, and, or, and xor; as well as cluster
sizes other than 4 are not supported. If this becomes a problem, they
could be emulated, but at a significant performance cost due to the need
for non-uniform operations.
2019-05-15 21:03:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
template<typename T>
|
|
|
|
inline bool spvSubgroupAllEqual(T value)
|
|
|
|
{
|
2020-10-21 04:04:43 +00:00
|
|
|
return simd_all(all(value == simd_broadcast_first(value)));
|
MSL: Add support for subgroup operations.
Some support for subgroups is present starting in Metal 2.0 on both iOS
and macOS. macOS gains more complete support in 10.14 (Metal 2.1).
Some restrictions are present. On iOS and on macOS 10.13, the
implementation of `OpGroupNonUniformElect` is incorrect: if thread 0 has
already terminated or is not executing a conditional branch, the first
thread that *is* will falsely believe itself not to be. Unfortunately,
this operation is part of the "basic" feature set; without it, subgroups
cannot be supported at all.
The `SubgroupSize` and `SubgroupLocalInvocationId` builtins are only
available in compute shaders (and, by extension, tessellation control
shaders), despite SPIR-V making them available in all stages. This
limits the usefulness of some of the subgroup operations in fragment
shaders.
Although Metal on macOS supports some clustered, inclusive, and
exclusive operations, it does not support them all. In particular,
inclusive and exclusive min, max, and, or, and xor; as well as cluster
sizes other than 4 are not supported. If this becomes a problem, they
could be emulated, but at a significant performance cost due to the need
for non-uniform operations.
2019-05-15 21:03:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
template<>
|
|
|
|
inline bool spvSubgroupAllEqual(bool value)
|
|
|
|
{
|
|
|
|
return simd_all(value) || !simd_any(value);
|
|
|
|
}
|
|
|
|
|
2020-10-21 04:04:43 +00:00
|
|
|
template<uint N>
|
|
|
|
inline bool spvSubgroupAllEqual(vec<bool, N> value)
|
|
|
|
{
|
|
|
|
return simd_all(all(value == (vec<bool, N>)simd_broadcast_first((vec<ushort, N>)value)));
|
|
|
|
}
|
|
|
|
|
2020-10-21 06:51:48 +00:00
|
|
|
template<typename T>
|
|
|
|
inline T spvSubgroupShuffle(T value, ushort lane)
|
|
|
|
{
|
|
|
|
return simd_shuffle(value, lane);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<>
|
|
|
|
inline bool spvSubgroupShuffle(bool value, ushort lane)
|
|
|
|
{
|
|
|
|
return !!simd_shuffle((ushort)value, lane);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<uint N>
|
|
|
|
inline vec<bool, N> spvSubgroupShuffle(vec<bool, N> value, ushort lane)
|
|
|
|
{
|
|
|
|
return (vec<bool, N>)simd_shuffle((vec<ushort, N>)value, lane);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<typename T>
|
|
|
|
inline T spvSubgroupShuffleXor(T value, ushort mask)
|
|
|
|
{
|
|
|
|
return simd_shuffle_xor(value, mask);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<>
|
|
|
|
inline bool spvSubgroupShuffleXor(bool value, ushort mask)
|
|
|
|
{
|
|
|
|
return !!simd_shuffle_xor((ushort)value, mask);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<uint N>
|
|
|
|
inline vec<bool, N> spvSubgroupShuffleXor(vec<bool, N> value, ushort mask)
|
|
|
|
{
|
|
|
|
return (vec<bool, N>)simd_shuffle_xor((vec<ushort, N>)value, mask);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<typename T>
|
|
|
|
inline T spvSubgroupShuffleUp(T value, ushort delta)
|
|
|
|
{
|
|
|
|
return simd_shuffle_up(value, delta);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<>
|
|
|
|
inline bool spvSubgroupShuffleUp(bool value, ushort delta)
|
|
|
|
{
|
|
|
|
return !!simd_shuffle_up((ushort)value, delta);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<uint N>
|
|
|
|
inline vec<bool, N> spvSubgroupShuffleUp(vec<bool, N> value, ushort delta)
|
|
|
|
{
|
|
|
|
return (vec<bool, N>)simd_shuffle_up((vec<ushort, N>)value, delta);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<typename T>
|
|
|
|
inline T spvSubgroupShuffleDown(T value, ushort delta)
|
|
|
|
{
|
|
|
|
return simd_shuffle_down(value, delta);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<>
|
|
|
|
inline bool spvSubgroupShuffleDown(bool value, ushort delta)
|
|
|
|
{
|
|
|
|
return !!simd_shuffle_down((ushort)value, delta);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<uint N>
|
|
|
|
inline vec<bool, N> spvSubgroupShuffleDown(vec<bool, N> value, ushort delta)
|
|
|
|
{
|
|
|
|
return (vec<bool, N>)simd_shuffle_down((vec<ushort, N>)value, delta);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<typename T>
|
|
|
|
inline T spvQuadBroadcast(T value, uint lane)
|
|
|
|
{
|
|
|
|
return quad_broadcast(value, lane);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<>
|
|
|
|
inline bool spvQuadBroadcast(bool value, uint lane)
|
|
|
|
{
|
|
|
|
return !!quad_broadcast((ushort)value, lane);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<uint N>
|
|
|
|
inline vec<bool, N> spvQuadBroadcast(vec<bool, N> value, uint lane)
|
|
|
|
{
|
|
|
|
return (vec<bool, N>)quad_broadcast((vec<ushort, N>)value, lane);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<typename T>
|
|
|
|
inline T spvQuadSwap(T value, uint dir)
|
|
|
|
{
|
|
|
|
return quad_shuffle_xor(value, dir + 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<>
|
|
|
|
inline bool spvQuadSwap(bool value, uint dir)
|
|
|
|
{
|
|
|
|
return !!quad_shuffle_xor((ushort)value, dir + 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<uint N>
|
|
|
|
inline vec<bool, N> spvQuadSwap(vec<bool, N> value, uint dir)
|
|
|
|
{
|
|
|
|
return (vec<bool, N>)quad_shuffle_xor((vec<ushort, N>)value, dir + 1);
|
|
|
|
}
|
|
|
|
|
MSL: Add support for subgroup operations.
Some support for subgroups is present starting in Metal 2.0 on both iOS
and macOS. macOS gains more complete support in 10.14 (Metal 2.1).
Some restrictions are present. On iOS and on macOS 10.13, the
implementation of `OpGroupNonUniformElect` is incorrect: if thread 0 has
already terminated or is not executing a conditional branch, the first
thread that *is* will falsely believe itself not to be. Unfortunately,
this operation is part of the "basic" feature set; without it, subgroups
cannot be supported at all.
The `SubgroupSize` and `SubgroupLocalInvocationId` builtins are only
available in compute shaders (and, by extension, tessellation control
shaders), despite SPIR-V making them available in all stages. This
limits the usefulness of some of the subgroup operations in fragment
shaders.
Although Metal on macOS supports some clustered, inclusive, and
exclusive operations, it does not support them all. In particular,
inclusive and exclusive min, max, and, or, and xor; as well as cluster
sizes other than 4 are not supported. If this becomes a problem, they
could be emulated, but at a significant performance cost due to the need
for non-uniform operations.
2019-05-15 21:03:30 +00:00
|
|
|
kernel void main0(device SSBO& _9 [[buffer(0)]], uint gl_NumSubgroups [[simdgroups_per_threadgroup]], uint gl_SubgroupID [[simdgroup_index_in_threadgroup]], uint gl_SubgroupSize [[thread_execution_width]], uint gl_SubgroupInvocationID [[thread_index_in_simdgroup]])
|
|
|
|
{
|
2020-10-21 02:58:07 +00:00
|
|
|
uint4 gl_SubgroupEqMask = gl_SubgroupInvocationID >= 32 ? uint4(0, (1 << (gl_SubgroupInvocationID - 32)), uint2(0)) : uint4(1 << gl_SubgroupInvocationID, uint3(0));
|
|
|
|
uint4 gl_SubgroupGeMask = uint4(insert_bits(0u, 0xFFFFFFFF, min(gl_SubgroupInvocationID, 32u), (uint)max(min((int)gl_SubgroupSize, 32) - (int)gl_SubgroupInvocationID, 0)), insert_bits(0u, 0xFFFFFFFF, (uint)max((int)gl_SubgroupInvocationID - 32, 0), (uint)max((int)gl_SubgroupSize - (int)max(gl_SubgroupInvocationID, 32u), 0)), uint2(0));
|
|
|
|
uint4 gl_SubgroupGtMask = uint4(insert_bits(0u, 0xFFFFFFFF, min(gl_SubgroupInvocationID + 1, 32u), (uint)max(min((int)gl_SubgroupSize, 32) - (int)gl_SubgroupInvocationID - 1, 0)), insert_bits(0u, 0xFFFFFFFF, (uint)max((int)gl_SubgroupInvocationID + 1 - 32, 0), (uint)max((int)gl_SubgroupSize - (int)max(gl_SubgroupInvocationID + 1, 32u), 0)), uint2(0));
|
MSL: Add support for subgroup operations.
Some support for subgroups is present starting in Metal 2.0 on both iOS
and macOS. macOS gains more complete support in 10.14 (Metal 2.1).
Some restrictions are present. On iOS and on macOS 10.13, the
implementation of `OpGroupNonUniformElect` is incorrect: if thread 0 has
already terminated or is not executing a conditional branch, the first
thread that *is* will falsely believe itself not to be. Unfortunately,
this operation is part of the "basic" feature set; without it, subgroups
cannot be supported at all.
The `SubgroupSize` and `SubgroupLocalInvocationId` builtins are only
available in compute shaders (and, by extension, tessellation control
shaders), despite SPIR-V making them available in all stages. This
limits the usefulness of some of the subgroup operations in fragment
shaders.
Although Metal on macOS supports some clustered, inclusive, and
exclusive operations, it does not support them all. In particular,
inclusive and exclusive min, max, and, or, and xor; as well as cluster
sizes other than 4 are not supported. If this becomes a problem, they
could be emulated, but at a significant performance cost due to the need
for non-uniform operations.
2019-05-15 21:03:30 +00:00
|
|
|
uint4 gl_SubgroupLeMask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupInvocationID + 1, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupInvocationID + 1 - 32, 0)), uint2(0));
|
|
|
|
uint4 gl_SubgroupLtMask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupInvocationID, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupInvocationID - 32, 0)), uint2(0));
|
|
|
|
_9.FragColor = float(gl_NumSubgroups);
|
|
|
|
_9.FragColor = float(gl_SubgroupID);
|
|
|
|
_9.FragColor = float(gl_SubgroupSize);
|
|
|
|
_9.FragColor = float(gl_SubgroupInvocationID);
|
|
|
|
simdgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture);
|
|
|
|
simdgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture);
|
|
|
|
simdgroup_barrier(mem_flags::mem_device);
|
|
|
|
simdgroup_barrier(mem_flags::mem_threadgroup);
|
|
|
|
simdgroup_barrier(mem_flags::mem_texture);
|
2020-11-19 05:16:46 +00:00
|
|
|
bool _39 = simd_is_first();
|
|
|
|
bool elected = _39;
|
MSL: Add support for subgroup operations.
Some support for subgroups is present starting in Metal 2.0 on both iOS
and macOS. macOS gains more complete support in 10.14 (Metal 2.1).
Some restrictions are present. On iOS and on macOS 10.13, the
implementation of `OpGroupNonUniformElect` is incorrect: if thread 0 has
already terminated or is not executing a conditional branch, the first
thread that *is* will falsely believe itself not to be. Unfortunately,
this operation is part of the "basic" feature set; without it, subgroups
cannot be supported at all.
The `SubgroupSize` and `SubgroupLocalInvocationId` builtins are only
available in compute shaders (and, by extension, tessellation control
shaders), despite SPIR-V making them available in all stages. This
limits the usefulness of some of the subgroup operations in fragment
shaders.
Although Metal on macOS supports some clustered, inclusive, and
exclusive operations, it does not support them all. In particular,
inclusive and exclusive min, max, and, or, and xor; as well as cluster
sizes other than 4 are not supported. If this becomes a problem, they
could be emulated, but at a significant performance cost due to the need
for non-uniform operations.
2019-05-15 21:03:30 +00:00
|
|
|
_9.FragColor = float4(gl_SubgroupEqMask).x;
|
|
|
|
_9.FragColor = float4(gl_SubgroupGeMask).x;
|
|
|
|
_9.FragColor = float4(gl_SubgroupGtMask).x;
|
|
|
|
_9.FragColor = float4(gl_SubgroupLeMask).x;
|
|
|
|
_9.FragColor = float4(gl_SubgroupLtMask).x;
|
2020-10-21 06:51:48 +00:00
|
|
|
float4 broadcasted = spvSubgroupBroadcast(float4(10.0), 8u);
|
|
|
|
bool2 broadcasted_bool = spvSubgroupBroadcast(bool2(true), 8u);
|
|
|
|
float3 first = spvSubgroupBroadcastFirst(float3(20.0));
|
|
|
|
bool4 first_bool = spvSubgroupBroadcastFirst(bool4(false));
|
2019-11-05 15:59:41 +00:00
|
|
|
uint4 ballot_value = spvSubgroupBallot(true);
|
|
|
|
bool inverse_ballot_value = spvSubgroupBallotBitExtract(ballot_value, gl_SubgroupInvocationID);
|
|
|
|
bool bit_extracted = spvSubgroupBallotBitExtract(uint4(10u), 8u);
|
2020-10-21 04:59:30 +00:00
|
|
|
uint bit_count = spvSubgroupBallotBitCount(ballot_value, gl_SubgroupSize);
|
2019-11-05 15:59:41 +00:00
|
|
|
uint inclusive_bit_count = spvSubgroupBallotInclusiveBitCount(ballot_value, gl_SubgroupInvocationID);
|
|
|
|
uint exclusive_bit_count = spvSubgroupBallotExclusiveBitCount(ballot_value, gl_SubgroupInvocationID);
|
2020-10-21 04:59:30 +00:00
|
|
|
uint lsb = spvSubgroupBallotFindLSB(ballot_value, gl_SubgroupSize);
|
|
|
|
uint msb = spvSubgroupBallotFindMSB(ballot_value, gl_SubgroupSize);
|
2020-10-21 06:51:48 +00:00
|
|
|
uint shuffled = spvSubgroupShuffle(10u, 8u);
|
|
|
|
bool shuffled_bool = spvSubgroupShuffle(true, 9u);
|
|
|
|
uint shuffled_xor = spvSubgroupShuffleXor(30u, 8u);
|
|
|
|
bool shuffled_xor_bool = spvSubgroupShuffleXor(false, 9u);
|
|
|
|
uint shuffled_up = spvSubgroupShuffleUp(20u, 4u);
|
|
|
|
bool shuffled_up_bool = spvSubgroupShuffleUp(true, 4u);
|
|
|
|
uint shuffled_down = spvSubgroupShuffleDown(20u, 4u);
|
|
|
|
bool shuffled_down_bool = spvSubgroupShuffleDown(false, 4u);
|
2019-11-05 15:59:41 +00:00
|
|
|
bool has_all = simd_all(true);
|
|
|
|
bool has_any = simd_any(true);
|
|
|
|
bool has_equal = spvSubgroupAllEqual(0);
|
|
|
|
has_equal = spvSubgroupAllEqual(true);
|
2020-10-21 04:04:43 +00:00
|
|
|
has_equal = spvSubgroupAllEqual(float3(0.0, 1.0, 2.0));
|
|
|
|
has_equal = spvSubgroupAllEqual(bool4(true, true, false, true));
|
2019-11-05 15:59:41 +00:00
|
|
|
float4 added = simd_sum(float4(20.0));
|
|
|
|
int4 iadded = simd_sum(int4(20));
|
|
|
|
float4 multiplied = simd_product(float4(20.0));
|
|
|
|
int4 imultiplied = simd_product(int4(20));
|
|
|
|
float4 lo = simd_min(float4(20.0));
|
|
|
|
float4 hi = simd_max(float4(20.0));
|
|
|
|
int4 slo = simd_min(int4(20));
|
|
|
|
int4 shi = simd_max(int4(20));
|
|
|
|
uint4 ulo = simd_min(uint4(20u));
|
|
|
|
uint4 uhi = simd_max(uint4(20u));
|
|
|
|
uint4 anded = simd_and(ballot_value);
|
|
|
|
uint4 ored = simd_or(ballot_value);
|
|
|
|
uint4 xored = simd_xor(ballot_value);
|
2021-03-08 11:57:37 +00:00
|
|
|
bool4 anded_b = simd_and(ballot_value == uint4(42u));
|
|
|
|
bool4 ored_b = simd_or(ballot_value == uint4(42u));
|
|
|
|
bool4 xored_b = simd_xor(ballot_value == uint4(42u));
|
2019-11-05 15:59:41 +00:00
|
|
|
added = simd_prefix_inclusive_sum(added);
|
|
|
|
iadded = simd_prefix_inclusive_sum(iadded);
|
|
|
|
multiplied = simd_prefix_inclusive_product(multiplied);
|
|
|
|
imultiplied = simd_prefix_inclusive_product(imultiplied);
|
|
|
|
added = simd_prefix_exclusive_sum(multiplied);
|
|
|
|
multiplied = simd_prefix_exclusive_product(multiplied);
|
|
|
|
iadded = simd_prefix_exclusive_sum(imultiplied);
|
|
|
|
imultiplied = simd_prefix_exclusive_product(imultiplied);
|
|
|
|
added = quad_sum(added);
|
|
|
|
multiplied = quad_product(multiplied);
|
|
|
|
iadded = quad_sum(iadded);
|
|
|
|
imultiplied = quad_product(imultiplied);
|
|
|
|
lo = quad_min(lo);
|
|
|
|
hi = quad_max(hi);
|
|
|
|
ulo = quad_min(ulo);
|
|
|
|
uhi = quad_max(uhi);
|
|
|
|
slo = quad_min(slo);
|
|
|
|
shi = quad_max(shi);
|
|
|
|
anded = quad_and(anded);
|
|
|
|
ored = quad_or(ored);
|
|
|
|
xored = quad_xor(xored);
|
2021-03-08 11:57:37 +00:00
|
|
|
anded_b = quad_and(anded == uint4(2u));
|
|
|
|
ored_b = quad_or(ored == uint4(3u));
|
|
|
|
xored_b = quad_xor(xored == uint4(4u));
|
2020-10-21 06:51:48 +00:00
|
|
|
float4 swap_horiz = spvQuadSwap(float4(20.0), 0u);
|
|
|
|
bool4 swap_horiz_bool = spvQuadSwap(bool4(true), 0u);
|
|
|
|
float4 swap_vertical = spvQuadSwap(float4(20.0), 1u);
|
|
|
|
bool4 swap_vertical_bool = spvQuadSwap(bool4(true), 1u);
|
|
|
|
float4 swap_diagonal = spvQuadSwap(float4(20.0), 2u);
|
|
|
|
bool4 swap_diagonal_bool = spvQuadSwap(bool4(true), 2u);
|
|
|
|
float4 quad_broadcast0 = spvQuadBroadcast(float4(20.0), 3u);
|
|
|
|
bool4 quad_broadcast_bool = spvQuadBroadcast(bool4(true), 3u);
|
MSL: Add support for subgroup operations.
Some support for subgroups is present starting in Metal 2.0 on both iOS
and macOS. macOS gains more complete support in 10.14 (Metal 2.1).
Some restrictions are present. On iOS and on macOS 10.13, the
implementation of `OpGroupNonUniformElect` is incorrect: if thread 0 has
already terminated or is not executing a conditional branch, the first
thread that *is* will falsely believe itself not to be. Unfortunately,
this operation is part of the "basic" feature set; without it, subgroups
cannot be supported at all.
The `SubgroupSize` and `SubgroupLocalInvocationId` builtins are only
available in compute shaders (and, by extension, tessellation control
shaders), despite SPIR-V making them available in all stages. This
limits the usefulness of some of the subgroup operations in fragment
shaders.
Although Metal on macOS supports some clustered, inclusive, and
exclusive operations, it does not support them all. In particular,
inclusive and exclusive min, max, and, or, and xor; as well as cluster
sizes other than 4 are not supported. If this becomes a problem, they
could be emulated, but at a significant performance cost due to the need
for non-uniform operations.
2019-05-15 21:03:30 +00:00
|
|
|
}
|
|
|
|
|