SPIRV-Cross/reference/opt/shaders-msl/vulkan/comp/subgroups.nocompat.invalid.vk.msl21.comp

#pragma clang diagnostic ignored "-Wmissing-prototypes"
#pragma clang diagnostic ignored "-Wmissing-braces"
#pragma clang diagnostic ignored "-Wunused-variable"

#include <metal_stdlib>
#include <simd/simd.h>
	
template <typename T, size_t Num>
struct unsafe_array
{
	T __Elements[Num ? Num : 1];
	
	constexpr size_t size() const thread { return Num; }
	constexpr size_t max_size() const thread { return Num; }
	constexpr bool empty() const thread { return Num == 0; }
	
	constexpr size_t size() const device { return Num; }
	constexpr size_t max_size() const device { return Num; }
	constexpr bool empty() const device { return Num == 0; }
	
	constexpr size_t size() const constant { return Num; }
	constexpr size_t max_size() const constant { return Num; }
	constexpr bool empty() const constant { return Num == 0; }
	
	constexpr size_t size() const threadgroup { return Num; }
	constexpr size_t max_size() const threadgroup { return Num; }
	constexpr bool empty() const threadgroup { return Num == 0; }
	
	thread T &operator[](size_t pos) thread
	{
		return __Elements[pos];
	}
	constexpr const thread T &operator[](size_t pos) const thread
	{
		return __Elements[pos];
	}
	
	device T &operator[](size_t pos) device
	{
		return __Elements[pos];
	}
	constexpr const device T &operator[](size_t pos) const device
	{
		return __Elements[pos];
	}
	
	constexpr const constant T &operator[](size_t pos) const constant
	{
		return __Elements[pos];
	}
	
	threadgroup T &operator[](size_t pos) threadgroup
	{
		return __Elements[pos];
	}
	constexpr const threadgroup T &operator[](size_t pos) const threadgroup
	{
		return __Elements[pos];
	}
};

using namespace metal;

struct SSBO
{
    float FragColor;
};

inline uint4 spvSubgroupBallot(bool value)
{
    simd_vote vote = simd_ballot(value);
    // simd_ballot() returns a 64-bit integer-like object, but
    // SPIR-V callers expect a uint4. We must convert.
    // FIXME: This won't include higher bits if Apple ever supports
    // 128 lanes in an SIMD-group.
    return uint4((uint)((simd_vote::vote_t)vote & 0xFFFFFFFF), (uint)(((simd_vote::vote_t)vote >> 32) & 0xFFFFFFFF), 0, 0);
}

inline bool spvSubgroupBallotBitExtract(uint4 ballot, uint bit)
{
    return !!extract_bits(ballot[bit / 32], bit % 32, 1);
}

inline uint spvSubgroupBallotFindLSB(uint4 ballot)
{
    return select(ctz(ballot.x), select(32 + ctz(ballot.y), select(64 + ctz(ballot.z), select(96 + ctz(ballot.w), uint(-1), ballot.w == 0), ballot.z == 0), ballot.y == 0), ballot.x == 0);
}

inline uint spvSubgroupBallotFindMSB(uint4 ballot)
{
    return select(128 - (clz(ballot.w) + 1), select(96 - (clz(ballot.z) + 1), select(64 - (clz(ballot.y) + 1), select(32 - (clz(ballot.x) + 1), uint(-1), ballot.x == 0), ballot.y == 0), ballot.z == 0), ballot.w == 0);
}

inline uint spvSubgroupBallotBitCount(uint4 ballot)
{
    return popcount(ballot.x) + popcount(ballot.y) + popcount(ballot.z) + popcount(ballot.w);
}

inline uint spvSubgroupBallotInclusiveBitCount(uint4 ballot, uint gl_SubgroupInvocationID)
{
    uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupInvocationID + 1, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupInvocationID + 1 - 32, 0)), uint2(0));
    return spvSubgroupBallotBitCount(ballot & mask);
}

inline uint spvSubgroupBallotExclusiveBitCount(uint4 ballot, uint gl_SubgroupInvocationID)
{
    uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupInvocationID, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupInvocationID - 32, 0)), uint2(0));
    return spvSubgroupBallotBitCount(ballot & mask);
}

template<typename T>
inline bool spvSubgroupAllEqual(T value)
{
    return simd_all(value == simd_broadcast_first(value));
}

template<>
inline bool spvSubgroupAllEqual(bool value)
{
    return simd_all(value) || !simd_any(value);
}

kernel void main0(device SSBO& _9 [[buffer(0)]], uint gl_NumSubgroups [[simdgroups_per_threadgroup]], uint gl_SubgroupID [[simdgroup_index_in_threadgroup]], uint gl_SubgroupSize [[thread_execution_width]], uint gl_SubgroupInvocationID [[thread_index_in_simdgroup]])
{
    uint4 gl_SubgroupEqMask = gl_SubgroupInvocationID > 32 ? uint4(0, (1 << (gl_SubgroupInvocationID - 32)), uint2(0)) : uint4(1 << gl_SubgroupInvocationID, uint3(0));
    uint4 gl_SubgroupGeMask = uint4(extract_bits(0xFFFFFFFF, min(gl_SubgroupInvocationID, 32u), (uint)max(min((int)gl_SubgroupSize, 32) - (int)gl_SubgroupInvocationID, 0)), extract_bits(0xFFFFFFFF, (uint)max((int)gl_SubgroupInvocationID - 32, 0), (uint)max((int)gl_SubgroupSize - (int)max(gl_SubgroupInvocationID, 32u), 0)), uint2(0));
    uint4 gl_SubgroupGtMask = uint4(extract_bits(0xFFFFFFFF, min(gl_SubgroupInvocationID + 1, 32u), (uint)max(min((int)gl_SubgroupSize, 32) - (int)gl_SubgroupInvocationID - 1, 0)), extract_bits(0xFFFFFFFF, (uint)max((int)gl_SubgroupInvocationID + 1 - 32, 0), (uint)max((int)gl_SubgroupSize - (int)max(gl_SubgroupInvocationID + 1, 32u), 0)), uint2(0));
    uint4 gl_SubgroupLeMask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupInvocationID + 1, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupInvocationID + 1 - 32, 0)), uint2(0));
    uint4 gl_SubgroupLtMask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupInvocationID, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupInvocationID - 32, 0)), uint2(0));
    _9.FragColor = float(gl_NumSubgroups);
    _9.FragColor = float(gl_SubgroupID);
    _9.FragColor = float(gl_SubgroupSize);
    _9.FragColor = float(gl_SubgroupInvocationID);
    simdgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture);
    simdgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture);
    simdgroup_barrier(mem_flags::mem_device);
    simdgroup_barrier(mem_flags::mem_threadgroup);
    simdgroup_barrier(mem_flags::mem_texture);
    _9.FragColor = float4(gl_SubgroupEqMask).x;
    _9.FragColor = float4(gl_SubgroupGeMask).x;
    _9.FragColor = float4(gl_SubgroupGtMask).x;
    _9.FragColor = float4(gl_SubgroupLeMask).x;
    _9.FragColor = float4(gl_SubgroupLtMask).x;
    uint4 _83 = spvSubgroupBallot(true);
    float4 _165 = simd_prefix_inclusive_product(simd_product(float4(20.0)));
    int4 _167 = simd_prefix_inclusive_product(simd_product(int4(20)));
}
MSL: Add support for subgroup operations. Some support for subgroups is present starting in Metal 2.0 on both iOS and macOS. macOS gains more complete support in 10.14 (Metal 2.1). Some restrictions are present. On iOS and on macOS 10.13, the implementation of `OpGroupNonUniformElect` is incorrect: if thread 0 has already terminated or is not executing a conditional branch, the first thread that is will falsely believe itself not to be. Unfortunately, this operation is part of the "basic" feature set; without it, subgroups cannot be supported at all. The `SubgroupSize` and `SubgroupLocalInvocationId` builtins are only available in compute shaders (and, by extension, tessellation control shaders), despite SPIR-V making them available in all stages. This limits the usefulness of some of the subgroup operations in fragment shaders. Although Metal on macOS supports some clustered, inclusive, and exclusive operations, it does not support them all. In particular, inclusive and exclusive min, max, and, or, and xor; as well as cluster sizes other than 4 are not supported. If this becomes a problem, they could be emulated, but at a significant performance cost due to the need for non-uniform operations. 2019-05-15 21:03:30 +00:00			`#pragma clang diagnostic ignored "-Wmissing-prototypes"`
Update the Metal shaders to account for changes in the shader compilation. 2019-08-14 15:09:39 +00:00			`#pragma clang diagnostic ignored "-Wmissing-braces"`
			`#pragma clang diagnostic ignored "-Wunused-variable"`
MSL: Add support for subgroup operations. Some support for subgroups is present starting in Metal 2.0 on both iOS and macOS. macOS gains more complete support in 10.14 (Metal 2.1). Some restrictions are present. On iOS and on macOS 10.13, the implementation of `OpGroupNonUniformElect` is incorrect: if thread 0 has already terminated or is not executing a conditional branch, the first thread that is will falsely believe itself not to be. Unfortunately, this operation is part of the "basic" feature set; without it, subgroups cannot be supported at all. The `SubgroupSize` and `SubgroupLocalInvocationId` builtins are only available in compute shaders (and, by extension, tessellation control shaders), despite SPIR-V making them available in all stages. This limits the usefulness of some of the subgroup operations in fragment shaders. Although Metal on macOS supports some clustered, inclusive, and exclusive operations, it does not support them all. In particular, inclusive and exclusive min, max, and, or, and xor; as well as cluster sizes other than 4 are not supported. If this becomes a problem, they could be emulated, but at a significant performance cost due to the need for non-uniform operations. 2019-05-15 21:03:30 +00:00
			`#include <metal_stdlib>`
			`#include <simd/simd.h>`
Update the Metal shaders to account for changes in the shader compilation. 2019-08-14 15:09:39 +00:00
			`template <typename T, size_t Num>`
			`struct unsafe_array`
			`{`
			`T __Elements[Num ? Num : 1];`

			`constexpr size_t size() const thread { return Num; }`
			`constexpr size_t max_size() const thread { return Num; }`
			`constexpr bool empty() const thread { return Num == 0; }`

			`constexpr size_t size() const device { return Num; }`
			`constexpr size_t max_size() const device { return Num; }`
			`constexpr bool empty() const device { return Num == 0; }`

			`constexpr size_t size() const constant { return Num; }`
			`constexpr size_t max_size() const constant { return Num; }`
			`constexpr bool empty() const constant { return Num == 0; }`

			`constexpr size_t size() const threadgroup { return Num; }`
			`constexpr size_t max_size() const threadgroup { return Num; }`
			`constexpr bool empty() const threadgroup { return Num == 0; }`

			`thread T &operator[](size_t pos) thread`
			`{`
			`return __Elements[pos];`
			`}`
			`constexpr const thread T &operator[](size_t pos) const thread`
			`{`
			`return __Elements[pos];`
			`}`

			`device T &operator[](size_t pos) device`
			`{`
			`return __Elements[pos];`
			`}`
			`constexpr const device T &operator[](size_t pos) const device`
			`{`
			`return __Elements[pos];`
			`}`

			`constexpr const constant T &operator[](size_t pos) const constant`
			`{`
			`return __Elements[pos];`
			`}`

			`threadgroup T &operator[](size_t pos) threadgroup`
			`{`
			`return __Elements[pos];`
			`}`
			`constexpr const threadgroup T &operator[](size_t pos) const threadgroup`
			`{`
			`return __Elements[pos];`
			`}`
			`};`
MSL: Add support for subgroup operations. Some support for subgroups is present starting in Metal 2.0 on both iOS and macOS. macOS gains more complete support in 10.14 (Metal 2.1). Some restrictions are present. On iOS and on macOS 10.13, the implementation of `OpGroupNonUniformElect` is incorrect: if thread 0 has already terminated or is not executing a conditional branch, the first thread that is will falsely believe itself not to be. Unfortunately, this operation is part of the "basic" feature set; without it, subgroups cannot be supported at all. The `SubgroupSize` and `SubgroupLocalInvocationId` builtins are only available in compute shaders (and, by extension, tessellation control shaders), despite SPIR-V making them available in all stages. This limits the usefulness of some of the subgroup operations in fragment shaders. Although Metal on macOS supports some clustered, inclusive, and exclusive operations, it does not support them all. In particular, inclusive and exclusive min, max, and, or, and xor; as well as cluster sizes other than 4 are not supported. If this becomes a problem, they could be emulated, but at a significant performance cost due to the need for non-uniform operations. 2019-05-15 21:03:30 +00:00
			`using namespace metal;`

			`struct SSBO`
			`{`
			`float FragColor;`
			`};`

			`inline uint4 spvSubgroupBallot(bool value)`
			`{`
			`simd_vote vote = simd_ballot(value);`
			`// simd_ballot() returns a 64-bit integer-like object, but`
			`// SPIR-V callers expect a uint4. We must convert.`
			`// FIXME: This won't include higher bits if Apple ever supports`
			`// 128 lanes in an SIMD-group.`
			`return uint4((uint)((simd_vote::vote_t)vote & 0xFFFFFFFF), (uint)(((simd_vote::vote_t)vote >> 32) & 0xFFFFFFFF), 0, 0);`
			`}`

			`inline bool spvSubgroupBallotBitExtract(uint4 ballot, uint bit)`
			`{`
			`return !!extract_bits(ballot[bit / 32], bit % 32, 1);`
			`}`

			`inline uint spvSubgroupBallotFindLSB(uint4 ballot)`
			`{`
			`return select(ctz(ballot.x), select(32 + ctz(ballot.y), select(64 + ctz(ballot.z), select(96 + ctz(ballot.w), uint(-1), ballot.w == 0), ballot.z == 0), ballot.y == 0), ballot.x == 0);`
			`}`

			`inline uint spvSubgroupBallotFindMSB(uint4 ballot)`
			`{`
			`return select(128 - (clz(ballot.w) + 1), select(96 - (clz(ballot.z) + 1), select(64 - (clz(ballot.y) + 1), select(32 - (clz(ballot.x) + 1), uint(-1), ballot.x == 0), ballot.y == 0), ballot.z == 0), ballot.w == 0);`
			`}`

			`inline uint spvSubgroupBallotBitCount(uint4 ballot)`
			`{`
			`return popcount(ballot.x) + popcount(ballot.y) + popcount(ballot.z) + popcount(ballot.w);`
			`}`

			`inline uint spvSubgroupBallotInclusiveBitCount(uint4 ballot, uint gl_SubgroupInvocationID)`
			`{`
			`uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupInvocationID + 1, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupInvocationID + 1 - 32, 0)), uint2(0));`
			`return spvSubgroupBallotBitCount(ballot & mask);`
			`}`

			`inline uint spvSubgroupBallotExclusiveBitCount(uint4 ballot, uint gl_SubgroupInvocationID)`
			`{`
			`uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupInvocationID, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupInvocationID - 32, 0)), uint2(0));`
			`return spvSubgroupBallotBitCount(ballot & mask);`
			`}`

			`template<typename T>`
			`inline bool spvSubgroupAllEqual(T value)`
			`{`
			`return simd_all(value == simd_broadcast_first(value));`
			`}`

			`template<>`
			`inline bool spvSubgroupAllEqual(bool value)`
			`{`
			`return simd_all(value) \|\| !simd_any(value);`
			`}`

			`kernel void main0(device SSBO& _9 [[buffer(0)]], uint gl_NumSubgroups [[simdgroups_per_threadgroup]], uint gl_SubgroupID [[simdgroup_index_in_threadgroup]], uint gl_SubgroupSize [[thread_execution_width]], uint gl_SubgroupInvocationID [[thread_index_in_simdgroup]])`
			`{`
MSL: Support SubgroupLocalInvocationId and SubgroupSize in all stages. MSL prior to 2.2 doesn't support these natively in any stage but compute. But, we can (assuming no threads were terminated prematurely) get their values with some creative uses of the `simd_prefix_exclusive_sum()` and `simd_sum()` functions. Also, fix a missing `to_expression()` with `BuiltInSubgroupEqMask`. For KhronosGroup/MoltenVK#629. 2019-05-31 18:42:36 +00:00			`uint4 gl_SubgroupEqMask = gl_SubgroupInvocationID > 32 ? uint4(0, (1 << (gl_SubgroupInvocationID - 32)), uint2(0)) : uint4(1 << gl_SubgroupInvocationID, uint3(0));`
MSL: Add support for subgroup operations. Some support for subgroups is present starting in Metal 2.0 on both iOS and macOS. macOS gains more complete support in 10.14 (Metal 2.1). Some restrictions are present. On iOS and on macOS 10.13, the implementation of `OpGroupNonUniformElect` is incorrect: if thread 0 has already terminated or is not executing a conditional branch, the first thread that is will falsely believe itself not to be. Unfortunately, this operation is part of the "basic" feature set; without it, subgroups cannot be supported at all. The `SubgroupSize` and `SubgroupLocalInvocationId` builtins are only available in compute shaders (and, by extension, tessellation control shaders), despite SPIR-V making them available in all stages. This limits the usefulness of some of the subgroup operations in fragment shaders. Although Metal on macOS supports some clustered, inclusive, and exclusive operations, it does not support them all. In particular, inclusive and exclusive min, max, and, or, and xor; as well as cluster sizes other than 4 are not supported. If this becomes a problem, they could be emulated, but at a significant performance cost due to the need for non-uniform operations. 2019-05-15 21:03:30 +00:00			`uint4 gl_SubgroupGeMask = uint4(extract_bits(0xFFFFFFFF, min(gl_SubgroupInvocationID, 32u), (uint)max(min((int)gl_SubgroupSize, 32) - (int)gl_SubgroupInvocationID, 0)), extract_bits(0xFFFFFFFF, (uint)max((int)gl_SubgroupInvocationID - 32, 0), (uint)max((int)gl_SubgroupSize - (int)max(gl_SubgroupInvocationID, 32u), 0)), uint2(0));`
			`uint4 gl_SubgroupGtMask = uint4(extract_bits(0xFFFFFFFF, min(gl_SubgroupInvocationID + 1, 32u), (uint)max(min((int)gl_SubgroupSize, 32) - (int)gl_SubgroupInvocationID - 1, 0)), extract_bits(0xFFFFFFFF, (uint)max((int)gl_SubgroupInvocationID + 1 - 32, 0), (uint)max((int)gl_SubgroupSize - (int)max(gl_SubgroupInvocationID + 1, 32u), 0)), uint2(0));`
			`uint4 gl_SubgroupLeMask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupInvocationID + 1, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupInvocationID + 1 - 32, 0)), uint2(0));`
			`uint4 gl_SubgroupLtMask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupInvocationID, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupInvocationID - 32, 0)), uint2(0));`
			`_9.FragColor = float(gl_NumSubgroups);`
			`_9.FragColor = float(gl_SubgroupID);`
			`_9.FragColor = float(gl_SubgroupSize);`
			`_9.FragColor = float(gl_SubgroupInvocationID);`
			`simdgroup_barrier(mem_flags::mem_device \| mem_flags::mem_threadgroup \| mem_flags::mem_texture);`
			`simdgroup_barrier(mem_flags::mem_device \| mem_flags::mem_threadgroup \| mem_flags::mem_texture);`
			`simdgroup_barrier(mem_flags::mem_device);`
			`simdgroup_barrier(mem_flags::mem_threadgroup);`
			`simdgroup_barrier(mem_flags::mem_texture);`
			`_9.FragColor = float4(gl_SubgroupEqMask).x;`
			`_9.FragColor = float4(gl_SubgroupGeMask).x;`
			`_9.FragColor = float4(gl_SubgroupGtMask).x;`
			`_9.FragColor = float4(gl_SubgroupLeMask).x;`
			`_9.FragColor = float4(gl_SubgroupLtMask).x;`
			`uint4 _83 = spvSubgroupBallot(true);`
			`float4 _165 = simd_prefix_inclusive_product(simd_product(float4(20.0)));`
			`int4 _167 = simd_prefix_inclusive_product(simd_product(int4(20)));`
			`}`