SPIRV-Cross/reference/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl21.ios.comp

#pragma clang diagnostic ignored "-Wmissing-prototypes"

#include <metal_stdlib>
#include <simd/simd.h>

using namespace metal;

template<typename T>
inline T spvSubgroupShuffle(T value, ushort lane)
{
    return quad_shuffle(value, lane);
}

template<>
inline bool spvSubgroupShuffle(bool value, ushort lane)
{
    return !!quad_shuffle((ushort)value, lane);
}

template<uint N>
inline vec<bool, N> spvSubgroupShuffle(vec<bool, N> value, ushort lane)
{
    return (vec<bool, N>)quad_shuffle((vec<ushort, N>)value, lane);
}

template<typename T>
inline T spvSubgroupShuffleXor(T value, ushort mask)
{
    return quad_shuffle_xor(value, mask);
}

template<>
inline bool spvSubgroupShuffleXor(bool value, ushort mask)
{
    return !!quad_shuffle_xor((ushort)value, mask);
}

template<uint N>
inline vec<bool, N> spvSubgroupShuffleXor(vec<bool, N> value, ushort mask)
{
    return (vec<bool, N>)quad_shuffle_xor((vec<ushort, N>)value, mask);
}

template<typename T>
inline T spvSubgroupShuffleUp(T value, ushort delta)
{
    return quad_shuffle_up(value, delta);
}

template<>
inline bool spvSubgroupShuffleUp(bool value, ushort delta)
{
    return !!quad_shuffle_up((ushort)value, delta);
}

template<uint N>
inline vec<bool, N> spvSubgroupShuffleUp(vec<bool, N> value, ushort delta)
{
    return (vec<bool, N>)quad_shuffle_up((vec<ushort, N>)value, delta);
}

template<typename T>
inline T spvSubgroupShuffleDown(T value, ushort delta)
{
    return quad_shuffle_down(value, delta);
}

template<>
inline bool spvSubgroupShuffleDown(bool value, ushort delta)
{
    return !!quad_shuffle_down((ushort)value, delta);
}

template<uint N>
inline vec<bool, N> spvSubgroupShuffleDown(vec<bool, N> value, ushort delta)
{
    return (vec<bool, N>)quad_shuffle_down((vec<ushort, N>)value, delta);
}

template<typename T>
inline T spvQuadBroadcast(T value, uint lane)
{
    return quad_broadcast(value, lane);
}

template<>
inline bool spvQuadBroadcast(bool value, uint lane)
{
    return !!quad_broadcast((ushort)value, lane);
}

template<uint N>
inline vec<bool, N> spvQuadBroadcast(vec<bool, N> value, uint lane)
{
    return (vec<bool, N>)quad_broadcast((vec<ushort, N>)value, lane);
}

template<typename T>
inline T spvQuadSwap(T value, uint dir)
{
    return quad_shuffle_xor(value, dir + 1);
}

template<>
inline bool spvQuadSwap(bool value, uint dir)
{
    return !!quad_shuffle_xor((ushort)value, dir + 1);
}

template<uint N>
inline vec<bool, N> spvQuadSwap(vec<bool, N> value, uint dir)
{
    return (vec<bool, N>)quad_shuffle_xor((vec<ushort, N>)value, dir + 1);
}

struct SSBO
{
    float FragColor;
};

constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(1u);

kernel void main0(device SSBO& _9 [[buffer(0)]], uint gl_NumSubgroups [[quadgroups_per_threadgroup]], uint gl_SubgroupID [[quadgroup_index_in_threadgroup]], uint gl_SubgroupSize [[thread_execution_width]], uint gl_SubgroupInvocationID [[thread_index_in_quadgroup]])
{
    _9.FragColor = float(gl_NumSubgroups);
    _9.FragColor = float(gl_SubgroupID);
    _9.FragColor = float(gl_SubgroupSize);
    _9.FragColor = float(gl_SubgroupInvocationID);
    simdgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture);
    simdgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture);
    simdgroup_barrier(mem_flags::mem_device);
    simdgroup_barrier(mem_flags::mem_threadgroup);
    simdgroup_barrier(mem_flags::mem_texture);
    uint shuffled = spvSubgroupShuffle(10u, 8u);
    bool shuffled_bool = spvSubgroupShuffle(true, 9u);
    uint shuffled_xor = spvSubgroupShuffleXor(30u, 8u);
    bool shuffled_xor_bool = spvSubgroupShuffleXor(false, 9u);
    uint shuffled_up = spvSubgroupShuffleUp(20u, 4u);
    bool shuffled_up_bool = spvSubgroupShuffleUp(true, 4u);
    uint shuffled_down = spvSubgroupShuffleDown(20u, 4u);
    bool shuffled_down_bool = spvSubgroupShuffleDown(false, 4u);
    float4 swap_horiz = spvQuadSwap(float4(20.0), 0u);
    bool4 swap_horiz_bool = spvQuadSwap(bool4(true), 0u);
    float4 swap_vertical = spvQuadSwap(float4(20.0), 1u);
    bool4 swap_vertical_bool = spvQuadSwap(bool4(true), 1u);
    float4 swap_diagonal = spvQuadSwap(float4(20.0), 2u);
    bool4 swap_diagonal_bool = spvQuadSwap(bool4(true), 2u);
    float4 quad_broadcast0 = spvQuadBroadcast(float4(20.0), 3u);
    bool4 quad_broadcast_bool = spvQuadBroadcast(bool4(true), 3u);
}
MSL: Cast broadcast booleans to ushort. Metal doesn't support broadcasting or shuffling boolean values, but we can work around that by casting it to `ushort`, then casting it back to `bool`. I used `ushort` instead of `uint` because 16-bit values give better throughput on Apple GPUs. 2020-10-21 06:51:48 +00:00			`#pragma clang diagnostic ignored "-Wmissing-prototypes"`

MSL: Add support for subgroup operations. Some support for subgroups is present starting in Metal 2.0 on both iOS and macOS. macOS gains more complete support in 10.14 (Metal 2.1). Some restrictions are present. On iOS and on macOS 10.13, the implementation of `OpGroupNonUniformElect` is incorrect: if thread 0 has already terminated or is not executing a conditional branch, the first thread that is will falsely believe itself not to be. Unfortunately, this operation is part of the "basic" feature set; without it, subgroups cannot be supported at all. The `SubgroupSize` and `SubgroupLocalInvocationId` builtins are only available in compute shaders (and, by extension, tessellation control shaders), despite SPIR-V making them available in all stages. This limits the usefulness of some of the subgroup operations in fragment shaders. Although Metal on macOS supports some clustered, inclusive, and exclusive operations, it does not support them all. In particular, inclusive and exclusive min, max, and, or, and xor; as well as cluster sizes other than 4 are not supported. If this becomes a problem, they could be emulated, but at a significant performance cost due to the need for non-uniform operations. 2019-05-15 21:03:30 +00:00			`#include <metal_stdlib>`
			`#include <simd/simd.h>`

			`using namespace metal;`

MSL: Cast broadcast booleans to ushort. Metal doesn't support broadcasting or shuffling boolean values, but we can work around that by casting it to `ushort`, then casting it back to `bool`. I used `ushort` instead of `uint` because 16-bit values give better throughput on Apple GPUs. 2020-10-21 06:51:48 +00:00			`template<typename T>`
			`inline T spvSubgroupShuffle(T value, ushort lane)`
			`{`
			`return quad_shuffle(value, lane);`
			`}`

			`template<>`
			`inline bool spvSubgroupShuffle(bool value, ushort lane)`
			`{`
			`return !!quad_shuffle((ushort)value, lane);`
			`}`

			`template<uint N>`
			`inline vec<bool, N> spvSubgroupShuffle(vec<bool, N> value, ushort lane)`
			`{`
			`return (vec<bool, N>)quad_shuffle((vec<ushort, N>)value, lane);`
			`}`

			`template<typename T>`
			`inline T spvSubgroupShuffleXor(T value, ushort mask)`
			`{`
			`return quad_shuffle_xor(value, mask);`
			`}`

			`template<>`
			`inline bool spvSubgroupShuffleXor(bool value, ushort mask)`
			`{`
			`return !!quad_shuffle_xor((ushort)value, mask);`
			`}`

			`template<uint N>`
			`inline vec<bool, N> spvSubgroupShuffleXor(vec<bool, N> value, ushort mask)`
			`{`
			`return (vec<bool, N>)quad_shuffle_xor((vec<ushort, N>)value, mask);`
			`}`

			`template<typename T>`
			`inline T spvSubgroupShuffleUp(T value, ushort delta)`
			`{`
			`return quad_shuffle_up(value, delta);`
			`}`

			`template<>`
			`inline bool spvSubgroupShuffleUp(bool value, ushort delta)`
			`{`
			`return !!quad_shuffle_up((ushort)value, delta);`
			`}`

			`template<uint N>`
			`inline vec<bool, N> spvSubgroupShuffleUp(vec<bool, N> value, ushort delta)`
			`{`
			`return (vec<bool, N>)quad_shuffle_up((vec<ushort, N>)value, delta);`
			`}`

			`template<typename T>`
			`inline T spvSubgroupShuffleDown(T value, ushort delta)`
			`{`
			`return quad_shuffle_down(value, delta);`
			`}`

			`template<>`
			`inline bool spvSubgroupShuffleDown(bool value, ushort delta)`
			`{`
			`return !!quad_shuffle_down((ushort)value, delta);`
			`}`

			`template<uint N>`
			`inline vec<bool, N> spvSubgroupShuffleDown(vec<bool, N> value, ushort delta)`
			`{`
			`return (vec<bool, N>)quad_shuffle_down((vec<ushort, N>)value, delta);`
			`}`

			`template<typename T>`
			`inline T spvQuadBroadcast(T value, uint lane)`
			`{`
			`return quad_broadcast(value, lane);`
			`}`

			`template<>`
			`inline bool spvQuadBroadcast(bool value, uint lane)`
			`{`
			`return !!quad_broadcast((ushort)value, lane);`
			`}`

			`template<uint N>`
			`inline vec<bool, N> spvQuadBroadcast(vec<bool, N> value, uint lane)`
			`{`
			`return (vec<bool, N>)quad_broadcast((vec<ushort, N>)value, lane);`
			`}`

			`template<typename T>`
			`inline T spvQuadSwap(T value, uint dir)`
			`{`
			`return quad_shuffle_xor(value, dir + 1);`
			`}`

			`template<>`
			`inline bool spvQuadSwap(bool value, uint dir)`
			`{`
			`return !!quad_shuffle_xor((ushort)value, dir + 1);`
			`}`

			`template<uint N>`
			`inline vec<bool, N> spvQuadSwap(vec<bool, N> value, uint dir)`
			`{`
			`return (vec<bool, N>)quad_shuffle_xor((vec<ushort, N>)value, dir + 1);`
			`}`

MSL: Support synthetic functions in function constants. Emit synthetic functions before function constants. Support use of spvQuantizeToF16() in function constants for numerical behavior consistency with the op code. Ensure subnormal results from OpQuantizeToF16 are flushed to zero per SPIR-V spec. Adjust SPIRV-Cross unit test reference shaders to accommodate these changes. Any MSL reference shader that inclues a synthetic function is affected, since the location it is emitted has changed. 2021-09-28 23:10:16 +00:00			`struct SSBO`
			`{`
			`float FragColor;`
			`};`

			`constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(1u);`

MSL: Add support for subgroup operations. Some support for subgroups is present starting in Metal 2.0 on both iOS and macOS. macOS gains more complete support in 10.14 (Metal 2.1). Some restrictions are present. On iOS and on macOS 10.13, the implementation of `OpGroupNonUniformElect` is incorrect: if thread 0 has already terminated or is not executing a conditional branch, the first thread that is will falsely believe itself not to be. Unfortunately, this operation is part of the "basic" feature set; without it, subgroups cannot be supported at all. The `SubgroupSize` and `SubgroupLocalInvocationId` builtins are only available in compute shaders (and, by extension, tessellation control shaders), despite SPIR-V making them available in all stages. This limits the usefulness of some of the subgroup operations in fragment shaders. Although Metal on macOS supports some clustered, inclusive, and exclusive operations, it does not support them all. In particular, inclusive and exclusive min, max, and, or, and xor; as well as cluster sizes other than 4 are not supported. If this becomes a problem, they could be emulated, but at a significant performance cost due to the need for non-uniform operations. 2019-05-15 21:03:30 +00:00			`kernel void main0(device SSBO& _9 [[buffer(0)]], uint gl_NumSubgroups [[quadgroups_per_threadgroup]], uint gl_SubgroupID [[quadgroup_index_in_threadgroup]], uint gl_SubgroupSize [[thread_execution_width]], uint gl_SubgroupInvocationID [[thread_index_in_quadgroup]])`
			`{`
			`_9.FragColor = float(gl_NumSubgroups);`
			`_9.FragColor = float(gl_SubgroupID);`
			`_9.FragColor = float(gl_SubgroupSize);`
			`_9.FragColor = float(gl_SubgroupInvocationID);`
			`simdgroup_barrier(mem_flags::mem_device \| mem_flags::mem_threadgroup \| mem_flags::mem_texture);`
			`simdgroup_barrier(mem_flags::mem_device \| mem_flags::mem_threadgroup \| mem_flags::mem_texture);`
			`simdgroup_barrier(mem_flags::mem_device);`
			`simdgroup_barrier(mem_flags::mem_threadgroup);`
			`simdgroup_barrier(mem_flags::mem_texture);`
MSL: Cast broadcast booleans to ushort. Metal doesn't support broadcasting or shuffling boolean values, but we can work around that by casting it to `ushort`, then casting it back to `bool`. I used `ushort` instead of `uint` because 16-bit values give better throughput on Apple GPUs. 2020-10-21 06:51:48 +00:00			`uint shuffled = spvSubgroupShuffle(10u, 8u);`
			`bool shuffled_bool = spvSubgroupShuffle(true, 9u);`
			`uint shuffled_xor = spvSubgroupShuffleXor(30u, 8u);`
			`bool shuffled_xor_bool = spvSubgroupShuffleXor(false, 9u);`
			`uint shuffled_up = spvSubgroupShuffleUp(20u, 4u);`
			`bool shuffled_up_bool = spvSubgroupShuffleUp(true, 4u);`
			`uint shuffled_down = spvSubgroupShuffleDown(20u, 4u);`
			`bool shuffled_down_bool = spvSubgroupShuffleDown(false, 4u);`
			`float4 swap_horiz = spvQuadSwap(float4(20.0), 0u);`
			`bool4 swap_horiz_bool = spvQuadSwap(bool4(true), 0u);`
			`float4 swap_vertical = spvQuadSwap(float4(20.0), 1u);`
			`bool4 swap_vertical_bool = spvQuadSwap(bool4(true), 1u);`
			`float4 swap_diagonal = spvQuadSwap(float4(20.0), 2u);`
			`bool4 swap_diagonal_bool = spvQuadSwap(bool4(true), 2u);`
			`float4 quad_broadcast0 = spvQuadBroadcast(float4(20.0), 3u);`
			`bool4 quad_broadcast_bool = spvQuadBroadcast(bool4(true), 3u);`
MSL: Add support for subgroup operations. Some support for subgroups is present starting in Metal 2.0 on both iOS and macOS. macOS gains more complete support in 10.14 (Metal 2.1). Some restrictions are present. On iOS and on macOS 10.13, the implementation of `OpGroupNonUniformElect` is incorrect: if thread 0 has already terminated or is not executing a conditional branch, the first thread that is will falsely believe itself not to be. Unfortunately, this operation is part of the "basic" feature set; without it, subgroups cannot be supported at all. The `SubgroupSize` and `SubgroupLocalInvocationId` builtins are only available in compute shaders (and, by extension, tessellation control shaders), despite SPIR-V making them available in all stages. This limits the usefulness of some of the subgroup operations in fragment shaders. Although Metal on macOS supports some clustered, inclusive, and exclusive operations, it does not support them all. In particular, inclusive and exclusive min, max, and, or, and xor; as well as cluster sizes other than 4 are not supported. If this becomes a problem, they could be emulated, but at a significant performance cost due to the need for non-uniform operations. 2019-05-15 21:03:30 +00:00			`}`