9d9415754b
Some support for subgroups is present starting in Metal 2.0 on both iOS and macOS. macOS gains more complete support in 10.14 (Metal 2.1). Some restrictions are present. On iOS and on macOS 10.13, the implementation of `OpGroupNonUniformElect` is incorrect: if thread 0 has already terminated or is not executing a conditional branch, the first thread that *is* will falsely believe itself not to be. Unfortunately, this operation is part of the "basic" feature set; without it, subgroups cannot be supported at all. The `SubgroupSize` and `SubgroupLocalInvocationId` builtins are only available in compute shaders (and, by extension, tessellation control shaders), despite SPIR-V making them available in all stages. This limits the usefulness of some of the subgroup operations in fragment shaders. Although Metal on macOS supports some clustered, inclusive, and exclusive operations, it does not support them all. In particular, inclusive and exclusive min, max, and, or, and xor; as well as cluster sizes other than 4 are not supported. If this becomes a problem, they could be emulated, but at a significant performance cost due to the need for non-uniform operations.
80 lines
1.5 KiB
Plaintext
80 lines
1.5 KiB
Plaintext
#pragma clang diagnostic ignored "-Wmissing-prototypes"
|
|
|
|
#include <metal_stdlib>
|
|
#include <simd/simd.h>
|
|
|
|
using namespace metal;
|
|
|
|
constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(4u, 1u, 1u);
|
|
|
|
void barrier_shared()
|
|
{
|
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
}
|
|
|
|
void full_barrier()
|
|
{
|
|
threadgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture);
|
|
}
|
|
|
|
void image_barrier()
|
|
{
|
|
threadgroup_barrier(mem_flags::mem_texture);
|
|
}
|
|
|
|
void buffer_barrier()
|
|
{
|
|
threadgroup_barrier(mem_flags::mem_device);
|
|
}
|
|
|
|
void group_barrier()
|
|
{
|
|
threadgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture);
|
|
}
|
|
|
|
void barrier_shared_exec()
|
|
{
|
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
}
|
|
|
|
void full_barrier_exec()
|
|
{
|
|
threadgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture);
|
|
}
|
|
|
|
void image_barrier_exec()
|
|
{
|
|
threadgroup_barrier(mem_flags::mem_texture);
|
|
}
|
|
|
|
void buffer_barrier_exec()
|
|
{
|
|
threadgroup_barrier(mem_flags::mem_device);
|
|
}
|
|
|
|
void group_barrier_exec()
|
|
{
|
|
threadgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture);
|
|
}
|
|
|
|
void exec_barrier()
|
|
{
|
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
}
|
|
|
|
kernel void main0()
|
|
{
|
|
barrier_shared();
|
|
full_barrier();
|
|
image_barrier();
|
|
buffer_barrier();
|
|
group_barrier();
|
|
barrier_shared_exec();
|
|
full_barrier_exec();
|
|
image_barrier_exec();
|
|
buffer_barrier_exec();
|
|
group_barrier_exec();
|
|
exec_barrier();
|
|
}
|
|
|