#pragma clang diagnostic ignored "-Wmissing-prototypes" #include #include using namespace metal; constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(8u, 1u, 1u); template void spvArrayCopyFromConstantToStack1(thread T (&dst)[A], constant T (&src)[A]) { for (uint i = 0; i < A; i++) { dst[i] = src[i]; } } template void spvArrayCopyFromConstantToThreadGroup1(threadgroup T (&dst)[A], constant T (&src)[A]) { for (uint i = 0; i < A; i++) { dst[i] = src[i]; } } template void spvArrayCopyFromStackToStack1(thread T (&dst)[A], thread const T (&src)[A]) { for (uint i = 0; i < A; i++) { dst[i] = src[i]; } } template void spvArrayCopyFromStackToThreadGroup1(threadgroup T (&dst)[A], thread const T (&src)[A]) { for (uint i = 0; i < A; i++) { dst[i] = src[i]; } } template void spvArrayCopyFromThreadGroupToStack1(thread T (&dst)[A], threadgroup const T (&src)[A]) { for (uint i = 0; i < A; i++) { dst[i] = src[i]; } } template void spvArrayCopyFromThreadGroupToThreadGroup1(threadgroup T (&dst)[A], threadgroup const T (&src)[A]) { for (uint i = 0; i < A; i++) { dst[i] = src[i]; } } kernel void main0(uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]]) { threadgroup float shared_group[8][8]; threadgroup float shared_group_alt[8][8]; float blob[8]; for (int i = 0; i < 8; i++) { blob[i] = float(i); } spvArrayCopyFromStackToThreadGroup1(shared_group[gl_LocalInvocationIndex], blob); threadgroup_barrier(mem_flags::mem_threadgroup); float copied_blob[8]; spvArrayCopyFromThreadGroupToStack1(copied_blob, shared_group[gl_LocalInvocationIndex ^ 1u]); spvArrayCopyFromThreadGroupToThreadGroup1(shared_group_alt[gl_LocalInvocationIndex], shared_group[gl_LocalInvocationIndex]); }