diff --git a/CMakeLists.txt b/CMakeLists.txt index d8a0c77a..34515656 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -323,7 +323,7 @@ if (SPIRV_CROSS_STATIC) endif() set(spirv-cross-abi-major 0) -set(spirv-cross-abi-minor 42) +set(spirv-cross-abi-minor 43) set(spirv-cross-abi-patch 0) if (SPIRV_CROSS_SHARED) diff --git a/main.cpp b/main.cpp index da247768..f9ee2e0d 100644 --- a/main.cpp +++ b/main.cpp @@ -566,6 +566,9 @@ struct CLIArguments uint32_t msl_r32ui_linear_texture_alignment = 4; uint32_t msl_r32ui_alignment_constant_id = 65535; bool msl_texture_1d_as_2d = false; + bool msl_ios_use_simdgroup_functions = false; + bool msl_emulate_subgroups = false; + uint32_t msl_fixed_subgroup_size = 0; bool glsl_emit_push_constant_as_ubo = false; bool glsl_emit_ubo_as_plain_uniforms = false; bool glsl_force_flattened_io_blocks = false; @@ -779,7 +782,14 @@ static void print_help_msl() "\t[--msl-r32ui-linear-texture-align-constant-id ]:\n\t\tThe function constant ID to use for the linear texture alignment.\n" "\t\tOn MSL 1.2 or later, you can override the alignment by setting this function constant.\n" "\t[--msl-texture-1d-as-2d]:\n\t\tEmit Image variables of dimension Dim1D as texture2d.\n" - "\t\tIn Metal, 1D textures do not support all features that 2D textures do. Use this option if your code relies on these features.\n"); + "\t\tIn Metal, 1D textures do not support all features that 2D textures do. Use this option if your code relies on these features.\n" + "\t[--msl-ios-use-simdgroup-functions]:\n\t\tUse simd_*() functions for subgroup ops instead of quad_*().\n" + "\t\tRecent Apple GPUs support SIMD-groups larger than a quad. Use this option to take advantage of this support.\n" + "\t[--msl-emulate-subgroups]:\n\t\tAssume subgroups of size 1.\n" + "\t\tIntended for Vulkan Portability implementations where Metal support for SIMD-groups is insufficient for true subgroups.\n" + "\t[--msl-fixed-subgroup-size ]:\n\t\tAssign a constant to the SubgroupSize builtin.\n" + "\t\tIntended for Vulkan Portability implementations where VK_EXT_subgroup_size_control is not supported or disabled.\n" + "\t\tIf 0, assume variable subgroup size as actually exposed by Metal.\n"); // clang-format on } @@ -1021,6 +1031,9 @@ static string compile_iteration(const CLIArguments &args, std::vector msl_opts.r32ui_linear_texture_alignment = args.msl_r32ui_linear_texture_alignment; msl_opts.r32ui_alignment_constant_id = args.msl_r32ui_alignment_constant_id; msl_opts.texture_1D_as_2D = args.msl_texture_1d_as_2d; + msl_opts.ios_use_simdgroup_functions = args.msl_ios_use_simdgroup_functions; + msl_opts.emulate_subgroups = args.msl_emulate_subgroups; + msl_opts.fixed_subgroup_size = args.msl_fixed_subgroup_size; msl_comp->set_msl_options(msl_opts); for (auto &v : args.msl_discrete_descriptor_sets) msl_comp->add_discrete_descriptor_set(v); @@ -1449,6 +1462,10 @@ static int main_inner(int argc, char *argv[]) cbs.add("--msl-r32ui-linear-texture-align-constant-id", [&args](CLIParser &parser) { args.msl_r32ui_alignment_constant_id = parser.next_uint(); }); cbs.add("--msl-texture-1d-as-2d", [&args](CLIParser &) { args.msl_texture_1d_as_2d = true; }); + cbs.add("--msl-ios-use-simdgroup-functions", [&args](CLIParser &) { args.msl_ios_use_simdgroup_functions = true; }); + cbs.add("--msl-emulate-subgroups", [&args](CLIParser &) { args.msl_emulate_subgroups = true; }); + cbs.add("--msl-fixed-subgroup-size", + [&args](CLIParser &parser) { args.msl_fixed_subgroup_size = parser.next_uint(); }); cbs.add("--extension", [&args](CLIParser &parser) { args.extensions.push_back(parser.next_string()); }); cbs.add("--rename-entry-point", [&args](CLIParser &parser) { auto old_name = parser.next_string(); diff --git a/reference/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl12.emulate-subgroup.comp b/reference/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl12.emulate-subgroup.comp new file mode 100644 index 00000000..651991e3 --- /dev/null +++ b/reference/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl12.emulate-subgroup.comp @@ -0,0 +1,30 @@ +#include +#include + +using namespace metal; + +struct SSBO +{ + float FragColor; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(1u); + +kernel void main0(device SSBO& _9 [[buffer(0)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], unsupported-built-in-type gl_WorkGroupSize [[unsupported-built-in]]) +{ + uint gl_NumSubgroups = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z; + uint gl_SubgroupID = gl_LocalInvocationIndex; + uint gl_SubgroupSize = 1; + uint gl_SubgroupInvocationID = 0; + _9.FragColor = float(gl_NumSubgroups); + _9.FragColor = float(gl_SubgroupID); + _9.FragColor = float(gl_SubgroupSize); + _9.FragColor = float(gl_SubgroupInvocationID); + threadgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture); + threadgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture); + threadgroup_barrier(mem_flags::mem_device); + threadgroup_barrier(mem_flags::mem_threadgroup); + threadgroup_barrier(mem_flags::mem_texture); + bool elected = true; +} + diff --git a/reference/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl21.comp b/reference/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl21.comp index 6ad48b48..5c98435d 100644 --- a/reference/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl21.comp +++ b/reference/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl21.comp @@ -242,7 +242,8 @@ kernel void main0(device SSBO& _9 [[buffer(0)]], uint gl_NumSubgroups [[simdgrou simdgroup_barrier(mem_flags::mem_device); simdgroup_barrier(mem_flags::mem_threadgroup); simdgroup_barrier(mem_flags::mem_texture); - bool elected = simd_is_first(); + bool _39 = simd_is_first(); + bool elected = _39; _9.FragColor = float4(gl_SubgroupEqMask).x; _9.FragColor = float4(gl_SubgroupGeMask).x; _9.FragColor = float4(gl_SubgroupGtMask).x; diff --git a/reference/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl21.fixed-subgroup.comp b/reference/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl21.fixed-subgroup.comp new file mode 100644 index 00000000..809a8b56 --- /dev/null +++ b/reference/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl21.fixed-subgroup.comp @@ -0,0 +1,322 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct SSBO +{ + float FragColor; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(1u); + +template +inline T spvSubgroupBroadcast(T value, ushort lane) +{ + return simd_broadcast(value, lane); +} + +template<> +inline bool spvSubgroupBroadcast(bool value, ushort lane) +{ + return !!simd_broadcast((ushort)value, lane); +} + +template +inline vec spvSubgroupBroadcast(vec value, ushort lane) +{ + return (vec)simd_broadcast((vec)value, lane); +} + +template +inline T spvSubgroupBroadcastFirst(T value) +{ + return simd_broadcast_first(value); +} + +template<> +inline bool spvSubgroupBroadcastFirst(bool value) +{ + return !!simd_broadcast_first((ushort)value); +} + +template +inline vec spvSubgroupBroadcastFirst(vec value) +{ + return (vec)simd_broadcast_first((vec)value); +} + +inline uint4 spvSubgroupBallot(bool value) +{ + simd_vote vote = simd_ballot(value); + // simd_ballot() returns a 64-bit integer-like object, but + // SPIR-V callers expect a uint4. We must convert. + // FIXME: This won't include higher bits if Apple ever supports + // 128 lanes in an SIMD-group. + return uint4((uint)((simd_vote::vote_t)vote & 0xFFFFFFFF), (uint)(((simd_vote::vote_t)vote >> 32) & 0xFFFFFFFF), 0, 0); +} + +inline bool spvSubgroupBallotBitExtract(uint4 ballot, uint bit) +{ + return !!extract_bits(ballot[bit / 32], bit % 32, 1); +} + +inline uint spvSubgroupBallotFindLSB(uint4 ballot, uint gl_SubgroupSize) +{ + uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupSize, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupSize - 32, 0)), uint2(0)); + ballot &= mask; + return select(ctz(ballot.x), select(32 + ctz(ballot.y), select(64 + ctz(ballot.z), select(96 + ctz(ballot.w), uint(-1), ballot.w == 0), ballot.z == 0), ballot.y == 0), ballot.x == 0); +} + +inline uint spvSubgroupBallotFindMSB(uint4 ballot, uint gl_SubgroupSize) +{ + uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupSize, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupSize - 32, 0)), uint2(0)); + ballot &= mask; + return select(128 - (clz(ballot.w) + 1), select(96 - (clz(ballot.z) + 1), select(64 - (clz(ballot.y) + 1), select(32 - (clz(ballot.x) + 1), uint(-1), ballot.x == 0), ballot.y == 0), ballot.z == 0), ballot.w == 0); +} + +inline uint spvPopCount4(uint4 ballot) +{ + return popcount(ballot.x) + popcount(ballot.y) + popcount(ballot.z) + popcount(ballot.w); +} + +inline uint spvSubgroupBallotBitCount(uint4 ballot, uint gl_SubgroupSize) +{ + uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupSize, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupSize - 32, 0)), uint2(0)); + return spvPopCount4(ballot & mask); +} + +inline uint spvSubgroupBallotInclusiveBitCount(uint4 ballot, uint gl_SubgroupInvocationID) +{ + uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupInvocationID + 1, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupInvocationID + 1 - 32, 0)), uint2(0)); + return spvPopCount4(ballot & mask); +} + +inline uint spvSubgroupBallotExclusiveBitCount(uint4 ballot, uint gl_SubgroupInvocationID) +{ + uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupInvocationID, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupInvocationID - 32, 0)), uint2(0)); + return spvPopCount4(ballot & mask); +} + +template +inline bool spvSubgroupAllEqual(T value) +{ + return simd_all(all(value == simd_broadcast_first(value))); +} + +template<> +inline bool spvSubgroupAllEqual(bool value) +{ + return simd_all(value) || !simd_any(value); +} + +template +inline bool spvSubgroupAllEqual(vec value) +{ + return simd_all(all(value == (vec)simd_broadcast_first((vec)value))); +} + +template +inline T spvSubgroupShuffle(T value, ushort lane) +{ + return simd_shuffle(value, lane); +} + +template<> +inline bool spvSubgroupShuffle(bool value, ushort lane) +{ + return !!simd_shuffle((ushort)value, lane); +} + +template +inline vec spvSubgroupShuffle(vec value, ushort lane) +{ + return (vec)simd_shuffle((vec)value, lane); +} + +template +inline T spvSubgroupShuffleXor(T value, ushort mask) +{ + return simd_shuffle_xor(value, mask); +} + +template<> +inline bool spvSubgroupShuffleXor(bool value, ushort mask) +{ + return !!simd_shuffle_xor((ushort)value, mask); +} + +template +inline vec spvSubgroupShuffleXor(vec value, ushort mask) +{ + return (vec)simd_shuffle_xor((vec)value, mask); +} + +template +inline T spvSubgroupShuffleUp(T value, ushort delta) +{ + return simd_shuffle_up(value, delta); +} + +template<> +inline bool spvSubgroupShuffleUp(bool value, ushort delta) +{ + return !!simd_shuffle_up((ushort)value, delta); +} + +template +inline vec spvSubgroupShuffleUp(vec value, ushort delta) +{ + return (vec)simd_shuffle_up((vec)value, delta); +} + +template +inline T spvSubgroupShuffleDown(T value, ushort delta) +{ + return simd_shuffle_down(value, delta); +} + +template<> +inline bool spvSubgroupShuffleDown(bool value, ushort delta) +{ + return !!simd_shuffle_down((ushort)value, delta); +} + +template +inline vec spvSubgroupShuffleDown(vec value, ushort delta) +{ + return (vec)simd_shuffle_down((vec)value, delta); +} + +template +inline T spvQuadBroadcast(T value, uint lane) +{ + return quad_broadcast(value, lane); +} + +template<> +inline bool spvQuadBroadcast(bool value, uint lane) +{ + return !!quad_broadcast((ushort)value, lane); +} + +template +inline vec spvQuadBroadcast(vec value, uint lane) +{ + return (vec)quad_broadcast((vec)value, lane); +} + +template +inline T spvQuadSwap(T value, uint dir) +{ + return quad_shuffle_xor(value, dir + 1); +} + +template<> +inline bool spvQuadSwap(bool value, uint dir) +{ + return !!quad_shuffle_xor((ushort)value, dir + 1); +} + +template +inline vec spvQuadSwap(vec value, uint dir) +{ + return (vec)quad_shuffle_xor((vec)value, dir + 1); +} + +kernel void main0(device SSBO& _9 [[buffer(0)]], uint gl_NumSubgroups [[simdgroups_per_threadgroup]], uint gl_SubgroupID [[simdgroup_index_in_threadgroup]], uint gl_SubgroupInvocationID [[thread_index_in_simdgroup]]) +{ + uint gl_SubgroupSize = 32; + uint4 gl_SubgroupEqMask = gl_SubgroupInvocationID >= 32 ? uint4(0, (1 << (gl_SubgroupInvocationID - 32)), uint2(0)) : uint4(1 << gl_SubgroupInvocationID, uint3(0)); + uint4 gl_SubgroupGeMask = uint4(insert_bits(0u, 0xFFFFFFFF, gl_SubgroupInvocationID, 32 - gl_SubgroupInvocationID), uint3(0)) & spvSubgroupBallot(true); + uint4 gl_SubgroupGtMask = uint4(insert_bits(0u, 0xFFFFFFFF, gl_SubgroupInvocationID + 1, 32 - gl_SubgroupInvocationID - 1), uint3(0)) & spvSubgroupBallot(true); + uint4 gl_SubgroupLeMask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupInvocationID + 1, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupInvocationID + 1 - 32, 0)), uint2(0)); + uint4 gl_SubgroupLtMask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupInvocationID, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupInvocationID - 32, 0)), uint2(0)); + _9.FragColor = float(gl_NumSubgroups); + _9.FragColor = float(gl_SubgroupID); + _9.FragColor = float(gl_SubgroupSize); + _9.FragColor = float(gl_SubgroupInvocationID); + simdgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture); + simdgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture); + simdgroup_barrier(mem_flags::mem_device); + simdgroup_barrier(mem_flags::mem_threadgroup); + simdgroup_barrier(mem_flags::mem_texture); + bool _39 = simd_is_first(); + bool elected = _39; + _9.FragColor = float4(gl_SubgroupEqMask).x; + _9.FragColor = float4(gl_SubgroupGeMask).x; + _9.FragColor = float4(gl_SubgroupGtMask).x; + _9.FragColor = float4(gl_SubgroupLeMask).x; + _9.FragColor = float4(gl_SubgroupLtMask).x; + float4 broadcasted = spvSubgroupBroadcast(float4(10.0), 8u); + bool2 broadcasted_bool = spvSubgroupBroadcast(bool2(true), 8u); + float3 first = spvSubgroupBroadcastFirst(float3(20.0)); + bool4 first_bool = spvSubgroupBroadcastFirst(bool4(false)); + uint4 ballot_value = spvSubgroupBallot(true); + bool inverse_ballot_value = spvSubgroupBallotBitExtract(ballot_value, gl_SubgroupInvocationID); + bool bit_extracted = spvSubgroupBallotBitExtract(uint4(10u), 8u); + uint bit_count = spvSubgroupBallotBitCount(ballot_value, gl_SubgroupSize); + uint inclusive_bit_count = spvSubgroupBallotInclusiveBitCount(ballot_value, gl_SubgroupInvocationID); + uint exclusive_bit_count = spvSubgroupBallotExclusiveBitCount(ballot_value, gl_SubgroupInvocationID); + uint lsb = spvSubgroupBallotFindLSB(ballot_value, gl_SubgroupSize); + uint msb = spvSubgroupBallotFindMSB(ballot_value, gl_SubgroupSize); + uint shuffled = spvSubgroupShuffle(10u, 8u); + bool shuffled_bool = spvSubgroupShuffle(true, 9u); + uint shuffled_xor = spvSubgroupShuffleXor(30u, 8u); + bool shuffled_xor_bool = spvSubgroupShuffleXor(false, 9u); + uint shuffled_up = spvSubgroupShuffleUp(20u, 4u); + bool shuffled_up_bool = spvSubgroupShuffleUp(true, 4u); + uint shuffled_down = spvSubgroupShuffleDown(20u, 4u); + bool shuffled_down_bool = spvSubgroupShuffleDown(false, 4u); + bool has_all = simd_all(true); + bool has_any = simd_any(true); + bool has_equal = spvSubgroupAllEqual(0); + has_equal = spvSubgroupAllEqual(true); + has_equal = spvSubgroupAllEqual(float3(0.0, 1.0, 2.0)); + has_equal = spvSubgroupAllEqual(bool4(true, true, false, true)); + float4 added = simd_sum(float4(20.0)); + int4 iadded = simd_sum(int4(20)); + float4 multiplied = simd_product(float4(20.0)); + int4 imultiplied = simd_product(int4(20)); + float4 lo = simd_min(float4(20.0)); + float4 hi = simd_max(float4(20.0)); + int4 slo = simd_min(int4(20)); + int4 shi = simd_max(int4(20)); + uint4 ulo = simd_min(uint4(20u)); + uint4 uhi = simd_max(uint4(20u)); + uint4 anded = simd_and(ballot_value); + uint4 ored = simd_or(ballot_value); + uint4 xored = simd_xor(ballot_value); + added = simd_prefix_inclusive_sum(added); + iadded = simd_prefix_inclusive_sum(iadded); + multiplied = simd_prefix_inclusive_product(multiplied); + imultiplied = simd_prefix_inclusive_product(imultiplied); + added = simd_prefix_exclusive_sum(multiplied); + multiplied = simd_prefix_exclusive_product(multiplied); + iadded = simd_prefix_exclusive_sum(imultiplied); + imultiplied = simd_prefix_exclusive_product(imultiplied); + added = quad_sum(added); + multiplied = quad_product(multiplied); + iadded = quad_sum(iadded); + imultiplied = quad_product(imultiplied); + lo = quad_min(lo); + hi = quad_max(hi); + ulo = quad_min(ulo); + uhi = quad_max(uhi); + slo = quad_min(slo); + shi = quad_max(shi); + anded = quad_and(anded); + ored = quad_or(ored); + xored = quad_xor(xored); + float4 swap_horiz = spvQuadSwap(float4(20.0), 0u); + bool4 swap_horiz_bool = spvQuadSwap(bool4(true), 0u); + float4 swap_vertical = spvQuadSwap(float4(20.0), 1u); + bool4 swap_vertical_bool = spvQuadSwap(bool4(true), 1u); + float4 swap_diagonal = spvQuadSwap(float4(20.0), 2u); + bool4 swap_diagonal_bool = spvQuadSwap(bool4(true), 2u); + float4 quad_broadcast0 = spvQuadBroadcast(float4(20.0), 3u); + bool4 quad_broadcast_bool = spvQuadBroadcast(bool4(true), 3u); +} + diff --git a/reference/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl22.ios.comp b/reference/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl22.ios.comp new file mode 100644 index 00000000..bb319a3f --- /dev/null +++ b/reference/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl22.ios.comp @@ -0,0 +1,282 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct SSBO +{ + float FragColor; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(1u); + +template +inline T spvSubgroupBroadcast(T value, ushort lane) +{ + return quad_broadcast(value, lane); +} + +template<> +inline bool spvSubgroupBroadcast(bool value, ushort lane) +{ + return !!quad_broadcast((ushort)value, lane); +} + +template +inline vec spvSubgroupBroadcast(vec value, ushort lane) +{ + return (vec)quad_broadcast((vec)value, lane); +} + +template +inline T spvSubgroupBroadcastFirst(T value) +{ + return quad_broadcast_first(value); +} + +template<> +inline bool spvSubgroupBroadcastFirst(bool value) +{ + return !!quad_broadcast_first((ushort)value); +} + +template +inline vec spvSubgroupBroadcastFirst(vec value) +{ + return (vec)quad_broadcast_first((vec)value); +} + +inline uint4 spvSubgroupBallot(bool value) +{ + return uint4((quad_vote::vote_t)quad_ballot(value), 0, 0, 0); +} + +inline bool spvSubgroupBallotBitExtract(uint4 ballot, uint bit) +{ + return !!extract_bits(ballot[bit / 32], bit % 32, 1); +} + +inline uint spvSubgroupBallotFindLSB(uint4 ballot, uint gl_SubgroupSize) +{ + uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, gl_SubgroupSize), uint3(0)); + ballot &= mask; + return select(ctz(ballot.x), select(32 + ctz(ballot.y), select(64 + ctz(ballot.z), select(96 + ctz(ballot.w), uint(-1), ballot.w == 0), ballot.z == 0), ballot.y == 0), ballot.x == 0); +} + +inline uint spvSubgroupBallotFindMSB(uint4 ballot, uint gl_SubgroupSize) +{ + uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, gl_SubgroupSize), uint3(0)); + ballot &= mask; + return select(128 - (clz(ballot.w) + 1), select(96 - (clz(ballot.z) + 1), select(64 - (clz(ballot.y) + 1), select(32 - (clz(ballot.x) + 1), uint(-1), ballot.x == 0), ballot.y == 0), ballot.z == 0), ballot.w == 0); +} + +inline uint spvPopCount4(uint4 ballot) +{ + return popcount(ballot.x) + popcount(ballot.y) + popcount(ballot.z) + popcount(ballot.w); +} + +inline uint spvSubgroupBallotBitCount(uint4 ballot, uint gl_SubgroupSize) +{ + uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, gl_SubgroupSize), uint3(0)); + return spvPopCount4(ballot & mask); +} + +inline uint spvSubgroupBallotInclusiveBitCount(uint4 ballot, uint gl_SubgroupInvocationID) +{ + uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, gl_SubgroupInvocationID + 1), uint3(0)); + return spvPopCount4(ballot & mask); +} + +inline uint spvSubgroupBallotExclusiveBitCount(uint4 ballot, uint gl_SubgroupInvocationID) +{ + uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, gl_SubgroupInvocationID), uint2(0)); + return spvPopCount4(ballot & mask); +} + +template +inline bool spvSubgroupAllEqual(T value) +{ + return quad_all(all(value == quad_broadcast_first(value))); +} + +template<> +inline bool spvSubgroupAllEqual(bool value) +{ + return quad_all(value) || !quad_any(value); +} + +template +inline bool spvSubgroupAllEqual(vec value) +{ + return quad_all(all(value == (vec)quad_broadcast_first((vec)value))); +} + +template +inline T spvSubgroupShuffle(T value, ushort lane) +{ + return quad_shuffle(value, lane); +} + +template<> +inline bool spvSubgroupShuffle(bool value, ushort lane) +{ + return !!quad_shuffle((ushort)value, lane); +} + +template +inline vec spvSubgroupShuffle(vec value, ushort lane) +{ + return (vec)quad_shuffle((vec)value, lane); +} + +template +inline T spvSubgroupShuffleXor(T value, ushort mask) +{ + return quad_shuffle_xor(value, mask); +} + +template<> +inline bool spvSubgroupShuffleXor(bool value, ushort mask) +{ + return !!quad_shuffle_xor((ushort)value, mask); +} + +template +inline vec spvSubgroupShuffleXor(vec value, ushort mask) +{ + return (vec)quad_shuffle_xor((vec)value, mask); +} + +template +inline T spvSubgroupShuffleUp(T value, ushort delta) +{ + return quad_shuffle_up(value, delta); +} + +template<> +inline bool spvSubgroupShuffleUp(bool value, ushort delta) +{ + return !!quad_shuffle_up((ushort)value, delta); +} + +template +inline vec spvSubgroupShuffleUp(vec value, ushort delta) +{ + return (vec)quad_shuffle_up((vec)value, delta); +} + +template +inline T spvSubgroupShuffleDown(T value, ushort delta) +{ + return quad_shuffle_down(value, delta); +} + +template<> +inline bool spvSubgroupShuffleDown(bool value, ushort delta) +{ + return !!quad_shuffle_down((ushort)value, delta); +} + +template +inline vec spvSubgroupShuffleDown(vec value, ushort delta) +{ + return (vec)quad_shuffle_down((vec)value, delta); +} + +template +inline T spvQuadBroadcast(T value, uint lane) +{ + return quad_broadcast(value, lane); +} + +template<> +inline bool spvQuadBroadcast(bool value, uint lane) +{ + return !!quad_broadcast((ushort)value, lane); +} + +template +inline vec spvQuadBroadcast(vec value, uint lane) +{ + return (vec)quad_broadcast((vec)value, lane); +} + +template +inline T spvQuadSwap(T value, uint dir) +{ + return quad_shuffle_xor(value, dir + 1); +} + +template<> +inline bool spvQuadSwap(bool value, uint dir) +{ + return !!quad_shuffle_xor((ushort)value, dir + 1); +} + +template +inline vec spvQuadSwap(vec value, uint dir) +{ + return (vec)quad_shuffle_xor((vec)value, dir + 1); +} + +kernel void main0(device SSBO& _9 [[buffer(0)]], uint gl_NumSubgroups [[quadgroups_per_threadgroup]], uint gl_SubgroupID [[quadgroup_index_in_threadgroup]], uint gl_SubgroupSize [[thread_execution_width]], uint gl_SubgroupInvocationID [[thread_index_in_quadgroup]]) +{ + uint4 gl_SubgroupEqMask = uint4(1 << gl_SubgroupInvocationID, uint3(0)); + uint4 gl_SubgroupGeMask = uint4(insert_bits(0u, 0xFFFFFFFF, gl_SubgroupInvocationID, gl_SubgroupSize - gl_SubgroupInvocationID), uint3(0)); + uint4 gl_SubgroupGtMask = uint4(insert_bits(0u, 0xFFFFFFFF, gl_SubgroupInvocationID + 1, gl_SubgroupSize - gl_SubgroupInvocationID - 1), uint3(0)); + uint4 gl_SubgroupLeMask = uint4(extract_bits(0xFFFFFFFF, 0, gl_SubgroupInvocationID + 1), uint3(0)); + uint4 gl_SubgroupLtMask = uint4(extract_bits(0xFFFFFFFF, 0, gl_SubgroupInvocationID), uint3(0)); + _9.FragColor = float(gl_NumSubgroups); + _9.FragColor = float(gl_SubgroupID); + _9.FragColor = float(gl_SubgroupSize); + _9.FragColor = float(gl_SubgroupInvocationID); + simdgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture); + simdgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture); + simdgroup_barrier(mem_flags::mem_device); + simdgroup_barrier(mem_flags::mem_threadgroup); + simdgroup_barrier(mem_flags::mem_texture); + bool _39 = quad_is_first(); + bool elected = _39; + _9.FragColor = float4(gl_SubgroupEqMask).x; + _9.FragColor = float4(gl_SubgroupGeMask).x; + _9.FragColor = float4(gl_SubgroupGtMask).x; + _9.FragColor = float4(gl_SubgroupLeMask).x; + _9.FragColor = float4(gl_SubgroupLtMask).x; + float4 broadcasted = spvSubgroupBroadcast(float4(10.0), 8u); + bool2 broadcasted_bool = spvSubgroupBroadcast(bool2(true), 8u); + float3 first = spvSubgroupBroadcastFirst(float3(20.0)); + bool4 first_bool = spvSubgroupBroadcastFirst(bool4(false)); + uint4 ballot_value = spvSubgroupBallot(true); + bool inverse_ballot_value = spvSubgroupBallotBitExtract(ballot_value, gl_SubgroupInvocationID); + bool bit_extracted = spvSubgroupBallotBitExtract(uint4(10u), 8u); + uint bit_count = spvSubgroupBallotBitCount(ballot_value, gl_SubgroupSize); + uint inclusive_bit_count = spvSubgroupBallotInclusiveBitCount(ballot_value, gl_SubgroupInvocationID); + uint exclusive_bit_count = spvSubgroupBallotExclusiveBitCount(ballot_value, gl_SubgroupInvocationID); + uint lsb = spvSubgroupBallotFindLSB(ballot_value, gl_SubgroupSize); + uint msb = spvSubgroupBallotFindMSB(ballot_value, gl_SubgroupSize); + uint shuffled = spvSubgroupShuffle(10u, 8u); + bool shuffled_bool = spvSubgroupShuffle(true, 9u); + uint shuffled_xor = spvSubgroupShuffleXor(30u, 8u); + bool shuffled_xor_bool = spvSubgroupShuffleXor(false, 9u); + uint shuffled_up = spvSubgroupShuffleUp(20u, 4u); + bool shuffled_up_bool = spvSubgroupShuffleUp(true, 4u); + uint shuffled_down = spvSubgroupShuffleDown(20u, 4u); + bool shuffled_down_bool = spvSubgroupShuffleDown(false, 4u); + bool has_all = quad_all(true); + bool has_any = quad_any(true); + bool has_equal = spvSubgroupAllEqual(0); + has_equal = spvSubgroupAllEqual(true); + has_equal = spvSubgroupAllEqual(float3(0.0, 1.0, 2.0)); + has_equal = spvSubgroupAllEqual(bool4(true, true, false, true)); + float4 swap_horiz = spvQuadSwap(float4(20.0), 0u); + bool4 swap_horiz_bool = spvQuadSwap(bool4(true), 0u); + float4 swap_vertical = spvQuadSwap(float4(20.0), 1u); + bool4 swap_vertical_bool = spvQuadSwap(bool4(true), 1u); + float4 swap_diagonal = spvQuadSwap(float4(20.0), 2u); + bool4 swap_diagonal_bool = spvQuadSwap(bool4(true), 2u); + float4 quad_broadcast0 = spvQuadBroadcast(float4(20.0), 3u); + bool4 quad_broadcast_bool = spvQuadBroadcast(bool4(true), 3u); +} + diff --git a/reference/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl23.ios.simd.comp b/reference/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl23.ios.simd.comp new file mode 100644 index 00000000..9c1d9ae2 --- /dev/null +++ b/reference/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl23.ios.simd.comp @@ -0,0 +1,316 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct SSBO +{ + float FragColor; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(1u); + +template +inline T spvSubgroupBroadcast(T value, ushort lane) +{ + return simd_broadcast(value, lane); +} + +template<> +inline bool spvSubgroupBroadcast(bool value, ushort lane) +{ + return !!simd_broadcast((ushort)value, lane); +} + +template +inline vec spvSubgroupBroadcast(vec value, ushort lane) +{ + return (vec)simd_broadcast((vec)value, lane); +} + +template +inline T spvSubgroupBroadcastFirst(T value) +{ + return simd_broadcast_first(value); +} + +template<> +inline bool spvSubgroupBroadcastFirst(bool value) +{ + return !!simd_broadcast_first((ushort)value); +} + +template +inline vec spvSubgroupBroadcastFirst(vec value) +{ + return (vec)simd_broadcast_first((vec)value); +} + +inline uint4 spvSubgroupBallot(bool value) +{ + return uint4((simd_vote::vote_t)simd_ballot(value), 0, 0, 0); +} + +inline bool spvSubgroupBallotBitExtract(uint4 ballot, uint bit) +{ + return !!extract_bits(ballot[bit / 32], bit % 32, 1); +} + +inline uint spvSubgroupBallotFindLSB(uint4 ballot, uint gl_SubgroupSize) +{ + uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, gl_SubgroupSize), uint3(0)); + ballot &= mask; + return select(ctz(ballot.x), select(32 + ctz(ballot.y), select(64 + ctz(ballot.z), select(96 + ctz(ballot.w), uint(-1), ballot.w == 0), ballot.z == 0), ballot.y == 0), ballot.x == 0); +} + +inline uint spvSubgroupBallotFindMSB(uint4 ballot, uint gl_SubgroupSize) +{ + uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, gl_SubgroupSize), uint3(0)); + ballot &= mask; + return select(128 - (clz(ballot.w) + 1), select(96 - (clz(ballot.z) + 1), select(64 - (clz(ballot.y) + 1), select(32 - (clz(ballot.x) + 1), uint(-1), ballot.x == 0), ballot.y == 0), ballot.z == 0), ballot.w == 0); +} + +inline uint spvPopCount4(uint4 ballot) +{ + return popcount(ballot.x) + popcount(ballot.y) + popcount(ballot.z) + popcount(ballot.w); +} + +inline uint spvSubgroupBallotBitCount(uint4 ballot, uint gl_SubgroupSize) +{ + uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, gl_SubgroupSize), uint3(0)); + return spvPopCount4(ballot & mask); +} + +inline uint spvSubgroupBallotInclusiveBitCount(uint4 ballot, uint gl_SubgroupInvocationID) +{ + uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, gl_SubgroupInvocationID + 1), uint3(0)); + return spvPopCount4(ballot & mask); +} + +inline uint spvSubgroupBallotExclusiveBitCount(uint4 ballot, uint gl_SubgroupInvocationID) +{ + uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, gl_SubgroupInvocationID), uint2(0)); + return spvPopCount4(ballot & mask); +} + +template +inline bool spvSubgroupAllEqual(T value) +{ + return simd_all(all(value == simd_broadcast_first(value))); +} + +template<> +inline bool spvSubgroupAllEqual(bool value) +{ + return simd_all(value) || !simd_any(value); +} + +template +inline bool spvSubgroupAllEqual(vec value) +{ + return simd_all(all(value == (vec)simd_broadcast_first((vec)value))); +} + +template +inline T spvSubgroupShuffle(T value, ushort lane) +{ + return simd_shuffle(value, lane); +} + +template<> +inline bool spvSubgroupShuffle(bool value, ushort lane) +{ + return !!simd_shuffle((ushort)value, lane); +} + +template +inline vec spvSubgroupShuffle(vec value, ushort lane) +{ + return (vec)simd_shuffle((vec)value, lane); +} + +template +inline T spvSubgroupShuffleXor(T value, ushort mask) +{ + return simd_shuffle_xor(value, mask); +} + +template<> +inline bool spvSubgroupShuffleXor(bool value, ushort mask) +{ + return !!simd_shuffle_xor((ushort)value, mask); +} + +template +inline vec spvSubgroupShuffleXor(vec value, ushort mask) +{ + return (vec)simd_shuffle_xor((vec)value, mask); +} + +template +inline T spvSubgroupShuffleUp(T value, ushort delta) +{ + return simd_shuffle_up(value, delta); +} + +template<> +inline bool spvSubgroupShuffleUp(bool value, ushort delta) +{ + return !!simd_shuffle_up((ushort)value, delta); +} + +template +inline vec spvSubgroupShuffleUp(vec value, ushort delta) +{ + return (vec)simd_shuffle_up((vec)value, delta); +} + +template +inline T spvSubgroupShuffleDown(T value, ushort delta) +{ + return simd_shuffle_down(value, delta); +} + +template<> +inline bool spvSubgroupShuffleDown(bool value, ushort delta) +{ + return !!simd_shuffle_down((ushort)value, delta); +} + +template +inline vec spvSubgroupShuffleDown(vec value, ushort delta) +{ + return (vec)simd_shuffle_down((vec)value, delta); +} + +template +inline T spvQuadBroadcast(T value, uint lane) +{ + return quad_broadcast(value, lane); +} + +template<> +inline bool spvQuadBroadcast(bool value, uint lane) +{ + return !!quad_broadcast((ushort)value, lane); +} + +template +inline vec spvQuadBroadcast(vec value, uint lane) +{ + return (vec)quad_broadcast((vec)value, lane); +} + +template +inline T spvQuadSwap(T value, uint dir) +{ + return quad_shuffle_xor(value, dir + 1); +} + +template<> +inline bool spvQuadSwap(bool value, uint dir) +{ + return !!quad_shuffle_xor((ushort)value, dir + 1); +} + +template +inline vec spvQuadSwap(vec value, uint dir) +{ + return (vec)quad_shuffle_xor((vec)value, dir + 1); +} + +kernel void main0(device SSBO& _9 [[buffer(0)]], uint gl_NumSubgroups [[quadgroups_per_threadgroup]], uint gl_SubgroupID [[quadgroup_index_in_threadgroup]], uint gl_SubgroupSize [[thread_execution_width]], uint gl_SubgroupInvocationID [[thread_index_in_quadgroup]]) +{ + uint4 gl_SubgroupEqMask = uint4(1 << gl_SubgroupInvocationID, uint3(0)); + uint4 gl_SubgroupGeMask = uint4(insert_bits(0u, 0xFFFFFFFF, gl_SubgroupInvocationID, gl_SubgroupSize - gl_SubgroupInvocationID), uint3(0)); + uint4 gl_SubgroupGtMask = uint4(insert_bits(0u, 0xFFFFFFFF, gl_SubgroupInvocationID + 1, gl_SubgroupSize - gl_SubgroupInvocationID - 1), uint3(0)); + uint4 gl_SubgroupLeMask = uint4(extract_bits(0xFFFFFFFF, 0, gl_SubgroupInvocationID + 1), uint3(0)); + uint4 gl_SubgroupLtMask = uint4(extract_bits(0xFFFFFFFF, 0, gl_SubgroupInvocationID), uint3(0)); + _9.FragColor = float(gl_NumSubgroups); + _9.FragColor = float(gl_SubgroupID); + _9.FragColor = float(gl_SubgroupSize); + _9.FragColor = float(gl_SubgroupInvocationID); + simdgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture); + simdgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture); + simdgroup_barrier(mem_flags::mem_device); + simdgroup_barrier(mem_flags::mem_threadgroup); + simdgroup_barrier(mem_flags::mem_texture); + bool _39 = simd_is_first(); + bool elected = _39; + _9.FragColor = float4(gl_SubgroupEqMask).x; + _9.FragColor = float4(gl_SubgroupGeMask).x; + _9.FragColor = float4(gl_SubgroupGtMask).x; + _9.FragColor = float4(gl_SubgroupLeMask).x; + _9.FragColor = float4(gl_SubgroupLtMask).x; + float4 broadcasted = spvSubgroupBroadcast(float4(10.0), 8u); + bool2 broadcasted_bool = spvSubgroupBroadcast(bool2(true), 8u); + float3 first = spvSubgroupBroadcastFirst(float3(20.0)); + bool4 first_bool = spvSubgroupBroadcastFirst(bool4(false)); + uint4 ballot_value = spvSubgroupBallot(true); + bool inverse_ballot_value = spvSubgroupBallotBitExtract(ballot_value, gl_SubgroupInvocationID); + bool bit_extracted = spvSubgroupBallotBitExtract(uint4(10u), 8u); + uint bit_count = spvSubgroupBallotBitCount(ballot_value, gl_SubgroupSize); + uint inclusive_bit_count = spvSubgroupBallotInclusiveBitCount(ballot_value, gl_SubgroupInvocationID); + uint exclusive_bit_count = spvSubgroupBallotExclusiveBitCount(ballot_value, gl_SubgroupInvocationID); + uint lsb = spvSubgroupBallotFindLSB(ballot_value, gl_SubgroupSize); + uint msb = spvSubgroupBallotFindMSB(ballot_value, gl_SubgroupSize); + uint shuffled = spvSubgroupShuffle(10u, 8u); + bool shuffled_bool = spvSubgroupShuffle(true, 9u); + uint shuffled_xor = spvSubgroupShuffleXor(30u, 8u); + bool shuffled_xor_bool = spvSubgroupShuffleXor(false, 9u); + uint shuffled_up = spvSubgroupShuffleUp(20u, 4u); + bool shuffled_up_bool = spvSubgroupShuffleUp(true, 4u); + uint shuffled_down = spvSubgroupShuffleDown(20u, 4u); + bool shuffled_down_bool = spvSubgroupShuffleDown(false, 4u); + bool has_all = simd_all(true); + bool has_any = simd_any(true); + bool has_equal = spvSubgroupAllEqual(0); + has_equal = spvSubgroupAllEqual(true); + has_equal = spvSubgroupAllEqual(float3(0.0, 1.0, 2.0)); + has_equal = spvSubgroupAllEqual(bool4(true, true, false, true)); + float4 added = simd_sum(float4(20.0)); + int4 iadded = simd_sum(int4(20)); + float4 multiplied = simd_product(float4(20.0)); + int4 imultiplied = simd_product(int4(20)); + float4 lo = simd_min(float4(20.0)); + float4 hi = simd_max(float4(20.0)); + int4 slo = simd_min(int4(20)); + int4 shi = simd_max(int4(20)); + uint4 ulo = simd_min(uint4(20u)); + uint4 uhi = simd_max(uint4(20u)); + uint4 anded = simd_and(ballot_value); + uint4 ored = simd_or(ballot_value); + uint4 xored = simd_xor(ballot_value); + added = simd_prefix_inclusive_sum(added); + iadded = simd_prefix_inclusive_sum(iadded); + multiplied = simd_prefix_inclusive_product(multiplied); + imultiplied = simd_prefix_inclusive_product(imultiplied); + added = simd_prefix_exclusive_sum(multiplied); + multiplied = simd_prefix_exclusive_product(multiplied); + iadded = simd_prefix_exclusive_sum(imultiplied); + imultiplied = simd_prefix_exclusive_product(imultiplied); + added = quad_sum(added); + multiplied = quad_product(multiplied); + iadded = quad_sum(iadded); + imultiplied = quad_product(imultiplied); + lo = quad_min(lo); + hi = quad_max(hi); + ulo = quad_min(ulo); + uhi = quad_max(uhi); + slo = quad_min(slo); + shi = quad_max(shi); + anded = quad_and(anded); + ored = quad_or(ored); + xored = quad_xor(xored); + float4 swap_horiz = spvQuadSwap(float4(20.0), 0u); + bool4 swap_horiz_bool = spvQuadSwap(bool4(true), 0u); + float4 swap_vertical = spvQuadSwap(float4(20.0), 1u); + bool4 swap_vertical_bool = spvQuadSwap(bool4(true), 1u); + float4 swap_diagonal = spvQuadSwap(float4(20.0), 2u); + bool4 swap_diagonal_bool = spvQuadSwap(bool4(true), 2u); + float4 quad_broadcast0 = spvQuadBroadcast(float4(20.0), 3u); + bool4 quad_broadcast_bool = spvQuadBroadcast(bool4(true), 3u); +} + diff --git a/reference/shaders-msl-no-opt/frag/subgroups.nocompat.invalid.vk.msl21.frag b/reference/shaders-msl-no-opt/frag/subgroups.nocompat.invalid.vk.msl22.frag similarity index 98% rename from reference/shaders-msl-no-opt/frag/subgroups.nocompat.invalid.vk.msl21.frag rename to reference/shaders-msl-no-opt/frag/subgroups.nocompat.invalid.vk.msl22.frag index 021ce825..2243190d 100644 --- a/reference/shaders-msl-no-opt/frag/subgroups.nocompat.invalid.vk.msl21.frag +++ b/reference/shaders-msl-no-opt/frag/subgroups.nocompat.invalid.vk.msl22.frag @@ -224,11 +224,9 @@ inline vec spvQuadSwap(vec value, uint dir) return (vec)quad_shuffle_xor((vec)value, dir + 1); } -fragment main0_out main0() +fragment main0_out main0(uint gl_SubgroupSize [[threads_per_simdgroup]], uint gl_SubgroupInvocationID [[thread_index_in_simdgroup]]) { main0_out out = {}; - uint gl_SubgroupSize = simd_sum(1); - uint gl_SubgroupInvocationID = simd_prefix_exclusive_sum(1); uint4 gl_SubgroupEqMask = gl_SubgroupInvocationID >= 32 ? uint4(0, (1 << (gl_SubgroupInvocationID - 32)), uint2(0)) : uint4(1 << gl_SubgroupInvocationID, uint3(0)); uint4 gl_SubgroupGeMask = uint4(insert_bits(0u, 0xFFFFFFFF, min(gl_SubgroupInvocationID, 32u), (uint)max(min((int)gl_SubgroupSize, 32) - (int)gl_SubgroupInvocationID, 0)), insert_bits(0u, 0xFFFFFFFF, (uint)max((int)gl_SubgroupInvocationID - 32, 0), (uint)max((int)gl_SubgroupSize - (int)max(gl_SubgroupInvocationID, 32u), 0)), uint2(0)); uint4 gl_SubgroupGtMask = uint4(insert_bits(0u, 0xFFFFFFFF, min(gl_SubgroupInvocationID + 1, 32u), (uint)max(min((int)gl_SubgroupSize, 32) - (int)gl_SubgroupInvocationID - 1, 0)), insert_bits(0u, 0xFFFFFFFF, (uint)max((int)gl_SubgroupInvocationID + 1 - 32, 0), (uint)max((int)gl_SubgroupSize - (int)max(gl_SubgroupInvocationID + 1, 32u), 0)), uint2(0)); @@ -236,7 +234,8 @@ fragment main0_out main0() uint4 gl_SubgroupLtMask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupInvocationID, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupInvocationID - 32, 0)), uint2(0)); out.FragColor = float(gl_SubgroupSize); out.FragColor = float(gl_SubgroupInvocationID); - bool elected = simd_is_first(); + bool _24 = simd_is_first(); + bool elected = _24; out.FragColor = float4(gl_SubgroupEqMask).x; out.FragColor = float4(gl_SubgroupGeMask).x; out.FragColor = float4(gl_SubgroupGtMask).x; diff --git a/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl12.emulate-subgroup.comp b/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl12.emulate-subgroup.comp new file mode 100644 index 00000000..8a0be226 --- /dev/null +++ b/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl12.emulate-subgroup.comp @@ -0,0 +1,25 @@ +#version 450 +#extension GL_KHR_shader_subgroup_basic : require +layout(local_size_x = 1) in; + +layout(std430, binding = 0) buffer SSBO +{ + float FragColor; +}; + +// Reduced test for emulated functionality. + +void main() +{ + // basic + FragColor = float(gl_NumSubgroups); + FragColor = float(gl_SubgroupID); + FragColor = float(gl_SubgroupSize); + FragColor = float(gl_SubgroupInvocationID); + subgroupBarrier(); + subgroupMemoryBarrier(); + subgroupMemoryBarrierBuffer(); + subgroupMemoryBarrierShared(); + subgroupMemoryBarrierImage(); + bool elected = subgroupElect(); +} diff --git a/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl21.fixed-subgroup.comp b/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl21.fixed-subgroup.comp new file mode 100644 index 00000000..28c5d6b3 --- /dev/null +++ b/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl21.fixed-subgroup.comp @@ -0,0 +1,138 @@ +#version 450 +#extension GL_KHR_shader_subgroup_basic : require +#extension GL_KHR_shader_subgroup_ballot : require +#extension GL_KHR_shader_subgroup_vote : require +#extension GL_KHR_shader_subgroup_shuffle : require +#extension GL_KHR_shader_subgroup_shuffle_relative : require +#extension GL_KHR_shader_subgroup_arithmetic : require +#extension GL_KHR_shader_subgroup_clustered : require +#extension GL_KHR_shader_subgroup_quad : require +layout(local_size_x = 1) in; + +layout(std430, binding = 0) buffer SSBO +{ + float FragColor; +}; + +void main() +{ + // basic + FragColor = float(gl_NumSubgroups); + FragColor = float(gl_SubgroupID); + FragColor = float(gl_SubgroupSize); + FragColor = float(gl_SubgroupInvocationID); + subgroupBarrier(); + subgroupMemoryBarrier(); + subgroupMemoryBarrierBuffer(); + subgroupMemoryBarrierShared(); + subgroupMemoryBarrierImage(); + bool elected = subgroupElect(); + + // ballot + FragColor = float(gl_SubgroupEqMask); + FragColor = float(gl_SubgroupGeMask); + FragColor = float(gl_SubgroupGtMask); + FragColor = float(gl_SubgroupLeMask); + FragColor = float(gl_SubgroupLtMask); + vec4 broadcasted = subgroupBroadcast(vec4(10.0), 8u); + bvec2 broadcasted_bool = subgroupBroadcast(bvec2(true), 8u); + vec3 first = subgroupBroadcastFirst(vec3(20.0)); + bvec4 first_bool = subgroupBroadcastFirst(bvec4(false)); + uvec4 ballot_value = subgroupBallot(true); + bool inverse_ballot_value = subgroupInverseBallot(ballot_value); + bool bit_extracted = subgroupBallotBitExtract(uvec4(10u), 8u); + uint bit_count = subgroupBallotBitCount(ballot_value); + uint inclusive_bit_count = subgroupBallotInclusiveBitCount(ballot_value); + uint exclusive_bit_count = subgroupBallotExclusiveBitCount(ballot_value); + uint lsb = subgroupBallotFindLSB(ballot_value); + uint msb = subgroupBallotFindMSB(ballot_value); + + // shuffle + uint shuffled = subgroupShuffle(10u, 8u); + bool shuffled_bool = subgroupShuffle(true, 9u); + uint shuffled_xor = subgroupShuffleXor(30u, 8u); + bool shuffled_xor_bool = subgroupShuffleXor(false, 9u); + + // shuffle relative + uint shuffled_up = subgroupShuffleUp(20u, 4u); + bool shuffled_up_bool = subgroupShuffleUp(true, 4u); + uint shuffled_down = subgroupShuffleDown(20u, 4u); + bool shuffled_down_bool = subgroupShuffleDown(false, 4u); + + // vote + bool has_all = subgroupAll(true); + bool has_any = subgroupAny(true); + bool has_equal = subgroupAllEqual(0); + has_equal = subgroupAllEqual(true); + has_equal = subgroupAllEqual(vec3(0.0, 1.0, 2.0)); + has_equal = subgroupAllEqual(bvec4(true, true, false, true)); + + // arithmetic + vec4 added = subgroupAdd(vec4(20.0)); + ivec4 iadded = subgroupAdd(ivec4(20)); + vec4 multiplied = subgroupMul(vec4(20.0)); + ivec4 imultiplied = subgroupMul(ivec4(20)); + vec4 lo = subgroupMin(vec4(20.0)); + vec4 hi = subgroupMax(vec4(20.0)); + ivec4 slo = subgroupMin(ivec4(20)); + ivec4 shi = subgroupMax(ivec4(20)); + uvec4 ulo = subgroupMin(uvec4(20)); + uvec4 uhi = subgroupMax(uvec4(20)); + uvec4 anded = subgroupAnd(ballot_value); + uvec4 ored = subgroupOr(ballot_value); + uvec4 xored = subgroupXor(ballot_value); + + added = subgroupInclusiveAdd(added); + iadded = subgroupInclusiveAdd(iadded); + multiplied = subgroupInclusiveMul(multiplied); + imultiplied = subgroupInclusiveMul(imultiplied); + //lo = subgroupInclusiveMin(lo); // FIXME: Unsupported by Metal + //hi = subgroupInclusiveMax(hi); + //slo = subgroupInclusiveMin(slo); + //shi = subgroupInclusiveMax(shi); + //ulo = subgroupInclusiveMin(ulo); + //uhi = subgroupInclusiveMax(uhi); + //anded = subgroupInclusiveAnd(anded); + //ored = subgroupInclusiveOr(ored); + //xored = subgroupInclusiveXor(ored); + //added = subgroupExclusiveAdd(lo); + + added = subgroupExclusiveAdd(multiplied); + multiplied = subgroupExclusiveMul(multiplied); + iadded = subgroupExclusiveAdd(imultiplied); + imultiplied = subgroupExclusiveMul(imultiplied); + //lo = subgroupExclusiveMin(lo); // FIXME: Unsupported by Metal + //hi = subgroupExclusiveMax(hi); + //ulo = subgroupExclusiveMin(ulo); + //uhi = subgroupExclusiveMax(uhi); + //slo = subgroupExclusiveMin(slo); + //shi = subgroupExclusiveMax(shi); + //anded = subgroupExclusiveAnd(anded); + //ored = subgroupExclusiveOr(ored); + //xored = subgroupExclusiveXor(ored); + + // clustered + added = subgroupClusteredAdd(added, 4u); + multiplied = subgroupClusteredMul(multiplied, 4u); + iadded = subgroupClusteredAdd(iadded, 4u); + imultiplied = subgroupClusteredMul(imultiplied, 4u); + lo = subgroupClusteredMin(lo, 4u); + hi = subgroupClusteredMax(hi, 4u); + ulo = subgroupClusteredMin(ulo, 4u); + uhi = subgroupClusteredMax(uhi, 4u); + slo = subgroupClusteredMin(slo, 4u); + shi = subgroupClusteredMax(shi, 4u); + anded = subgroupClusteredAnd(anded, 4u); + ored = subgroupClusteredOr(ored, 4u); + xored = subgroupClusteredXor(xored, 4u); + + // quad + vec4 swap_horiz = subgroupQuadSwapHorizontal(vec4(20.0)); + bvec4 swap_horiz_bool = subgroupQuadSwapHorizontal(bvec4(true)); + vec4 swap_vertical = subgroupQuadSwapVertical(vec4(20.0)); + bvec4 swap_vertical_bool = subgroupQuadSwapVertical(bvec4(true)); + vec4 swap_diagonal = subgroupQuadSwapDiagonal(vec4(20.0)); + bvec4 swap_diagonal_bool = subgroupQuadSwapDiagonal(bvec4(true)); + vec4 quad_broadcast = subgroupQuadBroadcast(vec4(20.0), 3u); + bvec4 quad_broadcast_bool = subgroupQuadBroadcast(bvec4(true), 3u); +} diff --git a/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl22.ios.comp b/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl22.ios.comp new file mode 100644 index 00000000..bc904a4f --- /dev/null +++ b/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl22.ios.comp @@ -0,0 +1,79 @@ +#version 450 +#extension GL_KHR_shader_subgroup_basic : require +#extension GL_KHR_shader_subgroup_ballot : require +#extension GL_KHR_shader_subgroup_vote : require +#extension GL_KHR_shader_subgroup_shuffle : require +#extension GL_KHR_shader_subgroup_shuffle_relative : require +#extension GL_KHR_shader_subgroup_quad : require +layout(local_size_x = 1) in; + +layout(std430, binding = 0) buffer SSBO +{ + float FragColor; +}; + +// Reduced test for functionality exposed on iOS. + +void main() +{ + // basic + FragColor = float(gl_NumSubgroups); + FragColor = float(gl_SubgroupID); + FragColor = float(gl_SubgroupSize); + FragColor = float(gl_SubgroupInvocationID); + subgroupBarrier(); + subgroupMemoryBarrier(); + subgroupMemoryBarrierBuffer(); + subgroupMemoryBarrierShared(); + subgroupMemoryBarrierImage(); + bool elected = subgroupElect(); + + // ballot + FragColor = float(gl_SubgroupEqMask); + FragColor = float(gl_SubgroupGeMask); + FragColor = float(gl_SubgroupGtMask); + FragColor = float(gl_SubgroupLeMask); + FragColor = float(gl_SubgroupLtMask); + vec4 broadcasted = subgroupBroadcast(vec4(10.0), 8u); + bvec2 broadcasted_bool = subgroupBroadcast(bvec2(true), 8u); + vec3 first = subgroupBroadcastFirst(vec3(20.0)); + bvec4 first_bool = subgroupBroadcastFirst(bvec4(false)); + uvec4 ballot_value = subgroupBallot(true); + bool inverse_ballot_value = subgroupInverseBallot(ballot_value); + bool bit_extracted = subgroupBallotBitExtract(uvec4(10u), 8u); + uint bit_count = subgroupBallotBitCount(ballot_value); + uint inclusive_bit_count = subgroupBallotInclusiveBitCount(ballot_value); + uint exclusive_bit_count = subgroupBallotExclusiveBitCount(ballot_value); + uint lsb = subgroupBallotFindLSB(ballot_value); + uint msb = subgroupBallotFindMSB(ballot_value); + + // shuffle + uint shuffled = subgroupShuffle(10u, 8u); + bool shuffled_bool = subgroupShuffle(true, 9u); + uint shuffled_xor = subgroupShuffleXor(30u, 8u); + bool shuffled_xor_bool = subgroupShuffleXor(false, 9u); + + // shuffle relative + uint shuffled_up = subgroupShuffleUp(20u, 4u); + bool shuffled_up_bool = subgroupShuffleUp(true, 4u); + uint shuffled_down = subgroupShuffleDown(20u, 4u); + bool shuffled_down_bool = subgroupShuffleDown(false, 4u); + + // vote + bool has_all = subgroupAll(true); + bool has_any = subgroupAny(true); + bool has_equal = subgroupAllEqual(0); + has_equal = subgroupAllEqual(true); + has_equal = subgroupAllEqual(vec3(0.0, 1.0, 2.0)); + has_equal = subgroupAllEqual(bvec4(true, true, false, true)); + + // quad + vec4 swap_horiz = subgroupQuadSwapHorizontal(vec4(20.0)); + bvec4 swap_horiz_bool = subgroupQuadSwapHorizontal(bvec4(true)); + vec4 swap_vertical = subgroupQuadSwapVertical(vec4(20.0)); + bvec4 swap_vertical_bool = subgroupQuadSwapVertical(bvec4(true)); + vec4 swap_diagonal = subgroupQuadSwapDiagonal(vec4(20.0)); + bvec4 swap_diagonal_bool = subgroupQuadSwapDiagonal(bvec4(true)); + vec4 quad_broadcast = subgroupQuadBroadcast(vec4(20.0), 3u); + bvec4 quad_broadcast_bool = subgroupQuadBroadcast(bvec4(true), 3u); +} diff --git a/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl23.ios.simd.comp b/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl23.ios.simd.comp new file mode 100644 index 00000000..28c5d6b3 --- /dev/null +++ b/shaders-msl-no-opt/comp/subgroups.nocompat.invalid.vk.msl23.ios.simd.comp @@ -0,0 +1,138 @@ +#version 450 +#extension GL_KHR_shader_subgroup_basic : require +#extension GL_KHR_shader_subgroup_ballot : require +#extension GL_KHR_shader_subgroup_vote : require +#extension GL_KHR_shader_subgroup_shuffle : require +#extension GL_KHR_shader_subgroup_shuffle_relative : require +#extension GL_KHR_shader_subgroup_arithmetic : require +#extension GL_KHR_shader_subgroup_clustered : require +#extension GL_KHR_shader_subgroup_quad : require +layout(local_size_x = 1) in; + +layout(std430, binding = 0) buffer SSBO +{ + float FragColor; +}; + +void main() +{ + // basic + FragColor = float(gl_NumSubgroups); + FragColor = float(gl_SubgroupID); + FragColor = float(gl_SubgroupSize); + FragColor = float(gl_SubgroupInvocationID); + subgroupBarrier(); + subgroupMemoryBarrier(); + subgroupMemoryBarrierBuffer(); + subgroupMemoryBarrierShared(); + subgroupMemoryBarrierImage(); + bool elected = subgroupElect(); + + // ballot + FragColor = float(gl_SubgroupEqMask); + FragColor = float(gl_SubgroupGeMask); + FragColor = float(gl_SubgroupGtMask); + FragColor = float(gl_SubgroupLeMask); + FragColor = float(gl_SubgroupLtMask); + vec4 broadcasted = subgroupBroadcast(vec4(10.0), 8u); + bvec2 broadcasted_bool = subgroupBroadcast(bvec2(true), 8u); + vec3 first = subgroupBroadcastFirst(vec3(20.0)); + bvec4 first_bool = subgroupBroadcastFirst(bvec4(false)); + uvec4 ballot_value = subgroupBallot(true); + bool inverse_ballot_value = subgroupInverseBallot(ballot_value); + bool bit_extracted = subgroupBallotBitExtract(uvec4(10u), 8u); + uint bit_count = subgroupBallotBitCount(ballot_value); + uint inclusive_bit_count = subgroupBallotInclusiveBitCount(ballot_value); + uint exclusive_bit_count = subgroupBallotExclusiveBitCount(ballot_value); + uint lsb = subgroupBallotFindLSB(ballot_value); + uint msb = subgroupBallotFindMSB(ballot_value); + + // shuffle + uint shuffled = subgroupShuffle(10u, 8u); + bool shuffled_bool = subgroupShuffle(true, 9u); + uint shuffled_xor = subgroupShuffleXor(30u, 8u); + bool shuffled_xor_bool = subgroupShuffleXor(false, 9u); + + // shuffle relative + uint shuffled_up = subgroupShuffleUp(20u, 4u); + bool shuffled_up_bool = subgroupShuffleUp(true, 4u); + uint shuffled_down = subgroupShuffleDown(20u, 4u); + bool shuffled_down_bool = subgroupShuffleDown(false, 4u); + + // vote + bool has_all = subgroupAll(true); + bool has_any = subgroupAny(true); + bool has_equal = subgroupAllEqual(0); + has_equal = subgroupAllEqual(true); + has_equal = subgroupAllEqual(vec3(0.0, 1.0, 2.0)); + has_equal = subgroupAllEqual(bvec4(true, true, false, true)); + + // arithmetic + vec4 added = subgroupAdd(vec4(20.0)); + ivec4 iadded = subgroupAdd(ivec4(20)); + vec4 multiplied = subgroupMul(vec4(20.0)); + ivec4 imultiplied = subgroupMul(ivec4(20)); + vec4 lo = subgroupMin(vec4(20.0)); + vec4 hi = subgroupMax(vec4(20.0)); + ivec4 slo = subgroupMin(ivec4(20)); + ivec4 shi = subgroupMax(ivec4(20)); + uvec4 ulo = subgroupMin(uvec4(20)); + uvec4 uhi = subgroupMax(uvec4(20)); + uvec4 anded = subgroupAnd(ballot_value); + uvec4 ored = subgroupOr(ballot_value); + uvec4 xored = subgroupXor(ballot_value); + + added = subgroupInclusiveAdd(added); + iadded = subgroupInclusiveAdd(iadded); + multiplied = subgroupInclusiveMul(multiplied); + imultiplied = subgroupInclusiveMul(imultiplied); + //lo = subgroupInclusiveMin(lo); // FIXME: Unsupported by Metal + //hi = subgroupInclusiveMax(hi); + //slo = subgroupInclusiveMin(slo); + //shi = subgroupInclusiveMax(shi); + //ulo = subgroupInclusiveMin(ulo); + //uhi = subgroupInclusiveMax(uhi); + //anded = subgroupInclusiveAnd(anded); + //ored = subgroupInclusiveOr(ored); + //xored = subgroupInclusiveXor(ored); + //added = subgroupExclusiveAdd(lo); + + added = subgroupExclusiveAdd(multiplied); + multiplied = subgroupExclusiveMul(multiplied); + iadded = subgroupExclusiveAdd(imultiplied); + imultiplied = subgroupExclusiveMul(imultiplied); + //lo = subgroupExclusiveMin(lo); // FIXME: Unsupported by Metal + //hi = subgroupExclusiveMax(hi); + //ulo = subgroupExclusiveMin(ulo); + //uhi = subgroupExclusiveMax(uhi); + //slo = subgroupExclusiveMin(slo); + //shi = subgroupExclusiveMax(shi); + //anded = subgroupExclusiveAnd(anded); + //ored = subgroupExclusiveOr(ored); + //xored = subgroupExclusiveXor(ored); + + // clustered + added = subgroupClusteredAdd(added, 4u); + multiplied = subgroupClusteredMul(multiplied, 4u); + iadded = subgroupClusteredAdd(iadded, 4u); + imultiplied = subgroupClusteredMul(imultiplied, 4u); + lo = subgroupClusteredMin(lo, 4u); + hi = subgroupClusteredMax(hi, 4u); + ulo = subgroupClusteredMin(ulo, 4u); + uhi = subgroupClusteredMax(uhi, 4u); + slo = subgroupClusteredMin(slo, 4u); + shi = subgroupClusteredMax(shi, 4u); + anded = subgroupClusteredAnd(anded, 4u); + ored = subgroupClusteredOr(ored, 4u); + xored = subgroupClusteredXor(xored, 4u); + + // quad + vec4 swap_horiz = subgroupQuadSwapHorizontal(vec4(20.0)); + bvec4 swap_horiz_bool = subgroupQuadSwapHorizontal(bvec4(true)); + vec4 swap_vertical = subgroupQuadSwapVertical(vec4(20.0)); + bvec4 swap_vertical_bool = subgroupQuadSwapVertical(bvec4(true)); + vec4 swap_diagonal = subgroupQuadSwapDiagonal(vec4(20.0)); + bvec4 swap_diagonal_bool = subgroupQuadSwapDiagonal(bvec4(true)); + vec4 quad_broadcast = subgroupQuadBroadcast(vec4(20.0), 3u); + bvec4 quad_broadcast_bool = subgroupQuadBroadcast(bvec4(true), 3u); +} diff --git a/shaders-msl-no-opt/frag/subgroups.nocompat.invalid.vk.msl21.frag b/shaders-msl-no-opt/frag/subgroups.nocompat.invalid.vk.msl22.frag similarity index 100% rename from shaders-msl-no-opt/frag/subgroups.nocompat.invalid.vk.msl21.frag rename to shaders-msl-no-opt/frag/subgroups.nocompat.invalid.vk.msl22.frag diff --git a/spirv_cross_c.cpp b/spirv_cross_c.cpp index 5506d8d0..4b561378 100644 --- a/spirv_cross_c.cpp +++ b/spirv_cross_c.cpp @@ -678,6 +678,18 @@ spvc_result spvc_compiler_options_set_uint(spvc_compiler_options options, spvc_c case SPVC_COMPILER_OPTION_MSL_R32UI_ALIGNMENT_CONSTANT_ID: options->msl.r32ui_alignment_constant_id = value; break; + + case SPVC_COMPILER_OPTION_MSL_IOS_USE_SIMDGROUP_FUNCTIONS: + options->msl.ios_use_simdgroup_functions = value != 0; + break; + + case SPVC_COMPILER_OPTION_MSL_EMULATE_SUBGROUPS: + options->msl.emulate_subgroups = value != 0; + break; + + case SPVC_COMPILER_OPTION_MSL_FIXED_SUBGROUP_SIZE: + options->msl.fixed_subgroup_size = value; + break; #endif default: diff --git a/spirv_cross_c.h b/spirv_cross_c.h index 7ccec0aa..e12c2eaf 100644 --- a/spirv_cross_c.h +++ b/spirv_cross_c.h @@ -33,7 +33,7 @@ extern "C" { /* Bumped if ABI or API breaks backwards compatibility. */ #define SPVC_C_API_VERSION_MAJOR 0 /* Bumped if APIs or enumerations are added in a backwards compatible way. */ -#define SPVC_C_API_VERSION_MINOR 42 +#define SPVC_C_API_VERSION_MINOR 43 /* Bumped if internal implementation details change. */ #define SPVC_C_API_VERSION_PATCH 0 @@ -647,6 +647,10 @@ typedef enum spvc_compiler_option SPVC_COMPILER_OPTION_HLSL_FLATTEN_MATRIX_VERTEX_INPUT_SEMANTICS = 71 | SPVC_COMPILER_OPTION_HLSL_BIT, + SPVC_COMPILER_OPTION_MSL_IOS_USE_SIMDGROUP_FUNCTIONS = 72 | SPVC_COMPILER_OPTION_MSL_BIT, + SPVC_COMPILER_OPTION_MSL_EMULATE_SUBGROUPS = 73 | SPVC_COMPILER_OPTION_MSL_BIT, + SPVC_COMPILER_OPTION_MSL_FIXED_SUBGROUP_SIZE = 74 | SPVC_COMPILER_OPTION_MSL_BIT, + SPVC_COMPILER_OPTION_INT_MAX = 0x7fffffff } spvc_compiler_option; diff --git a/spirv_msl.cpp b/spirv_msl.cpp index 1193af75..f8cd1496 100644 --- a/spirv_msl.cpp +++ b/spirv_msl.cpp @@ -168,9 +168,12 @@ void CompilerMSL::build_implicit_builtins() active_input_builtins.get(BuiltInBaseVertex) || active_input_builtins.get(BuiltInInstanceId) || active_input_builtins.get(BuiltInInstanceIndex) || active_input_builtins.get(BuiltInBaseInstance)); bool need_sample_mask = msl_options.additional_fixed_sample_mask != 0xffffffff; + bool need_local_invocation_index = msl_options.emulate_subgroups && active_input_builtins.get(BuiltInSubgroupId); + bool need_workgroup_size = msl_options.emulate_subgroups && active_input_builtins.get(BuiltInNumSubgroups); if (need_subpass_input || need_sample_pos || need_subgroup_mask || need_vertex_params || need_tesc_params || need_multiview || need_dispatch_base || need_vertex_base_params || need_grid_params || needs_sample_id || - needs_subgroup_invocation_id || needs_subgroup_size || need_sample_mask) + needs_subgroup_invocation_id || needs_subgroup_size || need_sample_mask || need_local_invocation_index || + need_workgroup_size) { bool has_frag_coord = false; bool has_sample_id = false; @@ -184,6 +187,8 @@ void CompilerMSL::build_implicit_builtins() bool has_subgroup_size = false; bool has_view_idx = false; bool has_layer = false; + bool has_local_invocation_index = false; + bool has_workgroup_size = false; uint32_t workgroup_id_type = 0; // FIXME: Investigate the fact that there are no checks for the entry point interface variables. @@ -191,7 +196,6 @@ void CompilerMSL::build_implicit_builtins() if (!ir.meta[var.self].decoration.builtin) return; - // Use Metal's native frame-buffer fetch API for subpass inputs. BuiltIn builtin = ir.meta[var.self].decoration.builtin_type; if (var.storage == StorageClassOutput) @@ -207,6 +211,7 @@ void CompilerMSL::build_implicit_builtins() if (var.storage != StorageClassInput) return; + // Use Metal's native frame-buffer fetch API for subpass inputs. if (need_subpass_input && (!msl_options.use_framebuffer_fetch_subpasses)) { switch (builtin) @@ -330,6 +335,20 @@ void CompilerMSL::build_implicit_builtins() } } + if (need_local_invocation_index && builtin == BuiltInLocalInvocationIndex) + { + builtin_local_invocation_index_id = var.self; + mark_implicit_builtin(StorageClassInput, BuiltInLocalInvocationIndex, var.self); + has_local_invocation_index = true; + } + + if (need_workgroup_size && builtin == BuiltInLocalInvocationId) + { + builtin_workgroup_size_id = var.self; + mark_implicit_builtin(StorageClassInput, BuiltInWorkgroupSize, var.self); + has_workgroup_size = true; + } + // The base workgroup needs to have the same type and vector size // as the workgroup or invocation ID, so keep track of the type that // was used. @@ -681,6 +700,48 @@ void CompilerMSL::build_implicit_builtins() builtin_sample_mask_id = var_id; mark_implicit_builtin(StorageClassOutput, BuiltInSampleMask, var_id); } + + if (need_local_invocation_index && !has_local_invocation_index) + { + uint32_t offset = ir.increase_bound_by(2); + uint32_t type_ptr_id = offset; + uint32_t var_id = offset + 1; + + // Create gl_LocalInvocationIndex. + SPIRType uint_type_ptr; + uint_type_ptr = get_uint_type(); + uint_type_ptr.pointer = true; + uint_type_ptr.parent_type = get_uint_type_id(); + uint_type_ptr.storage = StorageClassInput; + + auto &ptr_type = set(type_ptr_id, uint_type_ptr); + ptr_type.self = get_uint_type_id(); + set(var_id, type_ptr_id, StorageClassInput); + set_decoration(var_id, DecorationBuiltIn, BuiltInLocalInvocationIndex); + builtin_local_invocation_index_id = var_id; + mark_implicit_builtin(StorageClassInput, BuiltInLocalInvocationIndex, var_id); + } + + if (need_workgroup_size && !has_workgroup_size) + { + uint32_t offset = ir.increase_bound_by(2); + uint32_t type_ptr_id = offset; + uint32_t var_id = offset + 1; + + // Create gl_WorkgroupSize. + uint32_t type_id = build_extended_vector_type(get_uint_type_id(), 3); + SPIRType uint_type_ptr = get(type_id); + uint_type_ptr.pointer = true; + uint_type_ptr.parent_type = type_id; + uint_type_ptr.storage = StorageClassInput; + + auto &ptr_type = set(type_ptr_id, uint_type_ptr); + ptr_type.self = type_id; + set(var_id, type_ptr_id, StorageClassInput); + set_decoration(var_id, DecorationBuiltIn, BuiltInWorkgroupSize); + builtin_workgroup_size_id = var_id; + mark_implicit_builtin(StorageClassInput, BuiltInWorkgroupSize, var_id); + } } if (needs_swizzle_buffer_def) @@ -4787,7 +4848,7 @@ void CompilerMSL::emit_custom_functions() statement("template"); statement("inline T spvSubgroupBroadcast(T value, ushort lane)"); begin_scope(); - if (msl_options.is_ios()) + if (msl_options.is_ios() && !msl_options.ios_use_simdgroup_functions) statement("return quad_broadcast(value, lane);"); else statement("return simd_broadcast(value, lane);"); @@ -4796,7 +4857,7 @@ void CompilerMSL::emit_custom_functions() statement("template<>"); statement("inline bool spvSubgroupBroadcast(bool value, ushort lane)"); begin_scope(); - if (msl_options.is_ios()) + if (msl_options.is_ios() && !msl_options.ios_use_simdgroup_functions) statement("return !!quad_broadcast((ushort)value, lane);"); else statement("return !!simd_broadcast((ushort)value, lane);"); @@ -4805,7 +4866,7 @@ void CompilerMSL::emit_custom_functions() statement("template"); statement("inline vec spvSubgroupBroadcast(vec value, ushort lane)"); begin_scope(); - if (msl_options.is_ios()) + if (msl_options.is_ios() && !msl_options.ios_use_simdgroup_functions) statement("return (vec)quad_broadcast((vec)value, lane);"); else statement("return (vec)simd_broadcast((vec)value, lane);"); @@ -4817,19 +4878,28 @@ void CompilerMSL::emit_custom_functions() statement("template"); statement("inline T spvSubgroupBroadcastFirst(T value)"); begin_scope(); - statement("return simd_broadcast_first(value);"); + if (msl_options.is_ios() && !msl_options.ios_use_simdgroup_functions) + statement("return quad_broadcast_first(value);"); + else + statement("return simd_broadcast_first(value);"); end_scope(); statement(""); statement("template<>"); statement("inline bool spvSubgroupBroadcastFirst(bool value)"); begin_scope(); - statement("return !!simd_broadcast_first((ushort)value);"); + if (msl_options.is_ios() && !msl_options.ios_use_simdgroup_functions) + statement("return !!quad_broadcast_first((ushort)value);"); + else + statement("return !!simd_broadcast_first((ushort)value);"); end_scope(); statement(""); statement("template"); statement("inline vec spvSubgroupBroadcastFirst(vec value)"); begin_scope(); - statement("return (vec)simd_broadcast_first((vec)value);"); + if (msl_options.is_ios() && !msl_options.ios_use_simdgroup_functions) + statement("return (vec)quad_broadcast_first((vec)value);"); + else + statement("return (vec)simd_broadcast_first((vec)value);"); end_scope(); statement(""); break; @@ -4837,13 +4907,26 @@ void CompilerMSL::emit_custom_functions() case SPVFuncImplSubgroupBallot: statement("inline uint4 spvSubgroupBallot(bool value)"); begin_scope(); - statement("simd_vote vote = simd_ballot(value);"); - statement("// simd_ballot() returns a 64-bit integer-like object, but"); - statement("// SPIR-V callers expect a uint4. We must convert."); - statement("// FIXME: This won't include higher bits if Apple ever supports"); - statement("// 128 lanes in an SIMD-group."); - statement("return uint4((uint)((simd_vote::vote_t)vote & 0xFFFFFFFF), (uint)(((simd_vote::vote_t)vote >> " - "32) & 0xFFFFFFFF), 0, 0);"); + if (msl_options.is_ios() && !msl_options.ios_use_simdgroup_functions) + { + statement("return uint4((quad_vote::vote_t)quad_ballot(value), 0, 0, 0);"); + } + else if (msl_options.is_ios()) + { + // The current simd_vote on iOS uses a 32-bit integer-like object. + statement("return uint4((simd_vote::vote_t)simd_ballot(value), 0, 0, 0);"); + } + else + { + statement("simd_vote vote = simd_ballot(value);"); + statement("// simd_ballot() returns a 64-bit integer-like object, but"); + statement("// SPIR-V callers expect a uint4. We must convert."); + statement("// FIXME: This won't include higher bits if Apple ever supports"); + statement("// 128 lanes in an SIMD-group."); + statement( + "return uint4((uint)((simd_vote::vote_t)vote & 0xFFFFFFFF), (uint)(((simd_vote::vote_t)vote >> " + "32) & 0xFFFFFFFF), 0, 0);"); + } end_scope(); statement(""); break; @@ -4859,8 +4942,15 @@ void CompilerMSL::emit_custom_functions() case SPVFuncImplSubgroupBallotFindLSB: statement("inline uint spvSubgroupBallotFindLSB(uint4 ballot, uint gl_SubgroupSize)"); begin_scope(); - statement("uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupSize, 32u)), " - "extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupSize - 32, 0)), uint2(0));"); + if (msl_options.is_ios()) + { + statement("uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, gl_SubgroupSize), uint3(0));"); + } + else + { + statement("uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupSize, 32u)), " + "extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupSize - 32, 0)), uint2(0));"); + } statement("ballot &= mask;"); statement("return select(ctz(ballot.x), select(32 + ctz(ballot.y), select(64 + ctz(ballot.z), select(96 + " "ctz(ballot.w), uint(-1), ballot.w == 0), ballot.z == 0), ballot.y == 0), ballot.x == 0);"); @@ -4871,8 +4961,15 @@ void CompilerMSL::emit_custom_functions() case SPVFuncImplSubgroupBallotFindMSB: statement("inline uint spvSubgroupBallotFindMSB(uint4 ballot, uint gl_SubgroupSize)"); begin_scope(); - statement("uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupSize, 32u)), " - "extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupSize - 32, 0)), uint2(0));"); + if (msl_options.is_ios()) + { + statement("uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, gl_SubgroupSize), uint3(0));"); + } + else + { + statement("uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupSize, 32u)), " + "extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupSize - 32, 0)), uint2(0));"); + } statement("ballot &= mask;"); statement("return select(128 - (clz(ballot.w) + 1), select(96 - (clz(ballot.z) + 1), select(64 - " "(clz(ballot.y) + 1), select(32 - (clz(ballot.x) + 1), uint(-1), ballot.x == 0), ballot.y == 0), " @@ -4889,23 +4986,44 @@ void CompilerMSL::emit_custom_functions() statement(""); statement("inline uint spvSubgroupBallotBitCount(uint4 ballot, uint gl_SubgroupSize)"); begin_scope(); - statement("uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupSize, 32u)), " - "extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupSize - 32, 0)), uint2(0));"); + if (msl_options.is_ios()) + { + statement("uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, gl_SubgroupSize), uint3(0));"); + } + else + { + statement("uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupSize, 32u)), " + "extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupSize - 32, 0)), uint2(0));"); + } statement("return spvPopCount4(ballot & mask);"); end_scope(); statement(""); statement("inline uint spvSubgroupBallotInclusiveBitCount(uint4 ballot, uint gl_SubgroupInvocationID)"); begin_scope(); - statement("uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupInvocationID + 1, 32u)), " - "extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupInvocationID + 1 - 32, 0)), " - "uint2(0));"); + if (msl_options.is_ios()) + { + statement("uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, gl_SubgroupInvocationID + 1), uint3(0));"); + } + else + { + statement("uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupInvocationID + 1, 32u)), " + "extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupInvocationID + 1 - 32, 0)), " + "uint2(0));"); + } statement("return spvPopCount4(ballot & mask);"); end_scope(); statement(""); statement("inline uint spvSubgroupBallotExclusiveBitCount(uint4 ballot, uint gl_SubgroupInvocationID)"); begin_scope(); - statement("uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupInvocationID, 32u)), " - "extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupInvocationID - 32, 0)), uint2(0));"); + if (msl_options.is_ios()) + { + statement("uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, gl_SubgroupInvocationID), uint2(0));"); + } + else + { + statement("uint4 mask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupInvocationID, 32u)), " + "extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupInvocationID - 32, 0)), uint2(0));"); + } statement("return spvPopCount4(ballot & mask);"); end_scope(); statement(""); @@ -4919,19 +5037,28 @@ void CompilerMSL::emit_custom_functions() statement("template"); statement("inline bool spvSubgroupAllEqual(T value)"); begin_scope(); - statement("return simd_all(all(value == simd_broadcast_first(value)));"); + if (msl_options.is_ios() && !msl_options.ios_use_simdgroup_functions) + statement("return quad_all(all(value == quad_broadcast_first(value)));"); + else + statement("return simd_all(all(value == simd_broadcast_first(value)));"); end_scope(); statement(""); statement("template<>"); statement("inline bool spvSubgroupAllEqual(bool value)"); begin_scope(); - statement("return simd_all(value) || !simd_any(value);"); + if (msl_options.is_ios() && !msl_options.ios_use_simdgroup_functions) + statement("return quad_all(value) || !quad_any(value);"); + else + statement("return simd_all(value) || !simd_any(value);"); end_scope(); statement(""); statement("template"); statement("inline bool spvSubgroupAllEqual(vec value)"); begin_scope(); - statement("return simd_all(all(value == (vec)simd_broadcast_first((vec)value)));"); + if (msl_options.is_ios() && !msl_options.ios_use_simdgroup_functions) + statement("return quad_all(all(value == (vec)quad_broadcast_first((vec)value)));"); + else + statement("return simd_all(all(value == (vec)simd_broadcast_first((vec)value)));"); end_scope(); statement(""); break; @@ -4940,7 +5067,7 @@ void CompilerMSL::emit_custom_functions() statement("template"); statement("inline T spvSubgroupShuffle(T value, ushort lane)"); begin_scope(); - if (msl_options.is_ios()) + if (msl_options.is_ios() && !msl_options.ios_use_simdgroup_functions) statement("return quad_shuffle(value, lane);"); else statement("return simd_shuffle(value, lane);"); @@ -4949,7 +5076,7 @@ void CompilerMSL::emit_custom_functions() statement("template<>"); statement("inline bool spvSubgroupShuffle(bool value, ushort lane)"); begin_scope(); - if (msl_options.is_ios()) + if (msl_options.is_ios() && !msl_options.ios_use_simdgroup_functions) statement("return !!quad_shuffle((ushort)value, lane);"); else statement("return !!simd_shuffle((ushort)value, lane);"); @@ -4958,7 +5085,7 @@ void CompilerMSL::emit_custom_functions() statement("template"); statement("inline vec spvSubgroupShuffle(vec value, ushort lane)"); begin_scope(); - if (msl_options.is_ios()) + if (msl_options.is_ios() && !msl_options.ios_use_simdgroup_functions) statement("return (vec)quad_shuffle((vec)value, lane);"); else statement("return (vec)simd_shuffle((vec)value, lane);"); @@ -4970,7 +5097,7 @@ void CompilerMSL::emit_custom_functions() statement("template"); statement("inline T spvSubgroupShuffleXor(T value, ushort mask)"); begin_scope(); - if (msl_options.is_ios()) + if (msl_options.is_ios() && !msl_options.ios_use_simdgroup_functions) statement("return quad_shuffle_xor(value, mask);"); else statement("return simd_shuffle_xor(value, mask);"); @@ -4979,7 +5106,7 @@ void CompilerMSL::emit_custom_functions() statement("template<>"); statement("inline bool spvSubgroupShuffleXor(bool value, ushort mask)"); begin_scope(); - if (msl_options.is_ios()) + if (msl_options.is_ios() && !msl_options.ios_use_simdgroup_functions) statement("return !!quad_shuffle_xor((ushort)value, mask);"); else statement("return !!simd_shuffle_xor((ushort)value, mask);"); @@ -4988,7 +5115,7 @@ void CompilerMSL::emit_custom_functions() statement("template"); statement("inline vec spvSubgroupShuffleXor(vec value, ushort mask)"); begin_scope(); - if (msl_options.is_ios()) + if (msl_options.is_ios() && !msl_options.ios_use_simdgroup_functions) statement("return (vec)quad_shuffle_xor((vec)value, mask);"); else statement("return (vec)simd_shuffle_xor((vec)value, mask);"); @@ -5000,7 +5127,7 @@ void CompilerMSL::emit_custom_functions() statement("template"); statement("inline T spvSubgroupShuffleUp(T value, ushort delta)"); begin_scope(); - if (msl_options.is_ios()) + if (msl_options.is_ios() && !msl_options.ios_use_simdgroup_functions) statement("return quad_shuffle_up(value, delta);"); else statement("return simd_shuffle_up(value, delta);"); @@ -5009,7 +5136,7 @@ void CompilerMSL::emit_custom_functions() statement("template<>"); statement("inline bool spvSubgroupShuffleUp(bool value, ushort delta)"); begin_scope(); - if (msl_options.is_ios()) + if (msl_options.is_ios() && !msl_options.ios_use_simdgroup_functions) statement("return !!quad_shuffle_up((ushort)value, delta);"); else statement("return !!simd_shuffle_up((ushort)value, delta);"); @@ -5018,7 +5145,7 @@ void CompilerMSL::emit_custom_functions() statement("template"); statement("inline vec spvSubgroupShuffleUp(vec value, ushort delta)"); begin_scope(); - if (msl_options.is_ios()) + if (msl_options.is_ios() && !msl_options.ios_use_simdgroup_functions) statement("return (vec)quad_shuffle_up((vec)value, delta);"); else statement("return (vec)simd_shuffle_up((vec)value, delta);"); @@ -5030,7 +5157,7 @@ void CompilerMSL::emit_custom_functions() statement("template"); statement("inline T spvSubgroupShuffleDown(T value, ushort delta)"); begin_scope(); - if (msl_options.is_ios()) + if (msl_options.is_ios() && !msl_options.ios_use_simdgroup_functions) statement("return quad_shuffle_down(value, delta);"); else statement("return simd_shuffle_down(value, delta);"); @@ -5039,7 +5166,7 @@ void CompilerMSL::emit_custom_functions() statement("template<>"); statement("inline bool spvSubgroupShuffleDown(bool value, ushort delta)"); begin_scope(); - if (msl_options.is_ios()) + if (msl_options.is_ios() && !msl_options.ios_use_simdgroup_functions) statement("return !!quad_shuffle_down((ushort)value, delta);"); else statement("return !!simd_shuffle_down((ushort)value, delta);"); @@ -5048,7 +5175,7 @@ void CompilerMSL::emit_custom_functions() statement("template"); statement("inline vec spvSubgroupShuffleDown(vec value, ushort delta)"); begin_scope(); - if (msl_options.is_ios()) + if (msl_options.is_ios() && !msl_options.ios_use_simdgroup_functions) statement("return (vec)quad_shuffle_down((vec)value, delta);"); else statement("return (vec)simd_shuffle_down((vec)value, delta);"); @@ -7677,6 +7804,10 @@ void CompilerMSL::emit_barrier(uint32_t id_exe_scope, uint32_t id_mem_scope, uin // Use the wider of the two scopes (smaller value) exe_scope = min(exe_scope, mem_scope); + if (msl_options.emulate_subgroups && exe_scope >= ScopeSubgroup && !id_mem_sem) + // In this case, we assume a "subgroup" size of 1. The barrier, then, is a noop. + return; + string bar_stmt; if ((msl_options.is_ios() && msl_options.supports_msl_version(1, 2)) || msl_options.supports_msl_version(2)) bar_stmt = exe_scope < ScopeSubgroup ? "threadgroup_barrier" : "simdgroup_barrier"; @@ -9852,9 +9983,11 @@ string CompilerMSL::member_attribute_qualifier(const SPIRType &type, uint32_t in case BuiltInPrimitiveId: if (msl_options.multi_patch_workgroup) return ""; - /* fallthrough */ + return string(" [[") + builtin_qualifier(builtin) + "]]" + (mbr_type.array.empty() ? "" : " "); case BuiltInSubgroupLocalInvocationId: // FIXME: Should work in any stage case BuiltInSubgroupSize: // FIXME: Should work in any stage + if (msl_options.emulate_subgroups) + return ""; return string(" [[") + builtin_qualifier(builtin) + "]]" + (mbr_type.array.empty() ? "" : " "); case BuiltInPatchVertices: return ""; @@ -10062,15 +10195,18 @@ string CompilerMSL::member_attribute_qualifier(const SPIRType &type, uint32_t in { switch (builtin) { + case BuiltInNumSubgroups: + case BuiltInSubgroupId: + case BuiltInSubgroupLocalInvocationId: // FIXME: Should work in any stage + case BuiltInSubgroupSize: // FIXME: Should work in any stage + if (msl_options.emulate_subgroups) + break; + /* fallthrough */ case BuiltInGlobalInvocationId: case BuiltInWorkgroupId: case BuiltInNumWorkgroups: case BuiltInLocalInvocationId: case BuiltInLocalInvocationIndex: - case BuiltInNumSubgroups: - case BuiltInSubgroupId: - case BuiltInSubgroupLocalInvocationId: // FIXME: Should work in any stage - case BuiltInSubgroupSize: // FIXME: Should work in any stage return string(" [[") + builtin_qualifier(builtin) + "]]"; default: @@ -10336,6 +10472,10 @@ bool CompilerMSL::is_direct_input_builtin(BuiltIn bi_type) case BuiltInViewIndex: return get_execution_model() == ExecutionModelFragment && msl_options.multiview && msl_options.multiview_layered_rendering; + // Compute function in + case BuiltInSubgroupId: + case BuiltInNumSubgroups: + return !msl_options.emulate_subgroups; // Any stage function in case BuiltInDeviceIndex: case BuiltInSubgroupEqMask: @@ -10344,10 +10484,12 @@ bool CompilerMSL::is_direct_input_builtin(BuiltIn bi_type) case BuiltInSubgroupLeMask: case BuiltInSubgroupLtMask: return false; - case BuiltInSubgroupLocalInvocationId: case BuiltInSubgroupSize: - return get_execution_model() == ExecutionModelGLCompute || - (get_execution_model() == ExecutionModelFragment && msl_options.supports_msl_version(2, 2)); + if (msl_options.fixed_subgroup_size != 0) + return false; + /* fallthrough */ + case BuiltInSubgroupLocalInvocationId: + return !msl_options.emulate_subgroups; default: return true; } @@ -10958,67 +11100,78 @@ void CompilerMSL::fix_up_shader_inputs_outputs() entry_func.fixup_hooks_in.push_back([=]() { statement(tc, ".y = 1.0 - ", tc, ".y;"); }); } break; - case BuiltInSubgroupLocalInvocationId: - // This is natively supported in compute shaders. - if (get_execution_model() == ExecutionModelGLCompute) + case BuiltInSubgroupId: + if (!msl_options.emulate_subgroups) break; - - // This is natively supported in fragment shaders in MSL 2.2. - if (get_execution_model() == ExecutionModelFragment && msl_options.supports_msl_version(2, 2)) - break; - - if (msl_options.is_ios()) - SPIRV_CROSS_THROW( - "SubgroupLocalInvocationId cannot be used outside of compute shaders before MSL 2.2 on iOS."); - - if (!msl_options.supports_msl_version(2, 1)) - SPIRV_CROSS_THROW( - "SubgroupLocalInvocationId cannot be used outside of compute shaders before MSL 2.1."); - - // Shaders other than compute shaders don't support the SIMD-group - // builtins directly, but we can emulate them using the SIMD-group - // functions. This might break if some of the subgroup terminated - // before reaching the entry point. + // For subgroup emulation, this is the same as the local invocation index. entry_func.fixup_hooks_in.push_back([=]() { - statement(builtin_type_decl(bi_type), " ", to_expression(var_id), - " = simd_prefix_exclusive_sum(1);"); + statement(builtin_type_decl(bi_type), " ", to_expression(var_id), " = ", + to_expression(builtin_local_invocation_index_id), ";"); }); break; - case BuiltInSubgroupSize: - // This is natively supported in compute shaders. - if (get_execution_model() == ExecutionModelGLCompute) + case BuiltInNumSubgroups: + if (!msl_options.emulate_subgroups) break; - - // This is natively supported in fragment shaders in MSL 2.2. - if (get_execution_model() == ExecutionModelFragment && msl_options.supports_msl_version(2, 2)) + // For subgroup emulation, this is the same as the workgroup size. + entry_func.fixup_hooks_in.push_back([=]() { + auto &type = expression_type(builtin_workgroup_size_id); + string size_expr = to_expression(builtin_workgroup_size_id); + if (type.vecsize >= 3) + size_expr = join(size_expr, ".x * ", size_expr, ".y * ", size_expr, ".z"); + else if (type.vecsize == 2) + size_expr = join(size_expr, ".x * ", size_expr, ".y"); + statement(builtin_type_decl(bi_type), " ", to_expression(var_id), " = ", size_expr, ";"); + }); + break; + case BuiltInSubgroupLocalInvocationId: + if (!msl_options.emulate_subgroups) break; - - if (msl_options.is_ios()) - SPIRV_CROSS_THROW("SubgroupSize cannot be used outside of compute shaders on iOS."); - - if (!msl_options.supports_msl_version(2, 1)) - SPIRV_CROSS_THROW("SubgroupSize cannot be used outside of compute shaders before Metal 2.1."); - + // For subgroup emulation, assume subgroups of size 1. entry_func.fixup_hooks_in.push_back( - [=]() { statement(builtin_type_decl(bi_type), " ", to_expression(var_id), " = simd_sum(1);"); }); + [=]() { statement(builtin_type_decl(bi_type), " ", to_expression(var_id), " = 0;"); }); + break; + case BuiltInSubgroupSize: + if (msl_options.emulate_subgroups) + { + // For subgroup emulation, assume subgroups of size 1. + entry_func.fixup_hooks_in.push_back( + [=]() { statement(builtin_type_decl(bi_type), " ", to_expression(var_id), " = 1;"); }); + } + else if (msl_options.fixed_subgroup_size != 0) + { + entry_func.fixup_hooks_in.push_back([=]() { + statement(builtin_type_decl(bi_type), " ", to_expression(var_id), " = ", + msl_options.fixed_subgroup_size, ";"); + }); + } break; case BuiltInSubgroupEqMask: - if (msl_options.is_ios()) - SPIRV_CROSS_THROW("Subgroup ballot functionality is unavailable on iOS."); + if (msl_options.is_ios() && !msl_options.supports_msl_version(2, 2)) + SPIRV_CROSS_THROW("Subgroup ballot functionality requires Metal 2.2 on iOS."); if (!msl_options.supports_msl_version(2, 1)) SPIRV_CROSS_THROW("Subgroup ballot functionality requires Metal 2.1."); entry_func.fixup_hooks_in.push_back([=]() { - statement(builtin_type_decl(bi_type), " ", to_expression(var_id), " = ", - to_expression(builtin_subgroup_invocation_id_id), " >= 32 ? uint4(0, (1 << (", - to_expression(builtin_subgroup_invocation_id_id), " - 32)), uint2(0)) : uint4(1 << ", - to_expression(builtin_subgroup_invocation_id_id), ", uint3(0));"); + if (msl_options.is_ios()) + { + statement(builtin_type_decl(bi_type), " ", to_expression(var_id), " = ", "uint4(1 << ", + to_expression(builtin_subgroup_invocation_id_id), ", uint3(0));"); + } + else + { + statement(builtin_type_decl(bi_type), " ", to_expression(var_id), " = ", + to_expression(builtin_subgroup_invocation_id_id), " >= 32 ? uint4(0, (1 << (", + to_expression(builtin_subgroup_invocation_id_id), " - 32)), uint2(0)) : uint4(1 << ", + to_expression(builtin_subgroup_invocation_id_id), ", uint3(0));"); + } }); break; case BuiltInSubgroupGeMask: - if (msl_options.is_ios()) - SPIRV_CROSS_THROW("Subgroup ballot functionality is unavailable on iOS."); + if (msl_options.is_ios() && !msl_options.supports_msl_version(2, 2)) + SPIRV_CROSS_THROW("Subgroup ballot functionality requires Metal 2.2 on iOS."); if (!msl_options.supports_msl_version(2, 1)) SPIRV_CROSS_THROW("Subgroup ballot functionality requires Metal 2.1."); + if (msl_options.fixed_subgroup_size != 0) + add_spv_func_and_recompile(SPVFuncImplSubgroupBallot); entry_func.fixup_hooks_in.push_back([=]() { // Case where index < 32, size < 32: // mask0 = bfi(0, 0xFFFFFFFF, index, size - index); @@ -11034,60 +11187,150 @@ void CompilerMSL::fix_up_shader_inputs_outputs() // This is further complicated by the fact that if you attempt // to bfi/bfe out-of-bounds on Metal, undefined behavior is the // result. - statement(builtin_type_decl(bi_type), " ", to_expression(var_id), - " = uint4(insert_bits(0u, 0xFFFFFFFF, min(", - to_expression(builtin_subgroup_invocation_id_id), ", 32u), (uint)max(min((int)", - to_expression(builtin_subgroup_size_id), ", 32) - (int)", - to_expression(builtin_subgroup_invocation_id_id), - ", 0)), insert_bits(0u, 0xFFFFFFFF, (uint)max((int)", - to_expression(builtin_subgroup_invocation_id_id), " - 32, 0), (uint)max((int)", - to_expression(builtin_subgroup_size_id), " - (int)max(", - to_expression(builtin_subgroup_invocation_id_id), ", 32u), 0)), uint2(0));"); + if (msl_options.fixed_subgroup_size > 32) + { + // Don't use the subgroup size variable with fixed subgroup sizes, + // since the variables could be defined in the wrong order. + statement(builtin_type_decl(bi_type), " ", to_expression(var_id), + " = uint4(insert_bits(0u, 0xFFFFFFFF, min(", + to_expression(builtin_subgroup_invocation_id_id), ", 32u), (uint)max(32 - (int)", + to_expression(builtin_subgroup_invocation_id_id), + ", 0)), insert_bits(0u, 0xFFFFFFFF," + " (uint)max((int)", + to_expression(builtin_subgroup_invocation_id_id), " - 32, 0), ", + msl_options.fixed_subgroup_size, " - max(", + to_expression(builtin_subgroup_invocation_id_id), + ", 32u)), uint2(0)) & spvSubgroupBallot(true);"); + } + else if (msl_options.fixed_subgroup_size != 0) + { + statement(builtin_type_decl(bi_type), " ", to_expression(var_id), + " = uint4(insert_bits(0u, 0xFFFFFFFF, ", + to_expression(builtin_subgroup_invocation_id_id), ", ", + msl_options.fixed_subgroup_size, " - ", + to_expression(builtin_subgroup_invocation_id_id), + "), uint3(0)) & spvSubgroupBallot(true);"); + } + else if (msl_options.is_ios()) + { + // On iOS, the SIMD-group size will currently never exceed 32. + statement(builtin_type_decl(bi_type), " ", to_expression(var_id), + " = uint4(insert_bits(0u, 0xFFFFFFFF, ", + to_expression(builtin_subgroup_invocation_id_id), ", ", + to_expression(builtin_subgroup_size_id), " - ", + to_expression(builtin_subgroup_invocation_id_id), "), uint3(0));"); + } + else + { + statement(builtin_type_decl(bi_type), " ", to_expression(var_id), + " = uint4(insert_bits(0u, 0xFFFFFFFF, min(", + to_expression(builtin_subgroup_invocation_id_id), ", 32u), (uint)max(min((int)", + to_expression(builtin_subgroup_size_id), ", 32) - (int)", + to_expression(builtin_subgroup_invocation_id_id), + ", 0)), insert_bits(0u, 0xFFFFFFFF, (uint)max((int)", + to_expression(builtin_subgroup_invocation_id_id), " - 32, 0), (uint)max((int)", + to_expression(builtin_subgroup_size_id), " - (int)max(", + to_expression(builtin_subgroup_invocation_id_id), ", 32u), 0)), uint2(0));"); + } }); break; case BuiltInSubgroupGtMask: - if (msl_options.is_ios()) - SPIRV_CROSS_THROW("Subgroup ballot functionality is unavailable on iOS."); + if (msl_options.is_ios() && !msl_options.supports_msl_version(2, 2)) + SPIRV_CROSS_THROW("Subgroup ballot functionality requires Metal 2.2 on iOS."); if (!msl_options.supports_msl_version(2, 1)) SPIRV_CROSS_THROW("Subgroup ballot functionality requires Metal 2.1."); + add_spv_func_and_recompile(SPVFuncImplSubgroupBallot); entry_func.fixup_hooks_in.push_back([=]() { // The same logic applies here, except now the index is one // more than the subgroup invocation ID. - statement(builtin_type_decl(bi_type), " ", to_expression(var_id), - " = uint4(insert_bits(0u, 0xFFFFFFFF, min(", - to_expression(builtin_subgroup_invocation_id_id), " + 1, 32u), (uint)max(min((int)", - to_expression(builtin_subgroup_size_id), ", 32) - (int)", - to_expression(builtin_subgroup_invocation_id_id), - " - 1, 0)), insert_bits(0u, 0xFFFFFFFF, (uint)max((int)", - to_expression(builtin_subgroup_invocation_id_id), " + 1 - 32, 0), (uint)max((int)", - to_expression(builtin_subgroup_size_id), " - (int)max(", - to_expression(builtin_subgroup_invocation_id_id), " + 1, 32u), 0)), uint2(0));"); + if (msl_options.fixed_subgroup_size > 32) + { + statement(builtin_type_decl(bi_type), " ", to_expression(var_id), + " = uint4(insert_bits(0u, 0xFFFFFFFF, min(", + to_expression(builtin_subgroup_invocation_id_id), " + 1, 32u), (uint)max(32 - (int)", + to_expression(builtin_subgroup_invocation_id_id), + " - 1, 0)), insert_bits(0u, 0xFFFFFFFF, (uint)max((int)", + to_expression(builtin_subgroup_invocation_id_id), " + 1 - 32, 0), ", + msl_options.fixed_subgroup_size, " - max(", + to_expression(builtin_subgroup_invocation_id_id), + " + 1, 32u)), uint2(0)) & " + "spvSubgroupBallot(true);"); + } + else if (msl_options.fixed_subgroup_size != 0) + { + statement(builtin_type_decl(bi_type), " ", to_expression(var_id), + " = uint4(insert_bits(0u, 0xFFFFFFFF, ", + to_expression(builtin_subgroup_invocation_id_id), " + 1, ", + msl_options.fixed_subgroup_size, " - ", + to_expression(builtin_subgroup_invocation_id_id), + " - 1), uint3(0)) & spvSubgroupBallot(true);"); + } + else if (msl_options.is_ios()) + { + statement(builtin_type_decl(bi_type), " ", to_expression(var_id), + " = uint4(insert_bits(0u, 0xFFFFFFFF, ", + to_expression(builtin_subgroup_invocation_id_id), " + 1, ", + to_expression(builtin_subgroup_size_id), " - ", + to_expression(builtin_subgroup_invocation_id_id), " - 1), uint3(0));"); + } + else + { + statement(builtin_type_decl(bi_type), " ", to_expression(var_id), + " = uint4(insert_bits(0u, 0xFFFFFFFF, min(", + to_expression(builtin_subgroup_invocation_id_id), " + 1, 32u), (uint)max(min((int)", + to_expression(builtin_subgroup_size_id), ", 32) - (int)", + to_expression(builtin_subgroup_invocation_id_id), + " - 1, 0)), insert_bits(0u, 0xFFFFFFFF, (uint)max((int)", + to_expression(builtin_subgroup_invocation_id_id), " + 1 - 32, 0), (uint)max((int)", + to_expression(builtin_subgroup_size_id), " - (int)max(", + to_expression(builtin_subgroup_invocation_id_id), " + 1, 32u), 0)), uint2(0));"); + } }); break; case BuiltInSubgroupLeMask: - if (msl_options.is_ios()) - SPIRV_CROSS_THROW("Subgroup ballot functionality is unavailable on iOS."); + if (msl_options.is_ios() && !msl_options.supports_msl_version(2, 2)) + SPIRV_CROSS_THROW("Subgroup ballot functionality requires Metal 2.2 on iOS."); if (!msl_options.supports_msl_version(2, 1)) SPIRV_CROSS_THROW("Subgroup ballot functionality requires Metal 2.1."); + add_spv_func_and_recompile(SPVFuncImplSubgroupBallot); entry_func.fixup_hooks_in.push_back([=]() { - statement(builtin_type_decl(bi_type), " ", to_expression(var_id), - " = uint4(extract_bits(0xFFFFFFFF, 0, min(", - to_expression(builtin_subgroup_invocation_id_id), - " + 1, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)", - to_expression(builtin_subgroup_invocation_id_id), " + 1 - 32, 0)), uint2(0));"); + if (msl_options.is_ios()) + { + statement(builtin_type_decl(bi_type), " ", to_expression(var_id), + " = uint4(extract_bits(0xFFFFFFFF, 0, ", + to_expression(builtin_subgroup_invocation_id_id), " + 1), uint3(0));"); + } + else + { + statement(builtin_type_decl(bi_type), " ", to_expression(var_id), + " = uint4(extract_bits(0xFFFFFFFF, 0, min(", + to_expression(builtin_subgroup_invocation_id_id), + " + 1, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)", + to_expression(builtin_subgroup_invocation_id_id), " + 1 - 32, 0)), uint2(0));"); + } }); break; case BuiltInSubgroupLtMask: - if (msl_options.is_ios()) - SPIRV_CROSS_THROW("Subgroup ballot functionality is unavailable on iOS."); + if (msl_options.is_ios() && !msl_options.supports_msl_version(2, 2)) + SPIRV_CROSS_THROW("Subgroup ballot functionality requires Metal 2.2 on iOS."); if (!msl_options.supports_msl_version(2, 1)) SPIRV_CROSS_THROW("Subgroup ballot functionality requires Metal 2.1."); + add_spv_func_and_recompile(SPVFuncImplSubgroupBallot); entry_func.fixup_hooks_in.push_back([=]() { - statement(builtin_type_decl(bi_type), " ", to_expression(var_id), - " = uint4(extract_bits(0xFFFFFFFF, 0, min(", - to_expression(builtin_subgroup_invocation_id_id), - ", 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)", - to_expression(builtin_subgroup_invocation_id_id), " - 32, 0)), uint2(0));"); + if (msl_options.is_ios()) + { + statement(builtin_type_decl(bi_type), " ", to_expression(var_id), + " = uint4(extract_bits(0xFFFFFFFF, 0, ", + to_expression(builtin_subgroup_invocation_id_id), "), uint3(0));"); + } + else + { + statement(builtin_type_decl(bi_type), " ", to_expression(var_id), + " = uint4(extract_bits(0xFFFFFFFF, 0, min(", + to_expression(builtin_subgroup_invocation_id_id), + ", 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)", + to_expression(builtin_subgroup_invocation_id_id), " - 32, 0)), uint2(0));"); + } }); break; case BuiltInViewIndex: @@ -12376,10 +12619,23 @@ void CompilerMSL::emit_subgroup_op(const Instruction &i) const uint32_t *ops = stream(i); auto op = static_cast(i.op); - // Metal 2.0 is required. iOS only supports quad ops. macOS only supports - // broadcast and shuffle on 10.13 (2.0), with full support in 10.14 (2.1). - // Note that iOS makes no distinction between a quad-group and a subgroup; - // all subgroups are quad-groups there. + if (msl_options.emulate_subgroups) + { + // In this mode, only the GroupNonUniform cap is supported. The only op + // we need to handle, then, is OpGroupNonUniformElect. + if (op != OpGroupNonUniformElect) + SPIRV_CROSS_THROW("Subgroup emulation does not support operations other than Elect."); + // In this mode, the subgroup size is assumed to be one, so every invocation + // is elected. + emit_op(ops[0], ops[1], "true", true); + return; + } + + // Metal 2.0 is required. iOS only supports quad ops on 11.0 (2.0), with + // full support in 13.0 (2.2). macOS only supports broadcast and shuffle on + // 10.13 (2.0), with full support in 10.14 (2.1). + // Note that Apple GPUs before A13 make no distinction between a quad-group + // and a SIMD-group; all SIMD-groups are quad-groups on those. if (!msl_options.supports_msl_version(2)) SPIRV_CROSS_THROW("Subgroups are only supported in Metal 2.0 and up."); @@ -12388,12 +12644,32 @@ void CompilerMSL::emit_subgroup_op(const Instruction &i) auto int_type = to_signed_basetype(integer_width); auto uint_type = to_unsigned_basetype(integer_width); - if (msl_options.is_ios()) + if (msl_options.is_ios() && (!msl_options.supports_msl_version(2, 3) || !msl_options.ios_use_simdgroup_functions)) { switch (op) { default: - SPIRV_CROSS_THROW("iOS only supports quad-group operations."); + SPIRV_CROSS_THROW("Subgroup ops beyond broadcast, ballot, and shuffle on iOS require Metal 2.3 and up."); + case OpGroupNonUniformBroadcastFirst: + if (!msl_options.supports_msl_version(2, 2)) + SPIRV_CROSS_THROW("BroadcastFirst on iOS requires Metal 2.2 and up."); + break; + case OpGroupNonUniformElect: + if (!msl_options.supports_msl_version(2, 2)) + SPIRV_CROSS_THROW("Elect on iOS requires Metal 2.2 and up."); + break; + case OpGroupNonUniformAny: + case OpGroupNonUniformAll: + case OpGroupNonUniformAllEqual: + case OpGroupNonUniformBallot: + case OpGroupNonUniformInverseBallot: + case OpGroupNonUniformBallotBitExtract: + case OpGroupNonUniformBallotFindLSB: + case OpGroupNonUniformBallotFindMSB: + case OpGroupNonUniformBallotBitCount: + if (!msl_options.supports_msl_version(2, 2)) + SPIRV_CROSS_THROW("Ballot ops on iOS requires Metal 2.2 and up."); + break; case OpGroupNonUniformBroadcast: case OpGroupNonUniformShuffle: case OpGroupNonUniformShuffleXor: @@ -12430,7 +12706,10 @@ void CompilerMSL::emit_subgroup_op(const Instruction &i) switch (op) { case OpGroupNonUniformElect: - emit_op(result_type, id, "simd_is_first()", true); + if (msl_options.is_ios() && !msl_options.ios_use_simdgroup_functions) + emit_op(result_type, id, "quad_is_first()", false); + else + emit_op(result_type, id, "simd_is_first()", false); break; case OpGroupNonUniformBroadcast: @@ -12501,11 +12780,17 @@ void CompilerMSL::emit_subgroup_op(const Instruction &i) break; case OpGroupNonUniformAll: - emit_unary_func_op(result_type, id, ops[3], "simd_all"); + if (msl_options.is_ios() && !msl_options.ios_use_simdgroup_functions) + emit_unary_func_op(result_type, id, ops[3], "quad_all"); + else + emit_unary_func_op(result_type, id, ops[3], "simd_all"); break; case OpGroupNonUniformAny: - emit_unary_func_op(result_type, id, ops[3], "simd_any"); + if (msl_options.is_ios() && !msl_options.ios_use_simdgroup_functions) + emit_unary_func_op(result_type, id, ops[3], "quad_any"); + else + emit_unary_func_op(result_type, id, ops[3], "simd_any"); break; case OpGroupNonUniformAllEqual: @@ -12969,6 +13254,9 @@ string CompilerMSL::builtin_qualifier(BuiltIn builtin) return "thread_index_in_threadgroup"; case BuiltInSubgroupSize: + if (msl_options.emulate_subgroups || msl_options.fixed_subgroup_size != 0) + // Shouldn't be reached. + SPIRV_CROSS_THROW("Emitting threads_per_simdgroup attribute with fixed subgroup size??"); if (execution.model == ExecutionModelFragment) { if (!msl_options.supports_msl_version(2, 2)) @@ -12983,16 +13271,25 @@ string CompilerMSL::builtin_qualifier(BuiltIn builtin) } case BuiltInNumSubgroups: + if (msl_options.emulate_subgroups) + // Shouldn't be reached. + SPIRV_CROSS_THROW("NumSubgroups is handled specially with emulation."); if (!msl_options.supports_msl_version(2)) SPIRV_CROSS_THROW("Subgroup builtins require Metal 2.0."); return msl_options.is_ios() ? "quadgroups_per_threadgroup" : "simdgroups_per_threadgroup"; case BuiltInSubgroupId: + if (msl_options.emulate_subgroups) + // Shouldn't be reached. + SPIRV_CROSS_THROW("SubgroupId is handled specially with emulation."); if (!msl_options.supports_msl_version(2)) SPIRV_CROSS_THROW("Subgroup builtins require Metal 2.0."); return msl_options.is_ios() ? "quadgroup_index_in_threadgroup" : "simdgroup_index_in_threadgroup"; case BuiltInSubgroupLocalInvocationId: + if (msl_options.emulate_subgroups) + // Shouldn't be reached. + SPIRV_CROSS_THROW("SubgroupLocalInvocationId is handled specially with emulation."); if (execution.model == ExecutionModelFragment) { if (!msl_options.supports_msl_version(2, 2)) diff --git a/spirv_msl.hpp b/spirv_msl.hpp index a26047e0..0b78df47 100644 --- a/spirv_msl.hpp +++ b/spirv_msl.hpp @@ -364,6 +364,28 @@ public: // and will be addressed using the current ViewIndex. bool arrayed_subpass_input = false; + // Whether to use SIMD-group or quadgroup functions to implement group nnon-uniform + // operations. Some GPUs on iOS do not support the SIMD-group functions, only the + // quadgroup functions. + bool ios_use_simdgroup_functions = false; + + // If set, the subgroup size will be assumed to be one, and subgroup-related + // builtins and operations will be emitted accordingly. This mode is intended to + // be used by MoltenVK on hardware/software configurations which do not provide + // sufficient support for subgroups. + bool emulate_subgroups = false; + + // If nonzero, a fixed subgroup size to assume. Metal, similarly to VK_EXT_subgroup_size_control, + // allows the SIMD-group size (aka thread execution width) to vary depending on + // register usage and requirements. In certain circumstances--for example, a pipeline + // in MoltenVK without VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT-- + // this is undesirable. This fixes the value of the SubgroupSize builtin, instead of + // mapping it to the Metal builtin [[thread_execution_width]]. If the thread + // execution width is reduced, the extra invocations will appear to be inactive. + // If zero, the SubgroupSize will be allowed to vary, and the builtin will be mapped + // to the Metal [[thread_execution_width]] builtin. + uint32_t fixed_subgroup_size = 0; + enum class IndexType { None = 0, @@ -853,6 +875,8 @@ protected: uint32_t builtin_subgroup_size_id = 0; uint32_t builtin_dispatch_base_id = 0; uint32_t builtin_stage_input_size_id = 0; + uint32_t builtin_local_invocation_index_id = 0; + uint32_t builtin_workgroup_size_id = 0; uint32_t swizzle_buffer_id = 0; uint32_t buffer_size_buffer_id = 0; uint32_t view_mask_buffer_id = 0; diff --git a/test_shaders.py b/test_shaders.py index 119617a7..d73f8598 100755 --- a/test_shaders.py +++ b/test_shaders.py @@ -308,6 +308,14 @@ def cross_compile_msl(shader, spirv, opt, iterations, paths): msl_args.append('--msl-arrayed-subpass-input') if '.1d-as-2d.' in shader: msl_args.append('--msl-texture-1d-as-2d') + if '.simd.' in shader: + msl_args.append('--msl-ios-use-simdgroup-functions') + if '.emulate-subgroup.' in shader: + msl_args.append('--msl-emulate-subgroups') + if '.fixed-subgroup.' in shader: + # Arbitrary for testing purposes. + msl_args.append('--msl-fixed-subgroup-size') + msl_args.append('32') subprocess.check_call(msl_args)