MSL: Add option to force depth write in fragment shaders

Metal writes to the depth/stencil attachment before fragment
shader execution if the execution does not modify the depth
value. However, Vulkan expects the write to happen after
fragment shader execution. To circumvent the issue we add
a simple depth passthrough if the user opts in. Only
required when the depth/stencil attachment is used as
input attachment at the same time. It seems Metal does not
correctly detect the dependency.
This commit is contained in:
Aitor Camacho 2024-05-20 16:14:36 +02:00
parent 5934c8fc1c
commit a43fabbe2a
13 changed files with 236 additions and 10 deletions

View File

@ -681,6 +681,7 @@ struct CLIArguments
bool msl_replace_recursive_inputs = false;
bool msl_readwrite_texture_fences = true;
bool msl_agx_manual_cube_grad_fixup = false;
bool msl_input_attachment_is_ds_attachment = false;
const char *msl_combined_sampler_suffix = nullptr;
bool glsl_emit_push_constant_as_ubo = false;
bool glsl_emit_ubo_as_plain_uniforms = false;
@ -873,6 +874,10 @@ static void print_help_msl()
"\t[--msl-runtime-array-rich-descriptor]:\n\t\tWhen declaring a runtime array of SSBOs, declare an array of {ptr, len} pairs to support OpArrayLength.\n"
"\t[--msl-replace-recursive-inputs]:\n\t\tWorks around a Metal 3.1 regression bug, which causes an infinite recursion crash during Metal's analysis of an entry point input structure that itself contains internal recursion.\n"
"\t[--msl-texture-buffer-native]:\n\t\tEnable native support for texel buffers. Otherwise, it is emulated as a normal texture.\n"
"\t[--msl-input-attachment-is-ds-attachment]:\n\t\tAdds a simple depth passthrough in fragment shaders when they do not modify the depth value.\n"
"\t\tRequired to force Metal to write to the depth/stencil attachment post fragment execution.\n"
"\t\tOtherwise, Metal may optimize the write to pre fragment execution which goes against the Vulkan spec.\n"
"\t\tOnly required if an input attachment and depth/stencil attachment reference the same resource.\n"
"\t[--msl-framebuffer-fetch]:\n\t\tImplement subpass inputs with frame buffer fetch.\n"
"\t\tEmits [[color(N)]] inputs in fragment stage.\n"
"\t\tRequires an Apple GPU.\n"
@ -1257,6 +1262,7 @@ static string compile_iteration(const CLIArguments &args, std::vector<uint32_t>
msl_opts.ios_support_base_vertex_instance = true;
msl_opts.runtime_array_rich_descriptor = args.msl_runtime_array_rich_descriptor;
msl_opts.replace_recursive_inputs = args.msl_replace_recursive_inputs;
msl_opts.input_attachment_is_ds_attachment = args.msl_input_attachment_is_ds_attachment;
msl_opts.readwrite_texture_fences = args.msl_readwrite_texture_fences;
msl_opts.agx_manual_cube_grad_fixup = args.msl_agx_manual_cube_grad_fixup;
msl_comp->set_msl_options(msl_opts);
@ -1823,6 +1829,7 @@ static int main_inner(int argc, char *argv[])
[&args](CLIParser &) { args.msl_runtime_array_rich_descriptor = true; });
cbs.add("--msl-replace-recursive-inputs",
[&args](CLIParser &) { args.msl_replace_recursive_inputs = true; });
cbs.add("--msl-input-attachment-is-ds-attachment", [&args](CLIParser &) { args.msl_input_attachment_is_ds_attachment = true; });
cbs.add("--extension", [&args](CLIParser &parser) { args.extensions.push_back(parser.next_string()); });
cbs.add("--rename-entry-point", [&args](CLIParser &parser) {
auto old_name = parser.next_string();

View File

@ -0,0 +1,17 @@
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
struct main0_out
{
float4 color [[color(0)]];
};
[[ early_fragment_tests ]] fragment main0_out main0(texture2d<float> inputDepth [[texture(0)]], float4 gl_FragCoord [[position]])
{
main0_out out = {};
out.color = inputDepth.read(uint2(gl_FragCoord.xy));
return out;
}

View File

@ -0,0 +1,19 @@
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
struct main0_out
{
float4 color [[color(0)]];
float gl_FragDepth [[depth(any)]];
};
fragment main0_out main0(texture2d<float> inputDepth [[texture(0)]], float4 gl_FragCoord [[position]])
{
main0_out out = {};
out.color = inputDepth.read(uint2(gl_FragCoord.xy));
out.gl_FragDepth = 1.0;
return out;
}

View File

@ -0,0 +1,19 @@
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
struct main0_out
{
float4 color [[color(0)]];
float gl_FragDepth [[depth(any)]];
};
fragment main0_out main0(texture2d<float> inputDepth [[texture(0)]], float4 gl_FragCoord [[position]])
{
main0_out out = {};
out.color = inputDepth.read(uint2(gl_FragCoord.xy));
out.gl_FragDepth = gl_FragCoord.z;
return out;
}

View File

@ -0,0 +1,17 @@
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
struct main0_out
{
float4 color [[color(0)]];
};
[[ early_fragment_tests ]] fragment main0_out main0(texture2d<float> inputDepth [[texture(0)]], float4 gl_FragCoord [[position]])
{
main0_out out = {};
out.color = inputDepth.read(uint2(gl_FragCoord.xy));
return out;
}

View File

@ -0,0 +1,19 @@
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
struct main0_out
{
float4 color [[color(0)]];
float gl_FragDepth [[depth(any)]];
};
fragment main0_out main0(texture2d<float> inputDepth [[texture(0)]], float4 gl_FragCoord [[position]])
{
main0_out out = {};
out.color = inputDepth.read(uint2(gl_FragCoord.xy));
out.gl_FragDepth = 1.0;
return out;
}

View File

@ -0,0 +1,19 @@
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
struct main0_out
{
float4 color [[color(0)]];
float gl_FragDepth [[depth(any)]];
};
fragment main0_out main0(texture2d<float> inputDepth [[texture(0)]], float4 gl_FragCoord [[position]])
{
main0_out out = {};
out.color = inputDepth.read(uint2(gl_FragCoord.xy));
out.gl_FragDepth = gl_FragCoord.z;
return out;
}

View File

@ -0,0 +1,12 @@
#version 450
layout(early_fragment_tests) in;
layout (input_attachment_index = 0, binding = 0) uniform subpassInput inputDepth;
layout (location = 0) out vec4 color;
void main()
{
color = subpassLoad(inputDepth);
}

View File

@ -0,0 +1,11 @@
#version 450
layout (input_attachment_index = 0, binding = 0) uniform subpassInput inputDepth;
layout (location = 0) out vec4 color;
void main()
{
color = subpassLoad(inputDepth);
gl_FragDepth = 1.0f;
}

View File

@ -0,0 +1,10 @@
#version 450
layout (input_attachment_index = 0, binding = 0) uniform subpassInput inputDepth;
layout (location = 0) out vec4 color;
void main()
{
color = subpassLoad(inputDepth);
}

View File

@ -271,11 +271,14 @@ void CompilerMSL::build_implicit_builtins()
active_input_builtins.get(BuiltInInstanceIndex) || active_input_builtins.get(BuiltInBaseInstance));
bool need_local_invocation_index = msl_options.emulate_subgroups && active_input_builtins.get(BuiltInSubgroupId);
bool need_workgroup_size = msl_options.emulate_subgroups && active_input_builtins.get(BuiltInNumSubgroups);
bool force_frag_depth_passthrough =
get_execution_model() == ExecutionModelFragment && !uses_explicit_early_fragment_test() && need_subpass_input &&
msl_options.enable_frag_depth_builtin && msl_options.input_attachment_is_ds_attachment;
if (need_subpass_input || need_sample_pos || need_subgroup_mask || need_vertex_params || need_tesc_params ||
need_tese_params || need_multiview || need_dispatch_base || need_vertex_base_params || need_grid_params ||
needs_sample_id || needs_subgroup_invocation_id || needs_subgroup_size || needs_helper_invocation ||
has_additional_fixed_sample_mask() || need_local_invocation_index || need_workgroup_size)
has_additional_fixed_sample_mask() || need_local_invocation_index || need_workgroup_size || force_frag_depth_passthrough)
{
bool has_frag_coord = false;
bool has_sample_id = false;
@ -292,6 +295,7 @@ void CompilerMSL::build_implicit_builtins()
bool has_helper_invocation = false;
bool has_local_invocation_index = false;
bool has_workgroup_size = false;
bool has_frag_depth = false;
uint32_t workgroup_id_type = 0;
ir.for_each_typed_id<SPIRVariable>([&](uint32_t, SPIRVariable &var) {
@ -312,6 +316,13 @@ void CompilerMSL::build_implicit_builtins()
mark_implicit_builtin(StorageClassOutput, BuiltInSampleMask, var.self);
does_shader_write_sample_mask = true;
}
if (force_frag_depth_passthrough && builtin == BuiltInFragDepth)
{
builtin_frag_depth_id = var.self;
mark_implicit_builtin(StorageClassOutput, BuiltInFragDepth, var.self);
has_frag_depth = true;
}
}
if (var.storage != StorageClassInput)
@ -902,6 +913,36 @@ void CompilerMSL::build_implicit_builtins()
builtin_workgroup_size_id = var_id;
mark_implicit_builtin(StorageClassInput, BuiltInWorkgroupSize, var_id);
}
if (!has_frag_depth && force_frag_depth_passthrough)
{
uint32_t offset = ir.increase_bound_by(3);
uint32_t type_id = offset;
uint32_t type_ptr_id = offset + 1;
uint32_t var_id = offset + 2;
// Create gl_FragDepth
SPIRType float_type { OpTypeFloat };
float_type.basetype = SPIRType::Float;
float_type.width = 32;
float_type.vecsize = 1;
set<SPIRType>(type_id, float_type);
SPIRType float_type_ptr_in = float_type;
float_type_ptr_in.op = spv::OpTypePointer;
float_type_ptr_in.pointer = true;
float_type_ptr_in.pointer_depth++;
float_type_ptr_in.parent_type = type_id;
float_type_ptr_in.storage = StorageClassOutput;
auto &ptr_in_type = set<SPIRType>(type_ptr_id, float_type_ptr_in);
ptr_in_type.self = type_id;
set<SPIRVariable>(var_id, type_ptr_id, StorageClassOutput);
set_decoration(var_id, DecorationBuiltIn, BuiltInFragDepth);
builtin_frag_depth_id = var_id;
mark_implicit_builtin(StorageClassOutput, BuiltInFragDepth, var_id);
active_output_builtins.set(BuiltInFragDepth);
}
}
if (needs_swizzle_buffer_def)
@ -1571,6 +1612,8 @@ string CompilerMSL::compile()
add_active_interface_variable(builtin_dispatch_base_id);
if (builtin_sample_mask_id)
add_active_interface_variable(builtin_sample_mask_id);
if (builtin_frag_depth_id)
add_active_interface_variable(builtin_frag_depth_id);
// Create structs to hold input, output and uniform variables.
// Do output first to ensure out. is declared at top of entry function.
@ -1869,8 +1912,13 @@ void CompilerMSL::extract_global_variables_from_function(uint32_t func_id, std::
{
uint32_t base_id = ops[0];
if (global_var_ids.find(base_id) != global_var_ids.end())
{
added_arg_ids.insert(base_id);
if (msl_options.input_attachment_is_ds_attachment && base_id == builtin_frag_depth_id)
writes_to_depth = true;
}
uint32_t rvalue_id = ops[1];
if (global_var_ids.find(rvalue_id) != global_var_ids.end())
added_arg_ids.insert(rvalue_id);
@ -14513,16 +14561,33 @@ void CompilerMSL::fix_up_shader_inputs_outputs()
}
}
else if (var.storage == StorageClassOutput && get_execution_model() == ExecutionModelFragment &&
is_builtin_variable(var) && active_output_builtins.get(bi_type) &&
bi_type == BuiltInSampleMask && has_additional_fixed_sample_mask())
is_builtin_variable(var) && active_output_builtins.get(bi_type))
{
// If the additional fixed sample mask was set, we need to adjust the sample_mask
// output to reflect that. If the shader outputs the sample_mask itself too, we need
// to AND the two masks to get the final one.
string op_str = does_shader_write_sample_mask ? " &= " : " = ";
entry_func.fixup_hooks_out.push_back([=]() {
statement(to_expression(builtin_sample_mask_id), op_str, additional_fixed_sample_mask_str(), ";");
});
switch (bi_type)
{
case BuiltInSampleMask:
if (has_additional_fixed_sample_mask())
{
// If the additional fixed sample mask was set, we need to adjust the sample_mask
// output to reflect that. If the shader outputs the sample_mask itself too, we need
// to AND the two masks to get the final one.
string op_str = does_shader_write_sample_mask ? " &= " : " = ";
entry_func.fixup_hooks_out.push_back([=]() {
statement(to_expression(builtin_sample_mask_id), op_str, additional_fixed_sample_mask_str(), ";");
});
}
break;
case BuiltInFragDepth:
if (msl_options.input_attachment_is_ds_attachment && !writes_to_depth)
{
entry_func.fixup_hooks_out.push_back([=]() {
statement(to_expression(builtin_frag_depth_id), " = ", to_expression(builtin_frag_coord_id), ".z;");
});
}
break;
default:
break;
}
}
});
}

View File

@ -529,6 +529,13 @@ public:
// with side effects. Provided as an option hoping Metal will fix this issue in the future.
bool force_fragment_with_side_effects_execution = false;
// If set, adds a depth pass through statement to circumvent the following issue:
// When the same depth/stencil is used as input and depth/stencil attachment, we need to
// force Metal to perform the depth/stencil write after fragment execution. Otherwise,
// Metal will write to the depth attachment before fragment execution. This happens
// if the fragment does not modify the depth value.
bool input_attachment_is_ds_attachment = false;
bool is_ios() const
{
return platform == iOS;
@ -1094,6 +1101,7 @@ protected:
uint32_t builtin_stage_input_size_id = 0;
uint32_t builtin_local_invocation_index_id = 0;
uint32_t builtin_workgroup_size_id = 0;
uint32_t builtin_frag_depth_id = 0;
uint32_t swizzle_buffer_id = 0;
uint32_t buffer_size_buffer_id = 0;
uint32_t view_mask_buffer_id = 0;
@ -1190,6 +1198,7 @@ protected:
bool needs_subgroup_size = false;
bool needs_sample_id = false;
bool needs_helper_invocation = false;
bool writes_to_depth = false;
std::string qual_pos_var_name;
std::string stage_in_var_name = "in";
std::string stage_out_var_name = "out";

View File

@ -388,6 +388,8 @@ def cross_compile_msl(shader, spirv, opt, iterations, paths):
msl_args.append('--msl-runtime-array-rich-descriptor')
if '.replace-recursive-inputs.' in shader:
msl_args.append('--msl-replace-recursive-inputs')
if '.input-attachment-is-ds-attachment.' in shader:
msl_args.append('--msl-input-attachment-is-ds-attachment')
if '.mask-location-0.' in shader:
msl_args.append('--mask-stage-output-location')
msl_args.append('0')