Never flatten outputs when capturing them.

There's no need to do so, since these are not stage-out structs being returned, but regular structures being written to a buffer. This also neatly avoids issues writing to composite (e.g. arrayed) per-patch outputs from a tessellation control shader.
2019-02-11 17:18:54 -06:00 · 2019-02-11 17:18:54 -06:00 · 0bb6bbda22
commit 0bb6bbda22
parent 8860a97d4a
4 changed files with 127 additions and 14 deletions
--- a/reference/opt/shaders-msl/desktop-only/tesc/arrayed-output.desktop.sso.tesc
+++ b/reference/opt/shaders-msl/desktop-only/tesc/arrayed-output.desktop.sso.tesc
@ -0,0 +1,42 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct main0_out
+{
+    float3 vVertex;
+};
+
+struct main0_patchOut
+{
+    float3 vPatch[2];
+};
+
+struct main0_in
+{
+    float3 vInput [[attribute(0)]];
+};
+
+kernel void main0(main0_in in [[stage_in]], uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], device uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], threadgroup main0_in* gl_in [[threadgroup(0)]])
+{
+    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 4];
+    device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
+    if (gl_InvocationID < spvIndirectParams[0])
+        gl_in[gl_InvocationID] = in;
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    gl_out[gl_InvocationID].vVertex = gl_in[gl_InvocationID].vInput + gl_in[gl_InvocationID ^ 1].vInput;
+    threadgroup_barrier(mem_flags::mem_device);
+    if (gl_InvocationID == 0)
+    {
+        patchOut.vPatch[0] = float3(10.0);
+        patchOut.vPatch[1] = float3(20.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(1.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(2.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(3.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(4.0);
+        spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(1.0);
+        spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(2.0);
+    }
+}
+
--- a/reference/shaders-msl/desktop-only/tesc/arrayed-output.desktop.sso.tesc
+++ b/reference/shaders-msl/desktop-only/tesc/arrayed-output.desktop.sso.tesc
@ -0,0 +1,42 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct main0_out
+{
+    float3 vVertex;
+};
+
+struct main0_patchOut
+{
+    float3 vPatch[2];
+};
+
+struct main0_in
+{
+    float3 vInput [[attribute(0)]];
+};
+
+kernel void main0(main0_in in [[stage_in]], uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], device uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], threadgroup main0_in* gl_in [[threadgroup(0)]])
+{
+    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 4];
+    device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
+    if (gl_InvocationID < spvIndirectParams[0])
+        gl_in[gl_InvocationID] = in;
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    gl_out[gl_InvocationID].vVertex = gl_in[gl_InvocationID].vInput + gl_in[gl_InvocationID ^ 1].vInput;
+    threadgroup_barrier(mem_flags::mem_device);
+    if (gl_InvocationID == 0)
+    {
+        patchOut.vPatch[0] = float3(10.0);
+        patchOut.vPatch[1] = float3(20.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(1.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(2.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(3.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(4.0);
+        spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(1.0);
+        spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(2.0);
+    }
+}
+
--- a/shaders-msl/desktop-only/tesc/arrayed-output.desktop.sso.tesc
+++ b/shaders-msl/desktop-only/tesc/arrayed-output.desktop.sso.tesc
@ -0,0 +1,27 @@
+#version 450
+
+layout(vertices = 4) out;
+layout(location = 0) patch out vec3 vPatch[2];
+layout(location = 2) out vec3 vVertex[];
+layout(location = 0) in vec3 vInput[];
+
+void main()
+{
+        vVertex[gl_InvocationID] =
+                vInput[gl_InvocationID] +
+                vInput[gl_InvocationID ^ 1];
+
+        barrier();
+
+        if (gl_InvocationID == 0)
+        {
+                vPatch[0] = vec3(10.0);
+                vPatch[1] = vec3(20.0);
+                gl_TessLevelOuter[0] = 1.0;
+                gl_TessLevelOuter[1] = 2.0;
+                gl_TessLevelOuter[2] = 3.0;
+                gl_TessLevelOuter[3] = 4.0;
+                gl_TessLevelInner[0] = 1.0;
+                gl_TessLevelInner[1] = 2.0;
+        }
+}
--- a/spirv_msl.cpp
+++ b/spirv_msl.cpp
@ -1044,12 +1044,14 @@ void CompilerMSL::add_plain_variable_to_interface_block(StorageClass storage, co
 	uint32_t type_id = ensure_correct_builtin_type(var.basetype, builtin);
 	var.basetype = type_id;

-	auto &type = strip_array ? get_variable_element_type(var) : get_variable_data_type(var);
+	type_id = get_pointee_type_id(var.basetype);
+	if (strip_array && is_array(get<SPIRType>(type_id)))
+		type_id = get<SPIRType>(type_id).parent_type;
+	auto &type = get<SPIRType>(type_id);
 	uint32_t target_components = 0;
 	uint32_t type_components = type.vecsize;
 	bool padded_output = false;

-	type_id = type.self;
 	// Check if we need to pad fragment output to match a certain number of components.
 	if (get_decoration_bitset(var.self).get(DecorationLocation) && msl_options.pad_fragment_output_components &&
 	    get_entry_point().model == ExecutionModelFragment && storage == StorageClassOutput)
@ -1532,7 +1534,7 @@ void CompilerMSL::add_variable_to_interface_block(StorageClass storage, const st

 	if (var_type.basetype == SPIRType::Struct)
 	{
-		if (!is_builtin_type(var_type) && !strip_array)
+		if (!is_builtin_type(var_type) && (!capture_output_to_buffer || storage == StorageClassInput) && !strip_array)
 		{
 			// For I/O blocks or structs, we will need to pass the block itself around
 			// to functions if they are used globally in leaf functions.
@ -1544,14 +1546,13 @@ void CompilerMSL::add_variable_to_interface_block(StorageClass storage, const st
 			vars_needing_early_declaration.push_back(var.self);
 		}

-		// Per-vertex outputs in a tess. control shader need special handling.
-		if (strip_array && storage != StorageClassInput && !has_decoration(var_type.self, DecorationBlock))
+		if (capture_output_to_buffer && storage != StorageClassInput && !has_decoration(var_type.self, DecorationBlock))
 		{
-			// We can't flatten the struct, because of the requirement that these be passed around as arrays.
-			// In Metal, the interface block itself is arrayed. This makes things very complicated, since
-			// stage-in structures in MSL don't support nested structures. Luckily, for stage-out in a
-			// tessellation control shader, we can get away with this because the structure is stored
-			// to a buffer, not returned.
+			// In Metal tessellation shaders, the interface block itself is arrayed. This makes things
+			// very complicated, since stage-in structures in MSL don't support nested structures.
+			// Luckily, for stage-out when capturing output, we can avoid this and just add
+			// composite members directly, because the stage-out structure is stored to a buffer,
+			// not returned.
 			add_plain_variable_to_interface_block(storage, ib_var_ref, ib_type, var, strip_array);
 		}
 		else
@ -1589,7 +1590,8 @@ void CompilerMSL::add_variable_to_interface_block(StorageClass storage, const st
 		if (!is_builtin || has_active_builtin(builtin, storage))
 		{
 			// MSL does not allow matrices or arrays in input or output variables, so need to handle it specially.
-			if (!is_builtin && (storage == StorageClassInput || storage == StorageClassOutput) &&
+			if (!is_builtin &&
+			    (storage == StorageClassInput || (storage == StorageClassOutput && !capture_output_to_buffer)) &&
 			    (is_matrix(var_type) || is_array(var_type)))
 			{
 				add_composite_variable_to_interface_block(storage, ib_var_ref, ib_type, var, strip_array);
@ -5248,9 +5250,9 @@ string CompilerMSL::entry_point_args(bool append_comma)
 			{
 				if (!ep_args.empty())
 					ep_args += ", ";
-				ep_args += join("device ", type_to_glsl(get_patch_stage_out_struct_type()), "* ",
-				                patch_output_buffer_var_name, " [[buffer(",
-				                convert_to_string(msl_options.shader_patch_output_buffer_index), ")]]");
+				ep_args +=
+				    join("device ", type_to_glsl(get_patch_stage_out_struct_type()), "* ", patch_output_buffer_var_name,
+				         " [[buffer(", convert_to_string(msl_options.shader_patch_output_buffer_index), ")]]");
 			}
 			if (!ep_args.empty())
 				ep_args += ", ";