MSL: Add support for processing more than one patch per workgroup.

This should hopefully reduce underutilization of the GPU, especially on GPUs where the thread execution width is greater than the number of control points. This also simplifies initialization by reading the buffer directly instead of using Metal's vertex-attribute-in-compute support. It turns out the only way in which shader stages are allowed to differ in their interfaces is in the number of components per vector; the base type must be the same. Since we are using the raw buffer instead of attributes, we can now also emit arrays and matrices directly into the buffer, instead of flattening them and then unpacking them. Structs are still flattened, however; this is due to the need to handle vectors with fewer components than were output, and I think handling this while also directly emitting structs could get ugly. Another advantage of this scheme is that the extra invocations needed to read the attributes when there were more input than output points are now no more. The number of threads per workgroup is now lcm(SIMD-size, output control points). This should ensure we always process a whole number of patches per workgroup. To avoid complexity handling indices in the tessellation control shader, I've also changed the way vertex shaders for tessellation are handled. They are now compute kernels using Metal's support for vertex-style stage input. This lets us always emit vertices into the buffer in order of vertex shader execution. Now we no longer have to deal with indexing in the tessellation control shader. This also fixes a long-standing issue where if an index were greater than the number of vertices to draw, the vertex shader would wind up writing outside the buffer, and the vertex would be lost. This is a breaking change, and I know SPIRV-Cross has other clients, so I've hidden this behind an option for now. In the future, I want to remove this option and make it the default.
2024-11-08 21:30:05 +00:00 · 2020-02-20 21:38:28 -06:00 · 2020-02-20 21:38:28 -06:00 · 688c5fcbda
commit 688c5fcbda
parent 3dcc23a5b3
46 changed files with 2649 additions and 113 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -323,7 +323,7 @@ if (SPIRV_CROSS_STATIC)
 endif()

 set(spirv-cross-abi-major 0)
-set(spirv-cross-abi-minor 35)
+set(spirv-cross-abi-minor 36)
 set(spirv-cross-abi-patch 0)

 if (SPIRV_CROSS_SHARED)
--- a/main.cpp
+++ b/main.cpp
@ -558,6 +558,8 @@ struct CLIArguments
 	bool msl_enable_frag_stencil_ref_builtin = true;
 	uint32_t msl_enable_frag_output_mask = 0xffffffff;
 	bool msl_enable_clip_distance_user_varying = true;
+	bool msl_multi_patch_workgroup = false;
+	bool msl_vertex_for_tessellation = false;
 	bool glsl_emit_push_constant_as_ubo = false;
 	bool glsl_emit_ubo_as_plain_uniforms = false;
 	SmallVector<pair<uint32_t, uint32_t>> glsl_ext_framebuffer_fetch;
@ -747,9 +749,15 @@ static void print_help_msl()
 	                "\t[--msl-enable-frag-output-mask <mask>]:\n\t\tOnly selectively enable fragment outputs. Useful if pipeline does not enable fragment output for certain locations, as pipeline creation might otherwise fail.\n"
 	                "\t[--msl-no-clip-distance-user-varying]:\n\t\tDo not emit user varyings to emulate gl_ClipDistance in fragment shaders.\n"
 	                "\t[--msl-shader-input <index> <format> <size>]:\n\t\tSpecify the format of the shader input at <index>.\n"
-	                "\t\t<format> can be 'u16', 'u8', or 'other', to indicate a 16-bit unsigned integer, 8-bit unsigned integer, "
+	                "\t\t<format> can be 'any32', 'any16', 'u16', 'u8', or 'other', to indicate a 32-bit opaque value, 16-bit opaque value, 16-bit unsigned integer, 8-bit unsigned integer, "
 	                "or other-typed variable. <size> is the vector length of the variable, which must be greater than or equal to that declared in the shader.\n"
-	                "\t\tUseful if shader stage interfaces don't match up, as pipeline creation might otherwise fail.\n");
+	                "\t\tUseful if shader stage interfaces don't match up, as pipeline creation might otherwise fail.\n"
+	                "\t[--msl-multi-patch-workgroup]:\n\t\tUse the new style of tessellation control processing, where multiple patches are processed per workgroup.\n"
+					"\t\tThis should increase throughput by ensuring all the GPU's SIMD lanes are occupied, but it is not compatible with the old style.\n"
+					"\t\tIn addition, this style also passes input variables in buffers directly instead of using vertex attribute processing.\n"
+					"\t\tIn a future version of SPIRV-Cross, this will become the default.\n"
+	                "\t[--msl-vertex-for-tessellation]:\n\t\tWhen handling a vertex shader, marks it as one that will be used with a new-style tessellation control shader.\n"
+					"\t\tThe vertex shader is output to MSL as a compute kernel which outputs vertices to the buffer in the order they are received, rather than in index order as with --msl-capture-output normally.\n");
 	// clang-format on
 }

@ -983,6 +991,8 @@ static string compile_iteration(const CLIArguments &args, std::vector<uint32_t>
 		msl_opts.enable_frag_stencil_ref_builtin = args.msl_enable_frag_stencil_ref_builtin;
 		msl_opts.enable_frag_output_mask = args.msl_enable_frag_output_mask;
 		msl_opts.enable_clip_distance_user_varying = args.msl_enable_clip_distance_user_varying;
+		msl_opts.multi_patch_workgroup = args.msl_multi_patch_workgroup;
+		msl_opts.vertex_for_tessellation = args.msl_vertex_for_tessellation;
 		msl_comp->set_msl_options(msl_opts);
 		for (auto &v : args.msl_discrete_descriptor_sets)
 			msl_comp->add_discrete_descriptor_set(v);
@ -1381,15 +1391,21 @@ static int main_inner(int argc, char *argv[])
 		// Make sure next_uint() is called in-order.
 		input.location = parser.next_uint();
 		const char *format = parser.next_value_string("other");
-		if (strcmp(format, "u16") == 0)
-			input.format = MSL_VERTEX_FORMAT_UINT16;
+		if (strcmp(format, "any32") == 0)
+			input.format = MSL_SHADER_INPUT_FORMAT_ANY32;
+		else if (strcmp(format, "any16") == 0)
+			input.format = MSL_SHADER_INPUT_FORMAT_ANY16;
+		else if (strcmp(format, "u16") == 0)
+			input.format = MSL_SHADER_INPUT_FORMAT_UINT16;
 		else if (strcmp(format, "u8") == 0)
-			input.format = MSL_VERTEX_FORMAT_UINT8;
+			input.format = MSL_SHADER_INPUT_FORMAT_UINT8;
 		else
-			input.format = MSL_VERTEX_FORMAT_OTHER;
+			input.format = MSL_SHADER_INPUT_FORMAT_OTHER;
 		input.vecsize = parser.next_uint();
 		args.msl_shader_inputs.push_back(input);
 	});
+	cbs.add("--msl-multi-patch-workgroup", [&args](CLIParser &) { args.msl_multi_patch_workgroup = true; });
+	cbs.add("--msl-vertex-for-tessellation", [&args](CLIParser &) { args.msl_vertex_for_tessellation = true; });
 	cbs.add("--extension", [&args](CLIParser &parser) { args.extensions.push_back(parser.next_string()); });
 	cbs.add("--rename-entry-point", [&args](CLIParser &parser) {
 		auto old_name = parser.next_string();
--- a/reference/opt/shaders-msl/asm/tesc/tess-level-overrun.multi-patch.asm.tesc
+++ b/reference/opt/shaders-msl/asm/tesc/tess-level-overrun.multi-patch.asm.tesc
@ -0,0 +1,24 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct TessLevels
+{
+    float inner0;
+    float inner1;
+    float outer0;
+    float outer1;
+    float outer2;
+    float outer3;
+};
+
+kernel void main0(const device TessLevels& sb_levels [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], constant uint* spvIndirectParams [[buffer(29)]], device MTLTriangleTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
+{
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1]);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor = half(sb_levels.inner0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(sb_levels.outer0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(sb_levels.outer1);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(sb_levels.outer2);
+}
+
--- a/reference/opt/shaders-msl/desktop-only/tesc/basic.desktop.sso.multi-patch.tesc
+++ b/reference/opt/shaders-msl/desktop-only/tesc/basic.desktop.sso.multi-patch.tesc
@ -0,0 +1,39 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct main0_out
+{
+    float4 gl_Position;
+};
+
+struct main0_patchOut
+{
+    float3 vFoo;
+};
+
+struct main0_in
+{
+    uint3 m_86;
+    ushort2 m_90;
+    float4 gl_Position;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 1];
+    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 1];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 1;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1]);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(8.8999996185302734375);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(6.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(8.8999996185302734375);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(6.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(3.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(4.900000095367431640625);
+    patchOut.vFoo = float3(1.0);
+    gl_out[gl_InvocationID].gl_Position = gl_in[0].gl_Position + gl_in[1].gl_Position;
+}
+
--- a/reference/opt/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc
+++ b/reference/opt/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc
@ -0,0 +1,38 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct Boo
+{
+    float3 a;
+    uint3 b;
+};
+
+struct main0_out
+{
+    Boo vVertex;
+};
+
+struct main0_in
+{
+    float3 Boo_a;
+    uint3 Boo_b;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    Boo _26 = Boo{ gl_in[gl_InvocationID].Boo_a, gl_in[gl_InvocationID].Boo_b };
+    gl_out[gl_InvocationID].vVertex = _26;
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(1.0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(2.0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(3.0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(4.0);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(1.0);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(2.0);
+}
+
--- a/reference/opt/shaders-msl/desktop-only/vert/shader-draw-parameters.desktop.for-tess.vert
+++ b/reference/opt/shaders-msl/desktop-only/vert/shader-draw-parameters.desktop.for-tess.vert
@ -0,0 +1,20 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct main0_out
+{
+    float4 gl_Position;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], uint3 spvDispatchBase [[grid_origin]], device main0_out* spvOut [[buffer(28)]])
+{
+    device main0_out& out = spvOut[gl_GlobalInvocationID.y * spvStageInputSize.x + gl_GlobalInvocationID.x];
+    if (any(gl_GlobalInvocationID >= spvStageInputSize))
+        return;
+    uint gl_BaseVertex = spvDispatchBase.x;
+    uint gl_BaseInstance = spvDispatchBase.y;
+    out.gl_Position = float4(float(int(gl_BaseVertex)), float(int(gl_BaseInstance)), 0.0, 1.0);
+}
+
--- a/reference/opt/shaders-msl/tesc/basic.multi-patch.tesc
+++ b/reference/opt/shaders-msl/tesc/basic.multi-patch.tesc
@ -0,0 +1,23 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct main0_patchOut
+{
+    float3 vFoo;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
+{
+    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 1];
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1]);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(8.8999996185302734375);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(6.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(8.8999996185302734375);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(6.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(3.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(4.900000095367431640625);
+    patchOut.vFoo = float3(1.0);
+}
+
--- a/reference/opt/shaders-msl/tesc/load-control-point-array-of-matrix.multi-patch.tesc
+++ b/reference/opt/shaders-msl/tesc/load-control-point-array-of-matrix.multi-patch.tesc
@ -0,0 +1,68 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct main0_out
+{
+    float4x4 vOutputs;
+};
+
+struct main0_in
+{
+    float4x4 vInputs;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    spvUnsafeArray<float4x4, 32> _16 = spvUnsafeArray<float4x4, 32>({ gl_in[0].vInputs, gl_in[1].vInputs, gl_in[2].vInputs, gl_in[3].vInputs, gl_in[4].vInputs, gl_in[5].vInputs, gl_in[6].vInputs, gl_in[7].vInputs, gl_in[8].vInputs, gl_in[9].vInputs, gl_in[10].vInputs, gl_in[11].vInputs, gl_in[12].vInputs, gl_in[13].vInputs, gl_in[14].vInputs, gl_in[15].vInputs, gl_in[16].vInputs, gl_in[17].vInputs, gl_in[18].vInputs, gl_in[19].vInputs, gl_in[20].vInputs, gl_in[21].vInputs, gl_in[22].vInputs, gl_in[23].vInputs, gl_in[24].vInputs, gl_in[25].vInputs, gl_in[26].vInputs, gl_in[27].vInputs, gl_in[28].vInputs, gl_in[29].vInputs, gl_in[30].vInputs, gl_in[31].vInputs });
+    spvUnsafeArray<float4x4, 32> tmp;
+    tmp = _16;
+    gl_out[gl_InvocationID].vOutputs = tmp[gl_InvocationID];
+}
+
--- a/reference/opt/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc
+++ b/reference/opt/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc
@ -0,0 +1,78 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct VertexData
+{
+    float4x4 a;
+    spvUnsafeArray<float4, 2> b;
+    float4 c;
+};
+
+struct main0_out
+{
+    float4 vOutputs;
+};
+
+struct main0_in
+{
+    float4x4 VertexData_a;
+    spvUnsafeArray<float4, 2> VertexData_b;
+    float4 VertexData_c;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    spvUnsafeArray<VertexData, 32> _19 = spvUnsafeArray<VertexData, 32>({ VertexData{ gl_in[0].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[0].VertexData_b[0], gl_in[0].VertexData_b[1] }), gl_in[0].VertexData_c }, VertexData{ gl_in[1].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[1].VertexData_b[0], gl_in[1].VertexData_b[1] }), gl_in[1].VertexData_c }, VertexData{ gl_in[2].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[2].VertexData_b[0], gl_in[2].VertexData_b[1] }), gl_in[2].VertexData_c }, VertexData{ gl_in[3].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[3].VertexData_b[0], gl_in[3].VertexData_b[1] }), gl_in[3].VertexData_c }, VertexData{ gl_in[4].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[4].VertexData_b[0], gl_in[4].VertexData_b[1] }), gl_in[4].VertexData_c }, VertexData{ gl_in[5].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[5].VertexData_b[0], gl_in[5].VertexData_b[1] }), gl_in[5].VertexData_c }, VertexData{ gl_in[6].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[6].VertexData_b[0], gl_in[6].VertexData_b[1] }), gl_in[6].VertexData_c }, VertexData{ gl_in[7].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[7].VertexData_b[0], gl_in[7].VertexData_b[1] }), gl_in[7].VertexData_c }, VertexData{ gl_in[8].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[8].VertexData_b[0], gl_in[8].VertexData_b[1] }), gl_in[8].VertexData_c }, VertexData{ gl_in[9].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[9].VertexData_b[0], gl_in[9].VertexData_b[1] }), gl_in[9].VertexData_c }, VertexData{ gl_in[10].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[10].VertexData_b[0], gl_in[10].VertexData_b[1] }), gl_in[10].VertexData_c }, VertexData{ gl_in[11].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[11].VertexData_b[0], gl_in[11].VertexData_b[1] }), gl_in[11].VertexData_c }, VertexData{ gl_in[12].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[12].VertexData_b[0], gl_in[12].VertexData_b[1] }), gl_in[12].VertexData_c }, VertexData{ gl_in[13].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[13].VertexData_b[0], gl_in[13].VertexData_b[1] }), gl_in[13].VertexData_c }, VertexData{ gl_in[14].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[14].VertexData_b[0], gl_in[14].VertexData_b[1] }), gl_in[14].VertexData_c }, VertexData{ gl_in[15].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[15].VertexData_b[0], gl_in[15].VertexData_b[1] }), gl_in[15].VertexData_c }, VertexData{ gl_in[16].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[16].VertexData_b[0], gl_in[16].VertexData_b[1] }), gl_in[16].VertexData_c }, VertexData{ gl_in[17].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[17].VertexData_b[0], gl_in[17].VertexData_b[1] }), gl_in[17].VertexData_c }, VertexData{ gl_in[18].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[18].VertexData_b[0], gl_in[18].VertexData_b[1] }), gl_in[18].VertexData_c }, VertexData{ gl_in[19].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[19].VertexData_b[0], gl_in[19].VertexData_b[1] }), gl_in[19].VertexData_c }, VertexData{ gl_in[20].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[20].VertexData_b[0], gl_in[20].VertexData_b[1] }), gl_in[20].VertexData_c }, VertexData{ gl_in[21].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[21].VertexData_b[0], gl_in[21].VertexData_b[1] }), gl_in[21].VertexData_c }, VertexData{ gl_in[22].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[22].VertexData_b[0], gl_in[22].VertexData_b[1] }), gl_in[22].VertexData_c }, VertexData{ gl_in[23].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[23].VertexData_b[0], gl_in[23].VertexData_b[1] }), gl_in[23].VertexData_c }, VertexData{ gl_in[24].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[24].VertexData_b[0], gl_in[24].VertexData_b[1] }), gl_in[24].VertexData_c }, VertexData{ gl_in[25].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[25].VertexData_b[0], gl_in[25].VertexData_b[1] }), gl_in[25].VertexData_c }, VertexData{ gl_in[26].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[26].VertexData_b[0], gl_in[26].VertexData_b[1] }), gl_in[26].VertexData_c }, VertexData{ gl_in[27].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[27].VertexData_b[0], gl_in[27].VertexData_b[1] }), gl_in[27].VertexData_c }, VertexData{ gl_in[28].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[28].VertexData_b[0], gl_in[28].VertexData_b[1] }), gl_in[28].VertexData_c }, VertexData{ gl_in[29].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[29].VertexData_b[0], gl_in[29].VertexData_b[1] }), gl_in[29].VertexData_c }, VertexData{ gl_in[30].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[30].VertexData_b[0], gl_in[30].VertexData_b[1] }), gl_in[30].VertexData_c }, VertexData{ gl_in[31].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[31].VertexData_b[0], gl_in[31].VertexData_b[1] }), gl_in[31].VertexData_c } });
+    spvUnsafeArray<VertexData, 32> tmp;
+    tmp = _19;
+    int _27 = gl_InvocationID ^ 1;
+    gl_out[gl_InvocationID].vOutputs = ((tmp[gl_InvocationID].a[1] + tmp[gl_InvocationID].b[1]) + tmp[gl_InvocationID].c) + gl_in[_27].VertexData_c;
+}
+
--- a/reference/opt/shaders-msl/tesc/load-control-point-array.multi-patch.tesc
+++ b/reference/opt/shaders-msl/tesc/load-control-point-array.multi-patch.tesc
@ -0,0 +1,69 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct main0_out
+{
+    float4 vOutputs;
+};
+
+struct main0_in
+{
+    float4 vInputs;
+    ushort2 m_43;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    spvUnsafeArray<float4, 32> _15 = spvUnsafeArray<float4, 32>({ gl_in[0].vInputs, gl_in[1].vInputs, gl_in[2].vInputs, gl_in[3].vInputs, gl_in[4].vInputs, gl_in[5].vInputs, gl_in[6].vInputs, gl_in[7].vInputs, gl_in[8].vInputs, gl_in[9].vInputs, gl_in[10].vInputs, gl_in[11].vInputs, gl_in[12].vInputs, gl_in[13].vInputs, gl_in[14].vInputs, gl_in[15].vInputs, gl_in[16].vInputs, gl_in[17].vInputs, gl_in[18].vInputs, gl_in[19].vInputs, gl_in[20].vInputs, gl_in[21].vInputs, gl_in[22].vInputs, gl_in[23].vInputs, gl_in[24].vInputs, gl_in[25].vInputs, gl_in[26].vInputs, gl_in[27].vInputs, gl_in[28].vInputs, gl_in[29].vInputs, gl_in[30].vInputs, gl_in[31].vInputs });
+    spvUnsafeArray<float4, 32> tmp;
+    tmp = _15;
+    gl_out[gl_InvocationID].vOutputs = tmp[gl_InvocationID];
+}
+
--- a/reference/opt/shaders-msl/tesc/water_tess.multi-patch.tesc
+++ b/reference/opt/shaders-msl/tesc/water_tess.multi-patch.tesc
@ -0,0 +1,91 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct UBO
+{
+    float4 uScale;
+    float3 uCamPos;
+    float2 uPatchSize;
+    float2 uMaxTessLevel;
+    float uDistanceMod;
+    float4 uFrustum[6];
+};
+
+struct main0_patchOut
+{
+    float2 vOutPatchPosBase;
+    float4 vPatchLods;
+};
+
+struct main0_in
+{
+    float3 vPatchPosBase;
+    ushort2 m_996;
+};
+
+kernel void main0(constant UBO& _41 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 1];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1]);
+    float2 _431 = (gl_in[0].vPatchPosBase.xy - float2(10.0)) * _41.uScale.xy;
+    float2 _441 = ((gl_in[0].vPatchPosBase.xy + _41.uPatchSize) + float2(10.0)) * _41.uScale.xy;
+    float3 _446 = float3(_431.x, -10.0, _431.y);
+    float3 _451 = float3(_441.x, 10.0, _441.y);
+    float4 _467 = float4((_446 + _451) * 0.5, 1.0);
+    float3 _514 = float3(length(_451 - _446) * (-0.5));
+    bool _516 = any(float3(dot(_41.uFrustum[0], _467), dot(_41.uFrustum[1], _467), dot(_41.uFrustum[2], _467)) <= _514);
+    bool _526;
+    if (!_516)
+    {
+        _526 = any(float3(dot(_41.uFrustum[3], _467), dot(_41.uFrustum[4], _467), dot(_41.uFrustum[5], _467)) <= _514);
+    }
+    else
+    {
+        _526 = _516;
+    }
+    if (!(!_526))
+    {
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(-1.0);
+    }
+    else
+    {
+        patchOut.vOutPatchPosBase = gl_in[0].vPatchPosBase.xy;
+        float2 _681 = (gl_in[0].vPatchPosBase.xy + (float2(-0.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float2 _710 = (gl_in[0].vPatchPosBase.xy + (float2(0.5, -0.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float _729 = fast::clamp(log2((length(_41.uCamPos - float3(_710.x, 0.0, _710.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x);
+        float2 _739 = (gl_in[0].vPatchPosBase.xy + (float2(1.5, -0.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float2 _768 = (gl_in[0].vPatchPosBase.xy + (float2(-0.5, 0.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float _787 = fast::clamp(log2((length(_41.uCamPos - float3(_768.x, 0.0, _768.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x);
+        float2 _797 = (gl_in[0].vPatchPosBase.xy + (float2(0.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float _816 = fast::clamp(log2((length(_41.uCamPos - float3(_797.x, 0.0, _797.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x);
+        float2 _826 = (gl_in[0].vPatchPosBase.xy + (float2(1.5, 0.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float _845 = fast::clamp(log2((length(_41.uCamPos - float3(_826.x, 0.0, _826.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x);
+        float2 _855 = (gl_in[0].vPatchPosBase.xy + (float2(-0.5, 1.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float2 _884 = (gl_in[0].vPatchPosBase.xy + (float2(0.5, 1.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float _903 = fast::clamp(log2((length(_41.uCamPos - float3(_884.x, 0.0, _884.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x);
+        float2 _913 = (gl_in[0].vPatchPosBase.xy + (float2(1.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float _614 = dot(float4(_787, _816, fast::clamp(log2((length(_41.uCamPos - float3(_855.x, 0.0, _855.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x), _903), float4(0.25));
+        float _620 = dot(float4(fast::clamp(log2((length(_41.uCamPos - float3(_681.x, 0.0, _681.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x), _729, _787, _816), float4(0.25));
+        float _626 = dot(float4(_729, fast::clamp(log2((length(_41.uCamPos - float3(_739.x, 0.0, _739.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x), _816, _845), float4(0.25));
+        float _632 = dot(float4(_816, _845, _903, fast::clamp(log2((length(_41.uCamPos - float3(_913.x, 0.0, _913.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x)), float4(0.25));
+        float4 _633 = float4(_614, _620, _626, _632);
+        patchOut.vPatchLods = _633;
+        float4 _940 = exp2(-fast::min(_633, _633.yzwx)) * _41.uMaxTessLevel.y;
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(_940.x);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(_940.y);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(_940.z);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(_940.w);
+        float _948 = _41.uMaxTessLevel.y * exp2(-fast::min(fast::min(fast::min(_614, _620), fast::min(_626, _632)), _816));
+        spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(_948);
+        spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(_948);
+    }
+}
+
--- a/reference/opt/shaders-msl/vert/basic.for-tess.vert
+++ b/reference/opt/shaders-msl/vert/basic.for-tess.vert
@ -0,0 +1,31 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct UBO
+{
+    float4x4 uMVP;
+};
+
+struct main0_out
+{
+    float3 vNormal;
+    float4 gl_Position;
+};
+
+struct main0_in
+{
+    float4 aVertex [[attribute(0)]];
+    float3 aNormal [[attribute(1)]];
+};
+
+kernel void main0(main0_in in [[stage_in]], constant UBO& _16 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], device main0_out* spvOut [[buffer(28)]])
+{
+    device main0_out& out = spvOut[gl_GlobalInvocationID.y * spvStageInputSize.x + gl_GlobalInvocationID.x];
+    if (any(gl_GlobalInvocationID >= spvStageInputSize))
+        return;
+    out.gl_Position = _16.uMVP * in.aVertex;
+    out.vNormal = in.aNormal;
+}
+
--- a/reference/opt/shaders-msl/vert/leaf-function.for-tess.vert
+++ b/reference/opt/shaders-msl/vert/leaf-function.for-tess.vert
@ -0,0 +1,31 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct UBO
+{
+    float4x4 uMVP;
+};
+
+struct main0_out
+{
+    float3 vNormal;
+    float4 gl_Position;
+};
+
+struct main0_in
+{
+    float4 aVertex [[attribute(0)]];
+    float3 aNormal [[attribute(1)]];
+};
+
+kernel void main0(main0_in in [[stage_in]], constant UBO& _18 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], device main0_out* spvOut [[buffer(28)]])
+{
+    device main0_out& out = spvOut[gl_GlobalInvocationID.y * spvStageInputSize.x + gl_GlobalInvocationID.x];
+    if (any(gl_GlobalInvocationID >= spvStageInputSize))
+        return;
+    out.gl_Position = _18.uMVP * in.aVertex;
+    out.vNormal = in.aNormal;
+}
+
--- a/reference/opt/shaders-msl/vert/no_stage_out.for-tess.vert
+++ b/reference/opt/shaders-msl/vert/no_stage_out.for-tess.vert
@ -0,0 +1,23 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct _10
+{
+    uint4 _m0[1024];
+};
+
+struct main0_in
+{
+    uint4 m_19 [[attribute(0)]];
+};
+
+kernel void main0(main0_in in [[stage_in]], device _10& _12 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], uint3 spvDispatchBase [[grid_origin]])
+{
+    if (any(gl_GlobalInvocationID >= spvStageInputSize))
+        return;
+    uint gl_VertexIndex = gl_GlobalInvocationID.x + spvDispatchBase.x;
+    _12._m0[int(gl_VertexIndex)] = in.m_19;
+}
+
--- a/reference/shaders-msl-no-opt/asm/tesc/tess-fixed-input-array-builtin-array.invalid.multi-patch.asm.tesc
+++ b/reference/shaders-msl-no-opt/asm/tesc/tess-fixed-input-array-builtin-array.invalid.multi-patch.asm.tesc
@ -0,0 +1,140 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct VertexOutput
+{
+    float4 pos;
+    float2 uv;
+};
+
+struct HSOut
+{
+    float4 pos;
+    float2 uv;
+};
+
+struct HSConstantOut
+{
+    spvUnsafeArray<float, 3> EdgeTess;
+    float InsideTess;
+};
+
+struct VertexOutput_1
+{
+    float3 uv;
+};
+
+struct HSOut_1
+{
+    float2 uv;
+};
+
+struct main0_out
+{
+    HSOut_1 _entryPointOutput;
+    float4 gl_Position;
+};
+
+struct main0_in
+{
+    float3 VertexOutput_uv;
+    ushort2 m_172;
+    float4 gl_Position;
+};
+
+static inline __attribute__((always_inline))
+HSOut _hs_main(thread const spvUnsafeArray<VertexOutput, 3> (&p), thread const uint& i)
+{
+    HSOut _output;
+    _output.pos = p[i].pos;
+    _output.uv = p[i].uv;
+    return _output;
+}
+
+static inline __attribute__((always_inline))
+HSConstantOut PatchHS(thread const spvUnsafeArray<VertexOutput, 3> (&_patch))
+{
+    HSConstantOut _output;
+    _output.EdgeTess[0] = (float2(1.0) + _patch[0].uv).x;
+    _output.EdgeTess[1] = (float2(1.0) + _patch[0].uv).x;
+    _output.EdgeTess[2] = (float2(1.0) + _patch[0].uv).x;
+    _output.InsideTess = (float2(1.0) + _patch[0].uv).x;
+    return _output;
+}
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLTriangleTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 3];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 3, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 3;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 3, spvIndirectParams[1]);
+    spvUnsafeArray<VertexOutput, 3> p;
+    p[0].pos = gl_in[0].gl_Position;
+    p[0].uv = gl_in[0].VertexOutput_uv.xy;
+    p[1].pos = gl_in[1].gl_Position;
+    p[1].uv = gl_in[1].VertexOutput_uv.xy;
+    p[2].pos = gl_in[2].gl_Position;
+    p[2].uv = gl_in[2].VertexOutput_uv.xy;
+    uint i = gl_InvocationID;
+    spvUnsafeArray<VertexOutput, 3> param;
+    param = p;
+    uint param_1 = i;
+    HSOut flattenTemp = _hs_main(param, param_1);
+    gl_out[gl_InvocationID].gl_Position = flattenTemp.pos;
+    gl_out[gl_InvocationID]._entryPointOutput.uv = flattenTemp.uv;
+    threadgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup);
+    if (int(gl_InvocationID) == 0)
+    {
+        spvUnsafeArray<VertexOutput, 3> param_2;
+        param_2 = p;
+        HSConstantOut _patchConstantResult = PatchHS(param_2);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(_patchConstantResult.EdgeTess[0]);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(_patchConstantResult.EdgeTess[1]);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(_patchConstantResult.EdgeTess[2]);
+        spvTessLevel[gl_PrimitiveID].insideTessellationFactor = half(_patchConstantResult.InsideTess);
+    }
+}
+
--- a/reference/shaders-msl/asm/tesc/tess-level-overrun.multi-patch.asm.tesc
+++ b/reference/shaders-msl/asm/tesc/tess-level-overrun.multi-patch.asm.tesc
@ -0,0 +1,24 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct TessLevels
+{
+    float inner0;
+    float inner1;
+    float outer0;
+    float outer1;
+    float outer2;
+    float outer3;
+};
+
+kernel void main0(const device TessLevels& sb_levels [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], constant uint* spvIndirectParams [[buffer(29)]], device MTLTriangleTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
+{
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1]);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor = half(sb_levels.inner0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(sb_levels.outer0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(sb_levels.outer1);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(sb_levels.outer2);
+}
+
--- a/reference/shaders-msl/desktop-only/tesc/basic.desktop.sso.multi-patch.tesc
+++ b/reference/shaders-msl/desktop-only/tesc/basic.desktop.sso.multi-patch.tesc
@ -0,0 +1,47 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct main0_out
+{
+    float4 gl_Position;
+};
+
+struct main0_patchOut
+{
+    float3 vFoo;
+};
+
+struct main0_in
+{
+    uint3 m_78;
+    ushort2 m_82;
+    float4 gl_Position;
+};
+
+static inline __attribute__((always_inline))
+void set_position(device main0_out* thread & gl_out, thread uint& gl_InvocationID, device main0_in* thread & gl_in)
+{
+    gl_out[gl_InvocationID].gl_Position = gl_in[0].gl_Position + gl_in[1].gl_Position;
+}
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 1];
+    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 1];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 1;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1]);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(8.8999996185302734375);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(6.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(8.8999996185302734375);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(6.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(3.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(4.900000095367431640625);
+    patchOut.vFoo = float3(1.0);
+    set_position(gl_out, gl_InvocationID, gl_in);
+}
+
--- a/reference/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc
+++ b/reference/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc
@ -0,0 +1,38 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct Boo
+{
+    float3 a;
+    uint3 b;
+};
+
+struct main0_out
+{
+    Boo vVertex;
+};
+
+struct main0_in
+{
+    float3 Boo_a;
+    uint3 Boo_b;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    Boo _26 = Boo{ gl_in[gl_InvocationID].Boo_a, gl_in[gl_InvocationID].Boo_b };
+    gl_out[gl_InvocationID].vVertex = _26;
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(1.0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(2.0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(3.0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(4.0);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(1.0);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(2.0);
+}
+
--- a/reference/shaders-msl/desktop-only/vert/shader-draw-parameters.desktop.for-tess.vert
+++ b/reference/shaders-msl/desktop-only/vert/shader-draw-parameters.desktop.for-tess.vert
@ -0,0 +1,20 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct main0_out
+{
+    float4 gl_Position;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], uint3 spvDispatchBase [[grid_origin]], device main0_out* spvOut [[buffer(28)]])
+{
+    device main0_out& out = spvOut[gl_GlobalInvocationID.y * spvStageInputSize.x + gl_GlobalInvocationID.x];
+    if (any(gl_GlobalInvocationID >= spvStageInputSize))
+        return;
+    uint gl_BaseVertex = spvDispatchBase.x;
+    uint gl_BaseInstance = spvDispatchBase.y;
+    out.gl_Position = float4(float(int(gl_BaseVertex)), float(int(gl_BaseInstance)), 0.0, 1.0);
+}
+
--- a/reference/shaders-msl/tesc/basic.multi-patch.tesc
+++ b/reference/shaders-msl/tesc/basic.multi-patch.tesc
@ -0,0 +1,23 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct main0_patchOut
+{
+    float3 vFoo;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
+{
+    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 1];
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1]);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(8.8999996185302734375);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(6.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(8.8999996185302734375);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(6.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(3.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(4.900000095367431640625);
+    patchOut.vFoo = float3(1.0);
+}
+
--- a/reference/shaders-msl/tesc/load-control-point-array-of-matrix.multi-patch.tesc
+++ b/reference/shaders-msl/tesc/load-control-point-array-of-matrix.multi-patch.tesc
@ -0,0 +1,68 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct main0_out
+{
+    float4x4 vOutputs;
+};
+
+struct main0_in
+{
+    float4x4 vInputs;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    spvUnsafeArray<float4x4, 32> _16 = spvUnsafeArray<float4x4, 32>({ gl_in[0].vInputs, gl_in[1].vInputs, gl_in[2].vInputs, gl_in[3].vInputs, gl_in[4].vInputs, gl_in[5].vInputs, gl_in[6].vInputs, gl_in[7].vInputs, gl_in[8].vInputs, gl_in[9].vInputs, gl_in[10].vInputs, gl_in[11].vInputs, gl_in[12].vInputs, gl_in[13].vInputs, gl_in[14].vInputs, gl_in[15].vInputs, gl_in[16].vInputs, gl_in[17].vInputs, gl_in[18].vInputs, gl_in[19].vInputs, gl_in[20].vInputs, gl_in[21].vInputs, gl_in[22].vInputs, gl_in[23].vInputs, gl_in[24].vInputs, gl_in[25].vInputs, gl_in[26].vInputs, gl_in[27].vInputs, gl_in[28].vInputs, gl_in[29].vInputs, gl_in[30].vInputs, gl_in[31].vInputs });
+    spvUnsafeArray<float4x4, 32> tmp;
+    tmp = _16;
+    gl_out[gl_InvocationID].vOutputs = tmp[gl_InvocationID];
+}
+
--- a/reference/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc
+++ b/reference/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc
@ -0,0 +1,80 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct VertexData
+{
+    float4x4 a;
+    spvUnsafeArray<float4, 2> b;
+    float4 c;
+};
+
+struct main0_out
+{
+    float4 vOutputs;
+};
+
+struct main0_in
+{
+    float4x4 VertexData_a;
+    spvUnsafeArray<float4, 2> VertexData_b;
+    float4 VertexData_c;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    spvUnsafeArray<VertexData, 32> _19 = spvUnsafeArray<VertexData, 32>({ VertexData{ gl_in[0].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[0].VertexData_b[0], gl_in[0].VertexData_b[1] }), gl_in[0].VertexData_c }, VertexData{ gl_in[1].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[1].VertexData_b[0], gl_in[1].VertexData_b[1] }), gl_in[1].VertexData_c }, VertexData{ gl_in[2].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[2].VertexData_b[0], gl_in[2].VertexData_b[1] }), gl_in[2].VertexData_c }, VertexData{ gl_in[3].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[3].VertexData_b[0], gl_in[3].VertexData_b[1] }), gl_in[3].VertexData_c }, VertexData{ gl_in[4].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[4].VertexData_b[0], gl_in[4].VertexData_b[1] }), gl_in[4].VertexData_c }, VertexData{ gl_in[5].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[5].VertexData_b[0], gl_in[5].VertexData_b[1] }), gl_in[5].VertexData_c }, VertexData{ gl_in[6].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[6].VertexData_b[0], gl_in[6].VertexData_b[1] }), gl_in[6].VertexData_c }, VertexData{ gl_in[7].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[7].VertexData_b[0], gl_in[7].VertexData_b[1] }), gl_in[7].VertexData_c }, VertexData{ gl_in[8].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[8].VertexData_b[0], gl_in[8].VertexData_b[1] }), gl_in[8].VertexData_c }, VertexData{ gl_in[9].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[9].VertexData_b[0], gl_in[9].VertexData_b[1] }), gl_in[9].VertexData_c }, VertexData{ gl_in[10].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[10].VertexData_b[0], gl_in[10].VertexData_b[1] }), gl_in[10].VertexData_c }, VertexData{ gl_in[11].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[11].VertexData_b[0], gl_in[11].VertexData_b[1] }), gl_in[11].VertexData_c }, VertexData{ gl_in[12].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[12].VertexData_b[0], gl_in[12].VertexData_b[1] }), gl_in[12].VertexData_c }, VertexData{ gl_in[13].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[13].VertexData_b[0], gl_in[13].VertexData_b[1] }), gl_in[13].VertexData_c }, VertexData{ gl_in[14].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[14].VertexData_b[0], gl_in[14].VertexData_b[1] }), gl_in[14].VertexData_c }, VertexData{ gl_in[15].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[15].VertexData_b[0], gl_in[15].VertexData_b[1] }), gl_in[15].VertexData_c }, VertexData{ gl_in[16].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[16].VertexData_b[0], gl_in[16].VertexData_b[1] }), gl_in[16].VertexData_c }, VertexData{ gl_in[17].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[17].VertexData_b[0], gl_in[17].VertexData_b[1] }), gl_in[17].VertexData_c }, VertexData{ gl_in[18].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[18].VertexData_b[0], gl_in[18].VertexData_b[1] }), gl_in[18].VertexData_c }, VertexData{ gl_in[19].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[19].VertexData_b[0], gl_in[19].VertexData_b[1] }), gl_in[19].VertexData_c }, VertexData{ gl_in[20].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[20].VertexData_b[0], gl_in[20].VertexData_b[1] }), gl_in[20].VertexData_c }, VertexData{ gl_in[21].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[21].VertexData_b[0], gl_in[21].VertexData_b[1] }), gl_in[21].VertexData_c }, VertexData{ gl_in[22].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[22].VertexData_b[0], gl_in[22].VertexData_b[1] }), gl_in[22].VertexData_c }, VertexData{ gl_in[23].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[23].VertexData_b[0], gl_in[23].VertexData_b[1] }), gl_in[23].VertexData_c }, VertexData{ gl_in[24].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[24].VertexData_b[0], gl_in[24].VertexData_b[1] }), gl_in[24].VertexData_c }, VertexData{ gl_in[25].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[25].VertexData_b[0], gl_in[25].VertexData_b[1] }), gl_in[25].VertexData_c }, VertexData{ gl_in[26].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[26].VertexData_b[0], gl_in[26].VertexData_b[1] }), gl_in[26].VertexData_c }, VertexData{ gl_in[27].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[27].VertexData_b[0], gl_in[27].VertexData_b[1] }), gl_in[27].VertexData_c }, VertexData{ gl_in[28].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[28].VertexData_b[0], gl_in[28].VertexData_b[1] }), gl_in[28].VertexData_c }, VertexData{ gl_in[29].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[29].VertexData_b[0], gl_in[29].VertexData_b[1] }), gl_in[29].VertexData_c }, VertexData{ gl_in[30].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[30].VertexData_b[0], gl_in[30].VertexData_b[1] }), gl_in[30].VertexData_c }, VertexData{ gl_in[31].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[31].VertexData_b[0], gl_in[31].VertexData_b[1] }), gl_in[31].VertexData_c } });
+    spvUnsafeArray<VertexData, 32> tmp;
+    tmp = _19;
+    int _27 = gl_InvocationID ^ 1;
+    VertexData _30 = VertexData{ gl_in[_27].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[_27].VertexData_b[0], gl_in[_27].VertexData_b[1] }), gl_in[_27].VertexData_c };
+    VertexData tmp_single = _30;
+    gl_out[gl_InvocationID].vOutputs = ((tmp[gl_InvocationID].a[1] + tmp[gl_InvocationID].b[1]) + tmp[gl_InvocationID].c) + tmp_single.c;
+}
+
--- a/reference/shaders-msl/tesc/load-control-point-array.multi-patch.tesc
+++ b/reference/shaders-msl/tesc/load-control-point-array.multi-patch.tesc
@ -0,0 +1,69 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct main0_out
+{
+    float4 vOutputs;
+};
+
+struct main0_in
+{
+    float4 vInputs;
+    ushort2 m_43;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    spvUnsafeArray<float4, 32> _15 = spvUnsafeArray<float4, 32>({ gl_in[0].vInputs, gl_in[1].vInputs, gl_in[2].vInputs, gl_in[3].vInputs, gl_in[4].vInputs, gl_in[5].vInputs, gl_in[6].vInputs, gl_in[7].vInputs, gl_in[8].vInputs, gl_in[9].vInputs, gl_in[10].vInputs, gl_in[11].vInputs, gl_in[12].vInputs, gl_in[13].vInputs, gl_in[14].vInputs, gl_in[15].vInputs, gl_in[16].vInputs, gl_in[17].vInputs, gl_in[18].vInputs, gl_in[19].vInputs, gl_in[20].vInputs, gl_in[21].vInputs, gl_in[22].vInputs, gl_in[23].vInputs, gl_in[24].vInputs, gl_in[25].vInputs, gl_in[26].vInputs, gl_in[27].vInputs, gl_in[28].vInputs, gl_in[29].vInputs, gl_in[30].vInputs, gl_in[31].vInputs });
+    spvUnsafeArray<float4, 32> tmp;
+    tmp = _15;
+    gl_out[gl_InvocationID].vOutputs = tmp[gl_InvocationID];
+}
+
--- a/reference/shaders-msl/tesc/water_tess.multi-patch.tesc
+++ b/reference/shaders-msl/tesc/water_tess.multi-patch.tesc
@ -0,0 +1,135 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct UBO
+{
+    float4 uScale;
+    float3 uCamPos;
+    float2 uPatchSize;
+    float2 uMaxTessLevel;
+    float uDistanceMod;
+    float4 uFrustum[6];
+};
+
+struct main0_patchOut
+{
+    float2 vOutPatchPosBase;
+    float4 vPatchLods;
+};
+
+struct main0_in
+{
+    float3 vPatchPosBase;
+    ushort2 m_430;
+};
+
+static inline __attribute__((always_inline))
+bool frustum_cull(thread const float2& p0, constant UBO& v_41)
+{
+    float2 min_xz = (p0 - float2(10.0)) * v_41.uScale.xy;
+    float2 max_xz = ((p0 + v_41.uPatchSize) + float2(10.0)) * v_41.uScale.xy;
+    float3 bb_min = float3(min_xz.x, -10.0, min_xz.y);
+    float3 bb_max = float3(max_xz.x, 10.0, max_xz.y);
+    float3 center = (bb_min + bb_max) * 0.5;
+    float radius = 0.5 * length(bb_max - bb_min);
+    float3 f0 = float3(dot(v_41.uFrustum[0], float4(center, 1.0)), dot(v_41.uFrustum[1], float4(center, 1.0)), dot(v_41.uFrustum[2], float4(center, 1.0)));
+    float3 f1 = float3(dot(v_41.uFrustum[3], float4(center, 1.0)), dot(v_41.uFrustum[4], float4(center, 1.0)), dot(v_41.uFrustum[5], float4(center, 1.0)));
+    bool _205 = any(f0 <= float3(-radius));
+    bool _215;
+    if (!_205)
+    {
+        _215 = any(f1 <= float3(-radius));
+    }
+    else
+    {
+        _215 = _205;
+    }
+    return !_215;
+}
+
+static inline __attribute__((always_inline))
+float lod_factor(thread const float2& pos_, constant UBO& v_41)
+{
+    float2 pos = pos_ * v_41.uScale.xy;
+    float3 dist_to_cam = v_41.uCamPos - float3(pos.x, 0.0, pos.y);
+    float level = log2((length(dist_to_cam) + 9.9999997473787516355514526367188e-05) * v_41.uDistanceMod);
+    return fast::clamp(level, 0.0, v_41.uMaxTessLevel.x);
+}
+
+static inline __attribute__((always_inline))
+float4 tess_level(thread const float4& lod, constant UBO& v_41)
+{
+    return exp2(-lod) * v_41.uMaxTessLevel.y;
+}
+
+static inline __attribute__((always_inline))
+float tess_level(thread const float& lod, constant UBO& v_41)
+{
+    return v_41.uMaxTessLevel.y * exp2(-lod);
+}
+
+static inline __attribute__((always_inline))
+void compute_tess_levels(thread const float2& p0, constant UBO& v_41, device float2& vOutPatchPosBase, device float4& vPatchLods, device half (&gl_TessLevelOuter)[4], device half (&gl_TessLevelInner)[2])
+{
+    vOutPatchPosBase = p0;
+    float2 param = p0 + (float2(-0.5) * v_41.uPatchSize);
+    float l00 = lod_factor(param, v_41);
+    float2 param_1 = p0 + (float2(0.5, -0.5) * v_41.uPatchSize);
+    float l10 = lod_factor(param_1, v_41);
+    float2 param_2 = p0 + (float2(1.5, -0.5) * v_41.uPatchSize);
+    float l20 = lod_factor(param_2, v_41);
+    float2 param_3 = p0 + (float2(-0.5, 0.5) * v_41.uPatchSize);
+    float l01 = lod_factor(param_3, v_41);
+    float2 param_4 = p0 + (float2(0.5) * v_41.uPatchSize);
+    float l11 = lod_factor(param_4, v_41);
+    float2 param_5 = p0 + (float2(1.5, 0.5) * v_41.uPatchSize);
+    float l21 = lod_factor(param_5, v_41);
+    float2 param_6 = p0 + (float2(-0.5, 1.5) * v_41.uPatchSize);
+    float l02 = lod_factor(param_6, v_41);
+    float2 param_7 = p0 + (float2(0.5, 1.5) * v_41.uPatchSize);
+    float l12 = lod_factor(param_7, v_41);
+    float2 param_8 = p0 + (float2(1.5) * v_41.uPatchSize);
+    float l22 = lod_factor(param_8, v_41);
+    float4 lods = float4(dot(float4(l01, l11, l02, l12), float4(0.25)), dot(float4(l00, l10, l01, l11), float4(0.25)), dot(float4(l10, l20, l11, l21), float4(0.25)), dot(float4(l11, l21, l12, l22), float4(0.25)));
+    vPatchLods = lods;
+    float4 outer_lods = fast::min(lods, lods.yzwx);
+    float4 param_9 = outer_lods;
+    float4 levels = tess_level(param_9, v_41);
+    gl_TessLevelOuter[0] = half(levels.x);
+    gl_TessLevelOuter[1] = half(levels.y);
+    gl_TessLevelOuter[2] = half(levels.z);
+    gl_TessLevelOuter[3] = half(levels.w);
+    float min_lod = fast::min(fast::min(lods.x, lods.y), fast::min(lods.z, lods.w));
+    float param_10 = fast::min(min_lod, l11);
+    float inner = tess_level(param_10, v_41);
+    gl_TessLevelInner[0] = half(inner);
+    gl_TessLevelInner[1] = half(inner);
+}
+
+kernel void main0(constant UBO& v_41 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 1];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1]);
+    float2 p0 = gl_in[0].vPatchPosBase.xy;
+    float2 param = p0;
+    if (!frustum_cull(param, v_41))
+    {
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(-1.0);
+    }
+    else
+    {
+        float2 param_1 = p0;
+        compute_tess_levels(param_1, v_41, patchOut.vOutPatchPosBase, patchOut.vPatchLods, spvTessLevel[gl_PrimitiveID].edgeTessellationFactor, spvTessLevel[gl_PrimitiveID].insideTessellationFactor);
+    }
+}
+
--- a/reference/shaders-msl/vert/basic.for-tess.vert
+++ b/reference/shaders-msl/vert/basic.for-tess.vert
@ -0,0 +1,31 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct UBO
+{
+    float4x4 uMVP;
+};
+
+struct main0_out
+{
+    float3 vNormal;
+    float4 gl_Position;
+};
+
+struct main0_in
+{
+    float4 aVertex [[attribute(0)]];
+    float3 aNormal [[attribute(1)]];
+};
+
+kernel void main0(main0_in in [[stage_in]], constant UBO& _16 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], device main0_out* spvOut [[buffer(28)]])
+{
+    device main0_out& out = spvOut[gl_GlobalInvocationID.y * spvStageInputSize.x + gl_GlobalInvocationID.x];
+    if (any(gl_GlobalInvocationID >= spvStageInputSize))
+        return;
+    out.gl_Position = _16.uMVP * in.aVertex;
+    out.vNormal = in.aNormal;
+}
+
--- a/reference/shaders-msl/vert/leaf-function.for-tess.vert
+++ b/reference/shaders-msl/vert/leaf-function.for-tess.vert
@ -0,0 +1,39 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct UBO
+{
+    float4x4 uMVP;
+};
+
+struct main0_out
+{
+    float3 vNormal;
+    float4 gl_Position;
+};
+
+struct main0_in
+{
+    float4 aVertex [[attribute(0)]];
+    float3 aNormal [[attribute(1)]];
+};
+
+static inline __attribute__((always_inline))
+void set_output(device float4& gl_Position, constant UBO& v_18, thread float4& aVertex, device float3& vNormal, thread float3& aNormal)
+{
+    gl_Position = v_18.uMVP * aVertex;
+    vNormal = aNormal;
+}
+
+kernel void main0(main0_in in [[stage_in]], constant UBO& v_18 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], device main0_out* spvOut [[buffer(28)]])
+{
+    device main0_out& out = spvOut[gl_GlobalInvocationID.y * spvStageInputSize.x + gl_GlobalInvocationID.x];
+    if (any(gl_GlobalInvocationID >= spvStageInputSize))
+        return;
+    set_output(out.gl_Position, v_18, in.aVertex, out.vNormal, in.aNormal);
+}
+
--- a/reference/shaders-msl/vert/no_stage_out.for-tess.vert
+++ b/reference/shaders-msl/vert/no_stage_out.for-tess.vert
@ -0,0 +1,23 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct _10
+{
+    uint4 _m0[1024];
+};
+
+struct main0_in
+{
+    uint4 m_19 [[attribute(0)]];
+};
+
+kernel void main0(main0_in in [[stage_in]], device _10& _12 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], uint3 spvDispatchBase [[grid_origin]])
+{
+    if (any(gl_GlobalInvocationID >= spvStageInputSize))
+        return;
+    uint gl_VertexIndex = gl_GlobalInvocationID.x + spvDispatchBase.x;
+    _12._m0[int(gl_VertexIndex)] = in.m_19;
+}
+
--- a/shaders-msl-no-opt/asm/tesc/tess-fixed-input-array-builtin-array.invalid.multi-patch.asm.tesc
+++ b/shaders-msl-no-opt/asm/tesc/tess-fixed-input-array-builtin-array.invalid.multi-patch.asm.tesc
@ -0,0 +1,248 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 2
+; Bound: 162
+; Schema: 0
+               OpCapability Tessellation
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint TessellationControl %hs_main "main" %p_pos %p_1 %i_1 %_entryPointOutput_pos %_entryPointOutput %_patchConstantOutput_EdgeTess %_patchConstantOutput_InsideTess
+               OpExecutionMode %hs_main OutputVertices 3
+               OpExecutionMode %hs_main Triangles
+               OpExecutionMode %hs_main SpacingFractionalOdd
+               OpExecutionMode %hs_main VertexOrderCw
+               OpSource HLSL 500
+               OpName %hs_main "hs_main"
+               OpName %VertexOutput "VertexOutput"
+               OpMemberName %VertexOutput 0 "pos"
+               OpMemberName %VertexOutput 1 "uv"
+               OpName %HSOut "HSOut"
+               OpMemberName %HSOut 0 "pos"
+               OpMemberName %HSOut 1 "uv"
+               OpName %_hs_main_struct_VertexOutput_vf4_vf21_3__u1_ "@hs_main(struct-VertexOutput-vf4-vf21[3];u1;"
+               OpName %p "p"
+               OpName %i "i"
+               OpName %HSConstantOut "HSConstantOut"
+               OpMemberName %HSConstantOut 0 "EdgeTess"
+               OpMemberName %HSConstantOut 1 "InsideTess"
+               OpName %PatchHS_struct_VertexOutput_vf4_vf21_3__ "PatchHS(struct-VertexOutput-vf4-vf21[3];"
+               OpName %patch "patch"
+               OpName %output "output"
+               OpName %p_0 "p"
+               OpName %p_pos "p.pos"
+               OpName %VertexOutput_0 "VertexOutput"
+               OpMemberName %VertexOutput_0 0 "uv"
+               OpName %p_1 "p"
+               OpName %i_0 "i"
+               OpName %i_1 "i"
+               OpName %flattenTemp "flattenTemp"
+               OpName %param "param"
+               OpName %param_0 "param"
+               OpName %_entryPointOutput_pos "@entryPointOutput.pos"
+               OpName %HSOut_0 "HSOut"
+               OpMemberName %HSOut_0 0 "uv"
+               OpName %_entryPointOutput "@entryPointOutput"
+               OpName %_patchConstantResult "@patchConstantResult"
+               OpName %param_1 "param"
+               OpName %_patchConstantOutput_EdgeTess "@patchConstantOutput.EdgeTess"
+               OpName %_patchConstantOutput_InsideTess "@patchConstantOutput.InsideTess"
+               OpName %output_0 "output"
+               OpDecorate %p_pos BuiltIn Position
+               OpDecorate %p_1 Location 0
+               OpDecorate %i_1 BuiltIn InvocationId
+               OpDecorate %_entryPointOutput_pos BuiltIn Position
+               OpDecorate %_entryPointOutput Location 0
+               OpDecorate %_patchConstantOutput_EdgeTess Patch
+               OpDecorate %_patchConstantOutput_EdgeTess BuiltIn TessLevelOuter
+               OpDecorate %_patchConstantOutput_InsideTess Patch
+               OpDecorate %_patchConstantOutput_InsideTess BuiltIn TessLevelInner
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+    %v4float = OpTypeVector %float 4
+    %v2float = OpTypeVector %float 2
+%VertexOutput = OpTypeStruct %v4float %v2float
+       %uint = OpTypeInt 32 0
+     %uint_3 = OpConstant %uint 3
+%_arr_VertexOutput_uint_3 = OpTypeArray %VertexOutput %uint_3
+%_ptr_Function__arr_VertexOutput_uint_3 = OpTypePointer Function %_arr_VertexOutput_uint_3
+%_ptr_Function_uint = OpTypePointer Function %uint
+      %HSOut = OpTypeStruct %v4float %v2float
+         %16 = OpTypeFunction %HSOut %_ptr_Function__arr_VertexOutput_uint_3 %_ptr_Function_uint
+%_arr_float_uint_3 = OpTypeArray %float %uint_3
+%HSConstantOut = OpTypeStruct %_arr_float_uint_3 %float
+         %23 = OpTypeFunction %HSConstantOut %_ptr_Function__arr_VertexOutput_uint_3
+%_ptr_Function_HSOut = OpTypePointer Function %HSOut
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+%_ptr_Function_v4float = OpTypePointer Function %v4float
+      %int_1 = OpConstant %int 1
+%_ptr_Function_v2float = OpTypePointer Function %v2float
+%_arr_v4float_uint_3 = OpTypeArray %v4float %uint_3
+%_ptr_Input__arr_v4float_uint_3 = OpTypePointer Input %_arr_v4float_uint_3
+      %p_pos = OpVariable %_ptr_Input__arr_v4float_uint_3 Input
+%_ptr_Input_v4float = OpTypePointer Input %v4float
+%VertexOutput_0 = OpTypeStruct %v2float
+%_arr_VertexOutput_0_uint_3 = OpTypeArray %VertexOutput_0 %uint_3
+%_ptr_Input__arr_VertexOutput_0_uint_3 = OpTypePointer Input %_arr_VertexOutput_0_uint_3
+        %p_1 = OpVariable %_ptr_Input__arr_VertexOutput_0_uint_3 Input
+%_ptr_Input_v2float = OpTypePointer Input %v2float
+      %int_2 = OpConstant %int 2
+%_ptr_Input_uint = OpTypePointer Input %uint
+        %i_1 = OpVariable %_ptr_Input_uint Input
+%_ptr_Output__arr_v4float_uint_3 = OpTypePointer Output %_arr_v4float_uint_3
+%_entryPointOutput_pos = OpVariable %_ptr_Output__arr_v4float_uint_3 Output
+%_ptr_Output_v4float = OpTypePointer Output %v4float
+    %HSOut_0 = OpTypeStruct %v2float
+%_arr_HSOut_0_uint_3 = OpTypeArray %HSOut_0 %uint_3
+%_ptr_Output__arr_HSOut_0_uint_3 = OpTypePointer Output %_arr_HSOut_0_uint_3
+%_entryPointOutput = OpVariable %_ptr_Output__arr_HSOut_0_uint_3 Output
+%_ptr_Output_v2float = OpTypePointer Output %v2float
+     %uint_2 = OpConstant %uint 2
+     %uint_1 = OpConstant %uint 1
+     %uint_0 = OpConstant %uint 0
+       %bool = OpTypeBool
+%_ptr_Function_HSConstantOut = OpTypePointer Function %HSConstantOut
+     %uint_4 = OpConstant %uint 4
+%_arr_float_uint_4 = OpTypeArray %float %uint_4
+%_ptr_Output__arr_float_uint_4 = OpTypePointer Output %_arr_float_uint_4
+%_patchConstantOutput_EdgeTess = OpVariable %_ptr_Output__arr_float_uint_4 Output
+%_ptr_Function_float = OpTypePointer Function %float
+%_ptr_Output_float = OpTypePointer Output %float
+%_arr_float_uint_2 = OpTypeArray %float %uint_2
+%_ptr_Output__arr_float_uint_2 = OpTypePointer Output %_arr_float_uint_2
+%_patchConstantOutput_InsideTess = OpVariable %_ptr_Output__arr_float_uint_2 Output
+    %float_1 = OpConstant %float 1
+    %hs_main = OpFunction %void None %3
+          %5 = OpLabel
+        %p_0 = OpVariable %_ptr_Function__arr_VertexOutput_uint_3 Function
+        %i_0 = OpVariable %_ptr_Function_uint Function
+%flattenTemp = OpVariable %_ptr_Function_HSOut Function
+      %param = OpVariable %_ptr_Function__arr_VertexOutput_uint_3 Function
+    %param_0 = OpVariable %_ptr_Function_uint Function
+%_patchConstantResult = OpVariable %_ptr_Function_HSConstantOut Function
+    %param_1 = OpVariable %_ptr_Function__arr_VertexOutput_uint_3 Function
+         %50 = OpAccessChain %_ptr_Input_v4float %p_pos %int_0
+         %51 = OpLoad %v4float %50
+         %52 = OpAccessChain %_ptr_Function_v4float %p_0 %int_0 %int_0
+               OpStore %52 %51
+         %58 = OpAccessChain %_ptr_Input_v2float %p_1 %int_0 %int_0
+         %59 = OpLoad %v2float %58
+         %60 = OpAccessChain %_ptr_Function_v2float %p_0 %int_0 %int_1
+               OpStore %60 %59
+         %61 = OpAccessChain %_ptr_Input_v4float %p_pos %int_1
+         %62 = OpLoad %v4float %61
+         %63 = OpAccessChain %_ptr_Function_v4float %p_0 %int_1 %int_0
+               OpStore %63 %62
+         %64 = OpAccessChain %_ptr_Input_v2float %p_1 %int_1 %int_0
+         %65 = OpLoad %v2float %64
+         %66 = OpAccessChain %_ptr_Function_v2float %p_0 %int_1 %int_1
+               OpStore %66 %65
+         %68 = OpAccessChain %_ptr_Input_v4float %p_pos %int_2
+         %69 = OpLoad %v4float %68
+         %70 = OpAccessChain %_ptr_Function_v4float %p_0 %int_2 %int_0
+               OpStore %70 %69
+         %71 = OpAccessChain %_ptr_Input_v2float %p_1 %int_2 %int_0
+         %72 = OpLoad %v2float %71
+         %73 = OpAccessChain %_ptr_Function_v2float %p_0 %int_2 %int_1
+               OpStore %73 %72
+         %77 = OpLoad %uint %i_1
+               OpStore %i_0 %77
+         %80 = OpLoad %_arr_VertexOutput_uint_3 %p_0
+               OpStore %param %80
+         %82 = OpLoad %uint %i_0
+               OpStore %param_0 %82
+         %83 = OpFunctionCall %HSOut %_hs_main_struct_VertexOutput_vf4_vf21_3__u1_ %param %param_0
+               OpStore %flattenTemp %83
+         %86 = OpAccessChain %_ptr_Function_v4float %flattenTemp %int_0
+         %87 = OpLoad %v4float %86
+         %94 = OpLoad %uint %i_1
+         %89 = OpAccessChain %_ptr_Output_v4float %_entryPointOutput_pos %94
+               OpStore %89 %87
+         %95 = OpAccessChain %_ptr_Function_v2float %flattenTemp %int_1
+         %96 = OpLoad %v2float %95
+         %98 = OpAccessChain %_ptr_Output_v2float %_entryPointOutput %94 %int_0
+               OpStore %98 %96
+               OpControlBarrier %uint_2 %uint_1 %uint_0
+        %102 = OpLoad %uint %i_1
+        %104 = OpIEqual %bool %102 %int_0
+               OpSelectionMerge %106 None
+               OpBranchConditional %104 %105 %106
+        %105 = OpLabel
+        %110 = OpLoad %_arr_VertexOutput_uint_3 %p_0
+               OpStore %param_1 %110
+        %111 = OpFunctionCall %HSConstantOut %PatchHS_struct_VertexOutput_vf4_vf21_3__ %param_1
+               OpStore %_patchConstantResult %111
+        %117 = OpAccessChain %_ptr_Function_float %_patchConstantResult %int_0 %int_0
+        %118 = OpLoad %float %117
+        %120 = OpAccessChain %_ptr_Output_float %_patchConstantOutput_EdgeTess %int_0
+               OpStore %120 %118
+        %121 = OpAccessChain %_ptr_Function_float %_patchConstantResult %int_0 %int_1
+        %122 = OpLoad %float %121
+        %123 = OpAccessChain %_ptr_Output_float %_patchConstantOutput_EdgeTess %int_1
+               OpStore %123 %122
+        %124 = OpAccessChain %_ptr_Function_float %_patchConstantResult %int_0 %int_2
+        %125 = OpLoad %float %124
+        %126 = OpAccessChain %_ptr_Output_float %_patchConstantOutput_EdgeTess %int_2
+               OpStore %126 %125
+        %130 = OpAccessChain %_ptr_Function_float %_patchConstantResult %int_1
+        %131 = OpLoad %float %130
+        %132 = OpAccessChain %_ptr_Output_float %_patchConstantOutput_InsideTess %int_0
+               OpStore %132 %131
+               OpBranch %106
+        %106 = OpLabel
+               OpReturn
+               OpFunctionEnd
+%_hs_main_struct_VertexOutput_vf4_vf21_3__u1_ = OpFunction %HSOut None %16
+          %p = OpFunctionParameter %_ptr_Function__arr_VertexOutput_uint_3
+          %i = OpFunctionParameter %_ptr_Function_uint
+         %20 = OpLabel
+     %output = OpVariable %_ptr_Function_HSOut Function
+         %31 = OpLoad %uint %i
+         %33 = OpAccessChain %_ptr_Function_v4float %p %31 %int_0
+         %34 = OpLoad %v4float %33
+         %35 = OpAccessChain %_ptr_Function_v4float %output %int_0
+               OpStore %35 %34
+         %37 = OpLoad %uint %i
+         %39 = OpAccessChain %_ptr_Function_v2float %p %37 %int_1
+         %40 = OpLoad %v2float %39
+         %41 = OpAccessChain %_ptr_Function_v2float %output %int_1
+               OpStore %41 %40
+         %42 = OpLoad %HSOut %output
+               OpReturnValue %42
+               OpFunctionEnd
+%PatchHS_struct_VertexOutput_vf4_vf21_3__ = OpFunction %HSConstantOut None %23
+      %patch = OpFunctionParameter %_ptr_Function__arr_VertexOutput_uint_3
+         %26 = OpLabel
+   %output_0 = OpVariable %_ptr_Function_HSConstantOut Function
+        %135 = OpAccessChain %_ptr_Function_v2float %patch %int_0 %int_1
+        %136 = OpLoad %v2float %135
+        %137 = OpCompositeConstruct %v2float %float_1 %float_1
+        %138 = OpFAdd %v2float %137 %136
+        %139 = OpCompositeExtract %float %138 0
+        %140 = OpAccessChain %_ptr_Function_float %output_0 %int_0 %int_0
+               OpStore %140 %139
+        %141 = OpAccessChain %_ptr_Function_v2float %patch %int_0 %int_1
+        %142 = OpLoad %v2float %141
+        %143 = OpCompositeConstruct %v2float %float_1 %float_1
+        %144 = OpFAdd %v2float %143 %142
+        %145 = OpCompositeExtract %float %144 0
+        %146 = OpAccessChain %_ptr_Function_float %output_0 %int_0 %int_1
+               OpStore %146 %145
+        %147 = OpAccessChain %_ptr_Function_v2float %patch %int_0 %int_1
+        %148 = OpLoad %v2float %147
+        %149 = OpCompositeConstruct %v2float %float_1 %float_1
+        %150 = OpFAdd %v2float %149 %148
+        %151 = OpCompositeExtract %float %150 0
+        %152 = OpAccessChain %_ptr_Function_float %output_0 %int_0 %int_2
+               OpStore %152 %151
+        %153 = OpAccessChain %_ptr_Function_v2float %patch %int_0 %int_1
+        %154 = OpLoad %v2float %153
+        %155 = OpCompositeConstruct %v2float %float_1 %float_1
+        %156 = OpFAdd %v2float %155 %154
+        %157 = OpCompositeExtract %float %156 0
+        %158 = OpAccessChain %_ptr_Function_float %output_0 %int_1
+               OpStore %158 %157
+        %159 = OpLoad %HSConstantOut %output_0
+               OpReturnValue %159
+               OpFunctionEnd
--- a/shaders-msl/asm/tesc/tess-level-overrun.multi-patch.asm.tesc
+++ b/shaders-msl/asm/tesc/tess-level-overrun.multi-patch.asm.tesc
@ -0,0 +1,102 @@
+; SPIR-V
+; Version: 1.3
+; Generator: Khronos Glslang Reference Front End; 7
+; Bound: 46
+; Schema: 0
+               OpCapability Tessellation
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint TessellationControl %main "main" %gl_TessLevelInner %gl_TessLevelOuter
+               OpExecutionMode %main OutputVertices 1
+               OpExecutionMode %main Triangles
+               OpSource ESSL 310
+               OpSourceExtension "GL_EXT_shader_io_blocks"
+               OpSourceExtension "GL_EXT_tessellation_shader"
+               OpName %main "main"
+               OpName %gl_TessLevelInner "gl_TessLevelInner"
+               OpName %TessLevels "TessLevels"
+               OpMemberName %TessLevels 0 "inner0"
+               OpMemberName %TessLevels 1 "inner1"
+               OpMemberName %TessLevels 2 "outer0"
+               OpMemberName %TessLevels 3 "outer1"
+               OpMemberName %TessLevels 4 "outer2"
+               OpMemberName %TessLevels 5 "outer3"
+               OpName %sb_levels "sb_levels"
+               OpName %gl_TessLevelOuter "gl_TessLevelOuter"
+               OpDecorate %gl_TessLevelInner Patch
+               OpDecorate %gl_TessLevelInner BuiltIn TessLevelInner
+               OpMemberDecorate %TessLevels 0 Restrict
+               OpMemberDecorate %TessLevels 0 NonWritable
+               OpMemberDecorate %TessLevels 0 Offset 0
+               OpMemberDecorate %TessLevels 1 Restrict
+               OpMemberDecorate %TessLevels 1 NonWritable
+               OpMemberDecorate %TessLevels 1 Offset 4
+               OpMemberDecorate %TessLevels 2 Restrict
+               OpMemberDecorate %TessLevels 2 NonWritable
+               OpMemberDecorate %TessLevels 2 Offset 8
+               OpMemberDecorate %TessLevels 3 Restrict
+               OpMemberDecorate %TessLevels 3 NonWritable
+               OpMemberDecorate %TessLevels 3 Offset 12
+               OpMemberDecorate %TessLevels 4 Restrict
+               OpMemberDecorate %TessLevels 4 NonWritable
+               OpMemberDecorate %TessLevels 4 Offset 16
+               OpMemberDecorate %TessLevels 5 Restrict
+               OpMemberDecorate %TessLevels 5 NonWritable
+               OpMemberDecorate %TessLevels 5 Offset 20
+               OpDecorate %TessLevels Block
+               OpDecorate %sb_levels DescriptorSet 0
+               OpDecorate %sb_levels Binding 0
+               OpDecorate %gl_TessLevelOuter Patch
+               OpDecorate %gl_TessLevelOuter BuiltIn TessLevelOuter
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+       %uint = OpTypeInt 32 0
+     %uint_2 = OpConstant %uint 2
+%_arr_float_uint_2 = OpTypeArray %float %uint_2
+%_ptr_Output__arr_float_uint_2 = OpTypePointer Output %_arr_float_uint_2
+%gl_TessLevelInner = OpVariable %_ptr_Output__arr_float_uint_2 Output
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+ %TessLevels = OpTypeStruct %float %float %float %float %float %float
+%_ptr_StorageBuffer_TessLevels = OpTypePointer StorageBuffer %TessLevels
+  %sb_levels = OpVariable %_ptr_StorageBuffer_TessLevels StorageBuffer
+%_ptr_StorageBuffer_float = OpTypePointer StorageBuffer %float
+%_ptr_Output_float = OpTypePointer Output %float
+      %int_1 = OpConstant %int 1
+     %uint_4 = OpConstant %uint 4
+%_arr_float_uint_4 = OpTypeArray %float %uint_4
+%_ptr_Output__arr_float_uint_4 = OpTypePointer Output %_arr_float_uint_4
+%gl_TessLevelOuter = OpVariable %_ptr_Output__arr_float_uint_4 Output
+      %int_2 = OpConstant %int 2
+      %int_3 = OpConstant %int 3
+      %int_4 = OpConstant %int 4
+      %int_5 = OpConstant %int 5
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %18 = OpAccessChain %_ptr_StorageBuffer_float %sb_levels %int_0
+         %19 = OpLoad %float %18
+         %21 = OpAccessChain %_ptr_Output_float %gl_TessLevelInner %int_0
+               OpStore %21 %19
+         %23 = OpAccessChain %_ptr_StorageBuffer_float %sb_levels %int_1
+         %24 = OpLoad %float %23
+         %25 = OpAccessChain %_ptr_Output_float %gl_TessLevelInner %int_1
+               OpStore %25 %24
+         %31 = OpAccessChain %_ptr_StorageBuffer_float %sb_levels %int_2
+         %32 = OpLoad %float %31
+         %33 = OpAccessChain %_ptr_Output_float %gl_TessLevelOuter %int_0
+               OpStore %33 %32
+         %35 = OpAccessChain %_ptr_StorageBuffer_float %sb_levels %int_3
+         %36 = OpLoad %float %35
+         %37 = OpAccessChain %_ptr_Output_float %gl_TessLevelOuter %int_1
+               OpStore %37 %36
+         %39 = OpAccessChain %_ptr_StorageBuffer_float %sb_levels %int_4
+         %40 = OpLoad %float %39
+         %41 = OpAccessChain %_ptr_Output_float %gl_TessLevelOuter %int_2
+               OpStore %41 %40
+         %43 = OpAccessChain %_ptr_StorageBuffer_float %sb_levels %int_5
+         %44 = OpLoad %float %43
+         %45 = OpAccessChain %_ptr_Output_float %gl_TessLevelOuter %int_3
+               OpStore %45 %44
+               OpReturn
+               OpFunctionEnd
--- a/shaders-msl/desktop-only/tesc/basic.desktop.sso.multi-patch.tesc
+++ b/shaders-msl/desktop-only/tesc/basic.desktop.sso.multi-patch.tesc
@ -0,0 +1,32 @@
+#version 450
+layout(vertices = 1) out;
+
+in gl_PerVertex
+{
+   vec4 gl_Position;
+} gl_in[gl_MaxPatchVertices];
+
+out gl_PerVertex
+{
+   vec4 gl_Position;
+} gl_out[1];
+
+layout(location = 0) patch out vec3 vFoo;
+
+void set_position()
+{
+    gl_out[gl_InvocationID].gl_Position = gl_in[0].gl_Position + gl_in[1].gl_Position;
+}
+
+void main()
+{
+    gl_TessLevelInner[0] = 8.9;
+    gl_TessLevelInner[1] = 6.9;
+    gl_TessLevelOuter[0] = 8.9;
+    gl_TessLevelOuter[1] = 6.9;
+    gl_TessLevelOuter[2] = 3.9;
+    gl_TessLevelOuter[3] = 4.9;
+    vFoo = vec3(1.0);
+
+    set_position();
+}
--- a/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc
+++ b/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc
@ -0,0 +1,22 @@
+#version 450
+
+struct Boo
+{
+        vec3 a;
+        uvec3 b;
+};
+
+layout(vertices = 4) out;
+layout(location = 0) out Boo vVertex[];
+layout(location = 0) in Boo vInput[];
+
+void main()
+{
+        vVertex[gl_InvocationID] = vInput[gl_InvocationID];
+        gl_TessLevelOuter[0] = 1.0;
+        gl_TessLevelOuter[1] = 2.0;
+        gl_TessLevelOuter[2] = 3.0;
+        gl_TessLevelOuter[3] = 4.0;
+        gl_TessLevelInner[0] = 1.0;
+        gl_TessLevelInner[1] = 2.0;
+}
--- a/shaders-msl/desktop-only/vert/shader-draw-parameters.desktop.for-tess.vert
+++ b/shaders-msl/desktop-only/vert/shader-draw-parameters.desktop.for-tess.vert
@ -0,0 +1,11 @@
+#version 460
+
+out gl_PerVertex
+{
+    vec4 gl_Position;
+};
+
+void main()
+{
+    gl_Position = vec4(gl_BaseVertex, gl_BaseInstance, 0, 1);
+}
--- a/shaders-msl/tesc/basic.multi-patch.tesc
+++ b/shaders-msl/tesc/basic.multi-patch.tesc
@ -0,0 +1,17 @@
+#version 310 es
+#extension GL_EXT_tessellation_shader : require
+
+layout(location = 0) patch out vec3 vFoo;
+
+layout(vertices = 1) out;
+
+void main()
+{
+    gl_TessLevelInner[0] = 8.9;
+    gl_TessLevelInner[1] = 6.9;
+    gl_TessLevelOuter[0] = 8.9;
+    gl_TessLevelOuter[1] = 6.9;
+    gl_TessLevelOuter[2] = 3.9;
+    gl_TessLevelOuter[3] = 4.9;
+    vFoo = vec3(1.0);
+}
--- a/shaders-msl/tesc/load-control-point-array-of-matrix.multi-patch.tesc
+++ b/shaders-msl/tesc/load-control-point-array-of-matrix.multi-patch.tesc
@ -0,0 +1,12 @@
+#version 450
+
+layout(vertices = 4) out;
+
+layout(location = 0) in mat4 vInputs[gl_MaxPatchVertices];
+layout(location = 0) out mat4 vOutputs[4];
+
+void main()
+{
+	mat4 tmp[gl_MaxPatchVertices] = vInputs;
+	vOutputs[gl_InvocationID] = tmp[gl_InvocationID];
+}
--- a/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc
+++ b/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc
@ -0,0 +1,21 @@
+#version 450
+
+layout(vertices = 4) out;
+
+struct VertexData
+{
+	mat4 a;
+	vec4 b[2];
+	vec4 c;
+};
+
+layout(location = 0) in VertexData vInputs[gl_MaxPatchVertices];
+layout(location = 0) out vec4 vOutputs[4];
+
+void main()
+{
+	VertexData tmp[gl_MaxPatchVertices] = vInputs;
+	VertexData tmp_single = vInputs[gl_InvocationID ^ 1];
+
+	vOutputs[gl_InvocationID] = tmp[gl_InvocationID].a[1] + tmp[gl_InvocationID].b[1] + tmp[gl_InvocationID].c + tmp_single.c;
+}
--- a/shaders-msl/tesc/load-control-point-array.multi-patch.tesc
+++ b/shaders-msl/tesc/load-control-point-array.multi-patch.tesc
@ -0,0 +1,12 @@
+#version 450
+
+layout(vertices = 4) out;
+
+layout(location = 0) in vec4 vInputs[gl_MaxPatchVertices];
+layout(location = 0) out vec4 vOutputs[4];
+
+void main()
+{
+	vec4 tmp[gl_MaxPatchVertices] = vInputs;
+	vOutputs[gl_InvocationID] = tmp[gl_InvocationID];
+}
--- a/shaders-msl/tesc/water_tess.multi-patch.tesc
+++ b/shaders-msl/tesc/water_tess.multi-patch.tesc
@ -0,0 +1,115 @@
+#version 310 es
+#extension GL_EXT_tessellation_shader : require
+
+layout(vertices = 1) out;
+layout(location = 0) in vec2 vPatchPosBase[];
+
+layout(std140) uniform UBO
+{
+    vec4 uScale;
+    highp vec3 uCamPos;
+    vec2 uPatchSize;
+    vec2 uMaxTessLevel;
+    float uDistanceMod;
+    vec4 uFrustum[6];
+};
+
+layout(location = 1) patch out vec2 vOutPatchPosBase;
+layout(location = 2) patch out vec4 vPatchLods;
+
+float lod_factor(vec2 pos_)
+{
+    vec2 pos = pos_ * uScale.xy;
+    vec3 dist_to_cam = uCamPos - vec3(pos.x, 0.0, pos.y);
+    float level = log2((length(dist_to_cam) + 0.0001) * uDistanceMod);
+    return clamp(level, 0.0, uMaxTessLevel.x);
+}
+
+float tess_level(float lod)
+{
+    return uMaxTessLevel.y * exp2(-lod);
+}
+
+vec4 tess_level(vec4 lod)
+{
+    return uMaxTessLevel.y * exp2(-lod);
+}
+
+// Guard band for vertex displacement.
+#define GUARD_BAND 10.0
+bool frustum_cull(vec2 p0)
+{
+    vec2 min_xz = (p0 - GUARD_BAND) * uScale.xy;
+    vec2 max_xz = (p0 + uPatchSize + GUARD_BAND) * uScale.xy;
+
+    vec3 bb_min = vec3(min_xz.x, -GUARD_BAND, min_xz.y);
+    vec3 bb_max = vec3(max_xz.x, +GUARD_BAND, max_xz.y);
+    vec3 center = 0.5 * (bb_min + bb_max);
+    float radius = 0.5 * length(bb_max - bb_min);
+
+    vec3 f0 = vec3(
+        dot(uFrustum[0], vec4(center, 1.0)),
+        dot(uFrustum[1], vec4(center, 1.0)),
+        dot(uFrustum[2], vec4(center, 1.0)));
+
+    vec3 f1 = vec3(
+        dot(uFrustum[3], vec4(center, 1.0)),
+        dot(uFrustum[4], vec4(center, 1.0)),
+        dot(uFrustum[5], vec4(center, 1.0)));
+
+    return !(any(lessThanEqual(f0, vec3(-radius))) || any(lessThanEqual(f1, vec3(-radius))));
+}
+
+void compute_tess_levels(vec2 p0)
+{
+    vOutPatchPosBase = p0;
+
+    float l00 = lod_factor(p0 + vec2(-0.5, -0.5) * uPatchSize);
+    float l10 = lod_factor(p0 + vec2(+0.5, -0.5) * uPatchSize);
+    float l20 = lod_factor(p0 + vec2(+1.5, -0.5) * uPatchSize);
+    float l01 = lod_factor(p0 + vec2(-0.5, +0.5) * uPatchSize);
+    float l11 = lod_factor(p0 + vec2(+0.5, +0.5) * uPatchSize);
+    float l21 = lod_factor(p0 + vec2(+1.5, +0.5) * uPatchSize);
+    float l02 = lod_factor(p0 + vec2(-0.5, +1.5) * uPatchSize);
+    float l12 = lod_factor(p0 + vec2(+0.5, +1.5) * uPatchSize);
+    float l22 = lod_factor(p0 + vec2(+1.5, +1.5) * uPatchSize);
+
+    vec4 lods = vec4(
+        dot(vec4(l01, l11, l02, l12), vec4(0.25)),
+        dot(vec4(l00, l10, l01, l11), vec4(0.25)),
+        dot(vec4(l10, l20, l11, l21), vec4(0.25)),
+        dot(vec4(l11, l21, l12, l22), vec4(0.25)));
+
+    vPatchLods = lods;
+
+    vec4 outer_lods = min(lods.xyzw, lods.yzwx);
+    vec4 levels = tess_level(outer_lods);
+    gl_TessLevelOuter[0] = levels.x;
+    gl_TessLevelOuter[1] = levels.y;
+    gl_TessLevelOuter[2] = levels.z;
+    gl_TessLevelOuter[3] = levels.w;
+
+    float min_lod = min(min(lods.x, lods.y), min(lods.z, lods.w));
+    float inner = tess_level(min(min_lod, l11));
+    gl_TessLevelInner[0] = inner;
+    gl_TessLevelInner[1] = inner;
+}
+
+void main()
+{
+    vec2 p0 = vPatchPosBase[0];
+    if (!frustum_cull(p0))
+    {
+        gl_TessLevelOuter[0] = -1.0;
+        gl_TessLevelOuter[1] = -1.0;
+        gl_TessLevelOuter[2] = -1.0;
+        gl_TessLevelOuter[3] = -1.0;
+        gl_TessLevelInner[0] = -1.0;
+        gl_TessLevelInner[1] = -1.0;
+    }
+    else
+    {
+        compute_tess_levels(p0);
+    }
+}
+
--- a/shaders-msl/vert/basic.for-tess.vert
+++ b/shaders-msl/vert/basic.for-tess.vert
@ -0,0 +1,17 @@
+#version 310 es
+
+layout(std140) uniform UBO
+{
+    uniform mat4 uMVP;
+};
+
+layout(location = 0) in vec4 aVertex;
+layout(location = 1) in vec3 aNormal;
+
+layout(location = 0) out vec3 vNormal;
+
+void main()
+{
+    gl_Position = uMVP * aVertex;
+    vNormal = aNormal;
+}
--- a/shaders-msl/vert/leaf-function.for-tess.vert
+++ b/shaders-msl/vert/leaf-function.for-tess.vert
@ -0,0 +1,22 @@
+#version 310 es
+
+layout(std140) uniform UBO
+{
+    uniform mat4 uMVP;
+};
+
+layout(location = 0) in vec4 aVertex;
+layout(location = 1) in vec3 aNormal;
+
+layout(location = 0) out vec3 vNormal;
+
+void set_output()
+{
+    gl_Position = uMVP * aVertex;
+    vNormal = aNormal;
+}
+
+void main()
+{
+    set_output();
+}
--- a/shaders-msl/vert/no_stage_out.for-tess.vert
+++ b/shaders-msl/vert/no_stage_out.for-tess.vert
@ -0,0 +1,14 @@
+#version 450
+
+layout(binding = 0, std430) writeonly buffer _10_12
+{
+	uvec4 _m0[1024];
+} _12;
+
+layout(location = 0) in uvec4 _19;
+
+void main()
+{
+	_12._m0[gl_VertexIndex] = _19;
+}
+
--- a/spirv_common.hpp
+++ b/spirv_common.hpp
@ -1589,8 +1589,10 @@ enum ExtendedDecorations
 	// Marks a buffer block for using explicit offsets (GLSL/HLSL).
 	SPIRVCrossDecorationExplicitOffset,

-	// Apply to a variable in the Input storage class; marks it as holding the base group passed to vkCmdDispatchBase().
-	// In MSL, this is used to adjust the WorkgroupId and GlobalInvocationId variables.
+	// Apply to a variable in the Input storage class; marks it as holding the base group passed to vkCmdDispatchBase(),
+	// or the base vertex and instance indices passed to vkCmdDrawIndexed().
+	// In MSL, this is used to adjust the WorkgroupId and GlobalInvocationId variables in compute shaders,
+	// and to hold the BaseVertex and BaseInstance variables in vertex shaders.
 	SPIRVCrossDecorationBuiltInDispatchBase,

 	// Apply to a variable that is a function parameter; marks it as being a "dynamic"
@ -1599,6 +1601,20 @@ enum ExtendedDecorations
 	// Y'CbCr conversion.
 	SPIRVCrossDecorationDynamicImageSampler,

+	// Apply to a variable in the Input storage class; marks it as holding the size of the stage
+	// input grid.
+	// In MSL, this is used to hold the vertex and instance counts in a tessellation pipeline
+	// vertex shader.
+	SPIRVCrossDecorationBuiltInStageInputSize,
+
+	// Apply to any access chain of a tessellation I/O variable; stores the type of the sub-object
+	// that was chained to, as recorded in the input variable itself. This is used in case the pointer
+	// is itself used as the base of an access chain, to calculate the original type of the sub-object
+	// chained to, in case a swizzle needs to be applied. This should not happen normally with valid
+	// SPIR-V, but the MSL backend can change the type of input variables, necessitating the
+	// addition of swizzles to keep the generated code compiling.
+	SPIRVCrossDecorationTessIOOriginalInputTypeID,
+
 	SPIRVCrossDecorationCount
 };

--- a/spirv_cross_c.cpp
+++ b/spirv_cross_c.cpp
@ -635,6 +635,26 @@ spvc_result spvc_compiler_options_set_uint(spvc_compiler_options options, spvc_c
 	case SPVC_COMPILER_OPTION_MSL_ENABLE_CLIP_DISTANCE_USER_VARYING:
 		options->msl.enable_clip_distance_user_varying = value != 0;
 		break;
+
+	case SPVC_COMPILER_OPTION_MSL_MULTI_PATCH_WORKGROUP:
+		options->msl.multi_patch_workgroup = value != 0;
+		break;
+
+	case SPVC_COMPILER_OPTION_MSL_SHADER_INPUT_BUFFER_INDEX:
+		options->msl.shader_input_buffer_index = value;
+		break;
+
+	case SPVC_COMPILER_OPTION_MSL_SHADER_INDEX_BUFFER_INDEX:
+		options->msl.shader_index_buffer_index = value;
+		break;
+
+	case SPVC_COMPILER_OPTION_MSL_VERTEX_FOR_TESSELLATION:
+		options->msl.vertex_for_tessellation = value != 0;
+		break;
+
+	case SPVC_COMPILER_OPTION_MSL_VERTEX_INDEX_TYPE:
+		options->msl.vertex_index_type = static_cast<CompilerMSL::Options::IndexType>(value);
+		break;
 #endif

 	default:
--- a/spirv_cross_c.h
+++ b/spirv_cross_c.h
@ -33,7 +33,7 @@ extern "C" {
 /* Bumped if ABI or API breaks backwards compatibility. */
 #define SPVC_C_API_VERSION_MAJOR 0
 /* Bumped if APIs or enumerations are added in a backwards compatible way. */
-#define SPVC_C_API_VERSION_MINOR 35
+#define SPVC_C_API_VERSION_MINOR 36
 /* Bumped if internal implementation details change. */
 #define SPVC_C_API_VERSION_PATCH 0

@ -258,12 +258,23 @@ typedef enum spvc_msl_platform
 	SPVC_MSL_PLATFORM_MAX_INT = 0x7fffffff
 } spvc_msl_platform;

+/* Maps to C++ API. */
+typedef enum spvc_msl_index_type
+{
+	SPVC_MSL_INDEX_TYPE_NONE = 0,
+	SPVC_MSL_INDEX_TYPE_UINT16 = 1,
+	SPVC_MSL_INDEX_TYPE_UINT32 = 2,
+	SPVC_MSL_INDEX_TYPE_MAX_INT = 0x7fffffff
+} spvc_msl_index_type;
+
 /* Maps to C++ API. */
 typedef enum spvc_msl_shader_input_format
 {
 	SPVC_MSL_SHADER_INPUT_FORMAT_OTHER = 0,
 	SPVC_MSL_SHADER_INPUT_FORMAT_UINT8 = 1,
 	SPVC_MSL_SHADER_INPUT_FORMAT_UINT16 = 2,
+	SPVC_MSL_SHADER_INPUT_FORMAT_ANY16 = 3,
+	SPVC_MSL_SHADER_INPUT_FORMAT_ANY32 = 4,

 	/* Deprecated names. */
 	SPVC_MSL_VERTEX_FORMAT_OTHER = SPVC_MSL_SHADER_INPUT_FORMAT_OTHER,
@ -617,6 +628,12 @@ typedef enum spvc_compiler_option

 	SPVC_COMPILER_OPTION_HLSL_ENABLE_16BIT_TYPES = 60 | SPVC_COMPILER_OPTION_HLSL_BIT,

+	SPVC_COMPILER_OPTION_MSL_MULTI_PATCH_WORKGROUP = 61 | SPVC_COMPILER_OPTION_MSL_BIT,
+	SPVC_COMPILER_OPTION_MSL_SHADER_INPUT_BUFFER_INDEX = 62 | SPVC_COMPILER_OPTION_MSL_BIT,
+	SPVC_COMPILER_OPTION_MSL_SHADER_INDEX_BUFFER_INDEX = 63 | SPVC_COMPILER_OPTION_MSL_BIT,
+	SPVC_COMPILER_OPTION_MSL_VERTEX_FOR_TESSELLATION = 64 | SPVC_COMPILER_OPTION_MSL_BIT,
+	SPVC_COMPILER_OPTION_MSL_VERTEX_INDEX_TYPE = 65 | SPVC_COMPILER_OPTION_MSL_BIT,
+
 	SPVC_COMPILER_OPTION_INT_MAX = 0x7fffffff
 } spvc_compiler_option;

--- a/spirv_msl.cpp
+++ b/spirv_msl.cpp
--- a/spirv_msl.hpp
+++ b/spirv_msl.hpp
@ -35,6 +35,8 @@ enum MSLShaderInputFormat
 	MSL_SHADER_INPUT_FORMAT_OTHER = 0,
 	MSL_SHADER_INPUT_FORMAT_UINT8 = 1,
 	MSL_SHADER_INPUT_FORMAT_UINT16 = 2,
+	MSL_SHADER_INPUT_FORMAT_ANY16 = 3,
+	MSL_SHADER_INPUT_FORMAT_ANY32 = 4,

 	// Deprecated aliases.
 	MSL_VERTEX_FORMAT_OTHER = MSL_SHADER_INPUT_FORMAT_OTHER,
@ -271,6 +273,8 @@ public:
 		uint32_t buffer_size_buffer_index = 25;
 		uint32_t view_mask_buffer_index = 24;
 		uint32_t dynamic_offsets_buffer_index = 23;
+		uint32_t shader_input_buffer_index = 22;
+		uint32_t shader_index_buffer_index = 21;
 		uint32_t shader_input_wg_index = 0;
 		uint32_t device_index = 0;
 		uint32_t enable_frag_output_mask = 0xffffffff;
@ -329,6 +333,31 @@ public:
 		// can be read in subsequent stages.
 		bool enable_clip_distance_user_varying = true;

+		// In a tessellation control shader, assume that more than one patch can be processed in a
+		// single workgroup. This requires changes to the way the InvocationId and PrimitiveId
+		// builtins are processed, but should result in more efficient usage of the GPU.
+		bool multi_patch_workgroup = false;
+
+		// If set, a vertex shader will be compiled as part of a tessellation pipeline.
+		// It will be translated as a compute kernel, so it can use the global invocation ID
+		// to index the output buffer.
+		bool vertex_for_tessellation = false;
+
+		enum class IndexType
+		{
+			None = 0,
+			UInt16 = 1,
+			UInt32 = 2
+		};
+
+		// The type of index in the index buffer, if present. For a compute shader, Metal
+		// requires specifying the indexing at pipeline creation, rather than at draw time
+		// as with graphics pipelines. This means we must create three different pipelines,
+		// for no indexing, 16-bit indices, and 32-bit indices. Each requires different
+		// handling for the gl_VertexIndex builtin. We may as well, then, create three
+		// different shaders for these three scenarios.
+		IndexType vertex_index_type = IndexType::None;
+
 		bool is_ios() const
 		{
 			return platform == iOS;
@ -431,7 +460,7 @@ public:
 	// input is a shader input description used to fix up shader input variables.
 	// If shader inputs are provided, is_msl_shader_input_used() will return true after
 	// calling ::compile() if the location was used by the MSL code.
-	void add_msl_shader_input(const MSLShaderInput &attr);
+	void add_msl_shader_input(const MSLShaderInput &input);

 	// resource is a resource binding to indicate the MSL buffer,
 	// texture or sampler index to use for a particular SPIR-V description set
@ -692,7 +721,7 @@ protected:

 	void fix_up_interface_member_indices(spv::StorageClass storage, uint32_t ib_type_id);

-	void mark_location_as_used_by_shader(uint32_t location, spv::StorageClass storage);
+	void mark_location_as_used_by_shader(uint32_t location, const SPIRType &type, spv::StorageClass storage);
 	uint32_t ensure_correct_builtin_type(uint32_t type_id, spv::BuiltIn builtin);
 	uint32_t ensure_correct_input_type(uint32_t type_id, uint32_t location, uint32_t num_components = 0);

@ -739,7 +768,13 @@ protected:
 	uint32_t get_declared_struct_member_matrix_stride_msl(const SPIRType &struct_type, uint32_t index) const;
 	uint32_t get_declared_struct_member_alignment_msl(const SPIRType &struct_type, uint32_t index) const;

+	uint32_t get_declared_input_size_msl(const SPIRType &struct_type, uint32_t index) const;
+	uint32_t get_declared_input_array_stride_msl(const SPIRType &struct_type, uint32_t index) const;
+	uint32_t get_declared_input_matrix_stride_msl(const SPIRType &struct_type, uint32_t index) const;
+	uint32_t get_declared_input_alignment_msl(const SPIRType &struct_type, uint32_t index) const;
+
 	const SPIRType &get_physical_member_type(const SPIRType &struct_type, uint32_t index) const;
+	SPIRType get_presumed_input_type(const SPIRType &struct_type, uint32_t index) const;

 	uint32_t get_declared_struct_size_msl(const SPIRType &struct_type, bool ignore_alignment = false,
 	                                      bool ignore_padding = false) const;
@ -785,6 +820,7 @@ protected:
 	uint32_t builtin_subgroup_invocation_id_id = 0;
 	uint32_t builtin_subgroup_size_id = 0;
 	uint32_t builtin_dispatch_base_id = 0;
+	uint32_t builtin_stage_input_size_id = 0;
 	uint32_t swizzle_buffer_id = 0;
 	uint32_t buffer_size_buffer_id = 0;
 	uint32_t view_mask_buffer_id = 0;
@ -811,7 +847,8 @@ protected:

 	Options msl_options;
 	std::set<SPVFuncImpl> spv_function_implementations;
-	std::unordered_map<uint32_t, MSLShaderInput> inputs_by_location;
+	// Must be ordered to ensure declarations are in a specific order.
+	std::map<uint32_t, MSLShaderInput> inputs_by_location;
 	std::unordered_map<uint32_t, MSLShaderInput> inputs_by_builtin;
 	std::unordered_set<uint32_t> inputs_in_use;
 	std::unordered_map<uint32_t, uint32_t> fragment_output_components;
@ -866,9 +903,11 @@ protected:
 	std::string buffer_size_name_suffix = "BufferSize";
 	std::string plane_name_suffix = "Plane";
 	std::string input_wg_var_name = "gl_in";
+	std::string input_buffer_var_name = "spvIn";
 	std::string output_buffer_var_name = "spvOut";
 	std::string patch_output_buffer_var_name = "spvPatchOut";
 	std::string tess_factor_buffer_var_name = "spvTessLevel";
+	std::string index_buffer_var_name = "spvIndices";
 	spv::Op previous_instruction_opcode = spv::OpNop;

 	// Must be ordered since declaration is in a specific order.
--- a/test_shaders.py
+++ b/test_shaders.py
@ -280,6 +280,19 @@ def cross_compile_msl(shader, spirv, opt, iterations, paths):
        msl_args.append('6')
        msl_args.append('other')
        msl_args.append('4')
+    if '.multi-patch.' in shader:
+        msl_args.append('--msl-multi-patch-workgroup')
+        # Arbitrary for testing purposes.
+        msl_args.append('--msl-shader-input')
+        msl_args.append('0')
+        msl_args.append('any32')
+        msl_args.append('3')
+        msl_args.append('--msl-shader-input')
+        msl_args.append('1')
+        msl_args.append('any16')
+        msl_args.append('2')
+    if '.for-tess.' in shader:
+        msl_args.append('--msl-vertex-for-tessellation')

    subprocess.check_call(msl_args)