Merge pull request #1294 from cdavis5e/msl-multi-patch-workgroup

MSL: Add support for processing more than one patch per workgroup.
2020-07-24 10:36:05 +02:00 · 2020-07-24 10:36:05 +02:00 · 934825a6a2
commit 934825a6a2
parent 3dcc23a5b3 688c5fcbda
46 changed files with 2649 additions and 113 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -323,7 +323,7 @@ if (SPIRV_CROSS_STATIC)
 endif()

 set(spirv-cross-abi-major 0)
-set(spirv-cross-abi-minor 35)
+set(spirv-cross-abi-minor 36)
 set(spirv-cross-abi-patch 0)

 if (SPIRV_CROSS_SHARED)
--- a/main.cpp
+++ b/main.cpp
@ -558,6 +558,8 @@ struct CLIArguments
 	bool msl_enable_frag_stencil_ref_builtin = true;
 	uint32_t msl_enable_frag_output_mask = 0xffffffff;
 	bool msl_enable_clip_distance_user_varying = true;
+	bool msl_multi_patch_workgroup = false;
+	bool msl_vertex_for_tessellation = false;
 	bool glsl_emit_push_constant_as_ubo = false;
 	bool glsl_emit_ubo_as_plain_uniforms = false;
 	SmallVector<pair<uint32_t, uint32_t>> glsl_ext_framebuffer_fetch;
@ -747,9 +749,15 @@ static void print_help_msl()
 	                "\t[--msl-enable-frag-output-mask <mask>]:\n\t\tOnly selectively enable fragment outputs. Useful if pipeline does not enable fragment output for certain locations, as pipeline creation might otherwise fail.\n"
 	                "\t[--msl-no-clip-distance-user-varying]:\n\t\tDo not emit user varyings to emulate gl_ClipDistance in fragment shaders.\n"
 	                "\t[--msl-shader-input <index> <format> <size>]:\n\t\tSpecify the format of the shader input at <index>.\n"
-	                "\t\t<format> can be 'u16', 'u8', or 'other', to indicate a 16-bit unsigned integer, 8-bit unsigned integer, "
+	                "\t\t<format> can be 'any32', 'any16', 'u16', 'u8', or 'other', to indicate a 32-bit opaque value, 16-bit opaque value, 16-bit unsigned integer, 8-bit unsigned integer, "
 	                "or other-typed variable. <size> is the vector length of the variable, which must be greater than or equal to that declared in the shader.\n"
-	                "\t\tUseful if shader stage interfaces don't match up, as pipeline creation might otherwise fail.\n");
+	                "\t\tUseful if shader stage interfaces don't match up, as pipeline creation might otherwise fail.\n"
+	                "\t[--msl-multi-patch-workgroup]:\n\t\tUse the new style of tessellation control processing, where multiple patches are processed per workgroup.\n"
+					"\t\tThis should increase throughput by ensuring all the GPU's SIMD lanes are occupied, but it is not compatible with the old style.\n"
+					"\t\tIn addition, this style also passes input variables in buffers directly instead of using vertex attribute processing.\n"
+					"\t\tIn a future version of SPIRV-Cross, this will become the default.\n"
+	                "\t[--msl-vertex-for-tessellation]:\n\t\tWhen handling a vertex shader, marks it as one that will be used with a new-style tessellation control shader.\n"
+					"\t\tThe vertex shader is output to MSL as a compute kernel which outputs vertices to the buffer in the order they are received, rather than in index order as with --msl-capture-output normally.\n");
 	// clang-format on
 }

@ -983,6 +991,8 @@ static string compile_iteration(const CLIArguments &args, std::vector<uint32_t>
 		msl_opts.enable_frag_stencil_ref_builtin = args.msl_enable_frag_stencil_ref_builtin;
 		msl_opts.enable_frag_output_mask = args.msl_enable_frag_output_mask;
 		msl_opts.enable_clip_distance_user_varying = args.msl_enable_clip_distance_user_varying;
+		msl_opts.multi_patch_workgroup = args.msl_multi_patch_workgroup;
+		msl_opts.vertex_for_tessellation = args.msl_vertex_for_tessellation;
 		msl_comp->set_msl_options(msl_opts);
 		for (auto &v : args.msl_discrete_descriptor_sets)
 			msl_comp->add_discrete_descriptor_set(v);
@ -1381,15 +1391,21 @@ static int main_inner(int argc, char *argv[])
 		// Make sure next_uint() is called in-order.
 		input.location = parser.next_uint();
 		const char *format = parser.next_value_string("other");
-		if (strcmp(format, "u16") == 0)
-			input.format = MSL_VERTEX_FORMAT_UINT16;
+		if (strcmp(format, "any32") == 0)
+			input.format = MSL_SHADER_INPUT_FORMAT_ANY32;
+		else if (strcmp(format, "any16") == 0)
+			input.format = MSL_SHADER_INPUT_FORMAT_ANY16;
+		else if (strcmp(format, "u16") == 0)
+			input.format = MSL_SHADER_INPUT_FORMAT_UINT16;
 		else if (strcmp(format, "u8") == 0)
-			input.format = MSL_VERTEX_FORMAT_UINT8;
+			input.format = MSL_SHADER_INPUT_FORMAT_UINT8;
 		else
-			input.format = MSL_VERTEX_FORMAT_OTHER;
+			input.format = MSL_SHADER_INPUT_FORMAT_OTHER;
 		input.vecsize = parser.next_uint();
 		args.msl_shader_inputs.push_back(input);
 	});
+	cbs.add("--msl-multi-patch-workgroup", [&args](CLIParser &) { args.msl_multi_patch_workgroup = true; });
+	cbs.add("--msl-vertex-for-tessellation", [&args](CLIParser &) { args.msl_vertex_for_tessellation = true; });
 	cbs.add("--extension", [&args](CLIParser &parser) { args.extensions.push_back(parser.next_string()); });
 	cbs.add("--rename-entry-point", [&args](CLIParser &parser) {
 		auto old_name = parser.next_string();
--- a/reference/opt/shaders-msl/asm/tesc/tess-level-overrun.multi-patch.asm.tesc
+++ b/reference/opt/shaders-msl/asm/tesc/tess-level-overrun.multi-patch.asm.tesc
@ -0,0 +1,24 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct TessLevels
+{
+    float inner0;
+    float inner1;
+    float outer0;
+    float outer1;
+    float outer2;
+    float outer3;
+};
+
+kernel void main0(const device TessLevels& sb_levels [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], constant uint* spvIndirectParams [[buffer(29)]], device MTLTriangleTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
+{
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1]);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor = half(sb_levels.inner0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(sb_levels.outer0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(sb_levels.outer1);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(sb_levels.outer2);
+}
+
--- a/reference/opt/shaders-msl/desktop-only/tesc/basic.desktop.sso.multi-patch.tesc
+++ b/reference/opt/shaders-msl/desktop-only/tesc/basic.desktop.sso.multi-patch.tesc
@ -0,0 +1,39 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct main0_out
+{
+    float4 gl_Position;
+};
+
+struct main0_patchOut
+{
+    float3 vFoo;
+};
+
+struct main0_in
+{
+    uint3 m_86;
+    ushort2 m_90;
+    float4 gl_Position;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 1];
+    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 1];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 1;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1]);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(8.8999996185302734375);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(6.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(8.8999996185302734375);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(6.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(3.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(4.900000095367431640625);
+    patchOut.vFoo = float3(1.0);
+    gl_out[gl_InvocationID].gl_Position = gl_in[0].gl_Position + gl_in[1].gl_Position;
+}
+
--- a/reference/opt/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc
+++ b/reference/opt/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc
@ -0,0 +1,38 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct Boo
+{
+    float3 a;
+    uint3 b;
+};
+
+struct main0_out
+{
+    Boo vVertex;
+};
+
+struct main0_in
+{
+    float3 Boo_a;
+    uint3 Boo_b;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    Boo _26 = Boo{ gl_in[gl_InvocationID].Boo_a, gl_in[gl_InvocationID].Boo_b };
+    gl_out[gl_InvocationID].vVertex = _26;
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(1.0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(2.0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(3.0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(4.0);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(1.0);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(2.0);
+}
+
--- a/reference/opt/shaders-msl/desktop-only/vert/shader-draw-parameters.desktop.for-tess.vert
+++ b/reference/opt/shaders-msl/desktop-only/vert/shader-draw-parameters.desktop.for-tess.vert
@ -0,0 +1,20 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct main0_out
+{
+    float4 gl_Position;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], uint3 spvDispatchBase [[grid_origin]], device main0_out* spvOut [[buffer(28)]])
+{
+    device main0_out& out = spvOut[gl_GlobalInvocationID.y * spvStageInputSize.x + gl_GlobalInvocationID.x];
+    if (any(gl_GlobalInvocationID >= spvStageInputSize))
+        return;
+    uint gl_BaseVertex = spvDispatchBase.x;
+    uint gl_BaseInstance = spvDispatchBase.y;
+    out.gl_Position = float4(float(int(gl_BaseVertex)), float(int(gl_BaseInstance)), 0.0, 1.0);
+}
+
--- a/reference/opt/shaders-msl/tesc/basic.multi-patch.tesc
+++ b/reference/opt/shaders-msl/tesc/basic.multi-patch.tesc
@ -0,0 +1,23 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct main0_patchOut
+{
+    float3 vFoo;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
+{
+    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 1];
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1]);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(8.8999996185302734375);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(6.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(8.8999996185302734375);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(6.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(3.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(4.900000095367431640625);
+    patchOut.vFoo = float3(1.0);
+}
+
--- a/reference/opt/shaders-msl/tesc/load-control-point-array-of-matrix.multi-patch.tesc
+++ b/reference/opt/shaders-msl/tesc/load-control-point-array-of-matrix.multi-patch.tesc
@ -0,0 +1,68 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct main0_out
+{
+    float4x4 vOutputs;
+};
+
+struct main0_in
+{
+    float4x4 vInputs;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    spvUnsafeArray<float4x4, 32> _16 = spvUnsafeArray<float4x4, 32>({ gl_in[0].vInputs, gl_in[1].vInputs, gl_in[2].vInputs, gl_in[3].vInputs, gl_in[4].vInputs, gl_in[5].vInputs, gl_in[6].vInputs, gl_in[7].vInputs, gl_in[8].vInputs, gl_in[9].vInputs, gl_in[10].vInputs, gl_in[11].vInputs, gl_in[12].vInputs, gl_in[13].vInputs, gl_in[14].vInputs, gl_in[15].vInputs, gl_in[16].vInputs, gl_in[17].vInputs, gl_in[18].vInputs, gl_in[19].vInputs, gl_in[20].vInputs, gl_in[21].vInputs, gl_in[22].vInputs, gl_in[23].vInputs, gl_in[24].vInputs, gl_in[25].vInputs, gl_in[26].vInputs, gl_in[27].vInputs, gl_in[28].vInputs, gl_in[29].vInputs, gl_in[30].vInputs, gl_in[31].vInputs });
+    spvUnsafeArray<float4x4, 32> tmp;
+    tmp = _16;
+    gl_out[gl_InvocationID].vOutputs = tmp[gl_InvocationID];
+}
+
--- a/reference/opt/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc
+++ b/reference/opt/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc
@ -0,0 +1,78 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct VertexData
+{
+    float4x4 a;
+    spvUnsafeArray<float4, 2> b;
+    float4 c;
+};
+
+struct main0_out
+{
+    float4 vOutputs;
+};
+
+struct main0_in
+{
+    float4x4 VertexData_a;
+    spvUnsafeArray<float4, 2> VertexData_b;
+    float4 VertexData_c;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    spvUnsafeArray<VertexData, 32> _19 = spvUnsafeArray<VertexData, 32>({ VertexData{ gl_in[0].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[0].VertexData_b[0], gl_in[0].VertexData_b[1] }), gl_in[0].VertexData_c }, VertexData{ gl_in[1].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[1].VertexData_b[0], gl_in[1].VertexData_b[1] }), gl_in[1].VertexData_c }, VertexData{ gl_in[2].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[2].VertexData_b[0], gl_in[2].VertexData_b[1] }), gl_in[2].VertexData_c }, VertexData{ gl_in[3].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[3].VertexData_b[0], gl_in[3].VertexData_b[1] }), gl_in[3].VertexData_c }, VertexData{ gl_in[4].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[4].VertexData_b[0], gl_in[4].VertexData_b[1] }), gl_in[4].VertexData_c }, VertexData{ gl_in[5].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[5].VertexData_b[0], gl_in[5].VertexData_b[1] }), gl_in[5].VertexData_c }, VertexData{ gl_in[6].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[6].VertexData_b[0], gl_in[6].VertexData_b[1] }), gl_in[6].VertexData_c }, VertexData{ gl_in[7].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[7].VertexData_b[0], gl_in[7].VertexData_b[1] }), gl_in[7].VertexData_c }, VertexData{ gl_in[8].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[8].VertexData_b[0], gl_in[8].VertexData_b[1] }), gl_in[8].VertexData_c }, VertexData{ gl_in[9].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[9].VertexData_b[0], gl_in[9].VertexData_b[1] }), gl_in[9].VertexData_c }, VertexData{ gl_in[10].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[10].VertexData_b[0], gl_in[10].VertexData_b[1] }), gl_in[10].VertexData_c }, VertexData{ gl_in[11].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[11].VertexData_b[0], gl_in[11].VertexData_b[1] }), gl_in[11].VertexData_c }, VertexData{ gl_in[12].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[12].VertexData_b[0], gl_in[12].VertexData_b[1] }), gl_in[12].VertexData_c }, VertexData{ gl_in[13].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[13].VertexData_b[0], gl_in[13].VertexData_b[1] }), gl_in[13].VertexData_c }, VertexData{ gl_in[14].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[14].VertexData_b[0], gl_in[14].VertexData_b[1] }), gl_in[14].VertexData_c }, VertexData{ gl_in[15].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[15].VertexData_b[0], gl_in[15].VertexData_b[1] }), gl_in[15].VertexData_c }, VertexData{ gl_in[16].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[16].VertexData_b[0], gl_in[16].VertexData_b[1] }), gl_in[16].VertexData_c }, VertexData{ gl_in[17].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[17].VertexData_b[0], gl_in[17].VertexData_b[1] }), gl_in[17].VertexData_c }, VertexData{ gl_in[18].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[18].VertexData_b[0], gl_in[18].VertexData_b[1] }), gl_in[18].VertexData_c }, VertexData{ gl_in[19].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[19].VertexData_b[0], gl_in[19].VertexData_b[1] }), gl_in[19].VertexData_c }, VertexData{ gl_in[20].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[20].VertexData_b[0], gl_in[20].VertexData_b[1] }), gl_in[20].VertexData_c }, VertexData{ gl_in[21].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[21].VertexData_b[0], gl_in[21].VertexData_b[1] }), gl_in[21].VertexData_c }, VertexData{ gl_in[22].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[22].VertexData_b[0], gl_in[22].VertexData_b[1] }), gl_in[22].VertexData_c }, VertexData{ gl_in[23].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[23].VertexData_b[0], gl_in[23].VertexData_b[1] }), gl_in[23].VertexData_c }, VertexData{ gl_in[24].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[24].VertexData_b[0], gl_in[24].VertexData_b[1] }), gl_in[24].VertexData_c }, VertexData{ gl_in[25].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[25].VertexData_b[0], gl_in[25].VertexData_b[1] }), gl_in[25].VertexData_c }, VertexData{ gl_in[26].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[26].VertexData_b[0], gl_in[26].VertexData_b[1] }), gl_in[26].VertexData_c }, VertexData{ gl_in[27].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[27].VertexData_b[0], gl_in[27].VertexData_b[1] }), gl_in[27].VertexData_c }, VertexData{ gl_in[28].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[28].VertexData_b[0], gl_in[28].VertexData_b[1] }), gl_in[28].VertexData_c }, VertexData{ gl_in[29].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[29].VertexData_b[0], gl_in[29].VertexData_b[1] }), gl_in[29].VertexData_c }, VertexData{ gl_in[30].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[30].VertexData_b[0], gl_in[30].VertexData_b[1] }), gl_in[30].VertexData_c }, VertexData{ gl_in[31].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[31].VertexData_b[0], gl_in[31].VertexData_b[1] }), gl_in[31].VertexData_c } });
+    spvUnsafeArray<VertexData, 32> tmp;
+    tmp = _19;
+    int _27 = gl_InvocationID ^ 1;
+    gl_out[gl_InvocationID].vOutputs = ((tmp[gl_InvocationID].a[1] + tmp[gl_InvocationID].b[1]) + tmp[gl_InvocationID].c) + gl_in[_27].VertexData_c;
+}
+
--- a/reference/opt/shaders-msl/tesc/load-control-point-array.multi-patch.tesc
+++ b/reference/opt/shaders-msl/tesc/load-control-point-array.multi-patch.tesc
@ -0,0 +1,69 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct main0_out
+{
+    float4 vOutputs;
+};
+
+struct main0_in
+{
+    float4 vInputs;
+    ushort2 m_43;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    spvUnsafeArray<float4, 32> _15 = spvUnsafeArray<float4, 32>({ gl_in[0].vInputs, gl_in[1].vInputs, gl_in[2].vInputs, gl_in[3].vInputs, gl_in[4].vInputs, gl_in[5].vInputs, gl_in[6].vInputs, gl_in[7].vInputs, gl_in[8].vInputs, gl_in[9].vInputs, gl_in[10].vInputs, gl_in[11].vInputs, gl_in[12].vInputs, gl_in[13].vInputs, gl_in[14].vInputs, gl_in[15].vInputs, gl_in[16].vInputs, gl_in[17].vInputs, gl_in[18].vInputs, gl_in[19].vInputs, gl_in[20].vInputs, gl_in[21].vInputs, gl_in[22].vInputs, gl_in[23].vInputs, gl_in[24].vInputs, gl_in[25].vInputs, gl_in[26].vInputs, gl_in[27].vInputs, gl_in[28].vInputs, gl_in[29].vInputs, gl_in[30].vInputs, gl_in[31].vInputs });
+    spvUnsafeArray<float4, 32> tmp;
+    tmp = _15;
+    gl_out[gl_InvocationID].vOutputs = tmp[gl_InvocationID];
+}
+
--- a/reference/opt/shaders-msl/tesc/water_tess.multi-patch.tesc
+++ b/reference/opt/shaders-msl/tesc/water_tess.multi-patch.tesc
@ -0,0 +1,91 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct UBO
+{
+    float4 uScale;
+    float3 uCamPos;
+    float2 uPatchSize;
+    float2 uMaxTessLevel;
+    float uDistanceMod;
+    float4 uFrustum[6];
+};
+
+struct main0_patchOut
+{
+    float2 vOutPatchPosBase;
+    float4 vPatchLods;
+};
+
+struct main0_in
+{
+    float3 vPatchPosBase;
+    ushort2 m_996;
+};
+
+kernel void main0(constant UBO& _41 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 1];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1]);
+    float2 _431 = (gl_in[0].vPatchPosBase.xy - float2(10.0)) * _41.uScale.xy;
+    float2 _441 = ((gl_in[0].vPatchPosBase.xy + _41.uPatchSize) + float2(10.0)) * _41.uScale.xy;
+    float3 _446 = float3(_431.x, -10.0, _431.y);
+    float3 _451 = float3(_441.x, 10.0, _441.y);
+    float4 _467 = float4((_446 + _451) * 0.5, 1.0);
+    float3 _514 = float3(length(_451 - _446) * (-0.5));
+    bool _516 = any(float3(dot(_41.uFrustum[0], _467), dot(_41.uFrustum[1], _467), dot(_41.uFrustum[2], _467)) <= _514);
+    bool _526;
+    if (!_516)
+    {
+        _526 = any(float3(dot(_41.uFrustum[3], _467), dot(_41.uFrustum[4], _467), dot(_41.uFrustum[5], _467)) <= _514);
+    }
+    else
+    {
+        _526 = _516;
+    }
+    if (!(!_526))
+    {
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(-1.0);
+    }
+    else
+    {
+        patchOut.vOutPatchPosBase = gl_in[0].vPatchPosBase.xy;
+        float2 _681 = (gl_in[0].vPatchPosBase.xy + (float2(-0.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float2 _710 = (gl_in[0].vPatchPosBase.xy + (float2(0.5, -0.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float _729 = fast::clamp(log2((length(_41.uCamPos - float3(_710.x, 0.0, _710.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x);
+        float2 _739 = (gl_in[0].vPatchPosBase.xy + (float2(1.5, -0.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float2 _768 = (gl_in[0].vPatchPosBase.xy + (float2(-0.5, 0.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float _787 = fast::clamp(log2((length(_41.uCamPos - float3(_768.x, 0.0, _768.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x);
+        float2 _797 = (gl_in[0].vPatchPosBase.xy + (float2(0.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float _816 = fast::clamp(log2((length(_41.uCamPos - float3(_797.x, 0.0, _797.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x);
+        float2 _826 = (gl_in[0].vPatchPosBase.xy + (float2(1.5, 0.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float _845 = fast::clamp(log2((length(_41.uCamPos - float3(_826.x, 0.0, _826.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x);
+        float2 _855 = (gl_in[0].vPatchPosBase.xy + (float2(-0.5, 1.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float2 _884 = (gl_in[0].vPatchPosBase.xy + (float2(0.5, 1.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float _903 = fast::clamp(log2((length(_41.uCamPos - float3(_884.x, 0.0, _884.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x);
+        float2 _913 = (gl_in[0].vPatchPosBase.xy + (float2(1.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float _614 = dot(float4(_787, _816, fast::clamp(log2((length(_41.uCamPos - float3(_855.x, 0.0, _855.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x), _903), float4(0.25));
+        float _620 = dot(float4(fast::clamp(log2((length(_41.uCamPos - float3(_681.x, 0.0, _681.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x), _729, _787, _816), float4(0.25));
+        float _626 = dot(float4(_729, fast::clamp(log2((length(_41.uCamPos - float3(_739.x, 0.0, _739.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x), _816, _845), float4(0.25));
+        float _632 = dot(float4(_816, _845, _903, fast::clamp(log2((length(_41.uCamPos - float3(_913.x, 0.0, _913.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x)), float4(0.25));
+        float4 _633 = float4(_614, _620, _626, _632);
+        patchOut.vPatchLods = _633;
+        float4 _940 = exp2(-fast::min(_633, _633.yzwx)) * _41.uMaxTessLevel.y;
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(_940.x);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(_940.y);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(_940.z);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(_940.w);
+        float _948 = _41.uMaxTessLevel.y * exp2(-fast::min(fast::min(fast::min(_614, _620), fast::min(_626, _632)), _816));
+        spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(_948);
+        spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(_948);
+    }
+}
+
--- a/reference/opt/shaders-msl/vert/basic.for-tess.vert
+++ b/reference/opt/shaders-msl/vert/basic.for-tess.vert
@ -0,0 +1,31 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct UBO
+{
+    float4x4 uMVP;
+};
+
+struct main0_out
+{
+    float3 vNormal;
+    float4 gl_Position;
+};
+
+struct main0_in
+{
+    float4 aVertex [[attribute(0)]];
+    float3 aNormal [[attribute(1)]];
+};
+
+kernel void main0(main0_in in [[stage_in]], constant UBO& _16 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], device main0_out* spvOut [[buffer(28)]])
+{
+    device main0_out& out = spvOut[gl_GlobalInvocationID.y * spvStageInputSize.x + gl_GlobalInvocationID.x];
+    if (any(gl_GlobalInvocationID >= spvStageInputSize))
+        return;
+    out.gl_Position = _16.uMVP * in.aVertex;
+    out.vNormal = in.aNormal;
+}
+
--- a/reference/opt/shaders-msl/vert/leaf-function.for-tess.vert
+++ b/reference/opt/shaders-msl/vert/leaf-function.for-tess.vert
@ -0,0 +1,31 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct UBO
+{
+    float4x4 uMVP;
+};
+
+struct main0_out
+{
+    float3 vNormal;
+    float4 gl_Position;
+};
+
+struct main0_in
+{
+    float4 aVertex [[attribute(0)]];
+    float3 aNormal [[attribute(1)]];
+};
+
+kernel void main0(main0_in in [[stage_in]], constant UBO& _18 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], device main0_out* spvOut [[buffer(28)]])
+{
+    device main0_out& out = spvOut[gl_GlobalInvocationID.y * spvStageInputSize.x + gl_GlobalInvocationID.x];
+    if (any(gl_GlobalInvocationID >= spvStageInputSize))
+        return;
+    out.gl_Position = _18.uMVP * in.aVertex;
+    out.vNormal = in.aNormal;
+}
+
--- a/reference/opt/shaders-msl/vert/no_stage_out.for-tess.vert
+++ b/reference/opt/shaders-msl/vert/no_stage_out.for-tess.vert
@ -0,0 +1,23 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct _10
+{
+    uint4 _m0[1024];
+};
+
+struct main0_in
+{
+    uint4 m_19 [[attribute(0)]];
+};
+
+kernel void main0(main0_in in [[stage_in]], device _10& _12 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], uint3 spvDispatchBase [[grid_origin]])
+{
+    if (any(gl_GlobalInvocationID >= spvStageInputSize))
+        return;
+    uint gl_VertexIndex = gl_GlobalInvocationID.x + spvDispatchBase.x;
+    _12._m0[int(gl_VertexIndex)] = in.m_19;
+}
+
--- a/reference/shaders-msl-no-opt/asm/tesc/tess-fixed-input-array-builtin-array.invalid.multi-patch.asm.tesc
+++ b/reference/shaders-msl-no-opt/asm/tesc/tess-fixed-input-array-builtin-array.invalid.multi-patch.asm.tesc
@ -0,0 +1,140 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct VertexOutput
+{
+    float4 pos;
+    float2 uv;
+};
+
+struct HSOut
+{
+    float4 pos;
+    float2 uv;
+};
+
+struct HSConstantOut
+{
+    spvUnsafeArray<float, 3> EdgeTess;
+    float InsideTess;
+};
+
+struct VertexOutput_1
+{
+    float3 uv;
+};
+
+struct HSOut_1
+{
+    float2 uv;
+};
+
+struct main0_out
+{
+    HSOut_1 _entryPointOutput;
+    float4 gl_Position;
+};
+
+struct main0_in
+{
+    float3 VertexOutput_uv;
+    ushort2 m_172;
+    float4 gl_Position;
+};
+
+static inline __attribute__((always_inline))
+HSOut _hs_main(thread const spvUnsafeArray<VertexOutput, 3> (&p), thread const uint& i)
+{
+    HSOut _output;
+    _output.pos = p[i].pos;
+    _output.uv = p[i].uv;
+    return _output;
+}
+
+static inline __attribute__((always_inline))
+HSConstantOut PatchHS(thread const spvUnsafeArray<VertexOutput, 3> (&_patch))
+{
+    HSConstantOut _output;
+    _output.EdgeTess[0] = (float2(1.0) + _patch[0].uv).x;
+    _output.EdgeTess[1] = (float2(1.0) + _patch[0].uv).x;
+    _output.EdgeTess[2] = (float2(1.0) + _patch[0].uv).x;
+    _output.InsideTess = (float2(1.0) + _patch[0].uv).x;
+    return _output;
+}
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLTriangleTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 3];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 3, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 3;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 3, spvIndirectParams[1]);
+    spvUnsafeArray<VertexOutput, 3> p;
+    p[0].pos = gl_in[0].gl_Position;
+    p[0].uv = gl_in[0].VertexOutput_uv.xy;
+    p[1].pos = gl_in[1].gl_Position;
+    p[1].uv = gl_in[1].VertexOutput_uv.xy;
+    p[2].pos = gl_in[2].gl_Position;
+    p[2].uv = gl_in[2].VertexOutput_uv.xy;
+    uint i = gl_InvocationID;
+    spvUnsafeArray<VertexOutput, 3> param;
+    param = p;
+    uint param_1 = i;
+    HSOut flattenTemp = _hs_main(param, param_1);
+    gl_out[gl_InvocationID].gl_Position = flattenTemp.pos;
+    gl_out[gl_InvocationID]._entryPointOutput.uv = flattenTemp.uv;
+    threadgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup);
+    if (int(gl_InvocationID) == 0)
+    {
+        spvUnsafeArray<VertexOutput, 3> param_2;
+        param_2 = p;
+        HSConstantOut _patchConstantResult = PatchHS(param_2);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(_patchConstantResult.EdgeTess[0]);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(_patchConstantResult.EdgeTess[1]);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(_patchConstantResult.EdgeTess[2]);
+        spvTessLevel[gl_PrimitiveID].insideTessellationFactor = half(_patchConstantResult.InsideTess);
+    }
+}
+
--- a/reference/shaders-msl/asm/tesc/tess-level-overrun.multi-patch.asm.tesc
+++ b/reference/shaders-msl/asm/tesc/tess-level-overrun.multi-patch.asm.tesc
@ -0,0 +1,24 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct TessLevels
+{
+    float inner0;
+    float inner1;
+    float outer0;
+    float outer1;
+    float outer2;
+    float outer3;
+};
+
+kernel void main0(const device TessLevels& sb_levels [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], constant uint* spvIndirectParams [[buffer(29)]], device MTLTriangleTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
+{
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1]);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor = half(sb_levels.inner0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(sb_levels.outer0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(sb_levels.outer1);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(sb_levels.outer2);
+}
+
--- a/reference/shaders-msl/desktop-only/tesc/basic.desktop.sso.multi-patch.tesc
+++ b/reference/shaders-msl/desktop-only/tesc/basic.desktop.sso.multi-patch.tesc
@ -0,0 +1,47 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct main0_out
+{
+    float4 gl_Position;
+};
+
+struct main0_patchOut
+{
+    float3 vFoo;
+};
+
+struct main0_in
+{
+    uint3 m_78;
+    ushort2 m_82;
+    float4 gl_Position;
+};
+
+static inline __attribute__((always_inline))
+void set_position(device main0_out* thread & gl_out, thread uint& gl_InvocationID, device main0_in* thread & gl_in)
+{
+    gl_out[gl_InvocationID].gl_Position = gl_in[0].gl_Position + gl_in[1].gl_Position;
+}
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 1];
+    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 1];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 1;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1]);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(8.8999996185302734375);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(6.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(8.8999996185302734375);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(6.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(3.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(4.900000095367431640625);
+    patchOut.vFoo = float3(1.0);
+    set_position(gl_out, gl_InvocationID, gl_in);
+}
+
--- a/reference/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc
+++ b/reference/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc
@ -0,0 +1,38 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct Boo
+{
+    float3 a;
+    uint3 b;
+};
+
+struct main0_out
+{
+    Boo vVertex;
+};
+
+struct main0_in
+{
+    float3 Boo_a;
+    uint3 Boo_b;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    Boo _26 = Boo{ gl_in[gl_InvocationID].Boo_a, gl_in[gl_InvocationID].Boo_b };
+    gl_out[gl_InvocationID].vVertex = _26;
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(1.0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(2.0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(3.0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(4.0);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(1.0);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(2.0);
+}
+
--- a/reference/shaders-msl/desktop-only/vert/shader-draw-parameters.desktop.for-tess.vert
+++ b/reference/shaders-msl/desktop-only/vert/shader-draw-parameters.desktop.for-tess.vert
@ -0,0 +1,20 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct main0_out
+{
+    float4 gl_Position;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], uint3 spvDispatchBase [[grid_origin]], device main0_out* spvOut [[buffer(28)]])
+{
+    device main0_out& out = spvOut[gl_GlobalInvocationID.y * spvStageInputSize.x + gl_GlobalInvocationID.x];
+    if (any(gl_GlobalInvocationID >= spvStageInputSize))
+        return;
+    uint gl_BaseVertex = spvDispatchBase.x;
+    uint gl_BaseInstance = spvDispatchBase.y;
+    out.gl_Position = float4(float(int(gl_BaseVertex)), float(int(gl_BaseInstance)), 0.0, 1.0);
+}
+
--- a/reference/shaders-msl/tesc/basic.multi-patch.tesc
+++ b/reference/shaders-msl/tesc/basic.multi-patch.tesc
@ -0,0 +1,23 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct main0_patchOut
+{
+    float3 vFoo;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
+{
+    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 1];
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1]);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(8.8999996185302734375);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(6.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(8.8999996185302734375);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(6.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(3.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(4.900000095367431640625);
+    patchOut.vFoo = float3(1.0);
+}
+
--- a/reference/shaders-msl/tesc/load-control-point-array-of-matrix.multi-patch.tesc
+++ b/reference/shaders-msl/tesc/load-control-point-array-of-matrix.multi-patch.tesc
@ -0,0 +1,68 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct main0_out
+{
+    float4x4 vOutputs;
+};
+
+struct main0_in
+{
+    float4x4 vInputs;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    spvUnsafeArray<float4x4, 32> _16 = spvUnsafeArray<float4x4, 32>({ gl_in[0].vInputs, gl_in[1].vInputs, gl_in[2].vInputs, gl_in[3].vInputs, gl_in[4].vInputs, gl_in[5].vInputs, gl_in[6].vInputs, gl_in[7].vInputs, gl_in[8].vInputs, gl_in[9].vInputs, gl_in[10].vInputs, gl_in[11].vInputs, gl_in[12].vInputs, gl_in[13].vInputs, gl_in[14].vInputs, gl_in[15].vInputs, gl_in[16].vInputs, gl_in[17].vInputs, gl_in[18].vInputs, gl_in[19].vInputs, gl_in[20].vInputs, gl_in[21].vInputs, gl_in[22].vInputs, gl_in[23].vInputs, gl_in[24].vInputs, gl_in[25].vInputs, gl_in[26].vInputs, gl_in[27].vInputs, gl_in[28].vInputs, gl_in[29].vInputs, gl_in[30].vInputs, gl_in[31].vInputs });
+    spvUnsafeArray<float4x4, 32> tmp;
+    tmp = _16;
+    gl_out[gl_InvocationID].vOutputs = tmp[gl_InvocationID];
+}
+
--- a/reference/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc
+++ b/reference/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc
@ -0,0 +1,80 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct VertexData
+{
+    float4x4 a;
+    spvUnsafeArray<float4, 2> b;
+    float4 c;
+};
+
+struct main0_out
+{
+    float4 vOutputs;
+};
+
+struct main0_in
+{
+    float4x4 VertexData_a;
+    spvUnsafeArray<float4, 2> VertexData_b;
+    float4 VertexData_c;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    spvUnsafeArray<VertexData, 32> _19 = spvUnsafeArray<VertexData, 32>({ VertexData{ gl_in[0].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[0].VertexData_b[0], gl_in[0].VertexData_b[1] }), gl_in[0].VertexData_c }, VertexData{ gl_in[1].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[1].VertexData_b[0], gl_in[1].VertexData_b[1] }), gl_in[1].VertexData_c }, VertexData{ gl_in[2].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[2].VertexData_b[0], gl_in[2].VertexData_b[1] }), gl_in[2].VertexData_c }, VertexData{ gl_in[3].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[3].VertexData_b[0], gl_in[3].VertexData_b[1] }), gl_in[3].VertexData_c }, VertexData{ gl_in[4].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[4].VertexData_b[0], gl_in[4].VertexData_b[1] }), gl_in[4].VertexData_c }, VertexData{ gl_in[5].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[5].VertexData_b[0], gl_in[5].VertexData_b[1] }), gl_in[5].VertexData_c }, VertexData{ gl_in[6].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[6].VertexData_b[0], gl_in[6].VertexData_b[1] }), gl_in[6].VertexData_c }, VertexData{ gl_in[7].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[7].VertexData_b[0], gl_in[7].VertexData_b[1] }), gl_in[7].VertexData_c }, VertexData{ gl_in[8].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[8].VertexData_b[0], gl_in[8].VertexData_b[1] }), gl_in[8].VertexData_c }, VertexData{ gl_in[9].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[9].VertexData_b[0], gl_in[9].VertexData_b[1] }), gl_in[9].VertexData_c }, VertexData{ gl_in[10].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[10].VertexData_b[0], gl_in[10].VertexData_b[1] }), gl_in[10].VertexData_c }, VertexData{ gl_in[11].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[11].VertexData_b[0], gl_in[11].VertexData_b[1] }), gl_in[11].VertexData_c }, VertexData{ gl_in[12].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[12].VertexData_b[0], gl_in[12].VertexData_b[1] }), gl_in[12].VertexData_c }, VertexData{ gl_in[13].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[13].VertexData_b[0], gl_in[13].VertexData_b[1] }), gl_in[13].VertexData_c }, VertexData{ gl_in[14].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[14].VertexData_b[0], gl_in[14].VertexData_b[1] }), gl_in[14].VertexData_c }, VertexData{ gl_in[15].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[15].VertexData_b[0], gl_in[15].VertexData_b[1] }), gl_in[15].VertexData_c }, VertexData{ gl_in[16].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[16].VertexData_b[0], gl_in[16].VertexData_b[1] }), gl_in[16].VertexData_c }, VertexData{ gl_in[17].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[17].VertexData_b[0], gl_in[17].VertexData_b[1] }), gl_in[17].VertexData_c }, VertexData{ gl_in[18].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[18].VertexData_b[0], gl_in[18].VertexData_b[1] }), gl_in[18].VertexData_c }, VertexData{ gl_in[19].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[19].VertexData_b[0], gl_in[19].VertexData_b[1] }), gl_in[19].VertexData_c }, VertexData{ gl_in[20].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[20].VertexData_b[0], gl_in[20].VertexData_b[1] }), gl_in[20].VertexData_c }, VertexData{ gl_in[21].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[21].VertexData_b[0], gl_in[21].VertexData_b[1] }), gl_in[21].VertexData_c }, VertexData{ gl_in[22].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[22].VertexData_b[0], gl_in[22].VertexData_b[1] }), gl_in[22].VertexData_c }, VertexData{ gl_in[23].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[23].VertexData_b[0], gl_in[23].VertexData_b[1] }), gl_in[23].VertexData_c }, VertexData{ gl_in[24].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[24].VertexData_b[0], gl_in[24].VertexData_b[1] }), gl_in[24].VertexData_c }, VertexData{ gl_in[25].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[25].VertexData_b[0], gl_in[25].VertexData_b[1] }), gl_in[25].VertexData_c }, VertexData{ gl_in[26].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[26].VertexData_b[0], gl_in[26].VertexData_b[1] }), gl_in[26].VertexData_c }, VertexData{ gl_in[27].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[27].VertexData_b[0], gl_in[27].VertexData_b[1] }), gl_in[27].VertexData_c }, VertexData{ gl_in[28].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[28].VertexData_b[0], gl_in[28].VertexData_b[1] }), gl_in[28].VertexData_c }, VertexData{ gl_in[29].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[29].VertexData_b[0], gl_in[29].VertexData_b[1] }), gl_in[29].VertexData_c }, VertexData{ gl_in[30].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[30].VertexData_b[0], gl_in[30].VertexData_b[1] }), gl_in[30].VertexData_c }, VertexData{ gl_in[31].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[31].VertexData_b[0], gl_in[31].VertexData_b[1] }), gl_in[31].VertexData_c } });
+    spvUnsafeArray<VertexData, 32> tmp;
+    tmp = _19;
+    int _27 = gl_InvocationID ^ 1;
+    VertexData _30 = VertexData{ gl_in[_27].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[_27].VertexData_b[0], gl_in[_27].VertexData_b[1] }), gl_in[_27].VertexData_c };
+    VertexData tmp_single = _30;
+    gl_out[gl_InvocationID].vOutputs = ((tmp[gl_InvocationID].a[1] + tmp[gl_InvocationID].b[1]) + tmp[gl_InvocationID].c) + tmp_single.c;
+}
+
--- a/reference/shaders-msl/tesc/load-control-point-array.multi-patch.tesc
+++ b/reference/shaders-msl/tesc/load-control-point-array.multi-patch.tesc
@ -0,0 +1,69 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct main0_out
+{
+    float4 vOutputs;
+};
+
+struct main0_in
+{
+    float4 vInputs;
+    ushort2 m_43;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    spvUnsafeArray<float4, 32> _15 = spvUnsafeArray<float4, 32>({ gl_in[0].vInputs, gl_in[1].vInputs, gl_in[2].vInputs, gl_in[3].vInputs, gl_in[4].vInputs, gl_in[5].vInputs, gl_in[6].vInputs, gl_in[7].vInputs, gl_in[8].vInputs, gl_in[9].vInputs, gl_in[10].vInputs, gl_in[11].vInputs, gl_in[12].vInputs, gl_in[13].vInputs, gl_in[14].vInputs, gl_in[15].vInputs, gl_in[16].vInputs, gl_in[17].vInputs, gl_in[18].vInputs, gl_in[19].vInputs, gl_in[20].vInputs, gl_in[21].vInputs, gl_in[22].vInputs, gl_in[23].vInputs, gl_in[24].vInputs, gl_in[25].vInputs, gl_in[26].vInputs, gl_in[27].vInputs, gl_in[28].vInputs, gl_in[29].vInputs, gl_in[30].vInputs, gl_in[31].vInputs });
+    spvUnsafeArray<float4, 32> tmp;
+    tmp = _15;
+    gl_out[gl_InvocationID].vOutputs = tmp[gl_InvocationID];
+}
+
--- a/reference/shaders-msl/tesc/water_tess.multi-patch.tesc
+++ b/reference/shaders-msl/tesc/water_tess.multi-patch.tesc
@ -0,0 +1,135 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct UBO
+{
+    float4 uScale;
+    float3 uCamPos;
+    float2 uPatchSize;
+    float2 uMaxTessLevel;
+    float uDistanceMod;
+    float4 uFrustum[6];
+};
+
+struct main0_patchOut
+{
+    float2 vOutPatchPosBase;
+    float4 vPatchLods;
+};
+
+struct main0_in
+{
+    float3 vPatchPosBase;
+    ushort2 m_430;
+};
+
+static inline __attribute__((always_inline))
+bool frustum_cull(thread const float2& p0, constant UBO& v_41)
+{
+    float2 min_xz = (p0 - float2(10.0)) * v_41.uScale.xy;
+    float2 max_xz = ((p0 + v_41.uPatchSize) + float2(10.0)) * v_41.uScale.xy;
+    float3 bb_min = float3(min_xz.x, -10.0, min_xz.y);
+    float3 bb_max = float3(max_xz.x, 10.0, max_xz.y);
+    float3 center = (bb_min + bb_max) * 0.5;
+    float radius = 0.5 * length(bb_max - bb_min);
+    float3 f0 = float3(dot(v_41.uFrustum[0], float4(center, 1.0)), dot(v_41.uFrustum[1], float4(center, 1.0)), dot(v_41.uFrustum[2], float4(center, 1.0)));
+    float3 f1 = float3(dot(v_41.uFrustum[3], float4(center, 1.0)), dot(v_41.uFrustum[4], float4(center, 1.0)), dot(v_41.uFrustum[5], float4(center, 1.0)));
+    bool _205 = any(f0 <= float3(-radius));
+    bool _215;
+    if (!_205)
+    {
+        _215 = any(f1 <= float3(-radius));
+    }
+    else
+    {
+        _215 = _205;
+    }
+    return !_215;
+}
+
+static inline __attribute__((always_inline))
+float lod_factor(thread const float2& pos_, constant UBO& v_41)
+{
+    float2 pos = pos_ * v_41.uScale.xy;
+    float3 dist_to_cam = v_41.uCamPos - float3(pos.x, 0.0, pos.y);
+    float level = log2((length(dist_to_cam) + 9.9999997473787516355514526367188e-05) * v_41.uDistanceMod);
+    return fast::clamp(level, 0.0, v_41.uMaxTessLevel.x);
+}
+
+static inline __attribute__((always_inline))
+float4 tess_level(thread const float4& lod, constant UBO& v_41)
+{
+    return exp2(-lod) * v_41.uMaxTessLevel.y;
+}
+
+static inline __attribute__((always_inline))
+float tess_level(thread const float& lod, constant UBO& v_41)
+{
+    return v_41.uMaxTessLevel.y * exp2(-lod);
+}
+
+static inline __attribute__((always_inline))
+void compute_tess_levels(thread const float2& p0, constant UBO& v_41, device float2& vOutPatchPosBase, device float4& vPatchLods, device half (&gl_TessLevelOuter)[4], device half (&gl_TessLevelInner)[2])
+{
+    vOutPatchPosBase = p0;
+    float2 param = p0 + (float2(-0.5) * v_41.uPatchSize);
+    float l00 = lod_factor(param, v_41);
+    float2 param_1 = p0 + (float2(0.5, -0.5) * v_41.uPatchSize);
+    float l10 = lod_factor(param_1, v_41);
+    float2 param_2 = p0 + (float2(1.5, -0.5) * v_41.uPatchSize);
+    float l20 = lod_factor(param_2, v_41);
+    float2 param_3 = p0 + (float2(-0.5, 0.5) * v_41.uPatchSize);
+    float l01 = lod_factor(param_3, v_41);
+    float2 param_4 = p0 + (float2(0.5) * v_41.uPatchSize);
+    float l11 = lod_factor(param_4, v_41);
+    float2 param_5 = p0 + (float2(1.5, 0.5) * v_41.uPatchSize);
+    float l21 = lod_factor(param_5, v_41);
+    float2 param_6 = p0 + (float2(-0.5, 1.5) * v_41.uPatchSize);
+    float l02 = lod_factor(param_6, v_41);
+    float2 param_7 = p0 + (float2(0.5, 1.5) * v_41.uPatchSize);
+    float l12 = lod_factor(param_7, v_41);
+    float2 param_8 = p0 + (float2(1.5) * v_41.uPatchSize);
+    float l22 = lod_factor(param_8, v_41);
+    float4 lods = float4(dot(float4(l01, l11, l02, l12), float4(0.25)), dot(float4(l00, l10, l01, l11), float4(0.25)), dot(float4(l10, l20, l11, l21), float4(0.25)), dot(float4(l11, l21, l12, l22), float4(0.25)));
+    vPatchLods = lods;
+    float4 outer_lods = fast::min(lods, lods.yzwx);
+    float4 param_9 = outer_lods;
+    float4 levels = tess_level(param_9, v_41);
+    gl_TessLevelOuter[0] = half(levels.x);
+    gl_TessLevelOuter[1] = half(levels.y);
+    gl_TessLevelOuter[2] = half(levels.z);
+    gl_TessLevelOuter[3] = half(levels.w);
+    float min_lod = fast::min(fast::min(lods.x, lods.y), fast::min(lods.z, lods.w));
+    float param_10 = fast::min(min_lod, l11);
+    float inner = tess_level(param_10, v_41);
+    gl_TessLevelInner[0] = half(inner);
+    gl_TessLevelInner[1] = half(inner);
+}
+
+kernel void main0(constant UBO& v_41 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 1];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1]);
+    float2 p0 = gl_in[0].vPatchPosBase.xy;
+    float2 param = p0;
+    if (!frustum_cull(param, v_41))
+    {
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(-1.0);
+    }
+    else
+    {
+        float2 param_1 = p0;
+        compute_tess_levels(param_1, v_41, patchOut.vOutPatchPosBase, patchOut.vPatchLods, spvTessLevel[gl_PrimitiveID].edgeTessellationFactor, spvTessLevel[gl_PrimitiveID].insideTessellationFactor);
+    }
+}
+
--- a/reference/shaders-msl/vert/basic.for-tess.vert
+++ b/reference/shaders-msl/vert/basic.for-tess.vert
@ -0,0 +1,31 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct UBO
+{
+    float4x4 uMVP;
+};
+
+struct main0_out
+{
+    float3 vNormal;
+    float4 gl_Position;
+};
+
+struct main0_in
+{
+    float4 aVertex [[attribute(0)]];
+    float3 aNormal [[attribute(1)]];
+};
+
+kernel void main0(main0_in in [[stage_in]], constant UBO& _16 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], device main0_out* spvOut [[buffer(28)]])
+{
+    device main0_out& out = spvOut[gl_GlobalInvocationID.y * spvStageInputSize.x + gl_GlobalInvocationID.x];
+    if (any(gl_GlobalInvocationID >= spvStageInputSize))
+        return;
+    out.gl_Position = _16.uMVP * in.aVertex;
+    out.vNormal = in.aNormal;
+}
+
--- a/reference/shaders-msl/vert/leaf-function.for-tess.vert
+++ b/reference/shaders-msl/vert/leaf-function.for-tess.vert
@ -0,0 +1,39 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct UBO
+{
+    float4x4 uMVP;
+};
+
+struct main0_out
+{
+    float3 vNormal;
+    float4 gl_Position;
+};
+
+struct main0_in
+{
+    float4 aVertex [[attribute(0)]];
+    float3 aNormal [[attribute(1)]];
+};
+
+static inline __attribute__((always_inline))
+void set_output(device float4& gl_Position, constant UBO& v_18, thread float4& aVertex, device float3& vNormal, thread float3& aNormal)
+{
+    gl_Position = v_18.uMVP * aVertex;
+    vNormal = aNormal;
+}
+
+kernel void main0(main0_in in [[stage_in]], constant UBO& v_18 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], device main0_out* spvOut [[buffer(28)]])
+{
+    device main0_out& out = spvOut[gl_GlobalInvocationID.y * spvStageInputSize.x + gl_GlobalInvocationID.x];
+    if (any(gl_GlobalInvocationID >= spvStageInputSize))
+        return;
+    set_output(out.gl_Position, v_18, in.aVertex, out.vNormal, in.aNormal);
+}
+
--- a/reference/shaders-msl/vert/no_stage_out.for-tess.vert
+++ b/reference/shaders-msl/vert/no_stage_out.for-tess.vert
@ -0,0 +1,23 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct _10
+{
+    uint4 _m0[1024];
+};
+
+struct main0_in
+{
+    uint4 m_19 [[attribute(0)]];
+};
+
+kernel void main0(main0_in in [[stage_in]], device _10& _12 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], uint3 spvDispatchBase [[grid_origin]])
+{
+    if (any(gl_GlobalInvocationID >= spvStageInputSize))
+        return;
+    uint gl_VertexIndex = gl_GlobalInvocationID.x + spvDispatchBase.x;
+    _12._m0[int(gl_VertexIndex)] = in.m_19;
+}
+
--- a/shaders-msl-no-opt/asm/tesc/tess-fixed-input-array-builtin-array.invalid.multi-patch.asm.tesc
+++ b/shaders-msl-no-opt/asm/tesc/tess-fixed-input-array-builtin-array.invalid.multi-patch.asm.tesc
@ -0,0 +1,248 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 2
+; Bound: 162
+; Schema: 0
+               OpCapability Tessellation
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint TessellationControl %hs_main "main" %p_pos %p_1 %i_1 %_entryPointOutput_pos %_entryPointOutput %_patchConstantOutput_EdgeTess %_patchConstantOutput_InsideTess
+               OpExecutionMode %hs_main OutputVertices 3
+               OpExecutionMode %hs_main Triangles
+               OpExecutionMode %hs_main SpacingFractionalOdd
+               OpExecutionMode %hs_main VertexOrderCw
+               OpSource HLSL 500
+               OpName %hs_main "hs_main"
+               OpName %VertexOutput "VertexOutput"
+               OpMemberName %VertexOutput 0 "pos"
+               OpMemberName %VertexOutput 1 "uv"
+               OpName %HSOut "HSOut"
+               OpMemberName %HSOut 0 "pos"
+               OpMemberName %HSOut 1 "uv"
+               OpName %_hs_main_struct_VertexOutput_vf4_vf21_3__u1_ "@hs_main(struct-VertexOutput-vf4-vf21[3];u1;"
+               OpName %p "p"
+               OpName %i "i"
+               OpName %HSConstantOut "HSConstantOut"
+               OpMemberName %HSConstantOut 0 "EdgeTess"
+               OpMemberName %HSConstantOut 1 "InsideTess"
+               OpName %PatchHS_struct_VertexOutput_vf4_vf21_3__ "PatchHS(struct-VertexOutput-vf4-vf21[3];"
+               OpName %patch "patch"
+               OpName %output "output"
+               OpName %p_0 "p"
+               OpName %p_pos "p.pos"
+               OpName %VertexOutput_0 "VertexOutput"
+               OpMemberName %VertexOutput_0 0 "uv"
+               OpName %p_1 "p"
+               OpName %i_0 "i"
+               OpName %i_1 "i"
+               OpName %flattenTemp "flattenTemp"
+               OpName %param "param"
+               OpName %param_0 "param"
+               OpName %_entryPointOutput_pos "@entryPointOutput.pos"
+               OpName %HSOut_0 "HSOut"
+               OpMemberName %HSOut_0 0 "uv"
+               OpName %_entryPointOutput "@entryPointOutput"
+               OpName %_patchConstantResult "@patchConstantResult"
+               OpName %param_1 "param"
+               OpName %_patchConstantOutput_EdgeTess "@patchConstantOutput.EdgeTess"
+               OpName %_patchConstantOutput_InsideTess "@patchConstantOutput.InsideTess"
+               OpName %output_0 "output"
+               OpDecorate %p_pos BuiltIn Position
+               OpDecorate %p_1 Location 0
+               OpDecorate %i_1 BuiltIn InvocationId
+               OpDecorate %_entryPointOutput_pos BuiltIn Position
+               OpDecorate %_entryPointOutput Location 0
+               OpDecorate %_patchConstantOutput_EdgeTess Patch
+               OpDecorate %_patchConstantOutput_EdgeTess BuiltIn TessLevelOuter
+               OpDecorate %_patchConstantOutput_InsideTess Patch
+               OpDecorate %_patchConstantOutput_InsideTess BuiltIn TessLevelInner
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+    %v4float = OpTypeVector %float 4
+    %v2float = OpTypeVector %float 2
+%VertexOutput = OpTypeStruct %v4float %v2float
+       %uint = OpTypeInt 32 0
+     %uint_3 = OpConstant %uint 3
+%_arr_VertexOutput_uint_3 = OpTypeArray %VertexOutput %uint_3
+%_ptr_Function__arr_VertexOutput_uint_3 = OpTypePointer Function %_arr_VertexOutput_uint_3
+%_ptr_Function_uint = OpTypePointer Function %uint
+      %HSOut = OpTypeStruct %v4float %v2float
+         %16 = OpTypeFunction %HSOut %_ptr_Function__arr_VertexOutput_uint_3 %_ptr_Function_uint
+%_arr_float_uint_3 = OpTypeArray %float %uint_3
+%HSConstantOut = OpTypeStruct %_arr_float_uint_3 %float
+         %23 = OpTypeFunction %HSConstantOut %_ptr_Function__arr_VertexOutput_uint_3
+%_ptr_Function_HSOut = OpTypePointer Function %HSOut
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+%_ptr_Function_v4float = OpTypePointer Function %v4float
+      %int_1 = OpConstant %int 1
+%_ptr_Function_v2float = OpTypePointer Function %v2float
+%_arr_v4float_uint_3 = OpTypeArray %v4float %uint_3
+%_ptr_Input__arr_v4float_uint_3 = OpTypePointer Input %_arr_v4float_uint_3
+      %p_pos = OpVariable %_ptr_Input__arr_v4float_uint_3 Input
+%_ptr_Input_v4float = OpTypePointer Input %v4float
+%VertexOutput_0 = OpTypeStruct %v2float
+%_arr_VertexOutput_0_uint_3 = OpTypeArray %VertexOutput_0 %uint_3
+%_ptr_Input__arr_VertexOutput_0_uint_3 = OpTypePointer Input %_arr_VertexOutput_0_uint_3
+        %p_1 = OpVariable %_ptr_Input__arr_VertexOutput_0_uint_3 Input
+%_ptr_Input_v2float = OpTypePointer Input %v2float
+      %int_2 = OpConstant %int 2
+%_ptr_Input_uint = OpTypePointer Input %uint
+        %i_1 = OpVariable %_ptr_Input_uint Input
+%_ptr_Output__arr_v4float_uint_3 = OpTypePointer Output %_arr_v4float_uint_3
+%_entryPointOutput_pos = OpVariable %_ptr_Output__arr_v4float_uint_3 Output
+%_ptr_Output_v4float = OpTypePointer Output %v4float
+    %HSOut_0 = OpTypeStruct %v2float
+%_arr_HSOut_0_uint_3 = OpTypeArray %HSOut_0 %uint_3
+%_ptr_Output__arr_HSOut_0_uint_3 = OpTypePointer Output %_arr_HSOut_0_uint_3
+%_entryPointOutput = OpVariable %_ptr_Output__arr_HSOut_0_uint_3 Output
+%_ptr_Output_v2float = OpTypePointer Output %v2float
+     %uint_2 = OpConstant %uint 2
+     %uint_1 = OpConstant %uint 1
+     %uint_0 = OpConstant %uint 0
+       %bool = OpTypeBool
+%_ptr_Function_HSConstantOut = OpTypePointer Function %HSConstantOut
+     %uint_4 = OpConstant %uint 4
+%_arr_float_uint_4 = OpTypeArray %float %uint_4
+%_ptr_Output__arr_float_uint_4 = OpTypePointer Output %_arr_float_uint_4
+%_patchConstantOutput_EdgeTess = OpVariable %_ptr_Output__arr_float_uint_4 Output
+%_ptr_Function_float = OpTypePointer Function %float
+%_ptr_Output_float = OpTypePointer Output %float
+%_arr_float_uint_2 = OpTypeArray %float %uint_2
+%_ptr_Output__arr_float_uint_2 = OpTypePointer Output %_arr_float_uint_2
+%_patchConstantOutput_InsideTess = OpVariable %_ptr_Output__arr_float_uint_2 Output
+    %float_1 = OpConstant %float 1
+    %hs_main = OpFunction %void None %3
+          %5 = OpLabel
+        %p_0 = OpVariable %_ptr_Function__arr_VertexOutput_uint_3 Function
+        %i_0 = OpVariable %_ptr_Function_uint Function
+%flattenTemp = OpVariable %_ptr_Function_HSOut Function
+      %param = OpVariable %_ptr_Function__arr_VertexOutput_uint_3 Function
+    %param_0 = OpVariable %_ptr_Function_uint Function
+%_patchConstantResult = OpVariable %_ptr_Function_HSConstantOut Function
+    %param_1 = OpVariable %_ptr_Function__arr_VertexOutput_uint_3 Function
+         %50 = OpAccessChain %_ptr_Input_v4float %p_pos %int_0
+         %51 = OpLoad %v4float %50
+         %52 = OpAccessChain %_ptr_Function_v4float %p_0 %int_0 %int_0
+               OpStore %52 %51
+         %58 = OpAccessChain %_ptr_Input_v2float %p_1 %int_0 %int_0
+         %59 = OpLoad %v2float %58
+         %60 = OpAccessChain %_ptr_Function_v2float %p_0 %int_0 %int_1
+               OpStore %60 %59
+         %61 = OpAccessChain %_ptr_Input_v4float %p_pos %int_1
+         %62 = OpLoad %v4float %61
+         %63 = OpAccessChain %_ptr_Function_v4float %p_0 %int_1 %int_0
+               OpStore %63 %62
+         %64 = OpAccessChain %_ptr_Input_v2float %p_1 %int_1 %int_0
+         %65 = OpLoad %v2float %64
+         %66 = OpAccessChain %_ptr_Function_v2float %p_0 %int_1 %int_1
+               OpStore %66 %65
+         %68 = OpAccessChain %_ptr_Input_v4float %p_pos %int_2
+         %69 = OpLoad %v4float %68
+         %70 = OpAccessChain %_ptr_Function_v4float %p_0 %int_2 %int_0
+               OpStore %70 %69
+         %71 = OpAccessChain %_ptr_Input_v2float %p_1 %int_2 %int_0
+         %72 = OpLoad %v2float %71
+         %73 = OpAccessChain %_ptr_Function_v2float %p_0 %int_2 %int_1
+               OpStore %73 %72
+         %77 = OpLoad %uint %i_1
+               OpStore %i_0 %77
+         %80 = OpLoad %_arr_VertexOutput_uint_3 %p_0
+               OpStore %param %80
+         %82 = OpLoad %uint %i_0
+               OpStore %param_0 %82
+         %83 = OpFunctionCall %HSOut %_hs_main_struct_VertexOutput_vf4_vf21_3__u1_ %param %param_0
+               OpStore %flattenTemp %83
+         %86 = OpAccessChain %_ptr_Function_v4float %flattenTemp %int_0
+         %87 = OpLoad %v4float %86
+         %94 = OpLoad %uint %i_1
+         %89 = OpAccessChain %_ptr_Output_v4float %_entryPointOutput_pos %94
+               OpStore %89 %87
+         %95 = OpAccessChain %_ptr_Function_v2float %flattenTemp %int_1
+         %96 = OpLoad %v2float %95
+         %98 = OpAccessChain %_ptr_Output_v2float %_entryPointOutput %94 %int_0
+               OpStore %98 %96
+               OpControlBarrier %uint_2 %uint_1 %uint_0
+        %102 = OpLoad %uint %i_1
+        %104 = OpIEqual %bool %102 %int_0
+               OpSelectionMerge %106 None
+               OpBranchConditional %104 %105 %106
+        %105 = OpLabel
+        %110 = OpLoad %_arr_VertexOutput_uint_3 %p_0
+               OpStore %param_1 %110
+        %111 = OpFunctionCall %HSConstantOut %PatchHS_struct_VertexOutput_vf4_vf21_3__ %param_1
+               OpStore %_patchConstantResult %111
+        %117 = OpAccessChain %_ptr_Function_float %_patchConstantResult %int_0 %int_0
+        %118 = OpLoad %float %117
+        %120 = OpAccessChain %_ptr_Output_float %_patchConstantOutput_EdgeTess %int_0
+               OpStore %120 %118
+        %121 = OpAccessChain %_ptr_Function_float %_patchConstantResult %int_0 %int_1
+        %122 = OpLoad %float %121
+        %123 = OpAccessChain %_ptr_Output_float %_patchConstantOutput_EdgeTess %int_1
+               OpStore %123 %122
+        %124 = OpAccessChain %_ptr_Function_float %_patchConstantResult %int_0 %int_2
+        %125 = OpLoad %float %124
+        %126 = OpAccessChain %_ptr_Output_float %_patchConstantOutput_EdgeTess %int_2
+               OpStore %126 %125
+        %130 = OpAccessChain %_ptr_Function_float %_patchConstantResult %int_1
+        %131 = OpLoad %float %130
+        %132 = OpAccessChain %_ptr_Output_float %_patchConstantOutput_InsideTess %int_0
+               OpStore %132 %131
+               OpBranch %106
+        %106 = OpLabel
+               OpReturn
+               OpFunctionEnd
+%_hs_main_struct_VertexOutput_vf4_vf21_3__u1_ = OpFunction %HSOut None %16
+          %p = OpFunctionParameter %_ptr_Function__arr_VertexOutput_uint_3
+          %i = OpFunctionParameter %_ptr_Function_uint
+         %20 = OpLabel
+     %output = OpVariable %_ptr_Function_HSOut Function
+         %31 = OpLoad %uint %i
+         %33 = OpAccessChain %_ptr_Function_v4float %p %31 %int_0
+         %34 = OpLoad %v4float %33
+         %35 = OpAccessChain %_ptr_Function_v4float %output %int_0
+               OpStore %35 %34
+         %37 = OpLoad %uint %i
+         %39 = OpAccessChain %_ptr_Function_v2float %p %37 %int_1
+         %40 = OpLoad %v2float %39
+         %41 = OpAccessChain %_ptr_Function_v2float %output %int_1
+               OpStore %41 %40
+         %42 = OpLoad %HSOut %output
+               OpReturnValue %42
+               OpFunctionEnd
+%PatchHS_struct_VertexOutput_vf4_vf21_3__ = OpFunction %HSConstantOut None %23
+      %patch = OpFunctionParameter %_ptr_Function__arr_VertexOutput_uint_3
+         %26 = OpLabel
+   %output_0 = OpVariable %_ptr_Function_HSConstantOut Function
+        %135 = OpAccessChain %_ptr_Function_v2float %patch %int_0 %int_1
+        %136 = OpLoad %v2float %135
+        %137 = OpCompositeConstruct %v2float %float_1 %float_1
+        %138 = OpFAdd %v2float %137 %136
+        %139 = OpCompositeExtract %float %138 0
+        %140 = OpAccessChain %_ptr_Function_float %output_0 %int_0 %int_0
+               OpStore %140 %139
+        %141 = OpAccessChain %_ptr_Function_v2float %patch %int_0 %int_1
+        %142 = OpLoad %v2float %141
+        %143 = OpCompositeConstruct %v2float %float_1 %float_1
+        %144 = OpFAdd %v2float %143 %142
+        %145 = OpCompositeExtract %float %144 0
+        %146 = OpAccessChain %_ptr_Function_float %output_0 %int_0 %int_1
+               OpStore %146 %145
+        %147 = OpAccessChain %_ptr_Function_v2float %patch %int_0 %int_1
+        %148 = OpLoad %v2float %147
+        %149 = OpCompositeConstruct %v2float %float_1 %float_1
+        %150 = OpFAdd %v2float %149 %148
+        %151 = OpCompositeExtract %float %150 0
+        %152 = OpAccessChain %_ptr_Function_float %output_0 %int_0 %int_2
+               OpStore %152 %151
+        %153 = OpAccessChain %_ptr_Function_v2float %patch %int_0 %int_1
+        %154 = OpLoad %v2float %153
+        %155 = OpCompositeConstruct %v2float %float_1 %float_1
+        %156 = OpFAdd %v2float %155 %154
+        %157 = OpCompositeExtract %float %156 0
+        %158 = OpAccessChain %_ptr_Function_float %output_0 %int_1
+               OpStore %158 %157
+        %159 = OpLoad %HSConstantOut %output_0
+               OpReturnValue %159
+               OpFunctionEnd
--- a/shaders-msl/asm/tesc/tess-level-overrun.multi-patch.asm.tesc
+++ b/shaders-msl/asm/tesc/tess-level-overrun.multi-patch.asm.tesc
@ -0,0 +1,102 @@
+; SPIR-V
+; Version: 1.3
+; Generator: Khronos Glslang Reference Front End; 7
+; Bound: 46
+; Schema: 0
+               OpCapability Tessellation
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint TessellationControl %main "main" %gl_TessLevelInner %gl_TessLevelOuter
+               OpExecutionMode %main OutputVertices 1
+               OpExecutionMode %main Triangles
+               OpSource ESSL 310
+               OpSourceExtension "GL_EXT_shader_io_blocks"
+               OpSourceExtension "GL_EXT_tessellation_shader"
+               OpName %main "main"
+               OpName %gl_TessLevelInner "gl_TessLevelInner"
+               OpName %TessLevels "TessLevels"
+               OpMemberName %TessLevels 0 "inner0"
+               OpMemberName %TessLevels 1 "inner1"
+               OpMemberName %TessLevels 2 "outer0"
+               OpMemberName %TessLevels 3 "outer1"
+               OpMemberName %TessLevels 4 "outer2"
+               OpMemberName %TessLevels 5 "outer3"
+               OpName %sb_levels "sb_levels"
+               OpName %gl_TessLevelOuter "gl_TessLevelOuter"
+               OpDecorate %gl_TessLevelInner Patch
+               OpDecorate %gl_TessLevelInner BuiltIn TessLevelInner
+               OpMemberDecorate %TessLevels 0 Restrict
+               OpMemberDecorate %TessLevels 0 NonWritable
+               OpMemberDecorate %TessLevels 0 Offset 0
+               OpMemberDecorate %TessLevels 1 Restrict
+               OpMemberDecorate %TessLevels 1 NonWritable
+               OpMemberDecorate %TessLevels 1 Offset 4
+               OpMemberDecorate %TessLevels 2 Restrict
+               OpMemberDecorate %TessLevels 2 NonWritable
+               OpMemberDecorate %TessLevels 2 Offset 8
+               OpMemberDecorate %TessLevels 3 Restrict
+               OpMemberDecorate %TessLevels 3 NonWritable
+               OpMemberDecorate %TessLevels 3 Offset 12
+               OpMemberDecorate %TessLevels 4 Restrict
+               OpMemberDecorate %TessLevels 4 NonWritable
+               OpMemberDecorate %TessLevels 4 Offset 16
+               OpMemberDecorate %TessLevels 5 Restrict
+               OpMemberDecorate %TessLevels 5 NonWritable
+               OpMemberDecorate %TessLevels 5 Offset 20
+               OpDecorate %TessLevels Block
+               OpDecorate %sb_levels DescriptorSet 0
+               OpDecorate %sb_levels Binding 0
+               OpDecorate %gl_TessLevelOuter Patch
+               OpDecorate %gl_TessLevelOuter BuiltIn TessLevelOuter
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+       %uint = OpTypeInt 32 0
+     %uint_2 = OpConstant %uint 2
+%_arr_float_uint_2 = OpTypeArray %float %uint_2
+%_ptr_Output__arr_float_uint_2 = OpTypePointer Output %_arr_float_uint_2
+%gl_TessLevelInner = OpVariable %_ptr_Output__arr_float_uint_2 Output
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+ %TessLevels = OpTypeStruct %float %float %float %float %float %float
+%_ptr_StorageBuffer_TessLevels = OpTypePointer StorageBuffer %TessLevels
+  %sb_levels = OpVariable %_ptr_StorageBuffer_TessLevels StorageBuffer
+%_ptr_StorageBuffer_float = OpTypePointer StorageBuffer %float
+%_ptr_Output_float = OpTypePointer Output %float
+      %int_1 = OpConstant %int 1
+     %uint_4 = OpConstant %uint 4
+%_arr_float_uint_4 = OpTypeArray %float %uint_4
+%_ptr_Output__arr_float_uint_4 = OpTypePointer Output %_arr_float_uint_4
+%gl_TessLevelOuter = OpVariable %_ptr_Output__arr_float_uint_4 Output
+      %int_2 = OpConstant %int 2
+      %int_3 = OpConstant %int 3
+      %int_4 = OpConstant %int 4
+      %int_5 = OpConstant %int 5
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %18 = OpAccessChain %_ptr_StorageBuffer_float %sb_levels %int_0
+         %19 = OpLoad %float %18
+         %21 = OpAccessChain %_ptr_Output_float %gl_TessLevelInner %int_0
+               OpStore %21 %19
+         %23 = OpAccessChain %_ptr_StorageBuffer_float %sb_levels %int_1
+         %24 = OpLoad %float %23
+         %25 = OpAccessChain %_ptr_Output_float %gl_TessLevelInner %int_1
+               OpStore %25 %24
+         %31 = OpAccessChain %_ptr_StorageBuffer_float %sb_levels %int_2
+         %32 = OpLoad %float %31
+         %33 = OpAccessChain %_ptr_Output_float %gl_TessLevelOuter %int_0
+               OpStore %33 %32
+         %35 = OpAccessChain %_ptr_StorageBuffer_float %sb_levels %int_3
+         %36 = OpLoad %float %35
+         %37 = OpAccessChain %_ptr_Output_float %gl_TessLevelOuter %int_1
+               OpStore %37 %36
+         %39 = OpAccessChain %_ptr_StorageBuffer_float %sb_levels %int_4
+         %40 = OpLoad %float %39
+         %41 = OpAccessChain %_ptr_Output_float %gl_TessLevelOuter %int_2
+               OpStore %41 %40
+         %43 = OpAccessChain %_ptr_StorageBuffer_float %sb_levels %int_5
+         %44 = OpLoad %float %43
+         %45 = OpAccessChain %_ptr_Output_float %gl_TessLevelOuter %int_3
+               OpStore %45 %44
+               OpReturn
+               OpFunctionEnd
--- a/shaders-msl/desktop-only/tesc/basic.desktop.sso.multi-patch.tesc
+++ b/shaders-msl/desktop-only/tesc/basic.desktop.sso.multi-patch.tesc
@ -0,0 +1,32 @@
+#version 450
+layout(vertices = 1) out;
+
+in gl_PerVertex
+{
+   vec4 gl_Position;
+} gl_in[gl_MaxPatchVertices];
+
+out gl_PerVertex
+{
+   vec4 gl_Position;
+} gl_out[1];
+
+layout(location = 0) patch out vec3 vFoo;
+
+void set_position()
+{
+    gl_out[gl_InvocationID].gl_Position = gl_in[0].gl_Position + gl_in[1].gl_Position;
+}
+
+void main()
+{
+    gl_TessLevelInner[0] = 8.9;
+    gl_TessLevelInner[1] = 6.9;
+    gl_TessLevelOuter[0] = 8.9;
+    gl_TessLevelOuter[1] = 6.9;
+    gl_TessLevelOuter[2] = 3.9;
+    gl_TessLevelOuter[3] = 4.9;
+    vFoo = vec3(1.0);
+
+    set_position();
+}
--- a/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc
+++ b/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc
@ -0,0 +1,22 @@
+#version 450
+
+struct Boo
+{
+        vec3 a;
+        uvec3 b;
+};
+
+layout(vertices = 4) out;
+layout(location = 0) out Boo vVertex[];
+layout(location = 0) in Boo vInput[];
+
+void main()
+{
+        vVertex[gl_InvocationID] = vInput[gl_InvocationID];
+        gl_TessLevelOuter[0] = 1.0;
+        gl_TessLevelOuter[1] = 2.0;
+        gl_TessLevelOuter[2] = 3.0;
+        gl_TessLevelOuter[3] = 4.0;
+        gl_TessLevelInner[0] = 1.0;
+        gl_TessLevelInner[1] = 2.0;
+}
--- a/shaders-msl/desktop-only/vert/shader-draw-parameters.desktop.for-tess.vert
+++ b/shaders-msl/desktop-only/vert/shader-draw-parameters.desktop.for-tess.vert
@ -0,0 +1,11 @@
+#version 460
+
+out gl_PerVertex
+{
+    vec4 gl_Position;
+};
+
+void main()
+{
+    gl_Position = vec4(gl_BaseVertex, gl_BaseInstance, 0, 1);
+}
--- a/shaders-msl/tesc/basic.multi-patch.tesc
+++ b/shaders-msl/tesc/basic.multi-patch.tesc
@ -0,0 +1,17 @@
+#version 310 es
+#extension GL_EXT_tessellation_shader : require
+
+layout(location = 0) patch out vec3 vFoo;
+
+layout(vertices = 1) out;
+
+void main()
+{
+    gl_TessLevelInner[0] = 8.9;
+    gl_TessLevelInner[1] = 6.9;
+    gl_TessLevelOuter[0] = 8.9;
+    gl_TessLevelOuter[1] = 6.9;
+    gl_TessLevelOuter[2] = 3.9;
+    gl_TessLevelOuter[3] = 4.9;
+    vFoo = vec3(1.0);
+}
--- a/shaders-msl/tesc/load-control-point-array-of-matrix.multi-patch.tesc
+++ b/shaders-msl/tesc/load-control-point-array-of-matrix.multi-patch.tesc
@ -0,0 +1,12 @@
+#version 450
+
+layout(vertices = 4) out;
+
+layout(location = 0) in mat4 vInputs[gl_MaxPatchVertices];
+layout(location = 0) out mat4 vOutputs[4];
+
+void main()
+{
+	mat4 tmp[gl_MaxPatchVertices] = vInputs;
+	vOutputs[gl_InvocationID] = tmp[gl_InvocationID];
+}
--- a/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc
+++ b/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc
@ -0,0 +1,21 @@
+#version 450
+
+layout(vertices = 4) out;
+
+struct VertexData
+{
+	mat4 a;
+	vec4 b[2];
+	vec4 c;
+};
+
+layout(location = 0) in VertexData vInputs[gl_MaxPatchVertices];
+layout(location = 0) out vec4 vOutputs[4];
+
+void main()
+{
+	VertexData tmp[gl_MaxPatchVertices] = vInputs;
+	VertexData tmp_single = vInputs[gl_InvocationID ^ 1];
+
+	vOutputs[gl_InvocationID] = tmp[gl_InvocationID].a[1] + tmp[gl_InvocationID].b[1] + tmp[gl_InvocationID].c + tmp_single.c;
+}
--- a/shaders-msl/tesc/load-control-point-array.multi-patch.tesc
+++ b/shaders-msl/tesc/load-control-point-array.multi-patch.tesc
@ -0,0 +1,12 @@
+#version 450
+
+layout(vertices = 4) out;
+
+layout(location = 0) in vec4 vInputs[gl_MaxPatchVertices];
+layout(location = 0) out vec4 vOutputs[4];
+
+void main()
+{
+	vec4 tmp[gl_MaxPatchVertices] = vInputs;
+	vOutputs[gl_InvocationID] = tmp[gl_InvocationID];
+}
--- a/shaders-msl/tesc/water_tess.multi-patch.tesc
+++ b/shaders-msl/tesc/water_tess.multi-patch.tesc
@ -0,0 +1,115 @@
+#version 310 es
+#extension GL_EXT_tessellation_shader : require
+
+layout(vertices = 1) out;
+layout(location = 0) in vec2 vPatchPosBase[];
+
+layout(std140) uniform UBO
+{
+    vec4 uScale;
+    highp vec3 uCamPos;
+    vec2 uPatchSize;
+    vec2 uMaxTessLevel;
+    float uDistanceMod;
+    vec4 uFrustum[6];
+};
+
+layout(location = 1) patch out vec2 vOutPatchPosBase;
+layout(location = 2) patch out vec4 vPatchLods;
+
+float lod_factor(vec2 pos_)
+{
+    vec2 pos = pos_ * uScale.xy;
+    vec3 dist_to_cam = uCamPos - vec3(pos.x, 0.0, pos.y);
+    float level = log2((length(dist_to_cam) + 0.0001) * uDistanceMod);
+    return clamp(level, 0.0, uMaxTessLevel.x);
+}
+
+float tess_level(float lod)
+{
+    return uMaxTessLevel.y * exp2(-lod);
+}
+
+vec4 tess_level(vec4 lod)
+{
+    return uMaxTessLevel.y * exp2(-lod);
+}
+
+// Guard band for vertex displacement.
+#define GUARD_BAND 10.0
+bool frustum_cull(vec2 p0)
+{
+    vec2 min_xz = (p0 - GUARD_BAND) * uScale.xy;
+    vec2 max_xz = (p0 + uPatchSize + GUARD_BAND) * uScale.xy;
+
+    vec3 bb_min = vec3(min_xz.x, -GUARD_BAND, min_xz.y);
+    vec3 bb_max = vec3(max_xz.x, +GUARD_BAND, max_xz.y);
+    vec3 center = 0.5 * (bb_min + bb_max);
+    float radius = 0.5 * length(bb_max - bb_min);
+
+    vec3 f0 = vec3(
+        dot(uFrustum[0], vec4(center, 1.0)),
+        dot(uFrustum[1], vec4(center, 1.0)),
+        dot(uFrustum[2], vec4(center, 1.0)));
+
+    vec3 f1 = vec3(
+        dot(uFrustum[3], vec4(center, 1.0)),
+        dot(uFrustum[4], vec4(center, 1.0)),
+        dot(uFrustum[5], vec4(center, 1.0)));
+
+    return !(any(lessThanEqual(f0, vec3(-radius))) || any(lessThanEqual(f1, vec3(-radius))));
+}
+
+void compute_tess_levels(vec2 p0)
+{
+    vOutPatchPosBase = p0;
+
+    float l00 = lod_factor(p0 + vec2(-0.5, -0.5) * uPatchSize);
+    float l10 = lod_factor(p0 + vec2(+0.5, -0.5) * uPatchSize);
+    float l20 = lod_factor(p0 + vec2(+1.5, -0.5) * uPatchSize);
+    float l01 = lod_factor(p0 + vec2(-0.5, +0.5) * uPatchSize);
+    float l11 = lod_factor(p0 + vec2(+0.5, +0.5) * uPatchSize);
+    float l21 = lod_factor(p0 + vec2(+1.5, +0.5) * uPatchSize);
+    float l02 = lod_factor(p0 + vec2(-0.5, +1.5) * uPatchSize);
+    float l12 = lod_factor(p0 + vec2(+0.5, +1.5) * uPatchSize);
+    float l22 = lod_factor(p0 + vec2(+1.5, +1.5) * uPatchSize);
+
+    vec4 lods = vec4(
+        dot(vec4(l01, l11, l02, l12), vec4(0.25)),
+        dot(vec4(l00, l10, l01, l11), vec4(0.25)),
+        dot(vec4(l10, l20, l11, l21), vec4(0.25)),
+        dot(vec4(l11, l21, l12, l22), vec4(0.25)));
+
+    vPatchLods = lods;
+
+    vec4 outer_lods = min(lods.xyzw, lods.yzwx);
+    vec4 levels = tess_level(outer_lods);
+    gl_TessLevelOuter[0] = levels.x;
+    gl_TessLevelOuter[1] = levels.y;
+    gl_TessLevelOuter[2] = levels.z;
+    gl_TessLevelOuter[3] = levels.w;
+
+    float min_lod = min(min(lods.x, lods.y), min(lods.z, lods.w));
+    float inner = tess_level(min(min_lod, l11));
+    gl_TessLevelInner[0] = inner;
+    gl_TessLevelInner[1] = inner;
+}
+
+void main()
+{
+    vec2 p0 = vPatchPosBase[0];
+    if (!frustum_cull(p0))
+    {
+        gl_TessLevelOuter[0] = -1.0;
+        gl_TessLevelOuter[1] = -1.0;
+        gl_TessLevelOuter[2] = -1.0;
+        gl_TessLevelOuter[3] = -1.0;
+        gl_TessLevelInner[0] = -1.0;
+        gl_TessLevelInner[1] = -1.0;
+    }
+    else
+    {
+        compute_tess_levels(p0);
+    }
+}
+
--- a/shaders-msl/vert/basic.for-tess.vert
+++ b/shaders-msl/vert/basic.for-tess.vert
@ -0,0 +1,17 @@
+#version 310 es
+
+layout(std140) uniform UBO
+{
+    uniform mat4 uMVP;
+};
+
+layout(location = 0) in vec4 aVertex;
+layout(location = 1) in vec3 aNormal;
+
+layout(location = 0) out vec3 vNormal;
+
+void main()
+{
+    gl_Position = uMVP * aVertex;
+    vNormal = aNormal;
+}
--- a/shaders-msl/vert/leaf-function.for-tess.vert
+++ b/shaders-msl/vert/leaf-function.for-tess.vert
@ -0,0 +1,22 @@
+#version 310 es
+
+layout(std140) uniform UBO
+{
+    uniform mat4 uMVP;
+};
+
+layout(location = 0) in vec4 aVertex;
+layout(location = 1) in vec3 aNormal;
+
+layout(location = 0) out vec3 vNormal;
+
+void set_output()
+{
+    gl_Position = uMVP * aVertex;
+    vNormal = aNormal;
+}
+
+void main()
+{
+    set_output();
+}
--- a/shaders-msl/vert/no_stage_out.for-tess.vert
+++ b/shaders-msl/vert/no_stage_out.for-tess.vert
@ -0,0 +1,14 @@
+#version 450
+
+layout(binding = 0, std430) writeonly buffer _10_12
+{
+	uvec4 _m0[1024];
+} _12;
+
+layout(location = 0) in uvec4 _19;
+
+void main()
+{
+	_12._m0[gl_VertexIndex] = _19;
+}
+
--- a/spirv_common.hpp
+++ b/spirv_common.hpp
@ -1589,8 +1589,10 @@ enum ExtendedDecorations
 	// Marks a buffer block for using explicit offsets (GLSL/HLSL).
 	SPIRVCrossDecorationExplicitOffset,

-	// Apply to a variable in the Input storage class; marks it as holding the base group passed to vkCmdDispatchBase().
-	// In MSL, this is used to adjust the WorkgroupId and GlobalInvocationId variables.
+	// Apply to a variable in the Input storage class; marks it as holding the base group passed to vkCmdDispatchBase(),
+	// or the base vertex and instance indices passed to vkCmdDrawIndexed().
+	// In MSL, this is used to adjust the WorkgroupId and GlobalInvocationId variables in compute shaders,
+	// and to hold the BaseVertex and BaseInstance variables in vertex shaders.
 	SPIRVCrossDecorationBuiltInDispatchBase,

 	// Apply to a variable that is a function parameter; marks it as being a "dynamic"
@ -1599,6 +1601,20 @@ enum ExtendedDecorations
 	// Y'CbCr conversion.
 	SPIRVCrossDecorationDynamicImageSampler,

+	// Apply to a variable in the Input storage class; marks it as holding the size of the stage
+	// input grid.
+	// In MSL, this is used to hold the vertex and instance counts in a tessellation pipeline
+	// vertex shader.
+	SPIRVCrossDecorationBuiltInStageInputSize,
+
+	// Apply to any access chain of a tessellation I/O variable; stores the type of the sub-object
+	// that was chained to, as recorded in the input variable itself. This is used in case the pointer
+	// is itself used as the base of an access chain, to calculate the original type of the sub-object
+	// chained to, in case a swizzle needs to be applied. This should not happen normally with valid
+	// SPIR-V, but the MSL backend can change the type of input variables, necessitating the
+	// addition of swizzles to keep the generated code compiling.
+	SPIRVCrossDecorationTessIOOriginalInputTypeID,
+
 	SPIRVCrossDecorationCount
 };

--- a/spirv_cross_c.cpp
+++ b/spirv_cross_c.cpp
@ -635,6 +635,26 @@ spvc_result spvc_compiler_options_set_uint(spvc_compiler_options options, spvc_c
 	case SPVC_COMPILER_OPTION_MSL_ENABLE_CLIP_DISTANCE_USER_VARYING:
 		options->msl.enable_clip_distance_user_varying = value != 0;
 		break;
+
+	case SPVC_COMPILER_OPTION_MSL_MULTI_PATCH_WORKGROUP:
+		options->msl.multi_patch_workgroup = value != 0;
+		break;
+
+	case SPVC_COMPILER_OPTION_MSL_SHADER_INPUT_BUFFER_INDEX:
+		options->msl.shader_input_buffer_index = value;
+		break;
+
+	case SPVC_COMPILER_OPTION_MSL_SHADER_INDEX_BUFFER_INDEX:
+		options->msl.shader_index_buffer_index = value;
+		break;
+
+	case SPVC_COMPILER_OPTION_MSL_VERTEX_FOR_TESSELLATION:
+		options->msl.vertex_for_tessellation = value != 0;
+		break;
+
+	case SPVC_COMPILER_OPTION_MSL_VERTEX_INDEX_TYPE:
+		options->msl.vertex_index_type = static_cast<CompilerMSL::Options::IndexType>(value);
+		break;
 #endif

 	default:
--- a/spirv_cross_c.h
+++ b/spirv_cross_c.h
@ -33,7 +33,7 @@ extern "C" {
 /* Bumped if ABI or API breaks backwards compatibility. */
 #define SPVC_C_API_VERSION_MAJOR 0
 /* Bumped if APIs or enumerations are added in a backwards compatible way. */
-#define SPVC_C_API_VERSION_MINOR 35
+#define SPVC_C_API_VERSION_MINOR 36
 /* Bumped if internal implementation details change. */
 #define SPVC_C_API_VERSION_PATCH 0

@ -258,12 +258,23 @@ typedef enum spvc_msl_platform
 	SPVC_MSL_PLATFORM_MAX_INT = 0x7fffffff
 } spvc_msl_platform;

+/* Maps to C++ API. */
+typedef enum spvc_msl_index_type
+{
+	SPVC_MSL_INDEX_TYPE_NONE = 0,
+	SPVC_MSL_INDEX_TYPE_UINT16 = 1,
+	SPVC_MSL_INDEX_TYPE_UINT32 = 2,
+	SPVC_MSL_INDEX_TYPE_MAX_INT = 0x7fffffff
+} spvc_msl_index_type;
+
 /* Maps to C++ API. */
 typedef enum spvc_msl_shader_input_format
 {
 	SPVC_MSL_SHADER_INPUT_FORMAT_OTHER = 0,
 	SPVC_MSL_SHADER_INPUT_FORMAT_UINT8 = 1,
 	SPVC_MSL_SHADER_INPUT_FORMAT_UINT16 = 2,
+	SPVC_MSL_SHADER_INPUT_FORMAT_ANY16 = 3,
+	SPVC_MSL_SHADER_INPUT_FORMAT_ANY32 = 4,

 	/* Deprecated names. */
 	SPVC_MSL_VERTEX_FORMAT_OTHER = SPVC_MSL_SHADER_INPUT_FORMAT_OTHER,
@ -617,6 +628,12 @@ typedef enum spvc_compiler_option

 	SPVC_COMPILER_OPTION_HLSL_ENABLE_16BIT_TYPES = 60 | SPVC_COMPILER_OPTION_HLSL_BIT,

+	SPVC_COMPILER_OPTION_MSL_MULTI_PATCH_WORKGROUP = 61 | SPVC_COMPILER_OPTION_MSL_BIT,
+	SPVC_COMPILER_OPTION_MSL_SHADER_INPUT_BUFFER_INDEX = 62 | SPVC_COMPILER_OPTION_MSL_BIT,
+	SPVC_COMPILER_OPTION_MSL_SHADER_INDEX_BUFFER_INDEX = 63 | SPVC_COMPILER_OPTION_MSL_BIT,
+	SPVC_COMPILER_OPTION_MSL_VERTEX_FOR_TESSELLATION = 64 | SPVC_COMPILER_OPTION_MSL_BIT,
+	SPVC_COMPILER_OPTION_MSL_VERTEX_INDEX_TYPE = 65 | SPVC_COMPILER_OPTION_MSL_BIT,
+
 	SPVC_COMPILER_OPTION_INT_MAX = 0x7fffffff
 } spvc_compiler_option;

--- a/spirv_msl.cpp
+++ b/spirv_msl.cpp
--- a/spirv_msl.hpp
+++ b/spirv_msl.hpp
@ -35,6 +35,8 @@ enum MSLShaderInputFormat
 	MSL_SHADER_INPUT_FORMAT_OTHER = 0,
 	MSL_SHADER_INPUT_FORMAT_UINT8 = 1,
 	MSL_SHADER_INPUT_FORMAT_UINT16 = 2,
+	MSL_SHADER_INPUT_FORMAT_ANY16 = 3,
+	MSL_SHADER_INPUT_FORMAT_ANY32 = 4,

 	// Deprecated aliases.
 	MSL_VERTEX_FORMAT_OTHER = MSL_SHADER_INPUT_FORMAT_OTHER,
@ -271,6 +273,8 @@ public:
 		uint32_t buffer_size_buffer_index = 25;
 		uint32_t view_mask_buffer_index = 24;
 		uint32_t dynamic_offsets_buffer_index = 23;
+		uint32_t shader_input_buffer_index = 22;
+		uint32_t shader_index_buffer_index = 21;
 		uint32_t shader_input_wg_index = 0;
 		uint32_t device_index = 0;
 		uint32_t enable_frag_output_mask = 0xffffffff;
@ -329,6 +333,31 @@ public:
 		// can be read in subsequent stages.
 		bool enable_clip_distance_user_varying = true;

+		// In a tessellation control shader, assume that more than one patch can be processed in a
+		// single workgroup. This requires changes to the way the InvocationId and PrimitiveId
+		// builtins are processed, but should result in more efficient usage of the GPU.
+		bool multi_patch_workgroup = false;
+
+		// If set, a vertex shader will be compiled as part of a tessellation pipeline.
+		// It will be translated as a compute kernel, so it can use the global invocation ID
+		// to index the output buffer.
+		bool vertex_for_tessellation = false;
+
+		enum class IndexType
+		{
+			None = 0,
+			UInt16 = 1,
+			UInt32 = 2
+		};
+
+		// The type of index in the index buffer, if present. For a compute shader, Metal
+		// requires specifying the indexing at pipeline creation, rather than at draw time
+		// as with graphics pipelines. This means we must create three different pipelines,
+		// for no indexing, 16-bit indices, and 32-bit indices. Each requires different
+		// handling for the gl_VertexIndex builtin. We may as well, then, create three
+		// different shaders for these three scenarios.
+		IndexType vertex_index_type = IndexType::None;
+
 		bool is_ios() const
 		{
 			return platform == iOS;
@ -431,7 +460,7 @@ public:
 	// input is a shader input description used to fix up shader input variables.
 	// If shader inputs are provided, is_msl_shader_input_used() will return true after
 	// calling ::compile() if the location was used by the MSL code.
-	void add_msl_shader_input(const MSLShaderInput &attr);
+	void add_msl_shader_input(const MSLShaderInput &input);

 	// resource is a resource binding to indicate the MSL buffer,
 	// texture or sampler index to use for a particular SPIR-V description set
@ -692,7 +721,7 @@ protected:

 	void fix_up_interface_member_indices(spv::StorageClass storage, uint32_t ib_type_id);

-	void mark_location_as_used_by_shader(uint32_t location, spv::StorageClass storage);
+	void mark_location_as_used_by_shader(uint32_t location, const SPIRType &type, spv::StorageClass storage);
 	uint32_t ensure_correct_builtin_type(uint32_t type_id, spv::BuiltIn builtin);
 	uint32_t ensure_correct_input_type(uint32_t type_id, uint32_t location, uint32_t num_components = 0);

@ -739,7 +768,13 @@ protected:
 	uint32_t get_declared_struct_member_matrix_stride_msl(const SPIRType &struct_type, uint32_t index) const;
 	uint32_t get_declared_struct_member_alignment_msl(const SPIRType &struct_type, uint32_t index) const;

+	uint32_t get_declared_input_size_msl(const SPIRType &struct_type, uint32_t index) const;
+	uint32_t get_declared_input_array_stride_msl(const SPIRType &struct_type, uint32_t index) const;
+	uint32_t get_declared_input_matrix_stride_msl(const SPIRType &struct_type, uint32_t index) const;
+	uint32_t get_declared_input_alignment_msl(const SPIRType &struct_type, uint32_t index) const;
+
 	const SPIRType &get_physical_member_type(const SPIRType &struct_type, uint32_t index) const;
+	SPIRType get_presumed_input_type(const SPIRType &struct_type, uint32_t index) const;

 	uint32_t get_declared_struct_size_msl(const SPIRType &struct_type, bool ignore_alignment = false,
 	                                      bool ignore_padding = false) const;
@ -785,6 +820,7 @@ protected:
 	uint32_t builtin_subgroup_invocation_id_id = 0;
 	uint32_t builtin_subgroup_size_id = 0;
 	uint32_t builtin_dispatch_base_id = 0;
+	uint32_t builtin_stage_input_size_id = 0;
 	uint32_t swizzle_buffer_id = 0;
 	uint32_t buffer_size_buffer_id = 0;
 	uint32_t view_mask_buffer_id = 0;
@ -811,7 +847,8 @@ protected:

 	Options msl_options;
 	std::set<SPVFuncImpl> spv_function_implementations;
-	std::unordered_map<uint32_t, MSLShaderInput> inputs_by_location;
+	// Must be ordered to ensure declarations are in a specific order.
+	std::map<uint32_t, MSLShaderInput> inputs_by_location;
 	std::unordered_map<uint32_t, MSLShaderInput> inputs_by_builtin;
 	std::unordered_set<uint32_t> inputs_in_use;
 	std::unordered_map<uint32_t, uint32_t> fragment_output_components;
@ -866,9 +903,11 @@ protected:
 	std::string buffer_size_name_suffix = "BufferSize";
 	std::string plane_name_suffix = "Plane";
 	std::string input_wg_var_name = "gl_in";
+	std::string input_buffer_var_name = "spvIn";
 	std::string output_buffer_var_name = "spvOut";
 	std::string patch_output_buffer_var_name = "spvPatchOut";
 	std::string tess_factor_buffer_var_name = "spvTessLevel";
+	std::string index_buffer_var_name = "spvIndices";
 	spv::Op previous_instruction_opcode = spv::OpNop;

 	// Must be ordered since declaration is in a specific order.
--- a/test_shaders.py
+++ b/test_shaders.py
@ -280,6 +280,19 @@ def cross_compile_msl(shader, spirv, opt, iterations, paths):
        msl_args.append('6')
        msl_args.append('other')
        msl_args.append('4')
+    if '.multi-patch.' in shader:
+        msl_args.append('--msl-multi-patch-workgroup')
+        # Arbitrary for testing purposes.
+        msl_args.append('--msl-shader-input')
+        msl_args.append('0')
+        msl_args.append('any32')
+        msl_args.append('3')
+        msl_args.append('--msl-shader-input')
+        msl_args.append('1')
+        msl_args.append('any16')
+        msl_args.append('2')
+    if '.for-tess.' in shader:
+        msl_args.append('--msl-vertex-for-tessellation')

    subprocess.check_call(msl_args)