From 688c5fcbda2abe98889fd5de70fd58e0b36c8a26 Mon Sep 17 00:00:00 2001
From: Chip Davis <cdavis@codeweavers.com>
Date: Thu, 20 Feb 2020 21:38:28 -0600
Subject: [PATCH] MSL: Add support for processing more than one patch per
 workgroup.

This should hopefully reduce underutilization of the GPU, especially on
GPUs where the thread execution width is greater than the number of
control points.

This also simplifies initialization by reading the buffer directly
instead of using Metal's vertex-attribute-in-compute support. It turns
out the only way in which shader stages are allowed to differ in their
interfaces is in the number of components per vector; the base type must
be the same. Since we are using the raw buffer instead of attributes, we
can now also emit arrays and matrices directly into the buffer, instead
of flattening them and then unpacking them. Structs are still flattened,
however; this is due to the need to handle vectors with fewer components
than were output, and I think handling this while also directly emitting
structs could get ugly.

Another advantage of this scheme is that the extra invocations needed to
read the attributes when there were more input than output points are
now no more. The number of threads per workgroup is now lcm(SIMD-size,
output control points). This should ensure we always process a whole
number of patches per workgroup.

To avoid complexity handling indices in the tessellation control shader,
I've also changed the way vertex shaders for tessellation are handled.
They are now compute kernels using Metal's support for vertex-style
stage input. This lets us always emit vertices into the buffer in order
of vertex shader execution. Now we no longer have to deal with indexing
in the tessellation control shader. This also fixes a long-standing
issue where if an index were greater than the number of vertices to
draw, the vertex shader would wind up writing outside the buffer, and
the vertex would be lost.

This is a breaking change, and I know SPIRV-Cross has other clients, so
I've hidden this behind an option for now. In the future, I want to
remove this option and make it the default.
---
 CMakeLists.txt                                |   2 +-
 main.cpp                                      |  28 +-
 .../tess-level-overrun.multi-patch.asm.tesc   |  24 +
 .../tesc/basic.desktop.sso.multi-patch.tesc   |  39 +
 .../struct-copy.desktop.sso.multi-patch.tesc  |  38 +
 ...ader-draw-parameters.desktop.for-tess.vert |  20 +
 .../shaders-msl/tesc/basic.multi-patch.tesc   |  23 +
 ...rol-point-array-of-matrix.multi-patch.tesc |  68 ++
 ...rol-point-array-of-struct.multi-patch.tesc |  78 ++
 .../load-control-point-array.multi-patch.tesc |  69 ++
 .../tesc/water_tess.multi-patch.tesc          |  91 +++
 .../opt/shaders-msl/vert/basic.for-tess.vert  |  31 +
 .../vert/leaf-function.for-tess.vert          |  31 +
 .../vert/no_stage_out.for-tess.vert           |  23 +
 ...builtin-array.invalid.multi-patch.asm.tesc | 140 ++++
 .../tess-level-overrun.multi-patch.asm.tesc   |  24 +
 .../tesc/basic.desktop.sso.multi-patch.tesc   |  47 ++
 .../struct-copy.desktop.sso.multi-patch.tesc  |  38 +
 ...ader-draw-parameters.desktop.for-tess.vert |  20 +
 .../shaders-msl/tesc/basic.multi-patch.tesc   |  23 +
 ...rol-point-array-of-matrix.multi-patch.tesc |  68 ++
 ...rol-point-array-of-struct.multi-patch.tesc |  80 ++
 .../load-control-point-array.multi-patch.tesc |  69 ++
 .../tesc/water_tess.multi-patch.tesc          | 135 ++++
 .../shaders-msl/vert/basic.for-tess.vert      |  31 +
 .../vert/leaf-function.for-tess.vert          |  39 +
 .../vert/no_stage_out.for-tess.vert           |  23 +
 ...builtin-array.invalid.multi-patch.asm.tesc | 248 +++++++
 .../tess-level-overrun.multi-patch.asm.tesc   | 102 +++
 .../tesc/basic.desktop.sso.multi-patch.tesc   |  32 +
 .../struct-copy.desktop.sso.multi-patch.tesc  |  22 +
 ...ader-draw-parameters.desktop.for-tess.vert |  11 +
 shaders-msl/tesc/basic.multi-patch.tesc       |  17 +
 ...rol-point-array-of-matrix.multi-patch.tesc |  12 +
 ...rol-point-array-of-struct.multi-patch.tesc |  21 +
 .../load-control-point-array.multi-patch.tesc |  12 +
 shaders-msl/tesc/water_tess.multi-patch.tesc  | 115 +++
 shaders-msl/vert/basic.for-tess.vert          |  17 +
 shaders-msl/vert/leaf-function.for-tess.vert  |  22 +
 shaders-msl/vert/no_stage_out.for-tess.vert   |  14 +
 spirv_common.hpp                              |  20 +-
 spirv_cross_c.cpp                             |  20 +
 spirv_cross_c.h                               |  19 +-
 spirv_msl.cpp                                 | 698 +++++++++++++++---
 spirv_msl.hpp                                 |  45 +-
 test_shaders.py                               |  13 +
 46 files changed, 2649 insertions(+), 113 deletions(-)
 create mode 100644 reference/opt/shaders-msl/asm/tesc/tess-level-overrun.multi-patch.asm.tesc
 create mode 100644 reference/opt/shaders-msl/desktop-only/tesc/basic.desktop.sso.multi-patch.tesc
 create mode 100644 reference/opt/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc
 create mode 100644 reference/opt/shaders-msl/desktop-only/vert/shader-draw-parameters.desktop.for-tess.vert
 create mode 100644 reference/opt/shaders-msl/tesc/basic.multi-patch.tesc
 create mode 100644 reference/opt/shaders-msl/tesc/load-control-point-array-of-matrix.multi-patch.tesc
 create mode 100644 reference/opt/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc
 create mode 100644 reference/opt/shaders-msl/tesc/load-control-point-array.multi-patch.tesc
 create mode 100644 reference/opt/shaders-msl/tesc/water_tess.multi-patch.tesc
 create mode 100644 reference/opt/shaders-msl/vert/basic.for-tess.vert
 create mode 100644 reference/opt/shaders-msl/vert/leaf-function.for-tess.vert
 create mode 100644 reference/opt/shaders-msl/vert/no_stage_out.for-tess.vert
 create mode 100644 reference/shaders-msl-no-opt/asm/tesc/tess-fixed-input-array-builtin-array.invalid.multi-patch.asm.tesc
 create mode 100644 reference/shaders-msl/asm/tesc/tess-level-overrun.multi-patch.asm.tesc
 create mode 100644 reference/shaders-msl/desktop-only/tesc/basic.desktop.sso.multi-patch.tesc
 create mode 100644 reference/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc
 create mode 100644 reference/shaders-msl/desktop-only/vert/shader-draw-parameters.desktop.for-tess.vert
 create mode 100644 reference/shaders-msl/tesc/basic.multi-patch.tesc
 create mode 100644 reference/shaders-msl/tesc/load-control-point-array-of-matrix.multi-patch.tesc
 create mode 100644 reference/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc
 create mode 100644 reference/shaders-msl/tesc/load-control-point-array.multi-patch.tesc
 create mode 100644 reference/shaders-msl/tesc/water_tess.multi-patch.tesc
 create mode 100644 reference/shaders-msl/vert/basic.for-tess.vert
 create mode 100644 reference/shaders-msl/vert/leaf-function.for-tess.vert
 create mode 100644 reference/shaders-msl/vert/no_stage_out.for-tess.vert
 create mode 100644 shaders-msl-no-opt/asm/tesc/tess-fixed-input-array-builtin-array.invalid.multi-patch.asm.tesc
 create mode 100644 shaders-msl/asm/tesc/tess-level-overrun.multi-patch.asm.tesc
 create mode 100644 shaders-msl/desktop-only/tesc/basic.desktop.sso.multi-patch.tesc
 create mode 100644 shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc
 create mode 100644 shaders-msl/desktop-only/vert/shader-draw-parameters.desktop.for-tess.vert
 create mode 100644 shaders-msl/tesc/basic.multi-patch.tesc
 create mode 100644 shaders-msl/tesc/load-control-point-array-of-matrix.multi-patch.tesc
 create mode 100644 shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc
 create mode 100644 shaders-msl/tesc/load-control-point-array.multi-patch.tesc
 create mode 100644 shaders-msl/tesc/water_tess.multi-patch.tesc
 create mode 100644 shaders-msl/vert/basic.for-tess.vert
 create mode 100644 shaders-msl/vert/leaf-function.for-tess.vert
 create mode 100644 shaders-msl/vert/no_stage_out.for-tess.vert

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 639cd68e..da3e0ebf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -323,7 +323,7 @@ if (SPIRV_CROSS_STATIC)
 endif()
 
 set(spirv-cross-abi-major 0)
-set(spirv-cross-abi-minor 35)
+set(spirv-cross-abi-minor 36)
 set(spirv-cross-abi-patch 0)
 
 if (SPIRV_CROSS_SHARED)
diff --git a/main.cpp b/main.cpp
index 827b401e..ef8363e0 100644
--- a/main.cpp
+++ b/main.cpp
@@ -558,6 +558,8 @@ struct CLIArguments
 	bool msl_enable_frag_stencil_ref_builtin = true;
 	uint32_t msl_enable_frag_output_mask = 0xffffffff;
 	bool msl_enable_clip_distance_user_varying = true;
+	bool msl_multi_patch_workgroup = false;
+	bool msl_vertex_for_tessellation = false;
 	bool glsl_emit_push_constant_as_ubo = false;
 	bool glsl_emit_ubo_as_plain_uniforms = false;
 	SmallVector<pair<uint32_t, uint32_t>> glsl_ext_framebuffer_fetch;
@@ -747,9 +749,15 @@ static void print_help_msl()
 	                "\t[--msl-enable-frag-output-mask <mask>]:\n\t\tOnly selectively enable fragment outputs. Useful if pipeline does not enable fragment output for certain locations, as pipeline creation might otherwise fail.\n"
 	                "\t[--msl-no-clip-distance-user-varying]:\n\t\tDo not emit user varyings to emulate gl_ClipDistance in fragment shaders.\n"
 	                "\t[--msl-shader-input <index> <format> <size>]:\n\t\tSpecify the format of the shader input at <index>.\n"
-	                "\t\t<format> can be 'u16', 'u8', or 'other', to indicate a 16-bit unsigned integer, 8-bit unsigned integer, "
+	                "\t\t<format> can be 'any32', 'any16', 'u16', 'u8', or 'other', to indicate a 32-bit opaque value, 16-bit opaque value, 16-bit unsigned integer, 8-bit unsigned integer, "
 	                "or other-typed variable. <size> is the vector length of the variable, which must be greater than or equal to that declared in the shader.\n"
-	                "\t\tUseful if shader stage interfaces don't match up, as pipeline creation might otherwise fail.\n");
+	                "\t\tUseful if shader stage interfaces don't match up, as pipeline creation might otherwise fail.\n"
+	                "\t[--msl-multi-patch-workgroup]:\n\t\tUse the new style of tessellation control processing, where multiple patches are processed per workgroup.\n"
+					"\t\tThis should increase throughput by ensuring all the GPU's SIMD lanes are occupied, but it is not compatible with the old style.\n"
+					"\t\tIn addition, this style also passes input variables in buffers directly instead of using vertex attribute processing.\n"
+					"\t\tIn a future version of SPIRV-Cross, this will become the default.\n"
+	                "\t[--msl-vertex-for-tessellation]:\n\t\tWhen handling a vertex shader, marks it as one that will be used with a new-style tessellation control shader.\n"
+					"\t\tThe vertex shader is output to MSL as a compute kernel which outputs vertices to the buffer in the order they are received, rather than in index order as with --msl-capture-output normally.\n");
 	// clang-format on
 }
 
@@ -983,6 +991,8 @@ static string compile_iteration(const CLIArguments &args, std::vector<uint32_t>
 		msl_opts.enable_frag_stencil_ref_builtin = args.msl_enable_frag_stencil_ref_builtin;
 		msl_opts.enable_frag_output_mask = args.msl_enable_frag_output_mask;
 		msl_opts.enable_clip_distance_user_varying = args.msl_enable_clip_distance_user_varying;
+		msl_opts.multi_patch_workgroup = args.msl_multi_patch_workgroup;
+		msl_opts.vertex_for_tessellation = args.msl_vertex_for_tessellation;
 		msl_comp->set_msl_options(msl_opts);
 		for (auto &v : args.msl_discrete_descriptor_sets)
 			msl_comp->add_discrete_descriptor_set(v);
@@ -1381,15 +1391,21 @@ static int main_inner(int argc, char *argv[])
 		// Make sure next_uint() is called in-order.
 		input.location = parser.next_uint();
 		const char *format = parser.next_value_string("other");
-		if (strcmp(format, "u16") == 0)
-			input.format = MSL_VERTEX_FORMAT_UINT16;
+		if (strcmp(format, "any32") == 0)
+			input.format = MSL_SHADER_INPUT_FORMAT_ANY32;
+		else if (strcmp(format, "any16") == 0)
+			input.format = MSL_SHADER_INPUT_FORMAT_ANY16;
+		else if (strcmp(format, "u16") == 0)
+			input.format = MSL_SHADER_INPUT_FORMAT_UINT16;
 		else if (strcmp(format, "u8") == 0)
-			input.format = MSL_VERTEX_FORMAT_UINT8;
+			input.format = MSL_SHADER_INPUT_FORMAT_UINT8;
 		else
-			input.format = MSL_VERTEX_FORMAT_OTHER;
+			input.format = MSL_SHADER_INPUT_FORMAT_OTHER;
 		input.vecsize = parser.next_uint();
 		args.msl_shader_inputs.push_back(input);
 	});
+	cbs.add("--msl-multi-patch-workgroup", [&args](CLIParser &) { args.msl_multi_patch_workgroup = true; });
+	cbs.add("--msl-vertex-for-tessellation", [&args](CLIParser &) { args.msl_vertex_for_tessellation = true; });
 	cbs.add("--extension", [&args](CLIParser &parser) { args.extensions.push_back(parser.next_string()); });
 	cbs.add("--rename-entry-point", [&args](CLIParser &parser) {
 		auto old_name = parser.next_string();
diff --git a/reference/opt/shaders-msl/asm/tesc/tess-level-overrun.multi-patch.asm.tesc b/reference/opt/shaders-msl/asm/tesc/tess-level-overrun.multi-patch.asm.tesc
new file mode 100644
index 00000000..a5f316dd
--- /dev/null
+++ b/reference/opt/shaders-msl/asm/tesc/tess-level-overrun.multi-patch.asm.tesc
@@ -0,0 +1,24 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct TessLevels
+{
+    float inner0;
+    float inner1;
+    float outer0;
+    float outer1;
+    float outer2;
+    float outer3;
+};
+
+kernel void main0(const device TessLevels& sb_levels [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], constant uint* spvIndirectParams [[buffer(29)]], device MTLTriangleTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
+{
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1]);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor = half(sb_levels.inner0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(sb_levels.outer0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(sb_levels.outer1);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(sb_levels.outer2);
+}
+
diff --git a/reference/opt/shaders-msl/desktop-only/tesc/basic.desktop.sso.multi-patch.tesc b/reference/opt/shaders-msl/desktop-only/tesc/basic.desktop.sso.multi-patch.tesc
new file mode 100644
index 00000000..863a32a8
--- /dev/null
+++ b/reference/opt/shaders-msl/desktop-only/tesc/basic.desktop.sso.multi-patch.tesc
@@ -0,0 +1,39 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct main0_out
+{
+    float4 gl_Position;
+};
+
+struct main0_patchOut
+{
+    float3 vFoo;
+};
+
+struct main0_in
+{
+    uint3 m_86;
+    ushort2 m_90;
+    float4 gl_Position;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 1];
+    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 1];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 1;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1]);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(8.8999996185302734375);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(6.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(8.8999996185302734375);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(6.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(3.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(4.900000095367431640625);
+    patchOut.vFoo = float3(1.0);
+    gl_out[gl_InvocationID].gl_Position = gl_in[0].gl_Position + gl_in[1].gl_Position;
+}
+
diff --git a/reference/opt/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc b/reference/opt/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc
new file mode 100644
index 00000000..e47d56a2
--- /dev/null
+++ b/reference/opt/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc
@@ -0,0 +1,38 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct Boo
+{
+    float3 a;
+    uint3 b;
+};
+
+struct main0_out
+{
+    Boo vVertex;
+};
+
+struct main0_in
+{
+    float3 Boo_a;
+    uint3 Boo_b;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    Boo _26 = Boo{ gl_in[gl_InvocationID].Boo_a, gl_in[gl_InvocationID].Boo_b };
+    gl_out[gl_InvocationID].vVertex = _26;
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(1.0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(2.0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(3.0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(4.0);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(1.0);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(2.0);
+}
+
diff --git a/reference/opt/shaders-msl/desktop-only/vert/shader-draw-parameters.desktop.for-tess.vert b/reference/opt/shaders-msl/desktop-only/vert/shader-draw-parameters.desktop.for-tess.vert
new file mode 100644
index 00000000..b3c8b6bb
--- /dev/null
+++ b/reference/opt/shaders-msl/desktop-only/vert/shader-draw-parameters.desktop.for-tess.vert
@@ -0,0 +1,20 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct main0_out
+{
+    float4 gl_Position;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], uint3 spvDispatchBase [[grid_origin]], device main0_out* spvOut [[buffer(28)]])
+{
+    device main0_out& out = spvOut[gl_GlobalInvocationID.y * spvStageInputSize.x + gl_GlobalInvocationID.x];
+    if (any(gl_GlobalInvocationID >= spvStageInputSize))
+        return;
+    uint gl_BaseVertex = spvDispatchBase.x;
+    uint gl_BaseInstance = spvDispatchBase.y;
+    out.gl_Position = float4(float(int(gl_BaseVertex)), float(int(gl_BaseInstance)), 0.0, 1.0);
+}
+
diff --git a/reference/opt/shaders-msl/tesc/basic.multi-patch.tesc b/reference/opt/shaders-msl/tesc/basic.multi-patch.tesc
new file mode 100644
index 00000000..b1403a3f
--- /dev/null
+++ b/reference/opt/shaders-msl/tesc/basic.multi-patch.tesc
@@ -0,0 +1,23 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct main0_patchOut
+{
+    float3 vFoo;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
+{
+    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 1];
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1]);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(8.8999996185302734375);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(6.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(8.8999996185302734375);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(6.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(3.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(4.900000095367431640625);
+    patchOut.vFoo = float3(1.0);
+}
+
diff --git a/reference/opt/shaders-msl/tesc/load-control-point-array-of-matrix.multi-patch.tesc b/reference/opt/shaders-msl/tesc/load-control-point-array-of-matrix.multi-patch.tesc
new file mode 100644
index 00000000..4f4cf0b0
--- /dev/null
+++ b/reference/opt/shaders-msl/tesc/load-control-point-array-of-matrix.multi-patch.tesc
@@ -0,0 +1,68 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct main0_out
+{
+    float4x4 vOutputs;
+};
+
+struct main0_in
+{
+    float4x4 vInputs;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    spvUnsafeArray<float4x4, 32> _16 = spvUnsafeArray<float4x4, 32>({ gl_in[0].vInputs, gl_in[1].vInputs, gl_in[2].vInputs, gl_in[3].vInputs, gl_in[4].vInputs, gl_in[5].vInputs, gl_in[6].vInputs, gl_in[7].vInputs, gl_in[8].vInputs, gl_in[9].vInputs, gl_in[10].vInputs, gl_in[11].vInputs, gl_in[12].vInputs, gl_in[13].vInputs, gl_in[14].vInputs, gl_in[15].vInputs, gl_in[16].vInputs, gl_in[17].vInputs, gl_in[18].vInputs, gl_in[19].vInputs, gl_in[20].vInputs, gl_in[21].vInputs, gl_in[22].vInputs, gl_in[23].vInputs, gl_in[24].vInputs, gl_in[25].vInputs, gl_in[26].vInputs, gl_in[27].vInputs, gl_in[28].vInputs, gl_in[29].vInputs, gl_in[30].vInputs, gl_in[31].vInputs });
+    spvUnsafeArray<float4x4, 32> tmp;
+    tmp = _16;
+    gl_out[gl_InvocationID].vOutputs = tmp[gl_InvocationID];
+}
+
diff --git a/reference/opt/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc b/reference/opt/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc
new file mode 100644
index 00000000..8bd5515b
--- /dev/null
+++ b/reference/opt/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc
@@ -0,0 +1,78 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct VertexData
+{
+    float4x4 a;
+    spvUnsafeArray<float4, 2> b;
+    float4 c;
+};
+
+struct main0_out
+{
+    float4 vOutputs;
+};
+
+struct main0_in
+{
+    float4x4 VertexData_a;
+    spvUnsafeArray<float4, 2> VertexData_b;
+    float4 VertexData_c;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    spvUnsafeArray<VertexData, 32> _19 = spvUnsafeArray<VertexData, 32>({ VertexData{ gl_in[0].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[0].VertexData_b[0], gl_in[0].VertexData_b[1] }), gl_in[0].VertexData_c }, VertexData{ gl_in[1].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[1].VertexData_b[0], gl_in[1].VertexData_b[1] }), gl_in[1].VertexData_c }, VertexData{ gl_in[2].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[2].VertexData_b[0], gl_in[2].VertexData_b[1] }), gl_in[2].VertexData_c }, VertexData{ gl_in[3].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[3].VertexData_b[0], gl_in[3].VertexData_b[1] }), gl_in[3].VertexData_c }, VertexData{ gl_in[4].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[4].VertexData_b[0], gl_in[4].VertexData_b[1] }), gl_in[4].VertexData_c }, VertexData{ gl_in[5].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[5].VertexData_b[0], gl_in[5].VertexData_b[1] }), gl_in[5].VertexData_c }, VertexData{ gl_in[6].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[6].VertexData_b[0], gl_in[6].VertexData_b[1] }), gl_in[6].VertexData_c }, VertexData{ gl_in[7].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[7].VertexData_b[0], gl_in[7].VertexData_b[1] }), gl_in[7].VertexData_c }, VertexData{ gl_in[8].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[8].VertexData_b[0], gl_in[8].VertexData_b[1] }), gl_in[8].VertexData_c }, VertexData{ gl_in[9].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[9].VertexData_b[0], gl_in[9].VertexData_b[1] }), gl_in[9].VertexData_c }, VertexData{ gl_in[10].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[10].VertexData_b[0], gl_in[10].VertexData_b[1] }), gl_in[10].VertexData_c }, VertexData{ gl_in[11].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[11].VertexData_b[0], gl_in[11].VertexData_b[1] }), gl_in[11].VertexData_c }, VertexData{ gl_in[12].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[12].VertexData_b[0], gl_in[12].VertexData_b[1] }), gl_in[12].VertexData_c }, VertexData{ gl_in[13].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[13].VertexData_b[0], gl_in[13].VertexData_b[1] }), gl_in[13].VertexData_c }, VertexData{ gl_in[14].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[14].VertexData_b[0], gl_in[14].VertexData_b[1] }), gl_in[14].VertexData_c }, VertexData{ gl_in[15].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[15].VertexData_b[0], gl_in[15].VertexData_b[1] }), gl_in[15].VertexData_c }, VertexData{ gl_in[16].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[16].VertexData_b[0], gl_in[16].VertexData_b[1] }), gl_in[16].VertexData_c }, VertexData{ gl_in[17].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[17].VertexData_b[0], gl_in[17].VertexData_b[1] }), gl_in[17].VertexData_c }, VertexData{ gl_in[18].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[18].VertexData_b[0], gl_in[18].VertexData_b[1] }), gl_in[18].VertexData_c }, VertexData{ gl_in[19].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[19].VertexData_b[0], gl_in[19].VertexData_b[1] }), gl_in[19].VertexData_c }, VertexData{ gl_in[20].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[20].VertexData_b[0], gl_in[20].VertexData_b[1] }), gl_in[20].VertexData_c }, VertexData{ gl_in[21].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[21].VertexData_b[0], gl_in[21].VertexData_b[1] }), gl_in[21].VertexData_c }, VertexData{ gl_in[22].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[22].VertexData_b[0], gl_in[22].VertexData_b[1] }), gl_in[22].VertexData_c }, VertexData{ gl_in[23].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[23].VertexData_b[0], gl_in[23].VertexData_b[1] }), gl_in[23].VertexData_c }, VertexData{ gl_in[24].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[24].VertexData_b[0], gl_in[24].VertexData_b[1] }), gl_in[24].VertexData_c }, VertexData{ gl_in[25].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[25].VertexData_b[0], gl_in[25].VertexData_b[1] }), gl_in[25].VertexData_c }, VertexData{ gl_in[26].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[26].VertexData_b[0], gl_in[26].VertexData_b[1] }), gl_in[26].VertexData_c }, VertexData{ gl_in[27].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[27].VertexData_b[0], gl_in[27].VertexData_b[1] }), gl_in[27].VertexData_c }, VertexData{ gl_in[28].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[28].VertexData_b[0], gl_in[28].VertexData_b[1] }), gl_in[28].VertexData_c }, VertexData{ gl_in[29].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[29].VertexData_b[0], gl_in[29].VertexData_b[1] }), gl_in[29].VertexData_c }, VertexData{ gl_in[30].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[30].VertexData_b[0], gl_in[30].VertexData_b[1] }), gl_in[30].VertexData_c }, VertexData{ gl_in[31].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[31].VertexData_b[0], gl_in[31].VertexData_b[1] }), gl_in[31].VertexData_c } });
+    spvUnsafeArray<VertexData, 32> tmp;
+    tmp = _19;
+    int _27 = gl_InvocationID ^ 1;
+    gl_out[gl_InvocationID].vOutputs = ((tmp[gl_InvocationID].a[1] + tmp[gl_InvocationID].b[1]) + tmp[gl_InvocationID].c) + gl_in[_27].VertexData_c;
+}
+
diff --git a/reference/opt/shaders-msl/tesc/load-control-point-array.multi-patch.tesc b/reference/opt/shaders-msl/tesc/load-control-point-array.multi-patch.tesc
new file mode 100644
index 00000000..416bc681
--- /dev/null
+++ b/reference/opt/shaders-msl/tesc/load-control-point-array.multi-patch.tesc
@@ -0,0 +1,69 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct main0_out
+{
+    float4 vOutputs;
+};
+
+struct main0_in
+{
+    float4 vInputs;
+    ushort2 m_43;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    spvUnsafeArray<float4, 32> _15 = spvUnsafeArray<float4, 32>({ gl_in[0].vInputs, gl_in[1].vInputs, gl_in[2].vInputs, gl_in[3].vInputs, gl_in[4].vInputs, gl_in[5].vInputs, gl_in[6].vInputs, gl_in[7].vInputs, gl_in[8].vInputs, gl_in[9].vInputs, gl_in[10].vInputs, gl_in[11].vInputs, gl_in[12].vInputs, gl_in[13].vInputs, gl_in[14].vInputs, gl_in[15].vInputs, gl_in[16].vInputs, gl_in[17].vInputs, gl_in[18].vInputs, gl_in[19].vInputs, gl_in[20].vInputs, gl_in[21].vInputs, gl_in[22].vInputs, gl_in[23].vInputs, gl_in[24].vInputs, gl_in[25].vInputs, gl_in[26].vInputs, gl_in[27].vInputs, gl_in[28].vInputs, gl_in[29].vInputs, gl_in[30].vInputs, gl_in[31].vInputs });
+    spvUnsafeArray<float4, 32> tmp;
+    tmp = _15;
+    gl_out[gl_InvocationID].vOutputs = tmp[gl_InvocationID];
+}
+
diff --git a/reference/opt/shaders-msl/tesc/water_tess.multi-patch.tesc b/reference/opt/shaders-msl/tesc/water_tess.multi-patch.tesc
new file mode 100644
index 00000000..6264a8ef
--- /dev/null
+++ b/reference/opt/shaders-msl/tesc/water_tess.multi-patch.tesc
@@ -0,0 +1,91 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct UBO
+{
+    float4 uScale;
+    float3 uCamPos;
+    float2 uPatchSize;
+    float2 uMaxTessLevel;
+    float uDistanceMod;
+    float4 uFrustum[6];
+};
+
+struct main0_patchOut
+{
+    float2 vOutPatchPosBase;
+    float4 vPatchLods;
+};
+
+struct main0_in
+{
+    float3 vPatchPosBase;
+    ushort2 m_996;
+};
+
+kernel void main0(constant UBO& _41 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 1];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1]);
+    float2 _431 = (gl_in[0].vPatchPosBase.xy - float2(10.0)) * _41.uScale.xy;
+    float2 _441 = ((gl_in[0].vPatchPosBase.xy + _41.uPatchSize) + float2(10.0)) * _41.uScale.xy;
+    float3 _446 = float3(_431.x, -10.0, _431.y);
+    float3 _451 = float3(_441.x, 10.0, _441.y);
+    float4 _467 = float4((_446 + _451) * 0.5, 1.0);
+    float3 _514 = float3(length(_451 - _446) * (-0.5));
+    bool _516 = any(float3(dot(_41.uFrustum[0], _467), dot(_41.uFrustum[1], _467), dot(_41.uFrustum[2], _467)) <= _514);
+    bool _526;
+    if (!_516)
+    {
+        _526 = any(float3(dot(_41.uFrustum[3], _467), dot(_41.uFrustum[4], _467), dot(_41.uFrustum[5], _467)) <= _514);
+    }
+    else
+    {
+        _526 = _516;
+    }
+    if (!(!_526))
+    {
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(-1.0);
+    }
+    else
+    {
+        patchOut.vOutPatchPosBase = gl_in[0].vPatchPosBase.xy;
+        float2 _681 = (gl_in[0].vPatchPosBase.xy + (float2(-0.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float2 _710 = (gl_in[0].vPatchPosBase.xy + (float2(0.5, -0.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float _729 = fast::clamp(log2((length(_41.uCamPos - float3(_710.x, 0.0, _710.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x);
+        float2 _739 = (gl_in[0].vPatchPosBase.xy + (float2(1.5, -0.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float2 _768 = (gl_in[0].vPatchPosBase.xy + (float2(-0.5, 0.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float _787 = fast::clamp(log2((length(_41.uCamPos - float3(_768.x, 0.0, _768.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x);
+        float2 _797 = (gl_in[0].vPatchPosBase.xy + (float2(0.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float _816 = fast::clamp(log2((length(_41.uCamPos - float3(_797.x, 0.0, _797.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x);
+        float2 _826 = (gl_in[0].vPatchPosBase.xy + (float2(1.5, 0.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float _845 = fast::clamp(log2((length(_41.uCamPos - float3(_826.x, 0.0, _826.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x);
+        float2 _855 = (gl_in[0].vPatchPosBase.xy + (float2(-0.5, 1.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float2 _884 = (gl_in[0].vPatchPosBase.xy + (float2(0.5, 1.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float _903 = fast::clamp(log2((length(_41.uCamPos - float3(_884.x, 0.0, _884.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x);
+        float2 _913 = (gl_in[0].vPatchPosBase.xy + (float2(1.5) * _41.uPatchSize)) * _41.uScale.xy;
+        float _614 = dot(float4(_787, _816, fast::clamp(log2((length(_41.uCamPos - float3(_855.x, 0.0, _855.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x), _903), float4(0.25));
+        float _620 = dot(float4(fast::clamp(log2((length(_41.uCamPos - float3(_681.x, 0.0, _681.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x), _729, _787, _816), float4(0.25));
+        float _626 = dot(float4(_729, fast::clamp(log2((length(_41.uCamPos - float3(_739.x, 0.0, _739.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x), _816, _845), float4(0.25));
+        float _632 = dot(float4(_816, _845, _903, fast::clamp(log2((length(_41.uCamPos - float3(_913.x, 0.0, _913.y)) + 9.9999997473787516355514526367188e-05) * _41.uDistanceMod), 0.0, _41.uMaxTessLevel.x)), float4(0.25));
+        float4 _633 = float4(_614, _620, _626, _632);
+        patchOut.vPatchLods = _633;
+        float4 _940 = exp2(-fast::min(_633, _633.yzwx)) * _41.uMaxTessLevel.y;
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(_940.x);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(_940.y);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(_940.z);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(_940.w);
+        float _948 = _41.uMaxTessLevel.y * exp2(-fast::min(fast::min(fast::min(_614, _620), fast::min(_626, _632)), _816));
+        spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(_948);
+        spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(_948);
+    }
+}
+
diff --git a/reference/opt/shaders-msl/vert/basic.for-tess.vert b/reference/opt/shaders-msl/vert/basic.for-tess.vert
new file mode 100644
index 00000000..c99a95ac
--- /dev/null
+++ b/reference/opt/shaders-msl/vert/basic.for-tess.vert
@@ -0,0 +1,31 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct UBO
+{
+    float4x4 uMVP;
+};
+
+struct main0_out
+{
+    float3 vNormal;
+    float4 gl_Position;
+};
+
+struct main0_in
+{
+    float4 aVertex [[attribute(0)]];
+    float3 aNormal [[attribute(1)]];
+};
+
+kernel void main0(main0_in in [[stage_in]], constant UBO& _16 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], device main0_out* spvOut [[buffer(28)]])
+{
+    device main0_out& out = spvOut[gl_GlobalInvocationID.y * spvStageInputSize.x + gl_GlobalInvocationID.x];
+    if (any(gl_GlobalInvocationID >= spvStageInputSize))
+        return;
+    out.gl_Position = _16.uMVP * in.aVertex;
+    out.vNormal = in.aNormal;
+}
+
diff --git a/reference/opt/shaders-msl/vert/leaf-function.for-tess.vert b/reference/opt/shaders-msl/vert/leaf-function.for-tess.vert
new file mode 100644
index 00000000..e3d2d1fa
--- /dev/null
+++ b/reference/opt/shaders-msl/vert/leaf-function.for-tess.vert
@@ -0,0 +1,31 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct UBO
+{
+    float4x4 uMVP;
+};
+
+struct main0_out
+{
+    float3 vNormal;
+    float4 gl_Position;
+};
+
+struct main0_in
+{
+    float4 aVertex [[attribute(0)]];
+    float3 aNormal [[attribute(1)]];
+};
+
+kernel void main0(main0_in in [[stage_in]], constant UBO& _18 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], device main0_out* spvOut [[buffer(28)]])
+{
+    device main0_out& out = spvOut[gl_GlobalInvocationID.y * spvStageInputSize.x + gl_GlobalInvocationID.x];
+    if (any(gl_GlobalInvocationID >= spvStageInputSize))
+        return;
+    out.gl_Position = _18.uMVP * in.aVertex;
+    out.vNormal = in.aNormal;
+}
+
diff --git a/reference/opt/shaders-msl/vert/no_stage_out.for-tess.vert b/reference/opt/shaders-msl/vert/no_stage_out.for-tess.vert
new file mode 100644
index 00000000..c40e0ec7
--- /dev/null
+++ b/reference/opt/shaders-msl/vert/no_stage_out.for-tess.vert
@@ -0,0 +1,23 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct _10
+{
+    uint4 _m0[1024];
+};
+
+struct main0_in
+{
+    uint4 m_19 [[attribute(0)]];
+};
+
+kernel void main0(main0_in in [[stage_in]], device _10& _12 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], uint3 spvDispatchBase [[grid_origin]])
+{
+    if (any(gl_GlobalInvocationID >= spvStageInputSize))
+        return;
+    uint gl_VertexIndex = gl_GlobalInvocationID.x + spvDispatchBase.x;
+    _12._m0[int(gl_VertexIndex)] = in.m_19;
+}
+
diff --git a/reference/shaders-msl-no-opt/asm/tesc/tess-fixed-input-array-builtin-array.invalid.multi-patch.asm.tesc b/reference/shaders-msl-no-opt/asm/tesc/tess-fixed-input-array-builtin-array.invalid.multi-patch.asm.tesc
new file mode 100644
index 00000000..0d8a5001
--- /dev/null
+++ b/reference/shaders-msl-no-opt/asm/tesc/tess-fixed-input-array-builtin-array.invalid.multi-patch.asm.tesc
@@ -0,0 +1,140 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct VertexOutput
+{
+    float4 pos;
+    float2 uv;
+};
+
+struct HSOut
+{
+    float4 pos;
+    float2 uv;
+};
+
+struct HSConstantOut
+{
+    spvUnsafeArray<float, 3> EdgeTess;
+    float InsideTess;
+};
+
+struct VertexOutput_1
+{
+    float3 uv;
+};
+
+struct HSOut_1
+{
+    float2 uv;
+};
+
+struct main0_out
+{
+    HSOut_1 _entryPointOutput;
+    float4 gl_Position;
+};
+
+struct main0_in
+{
+    float3 VertexOutput_uv;
+    ushort2 m_172;
+    float4 gl_Position;
+};
+
+static inline __attribute__((always_inline))
+HSOut _hs_main(thread const spvUnsafeArray<VertexOutput, 3> (&p), thread const uint& i)
+{
+    HSOut _output;
+    _output.pos = p[i].pos;
+    _output.uv = p[i].uv;
+    return _output;
+}
+
+static inline __attribute__((always_inline))
+HSConstantOut PatchHS(thread const spvUnsafeArray<VertexOutput, 3> (&_patch))
+{
+    HSConstantOut _output;
+    _output.EdgeTess[0] = (float2(1.0) + _patch[0].uv).x;
+    _output.EdgeTess[1] = (float2(1.0) + _patch[0].uv).x;
+    _output.EdgeTess[2] = (float2(1.0) + _patch[0].uv).x;
+    _output.InsideTess = (float2(1.0) + _patch[0].uv).x;
+    return _output;
+}
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLTriangleTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 3];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 3, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 3;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 3, spvIndirectParams[1]);
+    spvUnsafeArray<VertexOutput, 3> p;
+    p[0].pos = gl_in[0].gl_Position;
+    p[0].uv = gl_in[0].VertexOutput_uv.xy;
+    p[1].pos = gl_in[1].gl_Position;
+    p[1].uv = gl_in[1].VertexOutput_uv.xy;
+    p[2].pos = gl_in[2].gl_Position;
+    p[2].uv = gl_in[2].VertexOutput_uv.xy;
+    uint i = gl_InvocationID;
+    spvUnsafeArray<VertexOutput, 3> param;
+    param = p;
+    uint param_1 = i;
+    HSOut flattenTemp = _hs_main(param, param_1);
+    gl_out[gl_InvocationID].gl_Position = flattenTemp.pos;
+    gl_out[gl_InvocationID]._entryPointOutput.uv = flattenTemp.uv;
+    threadgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup);
+    if (int(gl_InvocationID) == 0)
+    {
+        spvUnsafeArray<VertexOutput, 3> param_2;
+        param_2 = p;
+        HSConstantOut _patchConstantResult = PatchHS(param_2);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(_patchConstantResult.EdgeTess[0]);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(_patchConstantResult.EdgeTess[1]);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(_patchConstantResult.EdgeTess[2]);
+        spvTessLevel[gl_PrimitiveID].insideTessellationFactor = half(_patchConstantResult.InsideTess);
+    }
+}
+
diff --git a/reference/shaders-msl/asm/tesc/tess-level-overrun.multi-patch.asm.tesc b/reference/shaders-msl/asm/tesc/tess-level-overrun.multi-patch.asm.tesc
new file mode 100644
index 00000000..a5f316dd
--- /dev/null
+++ b/reference/shaders-msl/asm/tesc/tess-level-overrun.multi-patch.asm.tesc
@@ -0,0 +1,24 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct TessLevels
+{
+    float inner0;
+    float inner1;
+    float outer0;
+    float outer1;
+    float outer2;
+    float outer3;
+};
+
+kernel void main0(const device TessLevels& sb_levels [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], constant uint* spvIndirectParams [[buffer(29)]], device MTLTriangleTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
+{
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1]);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor = half(sb_levels.inner0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(sb_levels.outer0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(sb_levels.outer1);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(sb_levels.outer2);
+}
+
diff --git a/reference/shaders-msl/desktop-only/tesc/basic.desktop.sso.multi-patch.tesc b/reference/shaders-msl/desktop-only/tesc/basic.desktop.sso.multi-patch.tesc
new file mode 100644
index 00000000..f4b1479b
--- /dev/null
+++ b/reference/shaders-msl/desktop-only/tesc/basic.desktop.sso.multi-patch.tesc
@@ -0,0 +1,47 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct main0_out
+{
+    float4 gl_Position;
+};
+
+struct main0_patchOut
+{
+    float3 vFoo;
+};
+
+struct main0_in
+{
+    uint3 m_78;
+    ushort2 m_82;
+    float4 gl_Position;
+};
+
+static inline __attribute__((always_inline))
+void set_position(device main0_out* thread & gl_out, thread uint& gl_InvocationID, device main0_in* thread & gl_in)
+{
+    gl_out[gl_InvocationID].gl_Position = gl_in[0].gl_Position + gl_in[1].gl_Position;
+}
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 1];
+    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 1];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 1;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1]);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(8.8999996185302734375);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(6.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(8.8999996185302734375);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(6.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(3.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(4.900000095367431640625);
+    patchOut.vFoo = float3(1.0);
+    set_position(gl_out, gl_InvocationID, gl_in);
+}
+
diff --git a/reference/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc b/reference/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc
new file mode 100644
index 00000000..e47d56a2
--- /dev/null
+++ b/reference/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc
@@ -0,0 +1,38 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct Boo
+{
+    float3 a;
+    uint3 b;
+};
+
+struct main0_out
+{
+    Boo vVertex;
+};
+
+struct main0_in
+{
+    float3 Boo_a;
+    uint3 Boo_b;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    Boo _26 = Boo{ gl_in[gl_InvocationID].Boo_a, gl_in[gl_InvocationID].Boo_b };
+    gl_out[gl_InvocationID].vVertex = _26;
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(1.0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(2.0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(3.0);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(4.0);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(1.0);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(2.0);
+}
+
diff --git a/reference/shaders-msl/desktop-only/vert/shader-draw-parameters.desktop.for-tess.vert b/reference/shaders-msl/desktop-only/vert/shader-draw-parameters.desktop.for-tess.vert
new file mode 100644
index 00000000..b3c8b6bb
--- /dev/null
+++ b/reference/shaders-msl/desktop-only/vert/shader-draw-parameters.desktop.for-tess.vert
@@ -0,0 +1,20 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct main0_out
+{
+    float4 gl_Position;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], uint3 spvDispatchBase [[grid_origin]], device main0_out* spvOut [[buffer(28)]])
+{
+    device main0_out& out = spvOut[gl_GlobalInvocationID.y * spvStageInputSize.x + gl_GlobalInvocationID.x];
+    if (any(gl_GlobalInvocationID >= spvStageInputSize))
+        return;
+    uint gl_BaseVertex = spvDispatchBase.x;
+    uint gl_BaseInstance = spvDispatchBase.y;
+    out.gl_Position = float4(float(int(gl_BaseVertex)), float(int(gl_BaseInstance)), 0.0, 1.0);
+}
+
diff --git a/reference/shaders-msl/tesc/basic.multi-patch.tesc b/reference/shaders-msl/tesc/basic.multi-patch.tesc
new file mode 100644
index 00000000..b1403a3f
--- /dev/null
+++ b/reference/shaders-msl/tesc/basic.multi-patch.tesc
@@ -0,0 +1,23 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct main0_patchOut
+{
+    float3 vFoo;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
+{
+    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 1];
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1]);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(8.8999996185302734375);
+    spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(6.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(8.8999996185302734375);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(6.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(3.900000095367431640625);
+    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(4.900000095367431640625);
+    patchOut.vFoo = float3(1.0);
+}
+
diff --git a/reference/shaders-msl/tesc/load-control-point-array-of-matrix.multi-patch.tesc b/reference/shaders-msl/tesc/load-control-point-array-of-matrix.multi-patch.tesc
new file mode 100644
index 00000000..4f4cf0b0
--- /dev/null
+++ b/reference/shaders-msl/tesc/load-control-point-array-of-matrix.multi-patch.tesc
@@ -0,0 +1,68 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct main0_out
+{
+    float4x4 vOutputs;
+};
+
+struct main0_in
+{
+    float4x4 vInputs;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    spvUnsafeArray<float4x4, 32> _16 = spvUnsafeArray<float4x4, 32>({ gl_in[0].vInputs, gl_in[1].vInputs, gl_in[2].vInputs, gl_in[3].vInputs, gl_in[4].vInputs, gl_in[5].vInputs, gl_in[6].vInputs, gl_in[7].vInputs, gl_in[8].vInputs, gl_in[9].vInputs, gl_in[10].vInputs, gl_in[11].vInputs, gl_in[12].vInputs, gl_in[13].vInputs, gl_in[14].vInputs, gl_in[15].vInputs, gl_in[16].vInputs, gl_in[17].vInputs, gl_in[18].vInputs, gl_in[19].vInputs, gl_in[20].vInputs, gl_in[21].vInputs, gl_in[22].vInputs, gl_in[23].vInputs, gl_in[24].vInputs, gl_in[25].vInputs, gl_in[26].vInputs, gl_in[27].vInputs, gl_in[28].vInputs, gl_in[29].vInputs, gl_in[30].vInputs, gl_in[31].vInputs });
+    spvUnsafeArray<float4x4, 32> tmp;
+    tmp = _16;
+    gl_out[gl_InvocationID].vOutputs = tmp[gl_InvocationID];
+}
+
diff --git a/reference/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc b/reference/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc
new file mode 100644
index 00000000..ad23ea7f
--- /dev/null
+++ b/reference/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc
@@ -0,0 +1,80 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct VertexData
+{
+    float4x4 a;
+    spvUnsafeArray<float4, 2> b;
+    float4 c;
+};
+
+struct main0_out
+{
+    float4 vOutputs;
+};
+
+struct main0_in
+{
+    float4x4 VertexData_a;
+    spvUnsafeArray<float4, 2> VertexData_b;
+    float4 VertexData_c;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    spvUnsafeArray<VertexData, 32> _19 = spvUnsafeArray<VertexData, 32>({ VertexData{ gl_in[0].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[0].VertexData_b[0], gl_in[0].VertexData_b[1] }), gl_in[0].VertexData_c }, VertexData{ gl_in[1].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[1].VertexData_b[0], gl_in[1].VertexData_b[1] }), gl_in[1].VertexData_c }, VertexData{ gl_in[2].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[2].VertexData_b[0], gl_in[2].VertexData_b[1] }), gl_in[2].VertexData_c }, VertexData{ gl_in[3].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[3].VertexData_b[0], gl_in[3].VertexData_b[1] }), gl_in[3].VertexData_c }, VertexData{ gl_in[4].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[4].VertexData_b[0], gl_in[4].VertexData_b[1] }), gl_in[4].VertexData_c }, VertexData{ gl_in[5].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[5].VertexData_b[0], gl_in[5].VertexData_b[1] }), gl_in[5].VertexData_c }, VertexData{ gl_in[6].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[6].VertexData_b[0], gl_in[6].VertexData_b[1] }), gl_in[6].VertexData_c }, VertexData{ gl_in[7].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[7].VertexData_b[0], gl_in[7].VertexData_b[1] }), gl_in[7].VertexData_c }, VertexData{ gl_in[8].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[8].VertexData_b[0], gl_in[8].VertexData_b[1] }), gl_in[8].VertexData_c }, VertexData{ gl_in[9].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[9].VertexData_b[0], gl_in[9].VertexData_b[1] }), gl_in[9].VertexData_c }, VertexData{ gl_in[10].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[10].VertexData_b[0], gl_in[10].VertexData_b[1] }), gl_in[10].VertexData_c }, VertexData{ gl_in[11].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[11].VertexData_b[0], gl_in[11].VertexData_b[1] }), gl_in[11].VertexData_c }, VertexData{ gl_in[12].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[12].VertexData_b[0], gl_in[12].VertexData_b[1] }), gl_in[12].VertexData_c }, VertexData{ gl_in[13].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[13].VertexData_b[0], gl_in[13].VertexData_b[1] }), gl_in[13].VertexData_c }, VertexData{ gl_in[14].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[14].VertexData_b[0], gl_in[14].VertexData_b[1] }), gl_in[14].VertexData_c }, VertexData{ gl_in[15].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[15].VertexData_b[0], gl_in[15].VertexData_b[1] }), gl_in[15].VertexData_c }, VertexData{ gl_in[16].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[16].VertexData_b[0], gl_in[16].VertexData_b[1] }), gl_in[16].VertexData_c }, VertexData{ gl_in[17].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[17].VertexData_b[0], gl_in[17].VertexData_b[1] }), gl_in[17].VertexData_c }, VertexData{ gl_in[18].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[18].VertexData_b[0], gl_in[18].VertexData_b[1] }), gl_in[18].VertexData_c }, VertexData{ gl_in[19].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[19].VertexData_b[0], gl_in[19].VertexData_b[1] }), gl_in[19].VertexData_c }, VertexData{ gl_in[20].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[20].VertexData_b[0], gl_in[20].VertexData_b[1] }), gl_in[20].VertexData_c }, VertexData{ gl_in[21].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[21].VertexData_b[0], gl_in[21].VertexData_b[1] }), gl_in[21].VertexData_c }, VertexData{ gl_in[22].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[22].VertexData_b[0], gl_in[22].VertexData_b[1] }), gl_in[22].VertexData_c }, VertexData{ gl_in[23].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[23].VertexData_b[0], gl_in[23].VertexData_b[1] }), gl_in[23].VertexData_c }, VertexData{ gl_in[24].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[24].VertexData_b[0], gl_in[24].VertexData_b[1] }), gl_in[24].VertexData_c }, VertexData{ gl_in[25].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[25].VertexData_b[0], gl_in[25].VertexData_b[1] }), gl_in[25].VertexData_c }, VertexData{ gl_in[26].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[26].VertexData_b[0], gl_in[26].VertexData_b[1] }), gl_in[26].VertexData_c }, VertexData{ gl_in[27].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[27].VertexData_b[0], gl_in[27].VertexData_b[1] }), gl_in[27].VertexData_c }, VertexData{ gl_in[28].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[28].VertexData_b[0], gl_in[28].VertexData_b[1] }), gl_in[28].VertexData_c }, VertexData{ gl_in[29].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[29].VertexData_b[0], gl_in[29].VertexData_b[1] }), gl_in[29].VertexData_c }, VertexData{ gl_in[30].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[30].VertexData_b[0], gl_in[30].VertexData_b[1] }), gl_in[30].VertexData_c }, VertexData{ gl_in[31].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[31].VertexData_b[0], gl_in[31].VertexData_b[1] }), gl_in[31].VertexData_c } });
+    spvUnsafeArray<VertexData, 32> tmp;
+    tmp = _19;
+    int _27 = gl_InvocationID ^ 1;
+    VertexData _30 = VertexData{ gl_in[_27].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[_27].VertexData_b[0], gl_in[_27].VertexData_b[1] }), gl_in[_27].VertexData_c };
+    VertexData tmp_single = _30;
+    gl_out[gl_InvocationID].vOutputs = ((tmp[gl_InvocationID].a[1] + tmp[gl_InvocationID].b[1]) + tmp[gl_InvocationID].c) + tmp_single.c;
+}
+
diff --git a/reference/shaders-msl/tesc/load-control-point-array.multi-patch.tesc b/reference/shaders-msl/tesc/load-control-point-array.multi-patch.tesc
new file mode 100644
index 00000000..416bc681
--- /dev/null
+++ b/reference/shaders-msl/tesc/load-control-point-array.multi-patch.tesc
@@ -0,0 +1,69 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct main0_out
+{
+    float4 vOutputs;
+};
+
+struct main0_in
+{
+    float4 vInputs;
+    ushort2 m_43;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    spvUnsafeArray<float4, 32> _15 = spvUnsafeArray<float4, 32>({ gl_in[0].vInputs, gl_in[1].vInputs, gl_in[2].vInputs, gl_in[3].vInputs, gl_in[4].vInputs, gl_in[5].vInputs, gl_in[6].vInputs, gl_in[7].vInputs, gl_in[8].vInputs, gl_in[9].vInputs, gl_in[10].vInputs, gl_in[11].vInputs, gl_in[12].vInputs, gl_in[13].vInputs, gl_in[14].vInputs, gl_in[15].vInputs, gl_in[16].vInputs, gl_in[17].vInputs, gl_in[18].vInputs, gl_in[19].vInputs, gl_in[20].vInputs, gl_in[21].vInputs, gl_in[22].vInputs, gl_in[23].vInputs, gl_in[24].vInputs, gl_in[25].vInputs, gl_in[26].vInputs, gl_in[27].vInputs, gl_in[28].vInputs, gl_in[29].vInputs, gl_in[30].vInputs, gl_in[31].vInputs });
+    spvUnsafeArray<float4, 32> tmp;
+    tmp = _15;
+    gl_out[gl_InvocationID].vOutputs = tmp[gl_InvocationID];
+}
+
diff --git a/reference/shaders-msl/tesc/water_tess.multi-patch.tesc b/reference/shaders-msl/tesc/water_tess.multi-patch.tesc
new file mode 100644
index 00000000..afc011fe
--- /dev/null
+++ b/reference/shaders-msl/tesc/water_tess.multi-patch.tesc
@@ -0,0 +1,135 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct UBO
+{
+    float4 uScale;
+    float3 uCamPos;
+    float2 uPatchSize;
+    float2 uMaxTessLevel;
+    float uDistanceMod;
+    float4 uFrustum[6];
+};
+
+struct main0_patchOut
+{
+    float2 vOutPatchPosBase;
+    float4 vPatchLods;
+};
+
+struct main0_in
+{
+    float3 vPatchPosBase;
+    ushort2 m_430;
+};
+
+static inline __attribute__((always_inline))
+bool frustum_cull(thread const float2& p0, constant UBO& v_41)
+{
+    float2 min_xz = (p0 - float2(10.0)) * v_41.uScale.xy;
+    float2 max_xz = ((p0 + v_41.uPatchSize) + float2(10.0)) * v_41.uScale.xy;
+    float3 bb_min = float3(min_xz.x, -10.0, min_xz.y);
+    float3 bb_max = float3(max_xz.x, 10.0, max_xz.y);
+    float3 center = (bb_min + bb_max) * 0.5;
+    float radius = 0.5 * length(bb_max - bb_min);
+    float3 f0 = float3(dot(v_41.uFrustum[0], float4(center, 1.0)), dot(v_41.uFrustum[1], float4(center, 1.0)), dot(v_41.uFrustum[2], float4(center, 1.0)));
+    float3 f1 = float3(dot(v_41.uFrustum[3], float4(center, 1.0)), dot(v_41.uFrustum[4], float4(center, 1.0)), dot(v_41.uFrustum[5], float4(center, 1.0)));
+    bool _205 = any(f0 <= float3(-radius));
+    bool _215;
+    if (!_205)
+    {
+        _215 = any(f1 <= float3(-radius));
+    }
+    else
+    {
+        _215 = _205;
+    }
+    return !_215;
+}
+
+static inline __attribute__((always_inline))
+float lod_factor(thread const float2& pos_, constant UBO& v_41)
+{
+    float2 pos = pos_ * v_41.uScale.xy;
+    float3 dist_to_cam = v_41.uCamPos - float3(pos.x, 0.0, pos.y);
+    float level = log2((length(dist_to_cam) + 9.9999997473787516355514526367188e-05) * v_41.uDistanceMod);
+    return fast::clamp(level, 0.0, v_41.uMaxTessLevel.x);
+}
+
+static inline __attribute__((always_inline))
+float4 tess_level(thread const float4& lod, constant UBO& v_41)
+{
+    return exp2(-lod) * v_41.uMaxTessLevel.y;
+}
+
+static inline __attribute__((always_inline))
+float tess_level(thread const float& lod, constant UBO& v_41)
+{
+    return v_41.uMaxTessLevel.y * exp2(-lod);
+}
+
+static inline __attribute__((always_inline))
+void compute_tess_levels(thread const float2& p0, constant UBO& v_41, device float2& vOutPatchPosBase, device float4& vPatchLods, device half (&gl_TessLevelOuter)[4], device half (&gl_TessLevelInner)[2])
+{
+    vOutPatchPosBase = p0;
+    float2 param = p0 + (float2(-0.5) * v_41.uPatchSize);
+    float l00 = lod_factor(param, v_41);
+    float2 param_1 = p0 + (float2(0.5, -0.5) * v_41.uPatchSize);
+    float l10 = lod_factor(param_1, v_41);
+    float2 param_2 = p0 + (float2(1.5, -0.5) * v_41.uPatchSize);
+    float l20 = lod_factor(param_2, v_41);
+    float2 param_3 = p0 + (float2(-0.5, 0.5) * v_41.uPatchSize);
+    float l01 = lod_factor(param_3, v_41);
+    float2 param_4 = p0 + (float2(0.5) * v_41.uPatchSize);
+    float l11 = lod_factor(param_4, v_41);
+    float2 param_5 = p0 + (float2(1.5, 0.5) * v_41.uPatchSize);
+    float l21 = lod_factor(param_5, v_41);
+    float2 param_6 = p0 + (float2(-0.5, 1.5) * v_41.uPatchSize);
+    float l02 = lod_factor(param_6, v_41);
+    float2 param_7 = p0 + (float2(0.5, 1.5) * v_41.uPatchSize);
+    float l12 = lod_factor(param_7, v_41);
+    float2 param_8 = p0 + (float2(1.5) * v_41.uPatchSize);
+    float l22 = lod_factor(param_8, v_41);
+    float4 lods = float4(dot(float4(l01, l11, l02, l12), float4(0.25)), dot(float4(l00, l10, l01, l11), float4(0.25)), dot(float4(l10, l20, l11, l21), float4(0.25)), dot(float4(l11, l21, l12, l22), float4(0.25)));
+    vPatchLods = lods;
+    float4 outer_lods = fast::min(lods, lods.yzwx);
+    float4 param_9 = outer_lods;
+    float4 levels = tess_level(param_9, v_41);
+    gl_TessLevelOuter[0] = half(levels.x);
+    gl_TessLevelOuter[1] = half(levels.y);
+    gl_TessLevelOuter[2] = half(levels.z);
+    gl_TessLevelOuter[3] = half(levels.w);
+    float min_lod = fast::min(fast::min(lods.x, lods.y), fast::min(lods.z, lods.w));
+    float param_10 = fast::min(min_lod, l11);
+    float inner = tess_level(param_10, v_41);
+    gl_TessLevelInner[0] = half(inner);
+    gl_TessLevelInner[1] = half(inner);
+}
+
+kernel void main0(constant UBO& v_41 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
+{
+    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 1];
+    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 1, spvIndirectParams[1]);
+    float2 p0 = gl_in[0].vPatchPosBase.xy;
+    float2 param = p0;
+    if (!frustum_cull(param, v_41))
+    {
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[3] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].insideTessellationFactor[0] = half(-1.0);
+        spvTessLevel[gl_PrimitiveID].insideTessellationFactor[1] = half(-1.0);
+    }
+    else
+    {
+        float2 param_1 = p0;
+        compute_tess_levels(param_1, v_41, patchOut.vOutPatchPosBase, patchOut.vPatchLods, spvTessLevel[gl_PrimitiveID].edgeTessellationFactor, spvTessLevel[gl_PrimitiveID].insideTessellationFactor);
+    }
+}
+
diff --git a/reference/shaders-msl/vert/basic.for-tess.vert b/reference/shaders-msl/vert/basic.for-tess.vert
new file mode 100644
index 00000000..c99a95ac
--- /dev/null
+++ b/reference/shaders-msl/vert/basic.for-tess.vert
@@ -0,0 +1,31 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct UBO
+{
+    float4x4 uMVP;
+};
+
+struct main0_out
+{
+    float3 vNormal;
+    float4 gl_Position;
+};
+
+struct main0_in
+{
+    float4 aVertex [[attribute(0)]];
+    float3 aNormal [[attribute(1)]];
+};
+
+kernel void main0(main0_in in [[stage_in]], constant UBO& _16 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], device main0_out* spvOut [[buffer(28)]])
+{
+    device main0_out& out = spvOut[gl_GlobalInvocationID.y * spvStageInputSize.x + gl_GlobalInvocationID.x];
+    if (any(gl_GlobalInvocationID >= spvStageInputSize))
+        return;
+    out.gl_Position = _16.uMVP * in.aVertex;
+    out.vNormal = in.aNormal;
+}
+
diff --git a/reference/shaders-msl/vert/leaf-function.for-tess.vert b/reference/shaders-msl/vert/leaf-function.for-tess.vert
new file mode 100644
index 00000000..5a960e5e
--- /dev/null
+++ b/reference/shaders-msl/vert/leaf-function.for-tess.vert
@@ -0,0 +1,39 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct UBO
+{
+    float4x4 uMVP;
+};
+
+struct main0_out
+{
+    float3 vNormal;
+    float4 gl_Position;
+};
+
+struct main0_in
+{
+    float4 aVertex [[attribute(0)]];
+    float3 aNormal [[attribute(1)]];
+};
+
+static inline __attribute__((always_inline))
+void set_output(device float4& gl_Position, constant UBO& v_18, thread float4& aVertex, device float3& vNormal, thread float3& aNormal)
+{
+    gl_Position = v_18.uMVP * aVertex;
+    vNormal = aNormal;
+}
+
+kernel void main0(main0_in in [[stage_in]], constant UBO& v_18 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], device main0_out* spvOut [[buffer(28)]])
+{
+    device main0_out& out = spvOut[gl_GlobalInvocationID.y * spvStageInputSize.x + gl_GlobalInvocationID.x];
+    if (any(gl_GlobalInvocationID >= spvStageInputSize))
+        return;
+    set_output(out.gl_Position, v_18, in.aVertex, out.vNormal, in.aNormal);
+}
+
diff --git a/reference/shaders-msl/vert/no_stage_out.for-tess.vert b/reference/shaders-msl/vert/no_stage_out.for-tess.vert
new file mode 100644
index 00000000..c40e0ec7
--- /dev/null
+++ b/reference/shaders-msl/vert/no_stage_out.for-tess.vert
@@ -0,0 +1,23 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct _10
+{
+    uint4 _m0[1024];
+};
+
+struct main0_in
+{
+    uint4 m_19 [[attribute(0)]];
+};
+
+kernel void main0(main0_in in [[stage_in]], device _10& _12 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], uint3 spvDispatchBase [[grid_origin]])
+{
+    if (any(gl_GlobalInvocationID >= spvStageInputSize))
+        return;
+    uint gl_VertexIndex = gl_GlobalInvocationID.x + spvDispatchBase.x;
+    _12._m0[int(gl_VertexIndex)] = in.m_19;
+}
+
diff --git a/shaders-msl-no-opt/asm/tesc/tess-fixed-input-array-builtin-array.invalid.multi-patch.asm.tesc b/shaders-msl-no-opt/asm/tesc/tess-fixed-input-array-builtin-array.invalid.multi-patch.asm.tesc
new file mode 100644
index 00000000..0fd4dce2
--- /dev/null
+++ b/shaders-msl-no-opt/asm/tesc/tess-fixed-input-array-builtin-array.invalid.multi-patch.asm.tesc
@@ -0,0 +1,248 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 2
+; Bound: 162
+; Schema: 0
+               OpCapability Tessellation
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint TessellationControl %hs_main "main" %p_pos %p_1 %i_1 %_entryPointOutput_pos %_entryPointOutput %_patchConstantOutput_EdgeTess %_patchConstantOutput_InsideTess
+               OpExecutionMode %hs_main OutputVertices 3
+               OpExecutionMode %hs_main Triangles
+               OpExecutionMode %hs_main SpacingFractionalOdd
+               OpExecutionMode %hs_main VertexOrderCw
+               OpSource HLSL 500
+               OpName %hs_main "hs_main"
+               OpName %VertexOutput "VertexOutput"
+               OpMemberName %VertexOutput 0 "pos"
+               OpMemberName %VertexOutput 1 "uv"
+               OpName %HSOut "HSOut"
+               OpMemberName %HSOut 0 "pos"
+               OpMemberName %HSOut 1 "uv"
+               OpName %_hs_main_struct_VertexOutput_vf4_vf21_3__u1_ "@hs_main(struct-VertexOutput-vf4-vf21[3];u1;"
+               OpName %p "p"
+               OpName %i "i"
+               OpName %HSConstantOut "HSConstantOut"
+               OpMemberName %HSConstantOut 0 "EdgeTess"
+               OpMemberName %HSConstantOut 1 "InsideTess"
+               OpName %PatchHS_struct_VertexOutput_vf4_vf21_3__ "PatchHS(struct-VertexOutput-vf4-vf21[3];"
+               OpName %patch "patch"
+               OpName %output "output"
+               OpName %p_0 "p"
+               OpName %p_pos "p.pos"
+               OpName %VertexOutput_0 "VertexOutput"
+               OpMemberName %VertexOutput_0 0 "uv"
+               OpName %p_1 "p"
+               OpName %i_0 "i"
+               OpName %i_1 "i"
+               OpName %flattenTemp "flattenTemp"
+               OpName %param "param"
+               OpName %param_0 "param"
+               OpName %_entryPointOutput_pos "@entryPointOutput.pos"
+               OpName %HSOut_0 "HSOut"
+               OpMemberName %HSOut_0 0 "uv"
+               OpName %_entryPointOutput "@entryPointOutput"
+               OpName %_patchConstantResult "@patchConstantResult"
+               OpName %param_1 "param"
+               OpName %_patchConstantOutput_EdgeTess "@patchConstantOutput.EdgeTess"
+               OpName %_patchConstantOutput_InsideTess "@patchConstantOutput.InsideTess"
+               OpName %output_0 "output"
+               OpDecorate %p_pos BuiltIn Position
+               OpDecorate %p_1 Location 0
+               OpDecorate %i_1 BuiltIn InvocationId
+               OpDecorate %_entryPointOutput_pos BuiltIn Position
+               OpDecorate %_entryPointOutput Location 0
+               OpDecorate %_patchConstantOutput_EdgeTess Patch
+               OpDecorate %_patchConstantOutput_EdgeTess BuiltIn TessLevelOuter
+               OpDecorate %_patchConstantOutput_InsideTess Patch
+               OpDecorate %_patchConstantOutput_InsideTess BuiltIn TessLevelInner
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+    %v4float = OpTypeVector %float 4
+    %v2float = OpTypeVector %float 2
+%VertexOutput = OpTypeStruct %v4float %v2float
+       %uint = OpTypeInt 32 0
+     %uint_3 = OpConstant %uint 3
+%_arr_VertexOutput_uint_3 = OpTypeArray %VertexOutput %uint_3
+%_ptr_Function__arr_VertexOutput_uint_3 = OpTypePointer Function %_arr_VertexOutput_uint_3
+%_ptr_Function_uint = OpTypePointer Function %uint
+      %HSOut = OpTypeStruct %v4float %v2float
+         %16 = OpTypeFunction %HSOut %_ptr_Function__arr_VertexOutput_uint_3 %_ptr_Function_uint
+%_arr_float_uint_3 = OpTypeArray %float %uint_3
+%HSConstantOut = OpTypeStruct %_arr_float_uint_3 %float
+         %23 = OpTypeFunction %HSConstantOut %_ptr_Function__arr_VertexOutput_uint_3
+%_ptr_Function_HSOut = OpTypePointer Function %HSOut
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+%_ptr_Function_v4float = OpTypePointer Function %v4float
+      %int_1 = OpConstant %int 1
+%_ptr_Function_v2float = OpTypePointer Function %v2float
+%_arr_v4float_uint_3 = OpTypeArray %v4float %uint_3
+%_ptr_Input__arr_v4float_uint_3 = OpTypePointer Input %_arr_v4float_uint_3
+      %p_pos = OpVariable %_ptr_Input__arr_v4float_uint_3 Input
+%_ptr_Input_v4float = OpTypePointer Input %v4float
+%VertexOutput_0 = OpTypeStruct %v2float
+%_arr_VertexOutput_0_uint_3 = OpTypeArray %VertexOutput_0 %uint_3
+%_ptr_Input__arr_VertexOutput_0_uint_3 = OpTypePointer Input %_arr_VertexOutput_0_uint_3
+        %p_1 = OpVariable %_ptr_Input__arr_VertexOutput_0_uint_3 Input
+%_ptr_Input_v2float = OpTypePointer Input %v2float
+      %int_2 = OpConstant %int 2
+%_ptr_Input_uint = OpTypePointer Input %uint
+        %i_1 = OpVariable %_ptr_Input_uint Input
+%_ptr_Output__arr_v4float_uint_3 = OpTypePointer Output %_arr_v4float_uint_3
+%_entryPointOutput_pos = OpVariable %_ptr_Output__arr_v4float_uint_3 Output
+%_ptr_Output_v4float = OpTypePointer Output %v4float
+    %HSOut_0 = OpTypeStruct %v2float
+%_arr_HSOut_0_uint_3 = OpTypeArray %HSOut_0 %uint_3
+%_ptr_Output__arr_HSOut_0_uint_3 = OpTypePointer Output %_arr_HSOut_0_uint_3
+%_entryPointOutput = OpVariable %_ptr_Output__arr_HSOut_0_uint_3 Output
+%_ptr_Output_v2float = OpTypePointer Output %v2float
+     %uint_2 = OpConstant %uint 2
+     %uint_1 = OpConstant %uint 1
+     %uint_0 = OpConstant %uint 0
+       %bool = OpTypeBool
+%_ptr_Function_HSConstantOut = OpTypePointer Function %HSConstantOut
+     %uint_4 = OpConstant %uint 4
+%_arr_float_uint_4 = OpTypeArray %float %uint_4
+%_ptr_Output__arr_float_uint_4 = OpTypePointer Output %_arr_float_uint_4
+%_patchConstantOutput_EdgeTess = OpVariable %_ptr_Output__arr_float_uint_4 Output
+%_ptr_Function_float = OpTypePointer Function %float
+%_ptr_Output_float = OpTypePointer Output %float
+%_arr_float_uint_2 = OpTypeArray %float %uint_2
+%_ptr_Output__arr_float_uint_2 = OpTypePointer Output %_arr_float_uint_2
+%_patchConstantOutput_InsideTess = OpVariable %_ptr_Output__arr_float_uint_2 Output
+    %float_1 = OpConstant %float 1
+    %hs_main = OpFunction %void None %3
+          %5 = OpLabel
+        %p_0 = OpVariable %_ptr_Function__arr_VertexOutput_uint_3 Function
+        %i_0 = OpVariable %_ptr_Function_uint Function
+%flattenTemp = OpVariable %_ptr_Function_HSOut Function
+      %param = OpVariable %_ptr_Function__arr_VertexOutput_uint_3 Function
+    %param_0 = OpVariable %_ptr_Function_uint Function
+%_patchConstantResult = OpVariable %_ptr_Function_HSConstantOut Function
+    %param_1 = OpVariable %_ptr_Function__arr_VertexOutput_uint_3 Function
+         %50 = OpAccessChain %_ptr_Input_v4float %p_pos %int_0
+         %51 = OpLoad %v4float %50
+         %52 = OpAccessChain %_ptr_Function_v4float %p_0 %int_0 %int_0
+               OpStore %52 %51
+         %58 = OpAccessChain %_ptr_Input_v2float %p_1 %int_0 %int_0
+         %59 = OpLoad %v2float %58
+         %60 = OpAccessChain %_ptr_Function_v2float %p_0 %int_0 %int_1
+               OpStore %60 %59
+         %61 = OpAccessChain %_ptr_Input_v4float %p_pos %int_1
+         %62 = OpLoad %v4float %61
+         %63 = OpAccessChain %_ptr_Function_v4float %p_0 %int_1 %int_0
+               OpStore %63 %62
+         %64 = OpAccessChain %_ptr_Input_v2float %p_1 %int_1 %int_0
+         %65 = OpLoad %v2float %64
+         %66 = OpAccessChain %_ptr_Function_v2float %p_0 %int_1 %int_1
+               OpStore %66 %65
+         %68 = OpAccessChain %_ptr_Input_v4float %p_pos %int_2
+         %69 = OpLoad %v4float %68
+         %70 = OpAccessChain %_ptr_Function_v4float %p_0 %int_2 %int_0
+               OpStore %70 %69
+         %71 = OpAccessChain %_ptr_Input_v2float %p_1 %int_2 %int_0
+         %72 = OpLoad %v2float %71
+         %73 = OpAccessChain %_ptr_Function_v2float %p_0 %int_2 %int_1
+               OpStore %73 %72
+         %77 = OpLoad %uint %i_1
+               OpStore %i_0 %77
+         %80 = OpLoad %_arr_VertexOutput_uint_3 %p_0
+               OpStore %param %80
+         %82 = OpLoad %uint %i_0
+               OpStore %param_0 %82
+         %83 = OpFunctionCall %HSOut %_hs_main_struct_VertexOutput_vf4_vf21_3__u1_ %param %param_0
+               OpStore %flattenTemp %83
+         %86 = OpAccessChain %_ptr_Function_v4float %flattenTemp %int_0
+         %87 = OpLoad %v4float %86
+         %94 = OpLoad %uint %i_1
+         %89 = OpAccessChain %_ptr_Output_v4float %_entryPointOutput_pos %94
+               OpStore %89 %87
+         %95 = OpAccessChain %_ptr_Function_v2float %flattenTemp %int_1
+         %96 = OpLoad %v2float %95
+         %98 = OpAccessChain %_ptr_Output_v2float %_entryPointOutput %94 %int_0
+               OpStore %98 %96
+               OpControlBarrier %uint_2 %uint_1 %uint_0
+        %102 = OpLoad %uint %i_1
+        %104 = OpIEqual %bool %102 %int_0
+               OpSelectionMerge %106 None
+               OpBranchConditional %104 %105 %106
+        %105 = OpLabel
+        %110 = OpLoad %_arr_VertexOutput_uint_3 %p_0
+               OpStore %param_1 %110
+        %111 = OpFunctionCall %HSConstantOut %PatchHS_struct_VertexOutput_vf4_vf21_3__ %param_1
+               OpStore %_patchConstantResult %111
+        %117 = OpAccessChain %_ptr_Function_float %_patchConstantResult %int_0 %int_0
+        %118 = OpLoad %float %117
+        %120 = OpAccessChain %_ptr_Output_float %_patchConstantOutput_EdgeTess %int_0
+               OpStore %120 %118
+        %121 = OpAccessChain %_ptr_Function_float %_patchConstantResult %int_0 %int_1
+        %122 = OpLoad %float %121
+        %123 = OpAccessChain %_ptr_Output_float %_patchConstantOutput_EdgeTess %int_1
+               OpStore %123 %122
+        %124 = OpAccessChain %_ptr_Function_float %_patchConstantResult %int_0 %int_2
+        %125 = OpLoad %float %124
+        %126 = OpAccessChain %_ptr_Output_float %_patchConstantOutput_EdgeTess %int_2
+               OpStore %126 %125
+        %130 = OpAccessChain %_ptr_Function_float %_patchConstantResult %int_1
+        %131 = OpLoad %float %130
+        %132 = OpAccessChain %_ptr_Output_float %_patchConstantOutput_InsideTess %int_0
+               OpStore %132 %131
+               OpBranch %106
+        %106 = OpLabel
+               OpReturn
+               OpFunctionEnd
+%_hs_main_struct_VertexOutput_vf4_vf21_3__u1_ = OpFunction %HSOut None %16
+          %p = OpFunctionParameter %_ptr_Function__arr_VertexOutput_uint_3
+          %i = OpFunctionParameter %_ptr_Function_uint
+         %20 = OpLabel
+     %output = OpVariable %_ptr_Function_HSOut Function
+         %31 = OpLoad %uint %i
+         %33 = OpAccessChain %_ptr_Function_v4float %p %31 %int_0
+         %34 = OpLoad %v4float %33
+         %35 = OpAccessChain %_ptr_Function_v4float %output %int_0
+               OpStore %35 %34
+         %37 = OpLoad %uint %i
+         %39 = OpAccessChain %_ptr_Function_v2float %p %37 %int_1
+         %40 = OpLoad %v2float %39
+         %41 = OpAccessChain %_ptr_Function_v2float %output %int_1
+               OpStore %41 %40
+         %42 = OpLoad %HSOut %output
+               OpReturnValue %42
+               OpFunctionEnd
+%PatchHS_struct_VertexOutput_vf4_vf21_3__ = OpFunction %HSConstantOut None %23
+      %patch = OpFunctionParameter %_ptr_Function__arr_VertexOutput_uint_3
+         %26 = OpLabel
+   %output_0 = OpVariable %_ptr_Function_HSConstantOut Function
+        %135 = OpAccessChain %_ptr_Function_v2float %patch %int_0 %int_1
+        %136 = OpLoad %v2float %135
+        %137 = OpCompositeConstruct %v2float %float_1 %float_1
+        %138 = OpFAdd %v2float %137 %136
+        %139 = OpCompositeExtract %float %138 0
+        %140 = OpAccessChain %_ptr_Function_float %output_0 %int_0 %int_0
+               OpStore %140 %139
+        %141 = OpAccessChain %_ptr_Function_v2float %patch %int_0 %int_1
+        %142 = OpLoad %v2float %141
+        %143 = OpCompositeConstruct %v2float %float_1 %float_1
+        %144 = OpFAdd %v2float %143 %142
+        %145 = OpCompositeExtract %float %144 0
+        %146 = OpAccessChain %_ptr_Function_float %output_0 %int_0 %int_1
+               OpStore %146 %145
+        %147 = OpAccessChain %_ptr_Function_v2float %patch %int_0 %int_1
+        %148 = OpLoad %v2float %147
+        %149 = OpCompositeConstruct %v2float %float_1 %float_1
+        %150 = OpFAdd %v2float %149 %148
+        %151 = OpCompositeExtract %float %150 0
+        %152 = OpAccessChain %_ptr_Function_float %output_0 %int_0 %int_2
+               OpStore %152 %151
+        %153 = OpAccessChain %_ptr_Function_v2float %patch %int_0 %int_1
+        %154 = OpLoad %v2float %153
+        %155 = OpCompositeConstruct %v2float %float_1 %float_1
+        %156 = OpFAdd %v2float %155 %154
+        %157 = OpCompositeExtract %float %156 0
+        %158 = OpAccessChain %_ptr_Function_float %output_0 %int_1
+               OpStore %158 %157
+        %159 = OpLoad %HSConstantOut %output_0
+               OpReturnValue %159
+               OpFunctionEnd
diff --git a/shaders-msl/asm/tesc/tess-level-overrun.multi-patch.asm.tesc b/shaders-msl/asm/tesc/tess-level-overrun.multi-patch.asm.tesc
new file mode 100644
index 00000000..b21a2d3d
--- /dev/null
+++ b/shaders-msl/asm/tesc/tess-level-overrun.multi-patch.asm.tesc
@@ -0,0 +1,102 @@
+; SPIR-V
+; Version: 1.3
+; Generator: Khronos Glslang Reference Front End; 7
+; Bound: 46
+; Schema: 0
+               OpCapability Tessellation
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint TessellationControl %main "main" %gl_TessLevelInner %gl_TessLevelOuter
+               OpExecutionMode %main OutputVertices 1
+               OpExecutionMode %main Triangles
+               OpSource ESSL 310
+               OpSourceExtension "GL_EXT_shader_io_blocks"
+               OpSourceExtension "GL_EXT_tessellation_shader"
+               OpName %main "main"
+               OpName %gl_TessLevelInner "gl_TessLevelInner"
+               OpName %TessLevels "TessLevels"
+               OpMemberName %TessLevels 0 "inner0"
+               OpMemberName %TessLevels 1 "inner1"
+               OpMemberName %TessLevels 2 "outer0"
+               OpMemberName %TessLevels 3 "outer1"
+               OpMemberName %TessLevels 4 "outer2"
+               OpMemberName %TessLevels 5 "outer3"
+               OpName %sb_levels "sb_levels"
+               OpName %gl_TessLevelOuter "gl_TessLevelOuter"
+               OpDecorate %gl_TessLevelInner Patch
+               OpDecorate %gl_TessLevelInner BuiltIn TessLevelInner
+               OpMemberDecorate %TessLevels 0 Restrict
+               OpMemberDecorate %TessLevels 0 NonWritable
+               OpMemberDecorate %TessLevels 0 Offset 0
+               OpMemberDecorate %TessLevels 1 Restrict
+               OpMemberDecorate %TessLevels 1 NonWritable
+               OpMemberDecorate %TessLevels 1 Offset 4
+               OpMemberDecorate %TessLevels 2 Restrict
+               OpMemberDecorate %TessLevels 2 NonWritable
+               OpMemberDecorate %TessLevels 2 Offset 8
+               OpMemberDecorate %TessLevels 3 Restrict
+               OpMemberDecorate %TessLevels 3 NonWritable
+               OpMemberDecorate %TessLevels 3 Offset 12
+               OpMemberDecorate %TessLevels 4 Restrict
+               OpMemberDecorate %TessLevels 4 NonWritable
+               OpMemberDecorate %TessLevels 4 Offset 16
+               OpMemberDecorate %TessLevels 5 Restrict
+               OpMemberDecorate %TessLevels 5 NonWritable
+               OpMemberDecorate %TessLevels 5 Offset 20
+               OpDecorate %TessLevels Block
+               OpDecorate %sb_levels DescriptorSet 0
+               OpDecorate %sb_levels Binding 0
+               OpDecorate %gl_TessLevelOuter Patch
+               OpDecorate %gl_TessLevelOuter BuiltIn TessLevelOuter
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+       %uint = OpTypeInt 32 0
+     %uint_2 = OpConstant %uint 2
+%_arr_float_uint_2 = OpTypeArray %float %uint_2
+%_ptr_Output__arr_float_uint_2 = OpTypePointer Output %_arr_float_uint_2
+%gl_TessLevelInner = OpVariable %_ptr_Output__arr_float_uint_2 Output
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+ %TessLevels = OpTypeStruct %float %float %float %float %float %float
+%_ptr_StorageBuffer_TessLevels = OpTypePointer StorageBuffer %TessLevels
+  %sb_levels = OpVariable %_ptr_StorageBuffer_TessLevels StorageBuffer
+%_ptr_StorageBuffer_float = OpTypePointer StorageBuffer %float
+%_ptr_Output_float = OpTypePointer Output %float
+      %int_1 = OpConstant %int 1
+     %uint_4 = OpConstant %uint 4
+%_arr_float_uint_4 = OpTypeArray %float %uint_4
+%_ptr_Output__arr_float_uint_4 = OpTypePointer Output %_arr_float_uint_4
+%gl_TessLevelOuter = OpVariable %_ptr_Output__arr_float_uint_4 Output
+      %int_2 = OpConstant %int 2
+      %int_3 = OpConstant %int 3
+      %int_4 = OpConstant %int 4
+      %int_5 = OpConstant %int 5
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %18 = OpAccessChain %_ptr_StorageBuffer_float %sb_levels %int_0
+         %19 = OpLoad %float %18
+         %21 = OpAccessChain %_ptr_Output_float %gl_TessLevelInner %int_0
+               OpStore %21 %19
+         %23 = OpAccessChain %_ptr_StorageBuffer_float %sb_levels %int_1
+         %24 = OpLoad %float %23
+         %25 = OpAccessChain %_ptr_Output_float %gl_TessLevelInner %int_1
+               OpStore %25 %24
+         %31 = OpAccessChain %_ptr_StorageBuffer_float %sb_levels %int_2
+         %32 = OpLoad %float %31
+         %33 = OpAccessChain %_ptr_Output_float %gl_TessLevelOuter %int_0
+               OpStore %33 %32
+         %35 = OpAccessChain %_ptr_StorageBuffer_float %sb_levels %int_3
+         %36 = OpLoad %float %35
+         %37 = OpAccessChain %_ptr_Output_float %gl_TessLevelOuter %int_1
+               OpStore %37 %36
+         %39 = OpAccessChain %_ptr_StorageBuffer_float %sb_levels %int_4
+         %40 = OpLoad %float %39
+         %41 = OpAccessChain %_ptr_Output_float %gl_TessLevelOuter %int_2
+               OpStore %41 %40
+         %43 = OpAccessChain %_ptr_StorageBuffer_float %sb_levels %int_5
+         %44 = OpLoad %float %43
+         %45 = OpAccessChain %_ptr_Output_float %gl_TessLevelOuter %int_3
+               OpStore %45 %44
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-msl/desktop-only/tesc/basic.desktop.sso.multi-patch.tesc b/shaders-msl/desktop-only/tesc/basic.desktop.sso.multi-patch.tesc
new file mode 100644
index 00000000..a258afb3
--- /dev/null
+++ b/shaders-msl/desktop-only/tesc/basic.desktop.sso.multi-patch.tesc
@@ -0,0 +1,32 @@
+#version 450
+layout(vertices = 1) out;
+
+in gl_PerVertex
+{
+   vec4 gl_Position;
+} gl_in[gl_MaxPatchVertices];
+
+out gl_PerVertex
+{
+   vec4 gl_Position;
+} gl_out[1];
+
+layout(location = 0) patch out vec3 vFoo;
+
+void set_position()
+{
+    gl_out[gl_InvocationID].gl_Position = gl_in[0].gl_Position + gl_in[1].gl_Position;
+}
+
+void main()
+{
+    gl_TessLevelInner[0] = 8.9;
+    gl_TessLevelInner[1] = 6.9;
+    gl_TessLevelOuter[0] = 8.9;
+    gl_TessLevelOuter[1] = 6.9;
+    gl_TessLevelOuter[2] = 3.9;
+    gl_TessLevelOuter[3] = 4.9;
+    vFoo = vec3(1.0);
+
+    set_position();
+}
diff --git a/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc b/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc
new file mode 100644
index 00000000..78d0d00c
--- /dev/null
+++ b/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc
@@ -0,0 +1,22 @@
+#version 450
+
+struct Boo
+{
+        vec3 a;
+        uvec3 b;
+};
+
+layout(vertices = 4) out;
+layout(location = 0) out Boo vVertex[];
+layout(location = 0) in Boo vInput[];
+
+void main()
+{
+        vVertex[gl_InvocationID] = vInput[gl_InvocationID];
+        gl_TessLevelOuter[0] = 1.0;
+        gl_TessLevelOuter[1] = 2.0;
+        gl_TessLevelOuter[2] = 3.0;
+        gl_TessLevelOuter[3] = 4.0;
+        gl_TessLevelInner[0] = 1.0;
+        gl_TessLevelInner[1] = 2.0;
+}
diff --git a/shaders-msl/desktop-only/vert/shader-draw-parameters.desktop.for-tess.vert b/shaders-msl/desktop-only/vert/shader-draw-parameters.desktop.for-tess.vert
new file mode 100644
index 00000000..fadd1e73
--- /dev/null
+++ b/shaders-msl/desktop-only/vert/shader-draw-parameters.desktop.for-tess.vert
@@ -0,0 +1,11 @@
+#version 460
+
+out gl_PerVertex
+{
+    vec4 gl_Position;
+};
+
+void main()
+{
+    gl_Position = vec4(gl_BaseVertex, gl_BaseInstance, 0, 1);
+}
diff --git a/shaders-msl/tesc/basic.multi-patch.tesc b/shaders-msl/tesc/basic.multi-patch.tesc
new file mode 100644
index 00000000..0a41f98c
--- /dev/null
+++ b/shaders-msl/tesc/basic.multi-patch.tesc
@@ -0,0 +1,17 @@
+#version 310 es
+#extension GL_EXT_tessellation_shader : require
+
+layout(location = 0) patch out vec3 vFoo;
+
+layout(vertices = 1) out;
+
+void main()
+{
+    gl_TessLevelInner[0] = 8.9;
+    gl_TessLevelInner[1] = 6.9;
+    gl_TessLevelOuter[0] = 8.9;
+    gl_TessLevelOuter[1] = 6.9;
+    gl_TessLevelOuter[2] = 3.9;
+    gl_TessLevelOuter[3] = 4.9;
+    vFoo = vec3(1.0);
+}
diff --git a/shaders-msl/tesc/load-control-point-array-of-matrix.multi-patch.tesc b/shaders-msl/tesc/load-control-point-array-of-matrix.multi-patch.tesc
new file mode 100644
index 00000000..36b16681
--- /dev/null
+++ b/shaders-msl/tesc/load-control-point-array-of-matrix.multi-patch.tesc
@@ -0,0 +1,12 @@
+#version 450
+
+layout(vertices = 4) out;
+
+layout(location = 0) in mat4 vInputs[gl_MaxPatchVertices];
+layout(location = 0) out mat4 vOutputs[4];
+
+void main()
+{
+	mat4 tmp[gl_MaxPatchVertices] = vInputs;
+	vOutputs[gl_InvocationID] = tmp[gl_InvocationID];
+}
diff --git a/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc b/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc
new file mode 100644
index 00000000..4b4d5bfc
--- /dev/null
+++ b/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc
@@ -0,0 +1,21 @@
+#version 450
+
+layout(vertices = 4) out;
+
+struct VertexData
+{
+	mat4 a;
+	vec4 b[2];
+	vec4 c;
+};
+
+layout(location = 0) in VertexData vInputs[gl_MaxPatchVertices];
+layout(location = 0) out vec4 vOutputs[4];
+
+void main()
+{
+	VertexData tmp[gl_MaxPatchVertices] = vInputs;
+	VertexData tmp_single = vInputs[gl_InvocationID ^ 1];
+
+	vOutputs[gl_InvocationID] = tmp[gl_InvocationID].a[1] + tmp[gl_InvocationID].b[1] + tmp[gl_InvocationID].c + tmp_single.c;
+}
diff --git a/shaders-msl/tesc/load-control-point-array.multi-patch.tesc b/shaders-msl/tesc/load-control-point-array.multi-patch.tesc
new file mode 100644
index 00000000..1a5924b8
--- /dev/null
+++ b/shaders-msl/tesc/load-control-point-array.multi-patch.tesc
@@ -0,0 +1,12 @@
+#version 450
+
+layout(vertices = 4) out;
+
+layout(location = 0) in vec4 vInputs[gl_MaxPatchVertices];
+layout(location = 0) out vec4 vOutputs[4];
+
+void main()
+{
+	vec4 tmp[gl_MaxPatchVertices] = vInputs;
+	vOutputs[gl_InvocationID] = tmp[gl_InvocationID];
+}
diff --git a/shaders-msl/tesc/water_tess.multi-patch.tesc b/shaders-msl/tesc/water_tess.multi-patch.tesc
new file mode 100644
index 00000000..3ecdc3d1
--- /dev/null
+++ b/shaders-msl/tesc/water_tess.multi-patch.tesc
@@ -0,0 +1,115 @@
+#version 310 es
+#extension GL_EXT_tessellation_shader : require
+
+layout(vertices = 1) out;
+layout(location = 0) in vec2 vPatchPosBase[];
+
+layout(std140) uniform UBO
+{
+    vec4 uScale;
+    highp vec3 uCamPos;
+    vec2 uPatchSize;
+    vec2 uMaxTessLevel;
+    float uDistanceMod;
+    vec4 uFrustum[6];
+};
+
+layout(location = 1) patch out vec2 vOutPatchPosBase;
+layout(location = 2) patch out vec4 vPatchLods;
+
+float lod_factor(vec2 pos_)
+{
+    vec2 pos = pos_ * uScale.xy;
+    vec3 dist_to_cam = uCamPos - vec3(pos.x, 0.0, pos.y);
+    float level = log2((length(dist_to_cam) + 0.0001) * uDistanceMod);
+    return clamp(level, 0.0, uMaxTessLevel.x);
+}
+
+float tess_level(float lod)
+{
+    return uMaxTessLevel.y * exp2(-lod);
+}
+
+vec4 tess_level(vec4 lod)
+{
+    return uMaxTessLevel.y * exp2(-lod);
+}
+
+// Guard band for vertex displacement.
+#define GUARD_BAND 10.0
+bool frustum_cull(vec2 p0)
+{
+    vec2 min_xz = (p0 - GUARD_BAND) * uScale.xy;
+    vec2 max_xz = (p0 + uPatchSize + GUARD_BAND) * uScale.xy;
+
+    vec3 bb_min = vec3(min_xz.x, -GUARD_BAND, min_xz.y);
+    vec3 bb_max = vec3(max_xz.x, +GUARD_BAND, max_xz.y);
+    vec3 center = 0.5 * (bb_min + bb_max);
+    float radius = 0.5 * length(bb_max - bb_min);
+
+    vec3 f0 = vec3(
+        dot(uFrustum[0], vec4(center, 1.0)),
+        dot(uFrustum[1], vec4(center, 1.0)),
+        dot(uFrustum[2], vec4(center, 1.0)));
+
+    vec3 f1 = vec3(
+        dot(uFrustum[3], vec4(center, 1.0)),
+        dot(uFrustum[4], vec4(center, 1.0)),
+        dot(uFrustum[5], vec4(center, 1.0)));
+
+    return !(any(lessThanEqual(f0, vec3(-radius))) || any(lessThanEqual(f1, vec3(-radius))));
+}
+
+void compute_tess_levels(vec2 p0)
+{
+    vOutPatchPosBase = p0;
+
+    float l00 = lod_factor(p0 + vec2(-0.5, -0.5) * uPatchSize);
+    float l10 = lod_factor(p0 + vec2(+0.5, -0.5) * uPatchSize);
+    float l20 = lod_factor(p0 + vec2(+1.5, -0.5) * uPatchSize);
+    float l01 = lod_factor(p0 + vec2(-0.5, +0.5) * uPatchSize);
+    float l11 = lod_factor(p0 + vec2(+0.5, +0.5) * uPatchSize);
+    float l21 = lod_factor(p0 + vec2(+1.5, +0.5) * uPatchSize);
+    float l02 = lod_factor(p0 + vec2(-0.5, +1.5) * uPatchSize);
+    float l12 = lod_factor(p0 + vec2(+0.5, +1.5) * uPatchSize);
+    float l22 = lod_factor(p0 + vec2(+1.5, +1.5) * uPatchSize);
+
+    vec4 lods = vec4(
+        dot(vec4(l01, l11, l02, l12), vec4(0.25)),
+        dot(vec4(l00, l10, l01, l11), vec4(0.25)),
+        dot(vec4(l10, l20, l11, l21), vec4(0.25)),
+        dot(vec4(l11, l21, l12, l22), vec4(0.25)));
+
+    vPatchLods = lods;
+
+    vec4 outer_lods = min(lods.xyzw, lods.yzwx);
+    vec4 levels = tess_level(outer_lods);
+    gl_TessLevelOuter[0] = levels.x;
+    gl_TessLevelOuter[1] = levels.y;
+    gl_TessLevelOuter[2] = levels.z;
+    gl_TessLevelOuter[3] = levels.w;
+
+    float min_lod = min(min(lods.x, lods.y), min(lods.z, lods.w));
+    float inner = tess_level(min(min_lod, l11));
+    gl_TessLevelInner[0] = inner;
+    gl_TessLevelInner[1] = inner;
+}
+
+void main()
+{
+    vec2 p0 = vPatchPosBase[0];
+    if (!frustum_cull(p0))
+    {
+        gl_TessLevelOuter[0] = -1.0;
+        gl_TessLevelOuter[1] = -1.0;
+        gl_TessLevelOuter[2] = -1.0;
+        gl_TessLevelOuter[3] = -1.0;
+        gl_TessLevelInner[0] = -1.0;
+        gl_TessLevelInner[1] = -1.0;
+    }
+    else
+    {
+        compute_tess_levels(p0);
+    }
+}
+
diff --git a/shaders-msl/vert/basic.for-tess.vert b/shaders-msl/vert/basic.for-tess.vert
new file mode 100644
index 00000000..8191dc2d
--- /dev/null
+++ b/shaders-msl/vert/basic.for-tess.vert
@@ -0,0 +1,17 @@
+#version 310 es
+
+layout(std140) uniform UBO
+{
+    uniform mat4 uMVP;
+};
+
+layout(location = 0) in vec4 aVertex;
+layout(location = 1) in vec3 aNormal;
+
+layout(location = 0) out vec3 vNormal;
+
+void main()
+{
+    gl_Position = uMVP * aVertex;
+    vNormal = aNormal;
+}
diff --git a/shaders-msl/vert/leaf-function.for-tess.vert b/shaders-msl/vert/leaf-function.for-tess.vert
new file mode 100644
index 00000000..cdb60fae
--- /dev/null
+++ b/shaders-msl/vert/leaf-function.for-tess.vert
@@ -0,0 +1,22 @@
+#version 310 es
+
+layout(std140) uniform UBO
+{
+    uniform mat4 uMVP;
+};
+
+layout(location = 0) in vec4 aVertex;
+layout(location = 1) in vec3 aNormal;
+
+layout(location = 0) out vec3 vNormal;
+
+void set_output()
+{
+    gl_Position = uMVP * aVertex;
+    vNormal = aNormal;
+}
+
+void main()
+{
+    set_output();
+}
diff --git a/shaders-msl/vert/no_stage_out.for-tess.vert b/shaders-msl/vert/no_stage_out.for-tess.vert
new file mode 100644
index 00000000..3c2573a6
--- /dev/null
+++ b/shaders-msl/vert/no_stage_out.for-tess.vert
@@ -0,0 +1,14 @@
+#version 450
+
+layout(binding = 0, std430) writeonly buffer _10_12
+{
+	uvec4 _m0[1024];
+} _12;
+
+layout(location = 0) in uvec4 _19;
+
+void main()
+{
+	_12._m0[gl_VertexIndex] = _19;
+}
+
diff --git a/spirv_common.hpp b/spirv_common.hpp
index c5e0245f..6ff6fa9a 100644
--- a/spirv_common.hpp
+++ b/spirv_common.hpp
@@ -1589,8 +1589,10 @@ enum ExtendedDecorations
 	// Marks a buffer block for using explicit offsets (GLSL/HLSL).
 	SPIRVCrossDecorationExplicitOffset,
 
-	// Apply to a variable in the Input storage class; marks it as holding the base group passed to vkCmdDispatchBase().
-	// In MSL, this is used to adjust the WorkgroupId and GlobalInvocationId variables.
+	// Apply to a variable in the Input storage class; marks it as holding the base group passed to vkCmdDispatchBase(),
+	// or the base vertex and instance indices passed to vkCmdDrawIndexed().
+	// In MSL, this is used to adjust the WorkgroupId and GlobalInvocationId variables in compute shaders,
+	// and to hold the BaseVertex and BaseInstance variables in vertex shaders.
 	SPIRVCrossDecorationBuiltInDispatchBase,
 
 	// Apply to a variable that is a function parameter; marks it as being a "dynamic"
@@ -1599,6 +1601,20 @@ enum ExtendedDecorations
 	// Y'CbCr conversion.
 	SPIRVCrossDecorationDynamicImageSampler,
 
+	// Apply to a variable in the Input storage class; marks it as holding the size of the stage
+	// input grid.
+	// In MSL, this is used to hold the vertex and instance counts in a tessellation pipeline
+	// vertex shader.
+	SPIRVCrossDecorationBuiltInStageInputSize,
+
+	// Apply to any access chain of a tessellation I/O variable; stores the type of the sub-object
+	// that was chained to, as recorded in the input variable itself. This is used in case the pointer
+	// is itself used as the base of an access chain, to calculate the original type of the sub-object
+	// chained to, in case a swizzle needs to be applied. This should not happen normally with valid
+	// SPIR-V, but the MSL backend can change the type of input variables, necessitating the
+	// addition of swizzles to keep the generated code compiling.
+	SPIRVCrossDecorationTessIOOriginalInputTypeID,
+
 	SPIRVCrossDecorationCount
 };
 
diff --git a/spirv_cross_c.cpp b/spirv_cross_c.cpp
index ec1edd99..c7d1361b 100644
--- a/spirv_cross_c.cpp
+++ b/spirv_cross_c.cpp
@@ -635,6 +635,26 @@ spvc_result spvc_compiler_options_set_uint(spvc_compiler_options options, spvc_c
 	case SPVC_COMPILER_OPTION_MSL_ENABLE_CLIP_DISTANCE_USER_VARYING:
 		options->msl.enable_clip_distance_user_varying = value != 0;
 		break;
+
+	case SPVC_COMPILER_OPTION_MSL_MULTI_PATCH_WORKGROUP:
+		options->msl.multi_patch_workgroup = value != 0;
+		break;
+
+	case SPVC_COMPILER_OPTION_MSL_SHADER_INPUT_BUFFER_INDEX:
+		options->msl.shader_input_buffer_index = value;
+		break;
+
+	case SPVC_COMPILER_OPTION_MSL_SHADER_INDEX_BUFFER_INDEX:
+		options->msl.shader_index_buffer_index = value;
+		break;
+
+	case SPVC_COMPILER_OPTION_MSL_VERTEX_FOR_TESSELLATION:
+		options->msl.vertex_for_tessellation = value != 0;
+		break;
+
+	case SPVC_COMPILER_OPTION_MSL_VERTEX_INDEX_TYPE:
+		options->msl.vertex_index_type = static_cast<CompilerMSL::Options::IndexType>(value);
+		break;
 #endif
 
 	default:
diff --git a/spirv_cross_c.h b/spirv_cross_c.h
index 082c83b8..0ea8bf69 100644
--- a/spirv_cross_c.h
+++ b/spirv_cross_c.h
@@ -33,7 +33,7 @@ extern "C" {
 /* Bumped if ABI or API breaks backwards compatibility. */
 #define SPVC_C_API_VERSION_MAJOR 0
 /* Bumped if APIs or enumerations are added in a backwards compatible way. */
-#define SPVC_C_API_VERSION_MINOR 35
+#define SPVC_C_API_VERSION_MINOR 36
 /* Bumped if internal implementation details change. */
 #define SPVC_C_API_VERSION_PATCH 0
 
@@ -258,12 +258,23 @@ typedef enum spvc_msl_platform
 	SPVC_MSL_PLATFORM_MAX_INT = 0x7fffffff
 } spvc_msl_platform;
 
+/* Maps to C++ API. */
+typedef enum spvc_msl_index_type
+{
+	SPVC_MSL_INDEX_TYPE_NONE = 0,
+	SPVC_MSL_INDEX_TYPE_UINT16 = 1,
+	SPVC_MSL_INDEX_TYPE_UINT32 = 2,
+	SPVC_MSL_INDEX_TYPE_MAX_INT = 0x7fffffff
+} spvc_msl_index_type;
+
 /* Maps to C++ API. */
 typedef enum spvc_msl_shader_input_format
 {
 	SPVC_MSL_SHADER_INPUT_FORMAT_OTHER = 0,
 	SPVC_MSL_SHADER_INPUT_FORMAT_UINT8 = 1,
 	SPVC_MSL_SHADER_INPUT_FORMAT_UINT16 = 2,
+	SPVC_MSL_SHADER_INPUT_FORMAT_ANY16 = 3,
+	SPVC_MSL_SHADER_INPUT_FORMAT_ANY32 = 4,
 
 	/* Deprecated names. */
 	SPVC_MSL_VERTEX_FORMAT_OTHER = SPVC_MSL_SHADER_INPUT_FORMAT_OTHER,
@@ -617,6 +628,12 @@ typedef enum spvc_compiler_option
 
 	SPVC_COMPILER_OPTION_HLSL_ENABLE_16BIT_TYPES = 60 | SPVC_COMPILER_OPTION_HLSL_BIT,
 
+	SPVC_COMPILER_OPTION_MSL_MULTI_PATCH_WORKGROUP = 61 | SPVC_COMPILER_OPTION_MSL_BIT,
+	SPVC_COMPILER_OPTION_MSL_SHADER_INPUT_BUFFER_INDEX = 62 | SPVC_COMPILER_OPTION_MSL_BIT,
+	SPVC_COMPILER_OPTION_MSL_SHADER_INDEX_BUFFER_INDEX = 63 | SPVC_COMPILER_OPTION_MSL_BIT,
+	SPVC_COMPILER_OPTION_MSL_VERTEX_FOR_TESSELLATION = 64 | SPVC_COMPILER_OPTION_MSL_BIT,
+	SPVC_COMPILER_OPTION_MSL_VERTEX_INDEX_TYPE = 65 | SPVC_COMPILER_OPTION_MSL_BIT,
+
 	SPVC_COMPILER_OPTION_INT_MAX = 0x7fffffff
 } spvc_compiler_option;
 
diff --git a/spirv_msl.cpp b/spirv_msl.cpp
index 2da92288..21b91874 100644
--- a/spirv_msl.cpp
+++ b/spirv_msl.cpp
@@ -136,7 +136,8 @@ bool CompilerMSL::builtin_translates_to_nonarray(spv::BuiltIn builtin) const
 void CompilerMSL::build_implicit_builtins()
 {
 	bool need_sample_pos = active_input_builtins.get(BuiltInSamplePosition);
-	bool need_vertex_params = capture_output_to_buffer && get_execution_model() == ExecutionModelVertex;
+	bool need_vertex_params = capture_output_to_buffer && get_execution_model() == ExecutionModelVertex &&
+	                          !msl_options.vertex_for_tessellation;
 	bool need_tesc_params = get_execution_model() == ExecutionModelTessellationControl;
 	bool need_subgroup_mask =
 	    active_input_builtins.get(BuiltInSubgroupEqMask) || active_input_builtins.get(BuiltInSubgroupGeMask) ||
@@ -149,8 +150,15 @@ void CompilerMSL::build_implicit_builtins()
 	bool need_dispatch_base =
 	    msl_options.dispatch_base && get_execution_model() == ExecutionModelGLCompute &&
 	    (active_input_builtins.get(BuiltInWorkgroupId) || active_input_builtins.get(BuiltInGlobalInvocationId));
+	bool need_grid_params = get_execution_model() == ExecutionModelVertex && msl_options.vertex_for_tessellation;
+	bool need_vertex_base_params =
+	    need_grid_params &&
+	    (active_input_builtins.get(BuiltInVertexId) || active_input_builtins.get(BuiltInVertexIndex) ||
+	     active_input_builtins.get(BuiltInBaseVertex) || active_input_builtins.get(BuiltInInstanceId) ||
+	     active_input_builtins.get(BuiltInInstanceIndex) || active_input_builtins.get(BuiltInBaseInstance));
 	if (need_subpass_input || need_sample_pos || need_subgroup_mask || need_vertex_params || need_tesc_params ||
-	    need_multiview || need_dispatch_base || needs_subgroup_invocation_id)
+	    need_multiview || need_dispatch_base || need_vertex_base_params || need_grid_params ||
+	    needs_subgroup_invocation_id)
 	{
 		bool has_frag_coord = false;
 		bool has_sample_id = false;
@@ -417,7 +425,8 @@ void CompilerMSL::build_implicit_builtins()
 			}
 		}
 
-		if (need_tesc_params && (!has_invocation_id || !has_primitive_id))
+		if ((need_tesc_params && (msl_options.multi_patch_workgroup || !has_invocation_id || !has_primitive_id)) ||
+		    need_grid_params)
 		{
 			uint32_t type_ptr_id = ir.increase_bound_by(1);
 
@@ -429,7 +438,17 @@ void CompilerMSL::build_implicit_builtins()
 			auto &ptr_type = set<SPIRType>(type_ptr_id, uint_type_ptr);
 			ptr_type.self = get_uint_type_id();
 
-			if (!has_invocation_id)
+			if (msl_options.multi_patch_workgroup || need_grid_params)
+			{
+				uint32_t var_id = ir.increase_bound_by(1);
+
+				// Create gl_GlobalInvocationID.
+				set<SPIRVariable>(var_id, type_ptr_id, StorageClassInput);
+				set_decoration(var_id, DecorationBuiltIn, BuiltInGlobalInvocationId);
+				builtin_invocation_id_id = var_id;
+				mark_implicit_builtin(StorageClassInput, BuiltInGlobalInvocationId, var_id);
+			}
+			else if (need_tesc_params && !has_invocation_id)
 			{
 				uint32_t var_id = ir.increase_bound_by(1);
 
@@ -440,7 +459,7 @@ void CompilerMSL::build_implicit_builtins()
 				mark_implicit_builtin(StorageClassInput, BuiltInInvocationId, var_id);
 			}
 
-			if (!has_primitive_id)
+			if (need_tesc_params && !has_primitive_id)
 			{
 				uint32_t var_id = ir.increase_bound_by(1);
 
@@ -450,6 +469,17 @@ void CompilerMSL::build_implicit_builtins()
 				builtin_primitive_id_id = var_id;
 				mark_implicit_builtin(StorageClassInput, BuiltInPrimitiveId, var_id);
 			}
+
+			if (need_grid_params)
+			{
+				uint32_t var_id = ir.increase_bound_by(1);
+
+				set<SPIRVariable>(var_id, build_extended_vector_type(get_uint_type_id(), 3), StorageClassInput);
+				set_extended_decoration(var_id, SPIRVCrossDecorationBuiltInStageInputSize);
+				get_entry_point().interface_variables.push_back(var_id);
+				set_name(var_id, "spvStageInputSize");
+				builtin_stage_input_size_id = var_id;
+			}
 		}
 
 		if (!has_subgroup_invocation_id && (need_subgroup_mask || needs_subgroup_invocation_id))
@@ -494,8 +524,10 @@ void CompilerMSL::build_implicit_builtins()
 			mark_implicit_builtin(StorageClassInput, BuiltInSubgroupSize, var_id);
 		}
 
-		if (need_dispatch_base)
+		if (need_dispatch_base || need_vertex_base_params)
 		{
+			if (workgroup_id_type == 0)
+				workgroup_id_type = build_extended_vector_type(get_uint_type_id(), 3);
 			uint32_t var_id;
 			if (msl_options.supports_msl_version(1, 2))
 			{
@@ -1125,7 +1157,8 @@ void CompilerMSL::preprocess_op_codes()
 
 	// Tessellation control shaders are run as compute functions in Metal, and so
 	// must capture their output to a buffer.
-	if (get_execution_model() == ExecutionModelTessellationControl)
+	if (get_execution_model() == ExecutionModelTessellationControl ||
+	    (get_execution_model() == ExecutionModelVertex && msl_options.vertex_for_tessellation))
 	{
 		is_rasterization_disabled = true;
 		capture_output_to_buffer = true;
@@ -1259,11 +1292,11 @@ void CompilerMSL::extract_global_variables_from_function(uint32_t func_id, std::
 				uint32_t base_id = ops[0];
 				if (global_var_ids.find(base_id) != global_var_ids.end())
 					added_arg_ids.insert(base_id);
-				
-                		uint32_t rvalue_id = ops[1];
-                		if (global_var_ids.find(rvalue_id) != global_var_ids.end())
-                    			added_arg_ids.insert(rvalue_id);
-				
+
+				uint32_t rvalue_id = ops[1];
+				if (global_var_ids.find(rvalue_id) != global_var_ids.end())
+					added_arg_ids.insert(rvalue_id);
+
 				break;
 			}
 
@@ -1332,7 +1365,7 @@ void CompilerMSL::extract_global_variables_from_function(uint32_t func_id, std::
 				{
 					if (added_in)
 						continue;
-					name = input_wg_var_name;
+					name = "gl_in";
 					arg_id = stage_in_ptr_var_id;
 					added_in = true;
 				}
@@ -1447,9 +1480,32 @@ void CompilerMSL::mark_as_packable(SPIRType &type)
 }
 
 // If a shader input exists at the location, it is marked as being used by this shader
-void CompilerMSL::mark_location_as_used_by_shader(uint32_t location, StorageClass storage)
+void CompilerMSL::mark_location_as_used_by_shader(uint32_t location, const SPIRType &type, StorageClass storage)
 {
-	if (storage == StorageClassInput)
+	if (storage != StorageClassInput)
+		return;
+	if (is_array(type))
+	{
+		uint32_t dim = 1;
+		for (uint32_t i = 0; i < type.array.size(); i++)
+			dim *= to_array_size_literal(type, i);
+		for (uint32_t i = 0; i < dim; i++)
+		{
+			if (is_matrix(type))
+			{
+				for (uint32_t j = 0; j < type.columns; j++)
+					inputs_in_use.insert(location++);
+			}
+			else
+				inputs_in_use.insert(location++);
+		}
+	}
+	else if (is_matrix(type))
+	{
+		for (uint32_t i = 0; i < type.columns; i++)
+			inputs_in_use.insert(location + i);
+	}
+	else
 		inputs_in_use.insert(location);
 }
 
@@ -1465,13 +1521,37 @@ uint32_t CompilerMSL::get_target_components_for_fragment_location(uint32_t locat
 uint32_t CompilerMSL::build_extended_vector_type(uint32_t type_id, uint32_t components, SPIRType::BaseType basetype)
 {
 	uint32_t new_type_id = ir.increase_bound_by(1);
-	auto &type = set<SPIRType>(new_type_id, get<SPIRType>(type_id));
-	type.vecsize = components;
+	auto &old_type = get<SPIRType>(type_id);
+	auto *type = &set<SPIRType>(new_type_id, old_type);
+	type->vecsize = components;
 	if (basetype != SPIRType::Unknown)
-		type.basetype = basetype;
-	type.self = new_type_id;
-	type.parent_type = type_id;
-	type.pointer = false;
+		type->basetype = basetype;
+	type->self = new_type_id;
+	type->parent_type = type_id;
+	type->array.clear();
+	type->array_size_literal.clear();
+	type->pointer = false;
+
+	if (is_array(old_type))
+	{
+		uint32_t array_type_id = ir.increase_bound_by(1);
+		type = &set<SPIRType>(array_type_id, *type);
+		type->parent_type = new_type_id;
+		type->array = old_type.array;
+		type->array_size_literal = old_type.array_size_literal;
+		new_type_id = array_type_id;
+	}
+
+	if (old_type.pointer)
+	{
+		uint32_t ptr_type_id = ir.increase_bound_by(1);
+		type = &set<SPIRType>(ptr_type_id, *type);
+		type->self = new_type_id;
+		type->parent_type = new_type_id;
+		type->storage = old_type.storage;
+		type->pointer = true;
+		new_type_id = ptr_type_id;
+	}
 
 	return new_type_id;
 }
@@ -1636,13 +1716,13 @@ void CompilerMSL::add_plain_variable_to_interface_block(StorageClass storage, co
 			ib_type.member_types[ib_mbr_idx] = type_id;
 		}
 		set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, locn);
-		mark_location_as_used_by_shader(locn, storage);
+		mark_location_as_used_by_shader(locn, get<SPIRType>(type_id), storage);
 	}
 	else if (is_builtin && is_tessellation_shader() && inputs_by_builtin.count(builtin))
 	{
 		uint32_t locn = inputs_by_builtin[builtin].location;
 		set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, locn);
-		mark_location_as_used_by_shader(locn, storage);
+		mark_location_as_used_by_shader(locn, type, storage);
 	}
 
 	if (!location_meta)
@@ -1792,13 +1872,13 @@ void CompilerMSL::add_composite_variable_to_interface_block(StorageClass storage
 				ib_type.member_types[ib_mbr_idx] = mbr_type_id;
 			}
 			set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, locn);
-			mark_location_as_used_by_shader(locn, storage);
+			mark_location_as_used_by_shader(locn, *usable_type, storage);
 		}
 		else if (is_builtin && is_tessellation_shader() && inputs_by_builtin.count(builtin))
 		{
 			uint32_t locn = inputs_by_builtin[builtin].location + i;
 			set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, locn);
-			mark_location_as_used_by_shader(locn, storage);
+			mark_location_as_used_by_shader(locn, *usable_type, storage);
 		}
 		else if (is_builtin && builtin == BuiltInClipDistance)
 		{
@@ -1966,19 +2046,19 @@ void CompilerMSL::add_composite_member_variable_to_interface_block(StorageClass
 		{
 			uint32_t locn = get_member_decoration(var_type.self, mbr_idx, DecorationLocation) + i;
 			set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, locn);
-			mark_location_as_used_by_shader(locn, storage);
+			mark_location_as_used_by_shader(locn, *usable_type, storage);
 		}
 		else if (has_decoration(var.self, DecorationLocation))
 		{
 			uint32_t locn = get_accumulated_member_location(var, mbr_idx, meta.strip_array) + i;
 			set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, locn);
-			mark_location_as_used_by_shader(locn, storage);
+			mark_location_as_used_by_shader(locn, *usable_type, storage);
 		}
 		else if (is_builtin && is_tessellation_shader() && inputs_by_builtin.count(builtin))
 		{
 			uint32_t locn = inputs_by_builtin[builtin].location + i;
 			set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, locn);
-			mark_location_as_used_by_shader(locn, storage);
+			mark_location_as_used_by_shader(locn, *usable_type, storage);
 		}
 		else if (is_builtin && builtin == BuiltInClipDistance)
 		{
@@ -2108,7 +2188,7 @@ void CompilerMSL::add_plain_member_variable_to_interface_block(StorageClass stor
 			ib_type.member_types[ib_mbr_idx] = mbr_type_id;
 		}
 		set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, locn);
-		mark_location_as_used_by_shader(locn, storage);
+		mark_location_as_used_by_shader(locn, get<SPIRType>(mbr_type_id), storage);
 	}
 	else if (has_decoration(var.self, DecorationLocation))
 	{
@@ -2122,7 +2202,7 @@ void CompilerMSL::add_plain_member_variable_to_interface_block(StorageClass stor
 			ib_type.member_types[ib_mbr_idx] = mbr_type_id;
 		}
 		set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, locn);
-		mark_location_as_used_by_shader(locn, storage);
+		mark_location_as_used_by_shader(locn, get<SPIRType>(mbr_type_id), storage);
 	}
 	else if (is_builtin && is_tessellation_shader() && inputs_by_builtin.count(builtin))
 	{
@@ -2131,7 +2211,7 @@ void CompilerMSL::add_plain_member_variable_to_interface_block(StorageClass stor
 		if (builtin_itr != end(inputs_by_builtin))
 			locn = builtin_itr->second.location;
 		set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, locn);
-		mark_location_as_used_by_shader(locn, storage);
+		mark_location_as_used_by_shader(locn, get<SPIRType>(mbr_type_id), storage);
 	}
 
 	// Copy the component location, if present.
@@ -2207,13 +2287,13 @@ void CompilerMSL::add_tess_level_input_to_interface_block(const std::string &ib_
 			{
 				uint32_t locn = get_decoration(var.self, DecorationLocation);
 				set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, locn);
-				mark_location_as_used_by_shader(locn, StorageClassInput);
+				mark_location_as_used_by_shader(locn, var_type, StorageClassInput);
 			}
 			else if (inputs_by_builtin.count(builtin))
 			{
 				uint32_t locn = inputs_by_builtin[builtin].location;
 				set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, locn);
-				mark_location_as_used_by_shader(locn, StorageClassInput);
+				mark_location_as_used_by_shader(locn, var_type, StorageClassInput);
 			}
 
 			added_builtin_tess_level = true;
@@ -2268,13 +2348,13 @@ void CompilerMSL::add_tess_level_input_to_interface_block(const std::string &ib_
 		{
 			uint32_t locn = get_decoration(var.self, DecorationLocation);
 			set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, locn);
-			mark_location_as_used_by_shader(locn, StorageClassInput);
+			mark_location_as_used_by_shader(locn, new_var_type, StorageClassInput);
 		}
 		else if (inputs_by_builtin.count(builtin))
 		{
 			uint32_t locn = inputs_by_builtin[builtin].location;
 			set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, locn);
-			mark_location_as_used_by_shader(locn, StorageClassInput);
+			mark_location_as_used_by_shader(locn, new_var_type, StorageClassInput);
 		}
 	}
 }
@@ -2328,7 +2408,10 @@ void CompilerMSL::add_variable_to_interface_block(StorageClass storage, const st
 					bool is_composite_type = is_matrix(mbr_type) || is_array(mbr_type);
 					bool attribute_load_store =
 					    storage == StorageClassInput && get_execution_model() != ExecutionModelFragment;
-					bool storage_is_stage_io = storage == StorageClassInput || storage == StorageClassOutput;
+					bool storage_is_stage_io =
+					    (storage == StorageClassInput && !(get_execution_model() == ExecutionModelTessellationControl &&
+					                                       msl_options.multi_patch_workgroup)) ||
+					    storage == StorageClassOutput;
 
 					// ClipDistance always needs to be declared as user attributes.
 					if (builtin == BuiltInClipDistance)
@@ -2359,7 +2442,9 @@ void CompilerMSL::add_variable_to_interface_block(StorageClass storage, const st
 		{
 			bool is_composite_type = is_matrix(var_type) || is_array(var_type);
 			bool storage_is_stage_io =
-			    storage == StorageClassInput || (storage == StorageClassOutput && !capture_output_to_buffer);
+			    (storage == StorageClassInput &&
+			     !(get_execution_model() == ExecutionModelTessellationControl && msl_options.multi_patch_workgroup)) ||
+			    (storage == StorageClassOutput && !capture_output_to_buffer);
 			bool attribute_load_store = storage == StorageClassInput && get_execution_model() != ExecutionModelFragment;
 
 			// ClipDistance always needs to be declared as user attributes.
@@ -2551,17 +2636,30 @@ uint32_t CompilerMSL::add_interface_block(StorageClass storage, bool patch)
 		ib_var_ref = patch ? patch_stage_in_var_name : stage_in_var_name;
 		if (get_execution_model() == ExecutionModelTessellationControl)
 		{
-			// Add a hook to populate the shared workgroup memory containing
-			// the gl_in array.
+			// Add a hook to populate the shared workgroup memory containing the gl_in array.
 			entry_func.fixup_hooks_in.push_back([=]() {
-				// Can't use PatchVertices yet; the hook for that may not have run yet.
-				statement("if (", to_expression(builtin_invocation_id_id), " < ", "spvIndirectParams[0])");
-				statement("    ", input_wg_var_name, "[", to_expression(builtin_invocation_id_id), "] = ", ib_var_ref,
-				          ";");
-				statement("threadgroup_barrier(mem_flags::mem_threadgroup);");
-				statement("if (", to_expression(builtin_invocation_id_id), " >= ", get_entry_point().output_vertices,
-				          ")");
-				statement("    return;");
+				// Can't use PatchVertices, PrimitiveId, or InvocationId yet; the hooks for those may not have run yet.
+				if (msl_options.multi_patch_workgroup)
+				{
+					// n.b. builtin_invocation_id_id here is the dispatch global invocation ID,
+					// not the TC invocation ID.
+					statement("device ", to_name(ir.default_entry_point), "_", ib_var_ref, "* gl_in = &",
+					          input_buffer_var_name, "[min(", to_expression(builtin_invocation_id_id), ".x / ",
+					          get_entry_point().output_vertices,
+							  ", spvIndirectParams[1] - 1) * spvIndirectParams[0]];");
+				}
+				else
+				{
+					// It's safe to use InvocationId here because it's directly mapped to a
+					// Metal builtin, and therefore doesn't need a hook.
+					statement("if (", to_expression(builtin_invocation_id_id), " < spvIndirectParams[0])");
+					statement("    ", input_wg_var_name, "[", to_expression(builtin_invocation_id_id),
+					          "] = ", ib_var_ref, ";");
+					statement("threadgroup_barrier(mem_flags::mem_threadgroup);");
+					statement("if (", to_expression(builtin_invocation_id_id),
+					          " >= ", get_entry_point().output_vertices, ")");
+					statement("    return;");
+				}
 			});
 		}
 		break;
@@ -2603,7 +2701,14 @@ uint32_t CompilerMSL::add_interface_block(StorageClass storage, bool patch)
 						// The first member of the indirect buffer is always the number of vertices
 						// to draw.
 						// We zero-base the InstanceID & VertexID variables for HLSL emulation elsewhere, so don't do it twice
-						if (msl_options.enable_base_index_zero)
+						if (get_execution_model() == ExecutionModelVertex && msl_options.vertex_for_tessellation)
+						{
+							statement("device ", to_name(ir.default_entry_point), "_", ib_var_ref, "& ", ib_var_ref,
+							          " = ", output_buffer_var_name, "[", to_expression(builtin_invocation_id_id),
+							          ".y * ", to_expression(builtin_stage_input_size_id), ".x + ",
+							          to_expression(builtin_invocation_id_id), ".x];");
+						}
+						else if (msl_options.enable_base_index_zero)
 						{
 							statement("device ", to_name(ir.default_entry_point), "_", ib_var_ref, "& ", ib_var_ref,
 							          " = ", output_buffer_var_name, "[", to_expression(builtin_instance_idx_id),
@@ -2621,17 +2726,46 @@ uint32_t CompilerMSL::add_interface_block(StorageClass storage, bool patch)
 				});
 				break;
 			case ExecutionModelTessellationControl:
-				if (patch)
-					entry_func.fixup_hooks_in.push_back([=]() {
-						statement("device ", to_name(ir.default_entry_point), "_", ib_var_ref, "& ", ib_var_ref, " = ",
-						          patch_output_buffer_var_name, "[", to_expression(builtin_primitive_id_id), "];");
-					});
+				if (msl_options.multi_patch_workgroup)
+				{
+					// We cannot use PrimitiveId here, because the hook may not have run yet.
+					if (patch)
+					{
+						entry_func.fixup_hooks_in.push_back([=]() {
+							statement("device ", to_name(ir.default_entry_point), "_", ib_var_ref, "& ", ib_var_ref,
+							          " = ", patch_output_buffer_var_name, "[", to_expression(builtin_invocation_id_id),
+							          ".x / ", get_entry_point().output_vertices, "];");
+						});
+					}
+					else
+					{
+						entry_func.fixup_hooks_in.push_back([=]() {
+							statement("device ", to_name(ir.default_entry_point), "_", ib_var_ref, "* gl_out = &",
+							          output_buffer_var_name, "[", to_expression(builtin_invocation_id_id), ".x - ",
+							          to_expression(builtin_invocation_id_id), ".x % ",
+							          get_entry_point().output_vertices, "];");
+						});
+					}
+				}
 				else
-					entry_func.fixup_hooks_in.push_back([=]() {
-						statement("device ", to_name(ir.default_entry_point), "_", ib_var_ref, "* gl_out = &",
-						          output_buffer_var_name, "[", to_expression(builtin_primitive_id_id), " * ",
-						          get_entry_point().output_vertices, "];");
-					});
+				{
+					if (patch)
+					{
+						entry_func.fixup_hooks_in.push_back([=]() {
+							statement("device ", to_name(ir.default_entry_point), "_", ib_var_ref, "& ", ib_var_ref,
+							          " = ", patch_output_buffer_var_name, "[", to_expression(builtin_primitive_id_id),
+							          "];");
+						});
+					}
+					else
+					{
+						entry_func.fixup_hooks_in.push_back([=]() {
+							statement("device ", to_name(ir.default_entry_point), "_", ib_var_ref, "* gl_out = &",
+							          output_buffer_var_name, "[", to_expression(builtin_primitive_id_id), " * ",
+							          get_entry_point().output_vertices, "];");
+						});
+					}
+				}
 				break;
 			default:
 				break;
@@ -2658,6 +2792,58 @@ uint32_t CompilerMSL::add_interface_block(StorageClass storage, bool patch)
 		add_variable_to_interface_block(storage, ib_var_ref, ib_type, *p_var, meta);
 	}
 
+	if (get_execution_model() == ExecutionModelTessellationControl && msl_options.multi_patch_workgroup &&
+	    storage == StorageClassInput)
+	{
+		// For tessellation control inputs, add all outputs from the vertex shader to ensure
+		// the struct containing them is the correct size and layout.
+		for (auto &input : inputs_by_location)
+		{
+			if (is_msl_shader_input_used(input.first))
+				continue;
+
+			// Create a fake variable to put at the location.
+			uint32_t offset = ir.increase_bound_by(4);
+			uint32_t type_id = offset;
+			uint32_t array_type_id = offset + 1;
+			uint32_t ptr_type_id = offset + 2;
+			uint32_t var_id = offset + 3;
+
+			SPIRType type;
+			switch (input.second.format)
+			{
+			case MSL_SHADER_INPUT_FORMAT_UINT16:
+			case MSL_SHADER_INPUT_FORMAT_ANY16:
+				type.basetype = SPIRType::UShort;
+				type.width = 16;
+				break;
+			case MSL_SHADER_INPUT_FORMAT_ANY32:
+			default:
+				type.basetype = SPIRType::UInt;
+				type.width = 32;
+				break;
+			}
+			type.vecsize = input.second.vecsize;
+			set<SPIRType>(type_id, type);
+
+			type.array.push_back(0);
+			type.array_size_literal.push_back(true);
+			type.parent_type = type_id;
+			set<SPIRType>(array_type_id, type);
+
+			type.pointer = true;
+			type.parent_type = array_type_id;
+			type.storage = storage;
+			auto &ptr_type = set<SPIRType>(ptr_type_id, type);
+			ptr_type.self = array_type_id;
+
+			auto &fake_var = set<SPIRVariable>(var_id, ptr_type_id, storage);
+			set_decoration(var_id, DecorationLocation, input.first);
+			meta.strip_array = true;
+			add_variable_to_interface_block(storage, ib_var_ref, ib_type, fake_var, meta);
+		}
+	}
+
 	// Sort the members of the structure by their locations.
 	MemberSorter member_sorter(ib_type, ir.meta[ib_type_id], MemberSorter::Location);
 	member_sorter.sort();
@@ -2701,7 +2887,10 @@ uint32_t CompilerMSL::add_interface_block_pointer(uint32_t ib_var_id, StorageCla
 		auto &ib_ptr_type = set<SPIRType>(ib_ptr_type_id, ib_type);
 		ib_ptr_type.parent_type = ib_ptr_type.type_alias = ib_type.self;
 		ib_ptr_type.pointer = true;
-		ib_ptr_type.storage = storage == StorageClassInput ? StorageClassWorkgroup : StorageClassStorageBuffer;
+		ib_ptr_type.storage =
+		    storage == StorageClassInput ?
+		        (msl_options.multi_patch_workgroup ? StorageClassStorageBuffer : StorageClassWorkgroup) :
+		        StorageClassStorageBuffer;
 		ir.meta[ib_ptr_type_id] = ir.meta[ib_type.self];
 		// To ensure that get_variable_data_type() doesn't strip off the pointer,
 		// which we need, use another pointer.
@@ -2714,7 +2903,7 @@ uint32_t CompilerMSL::add_interface_block_pointer(uint32_t ib_var_id, StorageCla
 
 		ib_ptr_var_id = next_id;
 		set<SPIRVariable>(ib_ptr_var_id, ib_ptr_ptr_type_id, StorageClassFunction, 0);
-		set_name(ib_ptr_var_id, storage == StorageClassInput ? input_wg_var_name : "gl_out");
+		set_name(ib_ptr_var_id, storage == StorageClassInput ? "gl_in" : "gl_out");
 	}
 	else
 	{
@@ -2816,7 +3005,7 @@ uint32_t CompilerMSL::ensure_correct_input_type(uint32_t type_id, uint32_t locat
 		}
 	}
 
-	case MSL_VERTEX_FORMAT_UINT16:
+	case MSL_SHADER_INPUT_FORMAT_UINT16:
 	{
 		switch (type.basetype)
 		{
@@ -5289,7 +5478,7 @@ void CompilerMSL::emit_specialization_constants_and_structs()
 			auto &type = id.get<SPIRType>();
 			TypeID type_id = type.self;
 
-			bool is_struct = (type.basetype == SPIRType::Struct) && type.array.empty();
+			bool is_struct = (type.basetype == SPIRType::Struct) && type.array.empty() && !type.pointer;
 			bool is_block =
 			    has_decoration(type.self, DecorationBlock) || has_decoration(type.self, DecorationBufferBlock);
 
@@ -5351,7 +5540,10 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 	if (ptr_type.storage == StorageClassOutput && get_execution_model() == ExecutionModelTessellationEvaluation)
 		return false;
 
-	bool flat_data_type = is_matrix(result_type) || is_array(result_type) || result_type.basetype == SPIRType::Struct;
+	bool multi_patch_tess_ctl = get_execution_model() == ExecutionModelTessellationControl &&
+	                            msl_options.multi_patch_workgroup && ptr_type.storage == StorageClassInput;
+	bool flat_matrix = is_matrix(result_type) && !multi_patch_tess_ctl;
+	bool flat_data_type = flat_matrix || is_array(result_type) || result_type.basetype == SPIRType::Struct;
 	if (!flat_data_type)
 		return false;
 
@@ -5366,6 +5558,7 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 	uint32_t interface_index = get_extended_decoration(ptr, SPIRVCrossDecorationInterfaceMemberIndex);
 	auto *var = maybe_get_backing_variable(ptr);
 	bool ptr_is_io_variable = ir.ids[ptr].get_type() == TypeVariable;
+	auto &expr_type = get_pointee_type(ptr_type.self);
 
 	const auto &iface_type = expression_type(stage_in_ptr_var_id);
 
@@ -5379,7 +5572,7 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 			SPIRV_CROSS_THROW("Loading an array-of-array must be loaded directly from an IO variable.");
 		if (interface_index == uint32_t(-1))
 			SPIRV_CROSS_THROW("Interface index is unknown. Cannot continue.");
-		if (result_type.basetype == SPIRType::Struct || is_matrix(result_type))
+		if (result_type.basetype == SPIRType::Struct || flat_matrix)
 			SPIRV_CROSS_THROW("Cannot load array-of-array of composite type in tessellation IO.");
 
 		expr += type_to_glsl(result_type) + "({ ";
@@ -5393,16 +5586,44 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 			expr += type_to_glsl(sub_type) + "({ ";
 			interface_index = base_interface_index;
 			uint32_t array_size = to_array_size_literal(result_type, 0);
-			for (uint32_t j = 0; j < array_size; j++, interface_index++)
+			if (multi_patch_tess_ctl)
 			{
-				const uint32_t indices[2] = { i, interface_index };
+				for (uint32_t j = 0; j < array_size; j++)
+				{
+					const uint32_t indices[3] = { i, interface_index, j };
 
-				AccessChainMeta meta;
-				expr += access_chain_internal(stage_in_ptr_var_id, indices, 2,
-				                              ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
+					AccessChainMeta meta;
+					expr +=
+					    access_chain_internal(stage_in_ptr_var_id, indices, 3,
+					                          ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
+					// If the expression has more vector components than the result type, insert
+					// a swizzle. This shouldn't happen normally on valid SPIR-V, but it might
+					// happen if we replace the type of an input variable.
+					if (!is_matrix(sub_type) && sub_type.basetype != SPIRType::Struct &&
+					    expr_type.vecsize > sub_type.vecsize)
+						expr += vector_swizzle(sub_type.vecsize, 0);
 
-				if (j + 1 < array_size)
-					expr += ", ";
+					if (j + 1 < array_size)
+						expr += ", ";
+				}
+			}
+			else
+			{
+				for (uint32_t j = 0; j < array_size; j++, interface_index++)
+				{
+					const uint32_t indices[2] = { i, interface_index };
+
+					AccessChainMeta meta;
+					expr +=
+					    access_chain_internal(stage_in_ptr_var_id, indices, 2,
+					                          ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
+					if (!is_matrix(sub_type) && sub_type.basetype != SPIRType::Struct &&
+					    expr_type.vecsize > sub_type.vecsize)
+						expr += vector_swizzle(sub_type.vecsize, 0);
+
+					if (j + 1 < array_size)
+						expr += ", ";
+				}
 			}
 			expr += " })";
 			if (i + 1 < num_control_points)
@@ -5442,7 +5663,8 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 					SPIRV_CROSS_THROW("Interface index is unknown. Cannot continue.");
 
 				const auto &mbr_type = get<SPIRType>(struct_type.member_types[j]);
-				if (is_matrix(mbr_type))
+				const auto &expr_mbr_type = get<SPIRType>(expr_type.member_types[j]);
+				if (is_matrix(mbr_type) && !multi_patch_tess_ctl)
 				{
 					expr += type_to_glsl(mbr_type) + "(";
 					for (uint32_t k = 0; k < mbr_type.columns; k++, interface_index++)
@@ -5457,6 +5679,8 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 						}
 						else
 							expr += to_expression(ptr) + "." + to_member_name(iface_type, interface_index);
+						if (expr_mbr_type.vecsize > mbr_type.vecsize)
+							expr += vector_swizzle(mbr_type.vecsize, 0);
 
 						if (k + 1 < mbr_type.columns)
 							expr += ", ";
@@ -5467,21 +5691,48 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 				{
 					expr += type_to_glsl(mbr_type) + "({ ";
 					uint32_t array_size = to_array_size_literal(mbr_type, 0);
-					for (uint32_t k = 0; k < array_size; k++, interface_index++)
+					if (multi_patch_tess_ctl)
 					{
-						if (is_array_of_struct)
+						for (uint32_t k = 0; k < array_size; k++)
 						{
-							const uint32_t indices[2] = { i, interface_index };
-							AccessChainMeta meta;
-							expr += access_chain_internal(
-							    stage_in_ptr_var_id, indices, 2,
-							    ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
-						}
-						else
-							expr += to_expression(ptr) + "." + to_member_name(iface_type, interface_index);
+							if (is_array_of_struct)
+							{
+								const uint32_t indices[3] = { i, interface_index, k };
+								AccessChainMeta meta;
+								expr += access_chain_internal(
+								    stage_in_ptr_var_id, indices, 3,
+								    ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
+							}
+							else
+								expr += join(to_expression(ptr), ".", to_member_name(iface_type, interface_index), "[",
+								             k, "]");
+							if (expr_mbr_type.vecsize > mbr_type.vecsize)
+								expr += vector_swizzle(mbr_type.vecsize, 0);
 
-						if (k + 1 < array_size)
-							expr += ", ";
+							if (k + 1 < array_size)
+								expr += ", ";
+						}
+					}
+					else
+					{
+						for (uint32_t k = 0; k < array_size; k++, interface_index++)
+						{
+							if (is_array_of_struct)
+							{
+								const uint32_t indices[2] = { i, interface_index };
+								AccessChainMeta meta;
+								expr += access_chain_internal(
+								    stage_in_ptr_var_id, indices, 2,
+								    ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
+							}
+							else
+								expr += to_expression(ptr) + "." + to_member_name(iface_type, interface_index);
+							if (expr_mbr_type.vecsize > mbr_type.vecsize)
+								expr += vector_swizzle(mbr_type.vecsize, 0);
+
+							if (k + 1 < array_size)
+								expr += ", ";
+						}
 					}
 					expr += " })";
 				}
@@ -5497,6 +5748,8 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 					}
 					else
 						expr += to_expression(ptr) + "." + to_member_name(iface_type, interface_index);
+					if (expr_mbr_type.vecsize > mbr_type.vecsize)
+						expr += vector_swizzle(mbr_type.vecsize, 0);
 				}
 
 				if (j + 1 < struct_type.member_types.size())
@@ -5509,7 +5762,7 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 		if (is_array_of_struct)
 			expr += " })";
 	}
-	else if (is_matrix(result_type))
+	else if (flat_matrix)
 	{
 		bool is_array_of_matrix = is_array(result_type);
 		if (is_array_of_matrix && !ptr_is_io_variable)
@@ -5538,6 +5791,8 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 					expr +=
 					    access_chain_internal(stage_in_ptr_var_id, indices, 2,
 					                          ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
+					if (expr_type.vecsize > result_type.vecsize)
+						expr += vector_swizzle(result_type.vecsize, 0);
 					if (j + 1 < result_type.columns)
 						expr += ", ";
 				}
@@ -5554,6 +5809,8 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 			for (uint32_t i = 0; i < result_type.columns; i++, interface_index++)
 			{
 				expr += to_expression(ptr) + "." + to_member_name(iface_type, interface_index);
+				if (expr_type.vecsize > result_type.vecsize)
+					expr += vector_swizzle(result_type.vecsize, 0);
 				if (i + 1 < result_type.columns)
 					expr += ", ";
 			}
@@ -5579,6 +5836,8 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 			AccessChainMeta meta;
 			expr += access_chain_internal(stage_in_ptr_var_id, indices, 2,
 			                              ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
+			if (expr_type.vecsize > result_type.vecsize)
+				expr += vector_swizzle(result_type.vecsize, 0);
 
 			if (i + 1 < num_control_points)
 				expr += ", ";
@@ -5598,6 +5857,8 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 		for (uint32_t i = 0; i < array_size; i++, interface_index++)
 		{
 			expr += to_expression(ptr) + "." + to_member_name(iface_type, interface_index);
+			if (expr_type.vecsize > result_type.vecsize)
+				expr += vector_swizzle(result_type.vecsize, 0);
 			if (i + 1 < array_size)
 				expr += ", ";
 		}
@@ -5620,6 +5881,7 @@ bool CompilerMSL::emit_tessellation_access_chain(const uint32_t *ops, uint32_t l
 	bool patch = false;
 	bool flat_data = false;
 	bool ptr_is_chain = false;
+	bool multi_patch = get_execution_model() == ExecutionModelTessellationControl && msl_options.multi_patch_workgroup;
 
 	if (var)
 	{
@@ -5680,7 +5942,7 @@ bool CompilerMSL::emit_tessellation_access_chain(const uint32_t *ops, uint32_t l
 			// we're hosed.
 			for (; i < length; ++i)
 			{
-				if (!is_array(*type) && !is_matrix(*type) && type->basetype != SPIRType::Struct)
+				if ((multi_patch || (!is_array(*type) && !is_matrix(*type))) && type->basetype != SPIRType::Struct)
 					break;
 
 				auto *c = maybe_get<SPIRConstant>(ops[i]);
@@ -5699,7 +5961,8 @@ bool CompilerMSL::emit_tessellation_access_chain(const uint32_t *ops, uint32_t l
 					type = &get<SPIRType>(type->member_types[c->scalar()]);
 			}
 
-			if (is_matrix(result_ptr_type) || is_array(result_ptr_type) || result_ptr_type.basetype == SPIRType::Struct)
+			if ((!multi_patch && (is_matrix(result_ptr_type) || is_array(result_ptr_type))) ||
+			    result_ptr_type.basetype == SPIRType::Struct)
 			{
 				// We're not going to emit the actual member name, we let any further OpLoad take care of that.
 				// Tag the access chain with the member index we're referencing.
@@ -5760,6 +6023,24 @@ bool CompilerMSL::emit_tessellation_access_chain(const uint32_t *ops, uint32_t l
 			}
 		}
 
+		// Get the actual type of the object that was accessed. If it's a vector type and we changed it,
+		// then we'll need to add a swizzle.
+		// For this, we can't necessarily rely on the type of the base expression, because it might be
+		// another access chain, and it will therefore already have the "correct" type.
+		auto *expr_type = &get_variable_data_type(*var);
+		if (has_extended_decoration(ops[2], SPIRVCrossDecorationTessIOOriginalInputTypeID))
+			expr_type = &get<SPIRType>(get_extended_decoration(ops[2], SPIRVCrossDecorationTessIOOriginalInputTypeID));
+		for (uint32_t i = 3; i < length; i++)
+		{
+			if (!is_array(*expr_type) && expr_type->basetype == SPIRType::Struct)
+				expr_type = &get<SPIRType>(expr_type->member_types[get<SPIRConstant>(ops[i]).scalar()]);
+			else
+				expr_type = &get<SPIRType>(expr_type->parent_type);
+		}
+		if (!is_array(*expr_type) && !is_matrix(*expr_type) && expr_type->basetype != SPIRType::Struct &&
+		    expr_type->vecsize > result_ptr_type.vecsize)
+			e += vector_swizzle(result_ptr_type.vecsize, 0);
+
 		auto &expr = set<SPIRExpression>(ops[1], move(e), ops[0], should_forward(ops[2]));
 		expr.loaded_from = var->self;
 		expr.need_transpose = meta.need_transpose;
@@ -5772,6 +6053,8 @@ bool CompilerMSL::emit_tessellation_access_chain(const uint32_t *ops, uint32_t l
 			set_extended_decoration(ops[1], SPIRVCrossDecorationPhysicalTypeID, meta.storage_physical_type);
 		if (meta.storage_is_invariant)
 			set_decoration(ops[1], DecorationInvariant);
+		// Save the type we found in case the result is used in another access chain.
+		set_extended_decoration(ops[1], SPIRVCrossDecorationTessIOOriginalInputTypeID, expr_type->self);
 
 		// If we have some expression dependencies in our access chain, this access chain is technically a forwarded
 		// temporary which could be subject to invalidation.
@@ -8777,6 +9060,8 @@ string CompilerMSL::member_attribute_qualifier(const SPIRType &type, uint32_t in
 			case BuiltInInstanceId:
 			case BuiltInInstanceIndex:
 			case BuiltInBaseInstance:
+				if (msl_options.vertex_for_tessellation)
+					return "";
 				return string(" [[") + builtin_qualifier(builtin) + "]]";
 
 			case BuiltInDrawIndex:
@@ -8792,7 +9077,8 @@ string CompilerMSL::member_attribute_qualifier(const SPIRType &type, uint32_t in
 	}
 
 	// Vertex and tessellation evaluation function outputs
-	if ((execution.model == ExecutionModelVertex || execution.model == ExecutionModelTessellationEvaluation) &&
+	if (((execution.model == ExecutionModelVertex && !msl_options.vertex_for_tessellation) ||
+	     execution.model == ExecutionModelTessellationEvaluation) &&
 	    type.storage == StorageClassOutput)
 	{
 		if (is_builtin)
@@ -8844,6 +9130,9 @@ string CompilerMSL::member_attribute_qualifier(const SPIRType &type, uint32_t in
 			{
 			case BuiltInInvocationId:
 			case BuiltInPrimitiveId:
+				if (msl_options.multi_patch_workgroup)
+					return "";
+				/* fallthrough */
 			case BuiltInSubgroupLocalInvocationId: // FIXME: Should work in any stage
 			case BuiltInSubgroupSize: // FIXME: Should work in any stage
 				return string(" [[") + builtin_qualifier(builtin) + "]]" + (mbr_type.array.empty() ? "" : " ");
@@ -8854,6 +9143,8 @@ string CompilerMSL::member_attribute_qualifier(const SPIRType &type, uint32_t in
 				break;
 			}
 		}
+		if (msl_options.multi_patch_workgroup)
+			return "";
 		uint32_t locn = get_ordered_member_location(type.self, index);
 		if (locn != k_unknown_location)
 			return string(" [[attribute(") + convert_to_string(locn) + ")]]";
@@ -9115,7 +9406,9 @@ string CompilerMSL::func_type_decl(SPIRType &type)
 	switch (execution.model)
 	{
 	case ExecutionModelVertex:
-		entry_type = "vertex";
+		if (msl_options.vertex_for_tessellation && !msl_options.supports_msl_version(1, 2))
+			SPIRV_CROSS_THROW("Tessellation requires Metal 1.2.");
+		entry_type = msl_options.vertex_for_tessellation ? "kernel" : "vertex";
 		break;
 	case ExecutionModelTessellationEvaluation:
 		if (!msl_options.supports_msl_version(1, 2))
@@ -9219,7 +9512,7 @@ string CompilerMSL::get_type_address_space(const SPIRType &type, uint32_t id, bo
 	case StorageClassInput:
 		if (get_execution_model() == ExecutionModelTessellationControl && var &&
 		    var->basevariable == stage_in_ptr_var_id)
-			addr_space = "threadgroup";
+			addr_space = msl_options.multi_patch_workgroup ? "constant" : "threadgroup";
 		break;
 
 	case StorageClassOutput:
@@ -9262,6 +9555,9 @@ string CompilerMSL::entry_point_arg_stage_in()
 {
 	string decl;
 
+	if (get_execution_model() == ExecutionModelTessellationControl && msl_options.multi_patch_workgroup)
+		return decl;
+
 	// Stage-in structure
 	uint32_t stage_in_id;
 	if (get_execution_model() == ExecutionModelTessellationEvaluation)
@@ -9287,6 +9583,14 @@ bool CompilerMSL::is_direct_input_builtin(BuiltIn bi_type)
 {
 	switch (bi_type)
 	{
+	// Vertex function in
+	case BuiltInVertexId:
+	case BuiltInVertexIndex:
+	case BuiltInBaseVertex:
+	case BuiltInInstanceId:
+	case BuiltInInstanceIndex:
+	case BuiltInBaseInstance:
+		return get_execution_model() != ExecutionModelVertex || !msl_options.vertex_for_tessellation;
 	// Tess. control function in
 	case BuiltInPosition:
 	case BuiltInPointSize:
@@ -9294,6 +9598,9 @@ bool CompilerMSL::is_direct_input_builtin(BuiltIn bi_type)
 	case BuiltInCullDistance:
 	case BuiltInPatchVertices:
 		return false;
+	case BuiltInInvocationId:
+	case BuiltInPrimitiveId:
+		return get_execution_model() != ExecutionModelTessellationControl || !msl_options.multi_patch_workgroup;
 	// Tess. evaluation function in
 	case BuiltInTessLevelInner:
 	case BuiltInTessLevelOuter:
@@ -9370,7 +9677,7 @@ void CompilerMSL::entry_point_args_builtin(string &ep_args)
 		    has_extended_decoration(var_id, SPIRVCrossDecorationBuiltInDispatchBase))
 		{
 			// This is a special implicit builtin, not corresponding to any SPIR-V builtin,
-			// which holds the base that was passed to vkCmdDispatchBase(). If it's present,
+			// which holds the base that was passed to vkCmdDispatchBase() or vkCmdDrawIndexed(). If it's present,
 			// assume we emitted it for a good reason.
 			assert(msl_options.supports_msl_version(1, 2));
 			if (!ep_args.empty())
@@ -9378,6 +9685,19 @@ void CompilerMSL::entry_point_args_builtin(string &ep_args)
 
 			ep_args += type_to_glsl(get_variable_data_type(var)) + " " + to_expression(var_id) + " [[grid_origin]]";
 		}
+
+		if (var.storage == StorageClassInput &&
+		    has_extended_decoration(var_id, SPIRVCrossDecorationBuiltInStageInputSize))
+		{
+			// This is another special implicit builtin, not corresponding to any SPIR-V builtin,
+			// which holds the number of vertices and instances to draw. If it's present,
+			// assume we emitted it for a good reason.
+			assert(msl_options.supports_msl_version(1, 2));
+			if (!ep_args.empty())
+				ep_args += ", ";
+
+			ep_args += type_to_glsl(get_variable_data_type(var)) + " " + to_expression(var_id) + " [[grid_size]]";
+		}
 	});
 
 	// Correct the types of all encountered active builtins. We couldn't do this before
@@ -9412,7 +9732,8 @@ void CompilerMSL::entry_point_args_builtin(string &ep_args)
 			ep_args +=
 			    join("constant uint* spvIndirectParams [[buffer(", msl_options.indirect_params_buffer_index, ")]]");
 		}
-		else if (stage_out_var_id)
+		else if (stage_out_var_id &&
+		         !(get_execution_model() == ExecutionModelVertex && msl_options.vertex_for_tessellation))
 		{
 			if (!ep_args.empty())
 				ep_args += ", ";
@@ -9420,6 +9741,28 @@ void CompilerMSL::entry_point_args_builtin(string &ep_args)
 			    join("device uint* spvIndirectParams [[buffer(", msl_options.indirect_params_buffer_index, ")]]");
 		}
 
+		if (get_execution_model() == ExecutionModelVertex && msl_options.vertex_for_tessellation &&
+		    (active_input_builtins.get(BuiltInVertexIndex) || active_input_builtins.get(BuiltInVertexId)) &&
+		    msl_options.vertex_index_type != Options::IndexType::None)
+		{
+			// Add the index buffer so we can set gl_VertexIndex correctly.
+			if (!ep_args.empty())
+				ep_args += ", ";
+			switch (msl_options.vertex_index_type)
+			{
+			case Options::IndexType::None:
+				break;
+			case Options::IndexType::UInt16:
+				ep_args += join("const device ushort* ", index_buffer_var_name, " [[buffer(",
+				                msl_options.shader_index_buffer_index, ")]]");
+				break;
+			case Options::IndexType::UInt32:
+				ep_args += join("const device uint* ", index_buffer_var_name, " [[buffer(",
+				                msl_options.shader_index_buffer_index, ")]]");
+				break;
+			}
+		}
+
 		// Tessellation control shaders get three additional parameters:
 		// a buffer to hold the per-patch data, a buffer to hold the per-patch
 		// tessellation levels, and a block of workgroup memory to hold the
@@ -9442,8 +9785,16 @@ void CompilerMSL::entry_point_args_builtin(string &ep_args)
 			{
 				if (!ep_args.empty())
 					ep_args += ", ";
-				ep_args += join("threadgroup ", type_to_glsl(get_stage_in_struct_type()), "* ", input_wg_var_name,
-				                " [[threadgroup(", convert_to_string(msl_options.shader_input_wg_index), ")]]");
+				if (msl_options.multi_patch_workgroup)
+				{
+					ep_args += join("device ", type_to_glsl(get_stage_in_struct_type()), "* ", input_buffer_var_name,
+					                " [[buffer(", convert_to_string(msl_options.shader_input_buffer_index), ")]]");
+				}
+				else
+				{
+					ep_args += join("threadgroup ", type_to_glsl(get_stage_in_struct_type()), "* ", input_wg_var_name,
+					                " [[threadgroup(", convert_to_string(msl_options.shader_input_wg_index), ")]]");
+				}
 			}
 		}
 	}
@@ -9737,6 +10088,21 @@ string CompilerMSL::entry_point_args_classic(bool append_comma)
 
 void CompilerMSL::fix_up_shader_inputs_outputs()
 {
+	auto &entry_func = this->get<SPIRFunction>(ir.default_entry_point);
+
+	// Emit a guard to ensure we don't execute beyond the last vertex.
+	// Vertex shaders shouldn't have the problems with barriers in non-uniform control flow that
+	// tessellation control shaders do, so early returns should be OK. We may need to revisit this
+	// if it ever becomes possible to use barriers from a vertex shader.
+	if (get_execution_model() == ExecutionModelVertex && msl_options.vertex_for_tessellation)
+	{
+		entry_func.fixup_hooks_in.push_back([this]() {
+			statement("if (any(", to_expression(builtin_invocation_id_id),
+			          " >= ", to_expression(builtin_stage_input_size_id), "))");
+			statement("    return;");
+		});
+	}
+
 	// Look for sampled images and buffer. Add hooks to set up the swizzle constants or array lengths.
 	ir.for_each_typed_id<SPIRVariable>([&](uint32_t, SPIRVariable &var) {
 		auto &type = get_variable_data_type(var);
@@ -9747,7 +10113,6 @@ void CompilerMSL::fix_up_shader_inputs_outputs()
 		{
 			if (msl_options.swizzle_texture_samples && has_sampled_images && is_sampled_image_type(type))
 			{
-				auto &entry_func = this->get<SPIRFunction>(ir.default_entry_point);
 				entry_func.fixup_hooks_in.push_back([this, &type, &var, var_id]() {
 					bool is_array_type = !type.array.empty();
 
@@ -9774,7 +10139,6 @@ void CompilerMSL::fix_up_shader_inputs_outputs()
 		{
 			if (buffers_requiring_array_length.count(var.self))
 			{
-				auto &entry_func = this->get<SPIRFunction>(ir.default_entry_point);
 				entry_func.fixup_hooks_in.push_back([this, &type, &var, var_id]() {
 					bool is_array_type = !type.array.empty();
 
@@ -9799,13 +10163,12 @@ void CompilerMSL::fix_up_shader_inputs_outputs()
 	});
 
 	// Builtin variables
-	ir.for_each_typed_id<SPIRVariable>([&](uint32_t, SPIRVariable &var) {
+	ir.for_each_typed_id<SPIRVariable>([this, &entry_func](uint32_t, SPIRVariable &var) {
 		uint32_t var_id = var.self;
 		BuiltIn bi_type = ir.meta[var_id].decoration.builtin_type;
 
 		if (var.storage == StorageClassInput && is_builtin_variable(var))
 		{
-			auto &entry_func = this->get<SPIRFunction>(ir.default_entry_point);
 			switch (bi_type)
 			{
 			case BuiltInSamplePosition:
@@ -9824,6 +10187,29 @@ void CompilerMSL::fix_up_shader_inputs_outputs()
 					statement(builtin_type_decl(bi_type), " ", to_expression(var_id), " = simd_is_helper_thread();");
 				});
 				break;
+			case BuiltInInvocationId:
+				// This is direct-mapped without multi-patch workgroups.
+				if (get_execution_model() != ExecutionModelTessellationControl || !msl_options.multi_patch_workgroup)
+					break;
+
+				entry_func.fixup_hooks_in.push_back([=]() {
+					statement(builtin_type_decl(bi_type), " ", to_expression(var_id), " = ",
+					          to_expression(builtin_invocation_id_id), ".x % ", this->get_entry_point().output_vertices,
+					          ";");
+				});
+				break;
+			case BuiltInPrimitiveId:
+				// This is natively supported by fragment and tessellation evaluation shaders.
+				// In tessellation control shaders, this is direct-mapped without multi-patch workgroups.
+				if (get_execution_model() != ExecutionModelTessellationControl || !msl_options.multi_patch_workgroup)
+					break;
+
+				entry_func.fixup_hooks_in.push_back([=]() {
+					statement(builtin_type_decl(bi_type), " ", to_expression(var_id), " = min(",
+					          to_expression(builtin_invocation_id_id), ".x / ", this->get_entry_point().output_vertices,
+					          ", spvIndirectParams[1]);");
+				});
+				break;
 			case BuiltInPatchVertices:
 				if (get_execution_model() == ExecutionModelTessellationEvaluation)
 					entry_func.fixup_hooks_in.push_back([=]() {
@@ -10063,6 +10449,65 @@ void CompilerMSL::fix_up_shader_inputs_outputs()
 						          execution.workgroup_size.z, ");");
 				});
 				break;
+			case BuiltInVertexId:
+			case BuiltInVertexIndex:
+				// This is direct-mapped normally.
+				if (!msl_options.vertex_for_tessellation)
+					break;
+
+				entry_func.fixup_hooks_in.push_back([=]() {
+					builtin_declaration = true;
+					switch (msl_options.vertex_index_type)
+					{
+					case Options::IndexType::None:
+						statement(builtin_type_decl(bi_type), " ", to_expression(var_id), " = ",
+						          to_expression(builtin_invocation_id_id), ".x + ",
+						          to_expression(builtin_dispatch_base_id), ".x;");
+						break;
+					case Options::IndexType::UInt16:
+					case Options::IndexType::UInt32:
+						statement(builtin_type_decl(bi_type), " ", to_expression(var_id), " = ", index_buffer_var_name,
+						          "[", to_expression(builtin_invocation_id_id), ".x] + ",
+						          to_expression(builtin_dispatch_base_id), ".x;");
+						break;
+					}
+					builtin_declaration = false;
+				});
+				break;
+			case BuiltInBaseVertex:
+				// This is direct-mapped normally.
+				if (!msl_options.vertex_for_tessellation)
+					break;
+
+				entry_func.fixup_hooks_in.push_back([=]() {
+					statement(builtin_type_decl(bi_type), " ", to_expression(var_id), " = ",
+					          to_expression(builtin_dispatch_base_id), ".x;");
+				});
+				break;
+			case BuiltInInstanceId:
+			case BuiltInInstanceIndex:
+				// This is direct-mapped normally.
+				if (!msl_options.vertex_for_tessellation)
+					break;
+
+				entry_func.fixup_hooks_in.push_back([=]() {
+					builtin_declaration = true;
+					statement(builtin_type_decl(bi_type), " ", to_expression(var_id), " = ",
+					          to_expression(builtin_invocation_id_id), ".y + ", to_expression(builtin_dispatch_base_id),
+					          ".y;");
+					builtin_declaration = false;
+				});
+				break;
+			case BuiltInBaseInstance:
+				// This is direct-mapped normally.
+				if (!msl_options.vertex_for_tessellation)
+					break;
+
+				entry_func.fixup_hooks_in.push_back([=]() {
+					statement(builtin_type_decl(bi_type), " ", to_expression(var_id), " = ",
+					          to_expression(builtin_dispatch_base_id), ".y;");
+				});
+				break;
 			default:
 				break;
 			}
@@ -11653,6 +12098,11 @@ string CompilerMSL::builtin_qualifier(BuiltIn builtin)
 
 	// Tess. control function in
 	case BuiltInInvocationId:
+		if (msl_options.multi_patch_workgroup)
+		{
+			// Shouldn't be reached.
+			SPIRV_CROSS_THROW("InvocationId is computed manually with multi-patch workgroups in MSL.");
+		}
 		return "thread_index_in_threadgroup";
 	case BuiltInPatchVertices:
 		// Shouldn't be reached.
@@ -11661,6 +12111,11 @@ string CompilerMSL::builtin_qualifier(BuiltIn builtin)
 		switch (execution.model)
 		{
 		case ExecutionModelTessellationControl:
+			if (msl_options.multi_patch_workgroup)
+			{
+				// Shouldn't be reached.
+				SPIRV_CROSS_THROW("PrimitiveId is computed manually with multi-patch workgroups in MSL.");
+			}
 			return "threadgroup_position_in_grid";
 		case ExecutionModelTessellationEvaluation:
 			return "patch_id";
@@ -11941,6 +12396,18 @@ const SPIRType &CompilerMSL::get_physical_member_type(const SPIRType &type, uint
 		return get<SPIRType>(type.member_types[index]);
 }
 
+SPIRType CompilerMSL::get_presumed_input_type(const SPIRType &ib_type, uint32_t index) const
+{
+	SPIRType type = get_physical_member_type(ib_type, index);
+	uint32_t loc = get_member_decoration(ib_type.self, index, DecorationLocation);
+	if (inputs_by_location.count(loc))
+	{
+		if (inputs_by_location.at(loc).vecsize > type.vecsize)
+			type.vecsize = inputs_by_location.at(loc).vecsize;
+	}
+	return type;
+}
+
 uint32_t CompilerMSL::get_declared_type_array_stride_msl(const SPIRType &type, bool is_packed, bool row_major) const
 {
 	// Array stride in MSL is always size * array_size. sizeof(float3) == 16,
@@ -11976,6 +12443,12 @@ uint32_t CompilerMSL::get_declared_struct_member_array_stride_msl(const SPIRType
 	                                          has_member_decoration(type.self, index, DecorationRowMajor));
 }
 
+uint32_t CompilerMSL::get_declared_input_array_stride_msl(const SPIRType &type, uint32_t index) const
+{
+	return get_declared_type_array_stride_msl(get_presumed_input_type(type, index), false,
+	                                          has_member_decoration(type.self, index, DecorationRowMajor));
+}
+
 uint32_t CompilerMSL::get_declared_type_matrix_stride_msl(const SPIRType &type, bool packed, bool row_major) const
 {
 	// For packed matrices, we just use the size of the vector type.
@@ -11993,6 +12466,12 @@ uint32_t CompilerMSL::get_declared_struct_member_matrix_stride_msl(const SPIRTyp
 	                                           has_member_decoration(type.self, index, DecorationRowMajor));
 }
 
+uint32_t CompilerMSL::get_declared_input_matrix_stride_msl(const SPIRType &type, uint32_t index) const
+{
+	return get_declared_type_matrix_stride_msl(get_presumed_input_type(type, index), false,
+	                                           has_member_decoration(type.self, index, DecorationRowMajor));
+}
+
 uint32_t CompilerMSL::get_declared_struct_size_msl(const SPIRType &struct_type, bool ignore_alignment,
                                                    bool ignore_padding) const
 {
@@ -12078,6 +12557,12 @@ uint32_t CompilerMSL::get_declared_struct_member_size_msl(const SPIRType &type,
 	                                  has_member_decoration(type.self, index, DecorationRowMajor));
 }
 
+uint32_t CompilerMSL::get_declared_input_size_msl(const SPIRType &type, uint32_t index) const
+{
+	return get_declared_type_size_msl(get_presumed_input_type(type, index), false,
+	                                  has_member_decoration(type.self, index, DecorationRowMajor));
+}
+
 // Returns the byte alignment of a type.
 uint32_t CompilerMSL::get_declared_type_alignment_msl(const SPIRType &type, bool is_packed, bool row_major) const
 {
@@ -12134,6 +12619,12 @@ uint32_t CompilerMSL::get_declared_struct_member_alignment_msl(const SPIRType &t
 	                                       has_member_decoration(type.self, index, DecorationRowMajor));
 }
 
+uint32_t CompilerMSL::get_declared_input_alignment_msl(const SPIRType &type, uint32_t index) const
+{
+	return get_declared_type_alignment_msl(get_presumed_input_type(type, index), false,
+	                                       has_member_decoration(type.self, index, DecorationRowMajor));
+}
+
 bool CompilerMSL::skip_argument(uint32_t) const
 {
 	return false;
@@ -12708,11 +13199,18 @@ string CompilerMSL::to_initializer_expression(const SPIRVariable &var)
 	// FIXME: We cannot handle non-constant arrays being initialized.
 	// We will need to inject spvArrayCopy here somehow ...
 	auto &type = get<SPIRType>(var.basetype);
+	string expr;
 	if (ir.ids[var.initializer].get_type() == TypeConstant &&
 	    (!type.array.empty() || type.basetype == SPIRType::Struct))
-		return constant_expression(get<SPIRConstant>(var.initializer));
+		expr = constant_expression(get<SPIRConstant>(var.initializer));
 	else
-		return CompilerGLSL::to_initializer_expression(var);
+		expr = CompilerGLSL::to_initializer_expression(var);
+	// If the initializer has more vector components than the variable, add a swizzle.
+	// FIXME: This can't handle arrays or structs.
+	auto &init_type = expression_type(var.initializer);
+	if (type.array.empty() && type.basetype != SPIRType::Struct && init_type.vecsize > type.vecsize)
+		expr = enclose_expression(expr + vector_swizzle(type.vecsize, 0));
+	return expr;
 }
 
 string CompilerMSL::to_zero_initialized_expression(uint32_t)
diff --git a/spirv_msl.hpp b/spirv_msl.hpp
index dbe956f6..cdec83f5 100644
--- a/spirv_msl.hpp
+++ b/spirv_msl.hpp
@@ -35,6 +35,8 @@ enum MSLShaderInputFormat
 	MSL_SHADER_INPUT_FORMAT_OTHER = 0,
 	MSL_SHADER_INPUT_FORMAT_UINT8 = 1,
 	MSL_SHADER_INPUT_FORMAT_UINT16 = 2,
+	MSL_SHADER_INPUT_FORMAT_ANY16 = 3,
+	MSL_SHADER_INPUT_FORMAT_ANY32 = 4,
 
 	// Deprecated aliases.
 	MSL_VERTEX_FORMAT_OTHER = MSL_SHADER_INPUT_FORMAT_OTHER,
@@ -271,6 +273,8 @@ public:
 		uint32_t buffer_size_buffer_index = 25;
 		uint32_t view_mask_buffer_index = 24;
 		uint32_t dynamic_offsets_buffer_index = 23;
+		uint32_t shader_input_buffer_index = 22;
+		uint32_t shader_index_buffer_index = 21;
 		uint32_t shader_input_wg_index = 0;
 		uint32_t device_index = 0;
 		uint32_t enable_frag_output_mask = 0xffffffff;
@@ -329,6 +333,31 @@ public:
 		// can be read in subsequent stages.
 		bool enable_clip_distance_user_varying = true;
 
+		// In a tessellation control shader, assume that more than one patch can be processed in a
+		// single workgroup. This requires changes to the way the InvocationId and PrimitiveId
+		// builtins are processed, but should result in more efficient usage of the GPU.
+		bool multi_patch_workgroup = false;
+
+		// If set, a vertex shader will be compiled as part of a tessellation pipeline.
+		// It will be translated as a compute kernel, so it can use the global invocation ID
+		// to index the output buffer.
+		bool vertex_for_tessellation = false;
+
+		enum class IndexType
+		{
+			None = 0,
+			UInt16 = 1,
+			UInt32 = 2
+		};
+
+		// The type of index in the index buffer, if present. For a compute shader, Metal
+		// requires specifying the indexing at pipeline creation, rather than at draw time
+		// as with graphics pipelines. This means we must create three different pipelines,
+		// for no indexing, 16-bit indices, and 32-bit indices. Each requires different
+		// handling for the gl_VertexIndex builtin. We may as well, then, create three
+		// different shaders for these three scenarios.
+		IndexType vertex_index_type = IndexType::None;
+
 		bool is_ios() const
 		{
 			return platform == iOS;
@@ -431,7 +460,7 @@ public:
 	// input is a shader input description used to fix up shader input variables.
 	// If shader inputs are provided, is_msl_shader_input_used() will return true after
 	// calling ::compile() if the location was used by the MSL code.
-	void add_msl_shader_input(const MSLShaderInput &attr);
+	void add_msl_shader_input(const MSLShaderInput &input);
 
 	// resource is a resource binding to indicate the MSL buffer,
 	// texture or sampler index to use for a particular SPIR-V description set
@@ -692,7 +721,7 @@ protected:
 
 	void fix_up_interface_member_indices(spv::StorageClass storage, uint32_t ib_type_id);
 
-	void mark_location_as_used_by_shader(uint32_t location, spv::StorageClass storage);
+	void mark_location_as_used_by_shader(uint32_t location, const SPIRType &type, spv::StorageClass storage);
 	uint32_t ensure_correct_builtin_type(uint32_t type_id, spv::BuiltIn builtin);
 	uint32_t ensure_correct_input_type(uint32_t type_id, uint32_t location, uint32_t num_components = 0);
 
@@ -739,7 +768,13 @@ protected:
 	uint32_t get_declared_struct_member_matrix_stride_msl(const SPIRType &struct_type, uint32_t index) const;
 	uint32_t get_declared_struct_member_alignment_msl(const SPIRType &struct_type, uint32_t index) const;
 
+	uint32_t get_declared_input_size_msl(const SPIRType &struct_type, uint32_t index) const;
+	uint32_t get_declared_input_array_stride_msl(const SPIRType &struct_type, uint32_t index) const;
+	uint32_t get_declared_input_matrix_stride_msl(const SPIRType &struct_type, uint32_t index) const;
+	uint32_t get_declared_input_alignment_msl(const SPIRType &struct_type, uint32_t index) const;
+
 	const SPIRType &get_physical_member_type(const SPIRType &struct_type, uint32_t index) const;
+	SPIRType get_presumed_input_type(const SPIRType &struct_type, uint32_t index) const;
 
 	uint32_t get_declared_struct_size_msl(const SPIRType &struct_type, bool ignore_alignment = false,
 	                                      bool ignore_padding = false) const;
@@ -785,6 +820,7 @@ protected:
 	uint32_t builtin_subgroup_invocation_id_id = 0;
 	uint32_t builtin_subgroup_size_id = 0;
 	uint32_t builtin_dispatch_base_id = 0;
+	uint32_t builtin_stage_input_size_id = 0;
 	uint32_t swizzle_buffer_id = 0;
 	uint32_t buffer_size_buffer_id = 0;
 	uint32_t view_mask_buffer_id = 0;
@@ -811,7 +847,8 @@ protected:
 
 	Options msl_options;
 	std::set<SPVFuncImpl> spv_function_implementations;
-	std::unordered_map<uint32_t, MSLShaderInput> inputs_by_location;
+	// Must be ordered to ensure declarations are in a specific order.
+	std::map<uint32_t, MSLShaderInput> inputs_by_location;
 	std::unordered_map<uint32_t, MSLShaderInput> inputs_by_builtin;
 	std::unordered_set<uint32_t> inputs_in_use;
 	std::unordered_map<uint32_t, uint32_t> fragment_output_components;
@@ -866,9 +903,11 @@ protected:
 	std::string buffer_size_name_suffix = "BufferSize";
 	std::string plane_name_suffix = "Plane";
 	std::string input_wg_var_name = "gl_in";
+	std::string input_buffer_var_name = "spvIn";
 	std::string output_buffer_var_name = "spvOut";
 	std::string patch_output_buffer_var_name = "spvPatchOut";
 	std::string tess_factor_buffer_var_name = "spvTessLevel";
+	std::string index_buffer_var_name = "spvIndices";
 	spv::Op previous_instruction_opcode = spv::OpNop;
 
 	// Must be ordered since declaration is in a specific order.
diff --git a/test_shaders.py b/test_shaders.py
index 1f2e2504..14f3a574 100755
--- a/test_shaders.py
+++ b/test_shaders.py
@@ -280,6 +280,19 @@ def cross_compile_msl(shader, spirv, opt, iterations, paths):
         msl_args.append('6')
         msl_args.append('other')
         msl_args.append('4')
+    if '.multi-patch.' in shader:
+        msl_args.append('--msl-multi-patch-workgroup')
+        # Arbitrary for testing purposes.
+        msl_args.append('--msl-shader-input')
+        msl_args.append('0')
+        msl_args.append('any32')
+        msl_args.append('3')
+        msl_args.append('--msl-shader-input')
+        msl_args.append('1')
+        msl_args.append('any16')
+        msl_args.append('2')
+    if '.for-tess.' in shader:
+        msl_args.append('--msl-vertex-for-tessellation')
 
     subprocess.check_call(msl_args)