MSL: Rewrite how IO blocks are emitted in multi-patch mode.

Firstly, never flatten inputs or outputs in multi-patch mode. The main scenario where we do need to care is Block IO. In this case, we should only flatten the top-level member, and after that we use access chains as normal. Using structs in Input storage class is now possible as well. We don't need to consider per-location fixups at all here. In Vulkan, IO structs must match exactly. Only plain vectors can have smaller vector sizes as a special case.
2021-04-08 11:47:35 +02:00 · 2021-04-08 11:47:35 +02:00 · 46c48ee6b5
commit 46c48ee6b5
parent 425e968720
16 changed files with 463 additions and 189 deletions
--- a/reference/opt/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc
+++ b/reference/opt/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc
@ -16,8 +16,7 @@ struct main0_out

 struct main0_in
 {
-    float3 Boo_a;
-    uint3 Boo_b;
+    Boo vInput;
 };

 kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
@ -26,8 +25,7 @@ kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], devic
    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
-    Boo _26 = Boo{ gl_in[gl_InvocationID].Boo_a, gl_in[gl_InvocationID].Boo_b };
-    gl_out[gl_InvocationID].vVertex = _26;
+    gl_out[gl_InvocationID].vVertex = gl_in[gl_InvocationID].vInput;
    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(1.0);
    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(2.0);
    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(3.0);
--- a/reference/opt/shaders-msl/masking/write-outputs.mask-location-0.for-tess.vert
+++ b/reference/opt/shaders-msl/masking/write-outputs.mask-location-0.for-tess.vert
@ -9,8 +9,6 @@ struct main0_out
    float4 gl_Position;
    float gl_PointSize;
    float gl_ClipDistance[2];
-    float gl_ClipDistance_0;
-    float gl_ClipDistance_1;
 };

 kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], device main0_out* spvOut [[buffer(28)]])
@ -25,7 +23,5 @@ kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3
    out.gl_PointSize = 4.0;
    out.gl_ClipDistance[0] = 1.0;
    out.gl_ClipDistance[1] = 0.5;
-    out.gl_ClipDistance_0 = out.gl_ClipDistance[0];
-    out.gl_ClipDistance_1 = out.gl_ClipDistance[1];
 }

--- a/reference/opt/shaders-msl/masking/write-outputs.mask-location-1.for-tess.vert
+++ b/reference/opt/shaders-msl/masking/write-outputs.mask-location-1.for-tess.vert
@ -9,8 +9,6 @@ struct main0_out
    float4 gl_Position;
    float gl_PointSize;
    float gl_ClipDistance[2];
-    float gl_ClipDistance_0;
-    float gl_ClipDistance_1;
 };

 kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], device main0_out* spvOut [[buffer(28)]])
@ -25,7 +23,5 @@ kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3
    out.gl_PointSize = 4.0;
    out.gl_ClipDistance[0] = 1.0;
    out.gl_ClipDistance[1] = 0.5;
-    out.gl_ClipDistance_0 = out.gl_ClipDistance[0];
-    out.gl_ClipDistance_1 = out.gl_ClipDistance[1];
 }

--- a/reference/opt/shaders-msl/masking/write-outputs.mask-point-size.for-tess.vert
+++ b/reference/opt/shaders-msl/masking/write-outputs.mask-point-size.for-tess.vert
@ -9,8 +9,6 @@ struct main0_out
    float4 v1;
    float4 gl_Position;
    float gl_ClipDistance[2];
-    float gl_ClipDistance_0;
-    float gl_ClipDistance_1;
 };

 kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 spvStageInputSize [[grid_size]], device main0_out* spvOut [[buffer(28)]])
@ -25,7 +23,5 @@ kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3
    gl_PointSize = 4.0;
    out.gl_ClipDistance[0] = 1.0;
    out.gl_ClipDistance[1] = 0.5;
-    out.gl_ClipDistance_0 = out.gl_ClipDistance[0];
-    out.gl_ClipDistance_1 = out.gl_ClipDistance[1];
 }

--- a/reference/opt/shaders-msl/tesc/complex-patch-out-types.tesc
+++ b/reference/opt/shaders-msl/tesc/complex-patch-out-types.tesc
@ -0,0 +1,107 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct Meep
+{
+    float a;
+    float b;
+};
+
+struct Block
+{
+    spvUnsafeArray<float, 2> a;
+    float b;
+    float2x2 m;
+    Meep meep;
+    spvUnsafeArray<Meep, 2> meeps;
+};
+
+struct main0_out
+{
+    float4 gl_Position;
+};
+
+struct main0_patchOut
+{
+    spvUnsafeArray<float, 2> a;
+    float b;
+    float2x2 m;
+    Meep meep;
+    spvUnsafeArray<Meep, 2> meeps;
+    spvUnsafeArray<float, 2> Block_a;
+    float Block_b;
+    float2x2 Block_m;
+    Meep Block_meep;
+    spvUnsafeArray<Meep, 2> Block_meeps;
+};
+
+kernel void main0(uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
+{
+    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 4];
+    device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
+    gl_out[gl_InvocationID].gl_Position = float4(1.0);
+    patchOut.a[0] = 1.0;
+    patchOut.a[1] = 2.0;
+    patchOut.b = 3.0;
+    patchOut.m = float2x2(float2(2.0, 0.0), float2(0.0, 2.0));
+    patchOut.meep.a = 4.0;
+    patchOut.meep.b = 5.0;
+    patchOut.meeps[0].a = 6.0;
+    patchOut.meeps[0].b = 7.0;
+    patchOut.meeps[1].a = 8.0;
+    patchOut.meeps[1].b = 9.0;
+    patchOut.Block_a[0] = 1.0;
+    patchOut.Block_a[1] = 2.0;
+    patchOut.Block_b = 3.0;
+    patchOut.Block_m = float2x2(float2(4.0, 0.0), float2(0.0, 4.0));
+    patchOut.Block_meep.a = 4.0;
+    patchOut.Block_meep.b = 5.0;
+    patchOut.Block_meeps[0].a = 6.0;
+    patchOut.Block_meeps[0].b = 7.0;
+    patchOut.Block_meeps[1].a = 8.0;
+    patchOut.Block_meeps[1].b = 9.0;
+}
+
--- a/reference/opt/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc
+++ b/reference/opt/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc
@ -58,9 +58,7 @@ struct main0_out

 struct main0_in
 {
-    float4x4 VertexData_a;
-    spvUnsafeArray<float4, 2> VertexData_b;
-    float4 VertexData_c;
+    VertexData vInputs;
 };

 kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
@ -69,10 +67,10 @@ kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], devic
    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
-    spvUnsafeArray<VertexData, 32> _19 = spvUnsafeArray<VertexData, 32>({ VertexData{ gl_in[0].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[0].VertexData_b[0], gl_in[0].VertexData_b[1] }), gl_in[0].VertexData_c }, VertexData{ gl_in[1].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[1].VertexData_b[0], gl_in[1].VertexData_b[1] }), gl_in[1].VertexData_c }, VertexData{ gl_in[2].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[2].VertexData_b[0], gl_in[2].VertexData_b[1] }), gl_in[2].VertexData_c }, VertexData{ gl_in[3].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[3].VertexData_b[0], gl_in[3].VertexData_b[1] }), gl_in[3].VertexData_c }, VertexData{ gl_in[4].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[4].VertexData_b[0], gl_in[4].VertexData_b[1] }), gl_in[4].VertexData_c }, VertexData{ gl_in[5].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[5].VertexData_b[0], gl_in[5].VertexData_b[1] }), gl_in[5].VertexData_c }, VertexData{ gl_in[6].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[6].VertexData_b[0], gl_in[6].VertexData_b[1] }), gl_in[6].VertexData_c }, VertexData{ gl_in[7].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[7].VertexData_b[0], gl_in[7].VertexData_b[1] }), gl_in[7].VertexData_c }, VertexData{ gl_in[8].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[8].VertexData_b[0], gl_in[8].VertexData_b[1] }), gl_in[8].VertexData_c }, VertexData{ gl_in[9].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[9].VertexData_b[0], gl_in[9].VertexData_b[1] }), gl_in[9].VertexData_c }, VertexData{ gl_in[10].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[10].VertexData_b[0], gl_in[10].VertexData_b[1] }), gl_in[10].VertexData_c }, VertexData{ gl_in[11].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[11].VertexData_b[0], gl_in[11].VertexData_b[1] }), gl_in[11].VertexData_c }, VertexData{ gl_in[12].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[12].VertexData_b[0], gl_in[12].VertexData_b[1] }), gl_in[12].VertexData_c }, VertexData{ gl_in[13].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[13].VertexData_b[0], gl_in[13].VertexData_b[1] }), gl_in[13].VertexData_c }, VertexData{ gl_in[14].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[14].VertexData_b[0], gl_in[14].VertexData_b[1] }), gl_in[14].VertexData_c }, VertexData{ gl_in[15].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[15].VertexData_b[0], gl_in[15].VertexData_b[1] }), gl_in[15].VertexData_c }, VertexData{ gl_in[16].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[16].VertexData_b[0], gl_in[16].VertexData_b[1] }), gl_in[16].VertexData_c }, VertexData{ gl_in[17].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[17].VertexData_b[0], gl_in[17].VertexData_b[1] }), gl_in[17].VertexData_c }, VertexData{ gl_in[18].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[18].VertexData_b[0], gl_in[18].VertexData_b[1] }), gl_in[18].VertexData_c }, VertexData{ gl_in[19].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[19].VertexData_b[0], gl_in[19].VertexData_b[1] }), gl_in[19].VertexData_c }, VertexData{ gl_in[20].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[20].VertexData_b[0], gl_in[20].VertexData_b[1] }), gl_in[20].VertexData_c }, VertexData{ gl_in[21].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[21].VertexData_b[0], gl_in[21].VertexData_b[1] }), gl_in[21].VertexData_c }, VertexData{ gl_in[22].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[22].VertexData_b[0], gl_in[22].VertexData_b[1] }), gl_in[22].VertexData_c }, VertexData{ gl_in[23].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[23].VertexData_b[0], gl_in[23].VertexData_b[1] }), gl_in[23].VertexData_c }, VertexData{ gl_in[24].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[24].VertexData_b[0], gl_in[24].VertexData_b[1] }), gl_in[24].VertexData_c }, VertexData{ gl_in[25].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[25].VertexData_b[0], gl_in[25].VertexData_b[1] }), gl_in[25].VertexData_c }, VertexData{ gl_in[26].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[26].VertexData_b[0], gl_in[26].VertexData_b[1] }), gl_in[26].VertexData_c }, VertexData{ gl_in[27].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[27].VertexData_b[0], gl_in[27].VertexData_b[1] }), gl_in[27].VertexData_c }, VertexData{ gl_in[28].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[28].VertexData_b[0], gl_in[28].VertexData_b[1] }), gl_in[28].VertexData_c }, VertexData{ gl_in[29].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[29].VertexData_b[0], gl_in[29].VertexData_b[1] }), gl_in[29].VertexData_c }, VertexData{ gl_in[30].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[30].VertexData_b[0], gl_in[30].VertexData_b[1] }), gl_in[30].VertexData_c }, VertexData{ gl_in[31].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[31].VertexData_b[0], gl_in[31].VertexData_b[1] }), gl_in[31].VertexData_c } });
+    spvUnsafeArray<VertexData, 32> _19 = spvUnsafeArray<VertexData, 32>({ gl_in[0].vInputs, gl_in[1].vInputs, gl_in[2].vInputs, gl_in[3].vInputs, gl_in[4].vInputs, gl_in[5].vInputs, gl_in[6].vInputs, gl_in[7].vInputs, gl_in[8].vInputs, gl_in[9].vInputs, gl_in[10].vInputs, gl_in[11].vInputs, gl_in[12].vInputs, gl_in[13].vInputs, gl_in[14].vInputs, gl_in[15].vInputs, gl_in[16].vInputs, gl_in[17].vInputs, gl_in[18].vInputs, gl_in[19].vInputs, gl_in[20].vInputs, gl_in[21].vInputs, gl_in[22].vInputs, gl_in[23].vInputs, gl_in[24].vInputs, gl_in[25].vInputs, gl_in[26].vInputs, gl_in[27].vInputs, gl_in[28].vInputs, gl_in[29].vInputs, gl_in[30].vInputs, gl_in[31].vInputs });
    spvUnsafeArray<VertexData, 32> tmp;
    tmp = _19;
    int _27 = gl_InvocationID ^ 1;
-    gl_out[gl_InvocationID].vOutputs = ((tmp[gl_InvocationID].a[1] + tmp[gl_InvocationID].b[1]) + tmp[gl_InvocationID].c) + gl_in[_27].VertexData_c;
+    gl_out[gl_InvocationID].vOutputs = ((tmp[gl_InvocationID].a[1] + tmp[gl_InvocationID].b[1]) + tmp[gl_InvocationID].c) + gl_in[_27].vInputs.c;
 }

--- a/reference/shaders-msl-no-opt/asm/tesc/tess-fixed-input-array-builtin-array.invalid.multi-patch.asm.tesc
+++ b/reference/shaders-msl-no-opt/asm/tesc/tess-fixed-input-array-builtin-array.invalid.multi-patch.asm.tesc
@ -64,7 +64,7 @@ struct HSConstantOut

 struct VertexOutput_1
 {
-    float3 uv;
+    float2 uv;
 };

 struct HSOut_1
@ -80,8 +80,8 @@ struct main0_out

 struct main0_in
 {
-    float3 VertexOutput_uv;
-    ushort2 m_172;
+    VertexOutput_1 p;
+    ushort2 m_171;
    float4 gl_Position;
 };

@ -113,11 +113,11 @@ kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], devic
    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 3, spvIndirectParams[1]);
    spvUnsafeArray<VertexOutput, 3> p;
    p[0].pos = gl_in[0].gl_Position;
-    p[0].uv = gl_in[0].VertexOutput_uv.xy;
+    p[0].uv = gl_in[0].p.uv;
    p[1].pos = gl_in[1].gl_Position;
-    p[1].uv = gl_in[1].VertexOutput_uv.xy;
+    p[1].uv = gl_in[1].p.uv;
    p[2].pos = gl_in[2].gl_Position;
-    p[2].uv = gl_in[2].VertexOutput_uv.xy;
+    p[2].uv = gl_in[2].p.uv;
    uint i = gl_InvocationID;
    spvUnsafeArray<VertexOutput, 3> param;
    param = p;
--- a/reference/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc
+++ b/reference/shaders-msl/desktop-only/tesc/struct-copy.desktop.sso.multi-patch.tesc
@ -16,8 +16,7 @@ struct main0_out

 struct main0_in
 {
-    float3 Boo_a;
-    uint3 Boo_b;
+    Boo vInput;
 };

 kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
@ -26,8 +25,7 @@ kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], devic
    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
-    Boo _26 = Boo{ gl_in[gl_InvocationID].Boo_a, gl_in[gl_InvocationID].Boo_b };
-    gl_out[gl_InvocationID].vVertex = _26;
+    gl_out[gl_InvocationID].vVertex = gl_in[gl_InvocationID].vInput;
    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(1.0);
    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(2.0);
    spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[2] = half(3.0);
--- a/reference/shaders-msl/masking/write-outputs.mask-location-0.for-tess.vert
+++ b/reference/shaders-msl/masking/write-outputs.mask-location-0.for-tess.vert
@ -11,8 +11,6 @@ struct main0_out
    float4 gl_Position;
    float gl_PointSize;
    float gl_ClipDistance[2];
-    float gl_ClipDistance_0;
-    float gl_ClipDistance_1;
 };

 static inline __attribute__((always_inline))
@ -33,7 +31,5 @@ kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3
    if (any(gl_GlobalInvocationID >= spvStageInputSize))
        return;
    write_in_func(v0, out.v1, out.gl_Position, out.gl_PointSize, out.gl_ClipDistance);
-    out.gl_ClipDistance_0 = out.gl_ClipDistance[0];
-    out.gl_ClipDistance_1 = out.gl_ClipDistance[1];
 }

--- a/reference/shaders-msl/masking/write-outputs.mask-location-1.for-tess.vert
+++ b/reference/shaders-msl/masking/write-outputs.mask-location-1.for-tess.vert
@ -11,8 +11,6 @@ struct main0_out
    float4 gl_Position;
    float gl_PointSize;
    float gl_ClipDistance[2];
-    float gl_ClipDistance_0;
-    float gl_ClipDistance_1;
 };

 static inline __attribute__((always_inline))
@ -33,7 +31,5 @@ kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3
    if (any(gl_GlobalInvocationID >= spvStageInputSize))
        return;
    write_in_func(out.v0, v1, out.gl_Position, out.gl_PointSize, out.gl_ClipDistance);
-    out.gl_ClipDistance_0 = out.gl_ClipDistance[0];
-    out.gl_ClipDistance_1 = out.gl_ClipDistance[1];
 }

--- a/reference/shaders-msl/masking/write-outputs.mask-point-size.for-tess.vert
+++ b/reference/shaders-msl/masking/write-outputs.mask-point-size.for-tess.vert
@ -11,8 +11,6 @@ struct main0_out
    float4 v1;
    float4 gl_Position;
    float gl_ClipDistance[2];
-    float gl_ClipDistance_0;
-    float gl_ClipDistance_1;
 };

 static inline __attribute__((always_inline))
@ -33,7 +31,5 @@ kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3
    if (any(gl_GlobalInvocationID >= spvStageInputSize))
        return;
    write_in_func(out.v0, out.v1, out.gl_Position, gl_PointSize, out.gl_ClipDistance);
-    out.gl_ClipDistance_0 = out.gl_ClipDistance[0];
-    out.gl_ClipDistance_1 = out.gl_ClipDistance[1];
 }

--- a/reference/shaders-msl/tesc/complex-patch-out-types.tesc
+++ b/reference/shaders-msl/tesc/complex-patch-out-types.tesc
@ -0,0 +1,113 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct Meep
+{
+    float a;
+    float b;
+};
+
+struct Block
+{
+    spvUnsafeArray<float, 2> a;
+    float b;
+    float2x2 m;
+    Meep meep;
+    spvUnsafeArray<Meep, 2> meeps;
+};
+
+struct main0_out
+{
+    float4 gl_Position;
+};
+
+struct main0_patchOut
+{
+    spvUnsafeArray<float, 2> a;
+    float b;
+    float2x2 m;
+    Meep meep;
+    spvUnsafeArray<Meep, 2> meeps;
+    spvUnsafeArray<float, 2> Block_a;
+    float Block_b;
+    float2x2 Block_m;
+    Meep Block_meep;
+    spvUnsafeArray<Meep, 2> Block_meeps;
+};
+
+static inline __attribute__((always_inline))
+void write_in_func(device main0_out* thread & gl_out, thread uint& gl_InvocationID, device spvUnsafeArray<float, 2>& a, device float& b, device float2x2& m, device Meep& meep, device spvUnsafeArray<Meep, 2>& meeps, device main0_patchOut& patchOut)
+{
+    gl_out[gl_InvocationID].gl_Position = float4(1.0);
+    a[0] = 1.0;
+    a[1] = 2.0;
+    b = 3.0;
+    m = float2x2(float2(2.0, 0.0), float2(0.0, 2.0));
+    meep.a = 4.0;
+    meep.b = 5.0;
+    meeps[0].a = 6.0;
+    meeps[0].b = 7.0;
+    meeps[1].a = 8.0;
+    meeps[1].b = 9.0;
+    patchOut.Block_a[0] = 1.0;
+    patchOut.Block_a[1] = 2.0;
+    patchOut.Block_b = 3.0;
+    patchOut.Block_m = float2x2(float2(4.0, 0.0), float2(0.0, 4.0));
+    patchOut.Block_meep.a = 4.0;
+    patchOut.Block_meep.b = 5.0;
+    patchOut.Block_meeps[0].a = 6.0;
+    patchOut.Block_meeps[0].b = 7.0;
+    patchOut.Block_meeps[1].a = 8.0;
+    patchOut.Block_meeps[1].b = 9.0;
+}
+
+kernel void main0(uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
+{
+    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 4];
+    device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
+    write_in_func(gl_out, gl_InvocationID, patchOut.a, patchOut.b, patchOut.m, patchOut.meep, patchOut.meeps, patchOut);
+}
+
--- a/reference/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc
+++ b/reference/shaders-msl/tesc/load-control-point-array-of-struct.multi-patch.tesc
@ -58,9 +58,7 @@ struct main0_out

 struct main0_in
 {
-    float4x4 VertexData_a;
-    spvUnsafeArray<float4, 2> VertexData_b;
-    float4 VertexData_c;
+    VertexData vInputs;
 };

 kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
@ -69,12 +67,10 @@ kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], devic
    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
-    spvUnsafeArray<VertexData, 32> _19 = spvUnsafeArray<VertexData, 32>({ VertexData{ gl_in[0].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[0].VertexData_b[0], gl_in[0].VertexData_b[1] }), gl_in[0].VertexData_c }, VertexData{ gl_in[1].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[1].VertexData_b[0], gl_in[1].VertexData_b[1] }), gl_in[1].VertexData_c }, VertexData{ gl_in[2].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[2].VertexData_b[0], gl_in[2].VertexData_b[1] }), gl_in[2].VertexData_c }, VertexData{ gl_in[3].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[3].VertexData_b[0], gl_in[3].VertexData_b[1] }), gl_in[3].VertexData_c }, VertexData{ gl_in[4].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[4].VertexData_b[0], gl_in[4].VertexData_b[1] }), gl_in[4].VertexData_c }, VertexData{ gl_in[5].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[5].VertexData_b[0], gl_in[5].VertexData_b[1] }), gl_in[5].VertexData_c }, VertexData{ gl_in[6].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[6].VertexData_b[0], gl_in[6].VertexData_b[1] }), gl_in[6].VertexData_c }, VertexData{ gl_in[7].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[7].VertexData_b[0], gl_in[7].VertexData_b[1] }), gl_in[7].VertexData_c }, VertexData{ gl_in[8].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[8].VertexData_b[0], gl_in[8].VertexData_b[1] }), gl_in[8].VertexData_c }, VertexData{ gl_in[9].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[9].VertexData_b[0], gl_in[9].VertexData_b[1] }), gl_in[9].VertexData_c }, VertexData{ gl_in[10].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[10].VertexData_b[0], gl_in[10].VertexData_b[1] }), gl_in[10].VertexData_c }, VertexData{ gl_in[11].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[11].VertexData_b[0], gl_in[11].VertexData_b[1] }), gl_in[11].VertexData_c }, VertexData{ gl_in[12].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[12].VertexData_b[0], gl_in[12].VertexData_b[1] }), gl_in[12].VertexData_c }, VertexData{ gl_in[13].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[13].VertexData_b[0], gl_in[13].VertexData_b[1] }), gl_in[13].VertexData_c }, VertexData{ gl_in[14].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[14].VertexData_b[0], gl_in[14].VertexData_b[1] }), gl_in[14].VertexData_c }, VertexData{ gl_in[15].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[15].VertexData_b[0], gl_in[15].VertexData_b[1] }), gl_in[15].VertexData_c }, VertexData{ gl_in[16].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[16].VertexData_b[0], gl_in[16].VertexData_b[1] }), gl_in[16].VertexData_c }, VertexData{ gl_in[17].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[17].VertexData_b[0], gl_in[17].VertexData_b[1] }), gl_in[17].VertexData_c }, VertexData{ gl_in[18].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[18].VertexData_b[0], gl_in[18].VertexData_b[1] }), gl_in[18].VertexData_c }, VertexData{ gl_in[19].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[19].VertexData_b[0], gl_in[19].VertexData_b[1] }), gl_in[19].VertexData_c }, VertexData{ gl_in[20].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[20].VertexData_b[0], gl_in[20].VertexData_b[1] }), gl_in[20].VertexData_c }, VertexData{ gl_in[21].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[21].VertexData_b[0], gl_in[21].VertexData_b[1] }), gl_in[21].VertexData_c }, VertexData{ gl_in[22].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[22].VertexData_b[0], gl_in[22].VertexData_b[1] }), gl_in[22].VertexData_c }, VertexData{ gl_in[23].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[23].VertexData_b[0], gl_in[23].VertexData_b[1] }), gl_in[23].VertexData_c }, VertexData{ gl_in[24].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[24].VertexData_b[0], gl_in[24].VertexData_b[1] }), gl_in[24].VertexData_c }, VertexData{ gl_in[25].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[25].VertexData_b[0], gl_in[25].VertexData_b[1] }), gl_in[25].VertexData_c }, VertexData{ gl_in[26].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[26].VertexData_b[0], gl_in[26].VertexData_b[1] }), gl_in[26].VertexData_c }, VertexData{ gl_in[27].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[27].VertexData_b[0], gl_in[27].VertexData_b[1] }), gl_in[27].VertexData_c }, VertexData{ gl_in[28].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[28].VertexData_b[0], gl_in[28].VertexData_b[1] }), gl_in[28].VertexData_c }, VertexData{ gl_in[29].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[29].VertexData_b[0], gl_in[29].VertexData_b[1] }), gl_in[29].VertexData_c }, VertexData{ gl_in[30].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[30].VertexData_b[0], gl_in[30].VertexData_b[1] }), gl_in[30].VertexData_c }, VertexData{ gl_in[31].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[31].VertexData_b[0], gl_in[31].VertexData_b[1] }), gl_in[31].VertexData_c } });
+    spvUnsafeArray<VertexData, 32> _19 = spvUnsafeArray<VertexData, 32>({ gl_in[0].vInputs, gl_in[1].vInputs, gl_in[2].vInputs, gl_in[3].vInputs, gl_in[4].vInputs, gl_in[5].vInputs, gl_in[6].vInputs, gl_in[7].vInputs, gl_in[8].vInputs, gl_in[9].vInputs, gl_in[10].vInputs, gl_in[11].vInputs, gl_in[12].vInputs, gl_in[13].vInputs, gl_in[14].vInputs, gl_in[15].vInputs, gl_in[16].vInputs, gl_in[17].vInputs, gl_in[18].vInputs, gl_in[19].vInputs, gl_in[20].vInputs, gl_in[21].vInputs, gl_in[22].vInputs, gl_in[23].vInputs, gl_in[24].vInputs, gl_in[25].vInputs, gl_in[26].vInputs, gl_in[27].vInputs, gl_in[28].vInputs, gl_in[29].vInputs, gl_in[30].vInputs, gl_in[31].vInputs });
    spvUnsafeArray<VertexData, 32> tmp;
    tmp = _19;
-    int _27 = gl_InvocationID ^ 1;
-    VertexData _30 = VertexData{ gl_in[_27].VertexData_a, spvUnsafeArray<float4, 2>({ gl_in[_27].VertexData_b[0], gl_in[_27].VertexData_b[1] }), gl_in[_27].VertexData_c };
-    VertexData tmp_single = _30;
+    VertexData tmp_single = gl_in[gl_InvocationID ^ 1].vInputs;
    gl_out[gl_InvocationID].vOutputs = ((tmp[gl_InvocationID].a[1] + tmp[gl_InvocationID].b[1]) + tmp[gl_InvocationID].c) + tmp_single.c;
 }

--- a/shaders-msl/tesc/complex-patch-out-types.tesc
+++ b/shaders-msl/tesc/complex-patch-out-types.tesc
@ -0,0 +1,55 @@
+#version 450
+layout(vertices = 4) out;
+
+struct Meep
+{
+	float a;
+	float b;
+};
+
+layout(location = 0) patch out float a[2];
+layout(location = 2) patch out float b;
+layout(location = 3) patch out mat2 m;
+layout(location = 5) patch out Meep meep;
+layout(location = 7) patch out Meep meeps[2];
+
+layout(location = 11) patch out Block
+{
+	float a[2];
+	float b;
+	mat2 m;
+	Meep meep;
+	Meep meeps[2];
+} B;
+
+void write_in_func()
+{
+	gl_out[gl_InvocationID].gl_Position = vec4(1.0);
+
+	a[0] = 1.0;
+	a[1] = 2.0;
+	b = 3.0;
+	m = mat2(2.0);
+	meep.a = 4.0;
+	meep.b = 5.0;
+	meeps[0].a = 6.0;
+	meeps[0].b = 7.0;
+	meeps[1].a = 8.0;
+	meeps[1].b = 9.0;
+
+	B.a[0] = 1.0;
+	B.a[1] = 2.0;
+	B.b = 3.0;
+	B.m = mat2(4.0);
+	B.meep.a = 4.0;
+	B.meep.b = 5.0;
+	B.meeps[0].a = 6.0;
+	B.meeps[0].b = 7.0;
+	B.meeps[1].a = 8.0;
+	B.meeps[1].b = 9.0;
+}
+
+void main()
+{
+	write_in_func();
+}
--- a/spirv_msl.cpp
+++ b/spirv_msl.cpp
@ -1731,8 +1731,11 @@ void CompilerMSL::extract_global_variables_from_function(uint32_t func_id, std::
 	// Add the global variables as arguments to the function
 	if (func_id != ir.default_entry_point)
 	{
-		bool added_in = false;
-		bool added_out = false;
+		bool control_point_added_in = false;
+		bool control_point_added_out = false;
+		bool patch_added_in = false;
+		bool patch_added_out = false;
+
 		for (uint32_t arg_id : added_arg_ids)
 		{
 			auto &var = get<SPIRVariable>(arg_id);
@ -1741,16 +1744,19 @@ void CompilerMSL::extract_global_variables_from_function(uint32_t func_id, std::
 			BuiltIn bi_type = BuiltIn(get_decoration(arg_id, DecorationBuiltIn));

 			bool is_patch = has_decoration(arg_id, DecorationPatch) || is_patch_block(*p_type);
+			bool is_block = has_decoration(p_type->self, DecorationBlock);
 			bool is_control_point_storage =
 					!is_patch &&
 					((is_tessellation_shader() && var.storage == StorageClassInput) ||
 					 (get_execution_model() == ExecutionModelTessellationControl && var.storage == StorageClassOutput));
+			bool is_patch_block_storage = is_patch && is_block && var.storage == StorageClassOutput;
 			bool is_builtin = is_builtin_variable(var);
 			bool variable_is_stage_io =
 					!is_builtin || bi_type == BuiltInPosition || bi_type == BuiltInPointSize ||
 					bi_type == BuiltInClipDistance || bi_type == BuiltInCullDistance ||
 					p_type->basetype == SPIRType::Struct;
-			bool is_redirected_to_global_stage_io = is_control_point_storage && variable_is_stage_io;
+			bool is_redirected_to_global_stage_io = (is_control_point_storage || is_patch_block_storage) &&
+			                                        variable_is_stage_io;

 			// If output is masked it is not considered part of the global stage IO interface.
 			if (is_redirected_to_global_stage_io && var.storage == StorageClassOutput)
@ -1762,7 +1768,11 @@ void CompilerMSL::extract_global_variables_from_function(uint32_t func_id, std::
 				// Similarly, tessellation evaluation shaders see per-vertex inputs as arrays.
 				// We collected them into a structure; we must pass the array of this
 				// structure to the function.
-				std::string name = var.storage == StorageClassInput ? "gl_in" : "gl_out";
+				std::string name;
+				if (is_patch)
+					name = var.storage == StorageClassInput ? patch_stage_in_var_name : patch_stage_out_var_name;
+				else
+					name = var.storage == StorageClassInput ? "gl_in" : "gl_out";

 				if (var.storage == StorageClassOutput &&
 				    has_decoration(p_type->self, DecorationBlock) &&
@ -1797,16 +1807,18 @@ void CompilerMSL::extract_global_variables_from_function(uint32_t func_id, std::
 				// structure to the function.
 				if (var.storage == StorageClassInput)
 				{
+					auto &added_in = is_patch ? patch_added_in : control_point_added_in;
 					if (added_in)
 						continue;
-					arg_id = stage_in_ptr_var_id;
+					arg_id = is_patch ? patch_stage_in_var_id : stage_in_ptr_var_id;
 					added_in = true;
 				}
 				else if (var.storage == StorageClassOutput)
 				{
+					auto &added_out = is_patch ? patch_added_out : control_point_added_out;
 					if (added_out)
 						continue;
-					arg_id = stage_out_ptr_var_id;
+					arg_id = is_patch ? patch_stage_out_var_id : stage_out_ptr_var_id;
 					added_out = true;
 				}

@ -1915,14 +1927,23 @@ void CompilerMSL::mark_as_packable(SPIRType &type)

 uint32_t CompilerMSL::type_to_location_count(const SPIRType &type) const
 {
-	// In MSL, we cannot place structs in any context where we need locations.
-	assert(type.basetype != SPIRType::Struct);
+	uint32_t count;
+	if (type.basetype == SPIRType::Struct)
+	{
+		uint32_t mbr_count = uint32_t(type.member_types.size());
+		count = 0;
+		for (uint32_t i = 0; i < mbr_count; i++)
+			count += type_to_location_count(get<SPIRType>(type.member_types[i]));
+	}
+	else
+	{
+		count = type.columns > 1 ? type.columns : 1;
+	}

-	uint32_t dim = 1;
-	for (uint32_t i = 0; i < type.array.size(); i++)
-		dim *= to_array_size_literal(type, i);
+	uint32_t dim_count = uint32_t(type.array.size());
+	for (uint32_t i = 0; i < dim_count; i++)
+		count *= to_array_size_literal(type, i);

-	uint32_t count = dim * type.columns;
 	return count;
 }

@ -2911,6 +2932,16 @@ void CompilerMSL::add_tess_level_input_to_interface_block(const std::string &ib_
 	}
 }

+bool CompilerMSL::variable_storage_requires_stage_io(spv::StorageClass storage) const
+{
+	if (storage == StorageClassOutput)
+		return !capture_output_to_buffer;
+	else if (storage == StorageClassInput)
+		return !(get_execution_model() == ExecutionModelTessellationControl && msl_options.multi_patch_workgroup);
+	else
+		return false;
+}
+
 void CompilerMSL::add_variable_to_interface_block(StorageClass storage, const string &ib_var_ref, SPIRType &ib_type,
                                                  SPIRVariable &var, InterfaceBlockMeta &meta)
 {
@ -2942,7 +2973,7 @@ void CompilerMSL::add_variable_to_interface_block(StorageClass storage, const st

 	if (var_type.basetype == SPIRType::Struct)
 	{
-		bool block_requires_flattening = is_block || !capture_output_to_buffer || storage == StorageClassInput;
+		bool block_requires_flattening = variable_storage_requires_stage_io(storage) || is_block;
 		bool needs_local_declaration = !is_builtin && block_requires_flattening && meta.allow_local_declaration;

 		if (needs_local_declaration)
@ -3009,10 +3040,7 @@ void CompilerMSL::add_variable_to_interface_block(StorageClass storage, const st
 					bool is_composite_type = is_matrix(mbr_type) || is_array(mbr_type);
 					bool attribute_load_store =
 					    storage == StorageClassInput && get_execution_model() != ExecutionModelFragment;
-					bool storage_is_stage_io =
-					    (storage == StorageClassInput && !(get_execution_model() == ExecutionModelTessellationControl &&
-					                                       msl_options.multi_patch_workgroup)) ||
-					    storage == StorageClassOutput;
+					bool storage_is_stage_io = variable_storage_requires_stage_io(storage);

 					// ClipDistance always needs to be declared as user attributes.
 					if (builtin == BuiltInClipDistance)
@ -3042,10 +3070,7 @@ void CompilerMSL::add_variable_to_interface_block(StorageClass storage, const st
 		if (!is_builtin || has_active_builtin(builtin, storage))
 		{
 			bool is_composite_type = is_matrix(var_type) || is_array(var_type);
-			bool storage_is_stage_io =
-			    (storage == StorageClassInput &&
-			     !(get_execution_model() == ExecutionModelTessellationControl && msl_options.multi_patch_workgroup)) ||
-			    (storage == StorageClassOutput && !capture_output_to_buffer);
+			bool storage_is_stage_io = variable_storage_requires_stage_io(storage);
 			bool attribute_load_store = storage == StorageClassInput && get_execution_model() != ExecutionModelFragment;

 			// ClipDistance always needs to be declared as user attributes.
@ -3088,8 +3113,11 @@ void CompilerMSL::fix_up_interface_member_indices(StorageClass storage, uint32_t

 		auto &type = get_variable_element_type(var);

+		bool flatten_composites = variable_storage_requires_stage_io(var.storage);
+		bool is_block = has_decoration(type.self, DecorationBlock);
+
 		uint32_t mbr_idx = uint32_t(-1);
-		if (type.basetype == SPIRType::Struct)
+		if (type.basetype == SPIRType::Struct && (flatten_composites || is_block))
 			mbr_idx = get_extended_member_decoration(ib_type_id, i, SPIRVCrossDecorationInterfaceMemberIndex);

 		if (mbr_idx != uint32_t(-1))
@ -3580,6 +3608,10 @@ uint32_t CompilerMSL::ensure_correct_input_type(uint32_t type_id, uint32_t locat
 {
 	auto &type = get<SPIRType>(type_id);

+	// Struct types must match exactly.
+	if (type.basetype == SPIRType::Struct)
+		return type_id;
+
 	auto p_va = inputs_by_location.find(location);
 	if (p_va == end(inputs_by_location))
 	{
@ -6525,16 +6557,22 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 	if (ptr_type.storage == StorageClassOutput && get_execution_model() == ExecutionModelTessellationEvaluation)
 		return false;

-	bool multi_patch_tess_ctl = get_execution_model() == ExecutionModelTessellationControl &&
-	                            msl_options.multi_patch_workgroup && ptr_type.storage == StorageClassInput;
-	bool flat_matrix = is_matrix(result_type) && ptr_type.storage == StorageClassInput && !multi_patch_tess_ctl;
-	bool flat_struct = result_type.basetype == SPIRType::Struct && ptr_type.storage == StorageClassInput;
-	bool flat_data_type = flat_matrix || is_array(result_type) || flat_struct;
-	if (!flat_data_type)
-		return false;
-
 	if (has_decoration(ptr, DecorationPatch))
 		return false;
+	bool ptr_is_io_variable = ir.ids[ptr].get_type() == TypeVariable;
+
+	bool flattened_io = variable_storage_requires_stage_io(ptr_type.storage);
+
+	bool flat_data_type = flattened_io &&
+	                      (is_matrix(result_type) || is_array(result_type) || result_type.basetype == SPIRType::Struct);
+
+	// Edge case, even with multi-patch workgroups, we still need to unroll load
+	// if we're loading control points directly.
+	if (ptr_is_io_variable && is_array(result_type))
+		flat_data_type = true;
+
+	if (!flat_data_type)
+		return false;

 	// Now, we must unflatten a composite type and take care of interleaving array access with gl_in/gl_out.
 	// Lots of painful code duplication since we *really* should not unroll these kinds of loads in entry point fixup
@ -6543,12 +6581,31 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id

 	uint32_t interface_index = get_extended_decoration(ptr, SPIRVCrossDecorationInterfaceMemberIndex);
 	auto *var = maybe_get_backing_variable(ptr);
-	bool ptr_is_io_variable = ir.ids[ptr].get_type() == TypeVariable;
 	auto &expr_type = get_pointee_type(ptr_type.self);

 	const auto &iface_type = expression_type(stage_in_ptr_var_id);

-	if (result_type.array.size() > 2)
+	if (!flattened_io)
+	{
+		// Simplest case for multi-patch workgroups, just unroll array as-is.
+		if (interface_index == uint32_t(-1))
+			return false;
+
+		expr += type_to_glsl(result_type) + "({ ";
+		uint32_t num_control_points = to_array_size_literal(result_type, uint32_t(result_type.array.size()) - 1);
+
+		for (uint32_t i = 0; i < num_control_points; i++)
+		{
+			const uint32_t indices[2] = { i, interface_index };
+			AccessChainMeta meta;
+			expr += access_chain_internal(stage_in_ptr_var_id, indices, 2,
+			                              ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
+			if (i + 1 < num_control_points)
+				expr += ", ";
+		}
+		expr += " })";
+	}
+	else if (result_type.array.size() > 2)
 	{
 		SPIRV_CROSS_THROW("Cannot load tessellation IO variables with more than 2 dimensions.");
 	}
@ -6558,7 +6615,7 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 			SPIRV_CROSS_THROW("Loading an array-of-array must be loaded directly from an IO variable.");
 		if (interface_index == uint32_t(-1))
 			SPIRV_CROSS_THROW("Interface index is unknown. Cannot continue.");
-		if (result_type.basetype == SPIRType::Struct || flat_matrix)
+		if (result_type.basetype == SPIRType::Struct || is_matrix(result_type))
 			SPIRV_CROSS_THROW("Cannot load array-of-array of composite type in tessellation IO.");

 		expr += type_to_glsl(result_type) + "({ ";
@ -6572,44 +6629,19 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 			expr += type_to_glsl(sub_type) + "({ ";
 			interface_index = base_interface_index;
 			uint32_t array_size = to_array_size_literal(result_type, 0);
-			if (multi_patch_tess_ctl)
+			for (uint32_t j = 0; j < array_size; j++, interface_index++)
 			{
-				for (uint32_t j = 0; j < array_size; j++)
-				{
-					const uint32_t indices[3] = { i, interface_index, j };
+				const uint32_t indices[2] = { i, interface_index };

-					AccessChainMeta meta;
-					expr +=
-					    access_chain_internal(stage_in_ptr_var_id, indices, 3,
-					                          ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
-					// If the expression has more vector components than the result type, insert
-					// a swizzle. This shouldn't happen normally on valid SPIR-V, but it might
-					// happen if we replace the type of an input variable.
-					if (!is_matrix(sub_type) && sub_type.basetype != SPIRType::Struct &&
-					    expr_type.vecsize > sub_type.vecsize)
-						expr += vector_swizzle(sub_type.vecsize, 0);
+				AccessChainMeta meta;
+				expr += access_chain_internal(stage_in_ptr_var_id, indices, 2,
+				                              ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
+				if (!is_matrix(sub_type) && sub_type.basetype != SPIRType::Struct &&
+					expr_type.vecsize > sub_type.vecsize)
+					expr += vector_swizzle(sub_type.vecsize, 0);

-					if (j + 1 < array_size)
-						expr += ", ";
-				}
-			}
-			else
-			{
-				for (uint32_t j = 0; j < array_size; j++, interface_index++)
-				{
-					const uint32_t indices[2] = { i, interface_index };
-
-					AccessChainMeta meta;
-					expr +=
-					    access_chain_internal(stage_in_ptr_var_id, indices, 2,
-					                          ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
-					if (!is_matrix(sub_type) && sub_type.basetype != SPIRType::Struct &&
-					    expr_type.vecsize > sub_type.vecsize)
-						expr += vector_swizzle(sub_type.vecsize, 0);
-
-					if (j + 1 < array_size)
-						expr += ", ";
-				}
+				if (j + 1 < array_size)
+					expr += ", ";
 			}
 			expr += " })";
 			if (i + 1 < num_control_points)
@ -6617,7 +6649,7 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 		}
 		expr += " })";
 	}
-	else if (flat_struct)
+	else if (result_type.basetype == SPIRType::Struct)
 	{
 		bool is_array_of_struct = is_array(result_type);
 		if (is_array_of_struct && !ptr_is_io_variable)
@ -6650,7 +6682,7 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id

 				const auto &mbr_type = get<SPIRType>(struct_type.member_types[j]);
 				const auto &expr_mbr_type = get<SPIRType>(expr_type.member_types[j]);
-				if (is_matrix(mbr_type) && ptr_type.storage == StorageClassInput && !multi_patch_tess_ctl)
+				if (is_matrix(mbr_type) && ptr_type.storage == StorageClassInput)
 				{
 					expr += type_to_glsl(mbr_type) + "(";
 					for (uint32_t k = 0; k < mbr_type.columns; k++, interface_index++)
@ -6660,8 +6692,8 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 							const uint32_t indices[2] = { i, interface_index };
 							AccessChainMeta meta;
 							expr += access_chain_internal(
-							    stage_in_ptr_var_id, indices, 2,
-							    ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
+									stage_in_ptr_var_id, indices, 2,
+									ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
 						}
 						else
 							expr += to_expression(ptr) + "." + to_member_name(iface_type, interface_index);
@ -6677,48 +6709,23 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 				{
 					expr += type_to_glsl(mbr_type) + "({ ";
 					uint32_t array_size = to_array_size_literal(mbr_type, 0);
-					if (multi_patch_tess_ctl)
+					for (uint32_t k = 0; k < array_size; k++, interface_index++)
 					{
-						for (uint32_t k = 0; k < array_size; k++)
+						if (is_array_of_struct)
 						{
-							if (is_array_of_struct)
-							{
-								const uint32_t indices[3] = { i, interface_index, k };
-								AccessChainMeta meta;
-								expr += access_chain_internal(
-								    stage_in_ptr_var_id, indices, 3,
-								    ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
-							}
-							else
-								expr += join(to_expression(ptr), ".", to_member_name(iface_type, interface_index), "[",
-								             k, "]");
-							if (expr_mbr_type.vecsize > mbr_type.vecsize)
-								expr += vector_swizzle(mbr_type.vecsize, 0);
-
-							if (k + 1 < array_size)
-								expr += ", ";
+							const uint32_t indices[2] = { i, interface_index };
+							AccessChainMeta meta;
+							expr += access_chain_internal(
+									stage_in_ptr_var_id, indices, 2,
+									ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
 						}
-					}
-					else
-					{
-						for (uint32_t k = 0; k < array_size; k++, interface_index++)
-						{
-							if (is_array_of_struct)
-							{
-								const uint32_t indices[2] = { i, interface_index };
-								AccessChainMeta meta;
-								expr += access_chain_internal(
-								    stage_in_ptr_var_id, indices, 2,
-								    ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
-							}
-							else
-								expr += to_expression(ptr) + "." + to_member_name(iface_type, interface_index);
-							if (expr_mbr_type.vecsize > mbr_type.vecsize)
-								expr += vector_swizzle(mbr_type.vecsize, 0);
+						else
+							expr += to_expression(ptr) + "." + to_member_name(iface_type, interface_index);
+						if (expr_mbr_type.vecsize > mbr_type.vecsize)
+							expr += vector_swizzle(mbr_type.vecsize, 0);

-							if (k + 1 < array_size)
-								expr += ", ";
-						}
+						if (k + 1 < array_size)
+							expr += ", ";
 					}
 					expr += " })";
 				}
@ -6748,7 +6755,7 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 		if (is_array_of_struct)
 			expr += " })";
 	}
-	else if (flat_matrix)
+	else if (is_matrix(result_type))
 	{
 		bool is_array_of_matrix = is_array(result_type);
 		if (is_array_of_matrix && !ptr_is_io_variable)
@ -6774,9 +6781,8 @@ bool CompilerMSL::emit_tessellation_io_load(uint32_t result_type_id, uint32_t id
 					const uint32_t indices[2] = { i, interface_index };

 					AccessChainMeta meta;
-					expr +=
-					    access_chain_internal(stage_in_ptr_var_id, indices, 2,
-					                          ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
+					expr += access_chain_internal(stage_in_ptr_var_id, indices, 2,
+					                              ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_PTR_CHAIN_BIT, &meta);
 					if (expr_type.vecsize > result_type.vecsize)
 						expr += vector_swizzle(result_type.vecsize, 0);
 					if (j + 1 < result_type.columns)
@ -6867,7 +6873,8 @@ bool CompilerMSL::emit_tessellation_access_chain(const uint32_t *ops, uint32_t l
 	bool patch = false;
 	bool flat_data = false;
 	bool ptr_is_chain = false;
-	bool multi_patch = get_execution_model() == ExecutionModelTessellationControl && msl_options.multi_patch_workgroup;
+	bool flatten_composites = false;
+
 	bool is_block = false;

 	if (var)
@ -6875,13 +6882,15 @@ bool CompilerMSL::emit_tessellation_access_chain(const uint32_t *ops, uint32_t l

 	if (var)
 	{
+		flatten_composites = variable_storage_requires_stage_io(var->storage);
 		patch = has_decoration(ops[2], DecorationPatch) || is_patch_block(get_variable_data_type(*var));

 		// Should match strip_array in add_interface_block.
 		flat_data = var->storage == StorageClassInput ||
 		            (var->storage == StorageClassOutput && get_execution_model() == ExecutionModelTessellationControl);

-		if (patch && (!is_block || var->storage != StorageClassOutput))
+		// Patch inputs are treated as normal block IO variables, so they don't deal with this path at all.
+		if (patch && (!is_block || var->storage == StorageClassInput))
 			flat_data = false;

 		// We might have a chained access chain, where
@ -6943,7 +6952,7 @@ bool CompilerMSL::emit_tessellation_access_chain(const uint32_t *ops, uint32_t l

 		VariableID stage_var_id;
 		if (patch)
-			stage_var_id = patch_stage_out_var_id;
+			stage_var_id = var->storage == StorageClassInput ? patch_stage_in_var_id : patch_stage_out_var_id;
 		else
 			stage_var_id = var->storage == StorageClassInput ? stage_in_ptr_var_id : stage_out_ptr_var_id;

@ -6957,8 +6966,9 @@ bool CompilerMSL::emit_tessellation_access_chain(const uint32_t *ops, uint32_t l
 		auto &result_ptr_type = get<SPIRType>(ops[0]);

 		uint32_t const_mbr_id = next_id++;
-		uint32_t index = get_extended_decoration(var->self, SPIRVCrossDecorationInterfaceMemberIndex);
-		if (var->storage == StorageClassInput || is_block)
+		uint32_t index = get_extended_decoration(ops[2], SPIRVCrossDecorationInterfaceMemberIndex);
+
+		if (flatten_composites || is_block)
 		{
 			uint32_t i = first_non_array_index;
 			auto *type = &get_variable_element_type(*var);
@ -6977,9 +6987,9 @@ bool CompilerMSL::emit_tessellation_access_chain(const uint32_t *ops, uint32_t l
 			// In this case, we're poking into flattened structures and arrays, so now we have to
 			// combine the following indices. If we encounter a non-constant index,
 			// we're hosed.
-			for (; i < length; ++i)
+			for (; flatten_composites && i < length; ++i)
 			{
-				if ((multi_patch || (!is_array(*type) && !is_matrix(*type))) && type->basetype != SPIRType::Struct)
+				if (!is_array(*type) && !is_matrix(*type) && type->basetype != SPIRType::Struct)
 					break;

 				auto *c = maybe_get<SPIRConstant>(ops[i]);
@ -7007,31 +7017,48 @@ bool CompilerMSL::emit_tessellation_access_chain(const uint32_t *ops, uint32_t l
 					type = &get<SPIRType>(type->member_types[c->scalar()]);
 			}

-			if ((!multi_patch && (is_matrix(result_ptr_type) || is_array(result_ptr_type))) ||
-			    result_ptr_type.basetype == SPIRType::Struct)
-			{
-				// We're not going to emit the actual member name, we let any further OpLoad take care of that.
-				// Tag the access chain with the member index we're referencing.
-				set_extended_decoration(ops[1], SPIRVCrossDecorationInterfaceMemberIndex, index);
-			}
-			else
+			// We're not going to emit the actual member name, we let any further OpLoad take care of that.
+			// Tag the access chain with the member index we're referencing.
+			bool defer_access_chain = flatten_composites && (is_matrix(result_ptr_type) || is_array(result_ptr_type) ||
+			                                                 result_ptr_type.basetype == SPIRType::Struct);
+
+			if (!defer_access_chain)
 			{
 				// Access the appropriate member of gl_in/gl_out.
 				set<SPIRConstant>(const_mbr_id, get_uint_type_id(), index, false);
 				indices.push_back(const_mbr_id);

+				// Member index is now irrelevant.
+				index = uint32_t(-1);
+
 				// Append any straggling access chain indices.
 				if (i < length)
 					indices.insert(indices.end(), ops + i, ops + length);
 			}
+			else
+			{
+				// We must have consumed the entire access chain if we're deferring it.
+				assert(i == length);
+			}
+
+			if (index != uint32_t(-1))
+				set_extended_decoration(ops[1], SPIRVCrossDecorationInterfaceMemberIndex, index);
+			else
+				unset_extended_decoration(ops[1], SPIRVCrossDecorationInterfaceMemberIndex);
 		}
 		else
 		{
-			assert(index != uint32_t(-1));
-			set<SPIRConstant>(const_mbr_id, get_uint_type_id(), index, false);
-			indices.push_back(const_mbr_id);
+			if (index != uint32_t(-1))
+			{
+				set<SPIRConstant>(const_mbr_id, get_uint_type_id(), index, false);
+				indices.push_back(const_mbr_id);
+			}

-			indices.insert(indices.end(), ops + 4, ops + length);
+			// Member index is now irrelevant.
+			index = uint32_t(-1);
+			unset_extended_decoration(ops[1], SPIRVCrossDecorationInterfaceMemberIndex);
+
+			indices.insert(indices.end(), ops + first_non_array_index, ops + length);
 		}

 		// We use the pointer to the base of the input/output array here,
@ -7057,7 +7084,7 @@ bool CompilerMSL::emit_tessellation_access_chain(const uint32_t *ops, uint32_t l
 			// First one is the gl_in/gl_out struct itself, then an index into that array.
 			// If we have traversed further, we use a normal access chain formulation.
 			auto *ptr_expr = maybe_get<SPIRExpression>(ptr);
-			if (ptr_expr && ptr_expr->implied_read_expressions.size() == 2)
+			if (flatten_composites && ptr_expr && ptr_expr->implied_read_expressions.size() == 2)
 			{
 				e = join(to_expression(ptr),
 				         access_chain_internal(stage_var_id, indices.data(), uint32_t(indices.size()),
@ -10665,12 +10692,16 @@ uint32_t CompilerMSL::get_or_allocate_builtin_input_member_location(spv::BuiltIn
 	auto &mbr_type = get<SPIRType>(get<SPIRType>(type_id).member_types[index]);
 	uint32_t count = type_to_location_count(mbr_type);

-	// This should always be 1.
-	if (count != 1)
-		return k_unknown_location;
-
 	loc = 0;
-	while (location_inputs_in_use.count(loc) != 0)
+
+	const auto location_range_in_use = [this](uint32_t location, uint32_t location_count) -> bool {
+		for (uint32_t i = 0; i < location_count; i++)
+			if (location_inputs_in_use.count(location + i) != 0)
+				return true;
+		return false;
+	};
+
+	while (location_range_in_use(loc, count))
 		loc++;

 	set_member_decoration(type_id, index, DecorationLocation, loc);
--- a/spirv_msl.hpp
+++ b/spirv_msl.hpp
@ -1077,6 +1077,8 @@ protected:
 	bool type_is_pointer_to_pointer(const SPIRType &type) const;
 	bool is_supported_argument_buffer_type(const SPIRType &type) const;

+	bool variable_storage_requires_stage_io(spv::StorageClass storage) const;
+
 	// OpcodeHandler that handles several MSL preprocessing operations.
 	struct OpCodePreprocessor : OpcodeHandler
 	{