Merge pull request #2240 from KhronosGroup/fix-2239

MSL: Remove special case for threadgroup array wrapper.
2023-12-07 14:50:19 +01:00 · 2023-12-07 14:50:19 +01:00 · f349c91274
commit f349c91274
parent 13aab7f07e 2603fbe520
73 changed files with 1414 additions and 687 deletions
--- a/reference/opt/shaders-msl/comp/complex-type-alias.comp
+++ b/reference/opt/shaders-msl/comp/complex-type-alias.comp
@ -1,8 +1,49 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
 #include <metal_stdlib>
 #include <simd/simd.h>

 using namespace metal;

+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
 struct Foo0
 {
    float a;
@ -44,7 +85,7 @@ constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(8u, 8u, 1u);

 kernel void main0(device SSBO& _53 [[buffer(0)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
-    threadgroup Foo2 coeffs[64];
+    threadgroup spvUnsafeArray<Foo2, 64> coeffs;
    coeffs[gl_LocalInvocationIndex] = Foo2{ Foo1{ Foo0{ 0.0 } }, 0.0 };
    threadgroup_barrier(mem_flags::mem_threadgroup);
    if (gl_LocalInvocationIndex == 0u)
--- a/reference/opt/shaders-msl/comp/shared-array-of-arrays.comp
+++ b/reference/opt/shaders-msl/comp/shared-array-of-arrays.comp
@ -1,8 +1,49 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
 #include <metal_stdlib>
 #include <simd/simd.h>

 using namespace metal;

+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
 struct SSBO
 {
    float out_data[1];
@ -12,7 +53,7 @@ constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(4u, 4u, 1u);

 kernel void main0(device SSBO& _67 [[buffer(0)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
 {
-    threadgroup float foo[4][4];
+    threadgroup spvUnsafeArray<spvUnsafeArray<float, 4>, 4> foo;
    foo[gl_LocalInvocationID.x][gl_LocalInvocationID.y] = float(gl_LocalInvocationIndex);
    threadgroup_barrier(mem_flags::mem_threadgroup);
    _67.out_data[gl_GlobalInvocationID.x] = ((foo[gl_LocalInvocationID.x][0] + foo[gl_LocalInvocationID.x][1]) + foo[gl_LocalInvocationID.x][2]) + foo[gl_LocalInvocationID.x][3];
--- a/reference/opt/shaders-msl/comp/shared.comp
+++ b/reference/opt/shaders-msl/comp/shared.comp
@ -1,8 +1,49 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
 #include <metal_stdlib>
 #include <simd/simd.h>

 using namespace metal;

+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
 struct SSBO
 {
    float in_data[1];
@ -17,7 +58,7 @@ constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(4u, 1u, 1u);

 kernel void main0(const device SSBO& _22 [[buffer(0)]], device SSBO2& _44 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]])
 {
-    threadgroup float sShared[4];
+    threadgroup spvUnsafeArray<float, 4> sShared;
    sShared[gl_LocalInvocationIndex] = _22.in_data[gl_GlobalInvocationID.x];
    threadgroup_barrier(mem_flags::mem_threadgroup);
    _44.out_data[gl_GlobalInvocationID.x] = sShared[3u - gl_LocalInvocationIndex];
--- a/reference/opt/shaders-msl/comp/threadgroup-boolean-workaround.comp
+++ b/reference/opt/shaders-msl/comp/threadgroup-boolean-workaround.comp
@ -1,8 +1,49 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
 #include <metal_stdlib>
 #include <simd/simd.h>

 using namespace metal;

+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
 struct SSBO
 {
    float4 values[1];
@ -12,7 +53,7 @@ constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(4u, 1u, 1u);

 kernel void main0(device SSBO& _23 [[buffer(0)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
 {
-    threadgroup short4 foo[4];
+    threadgroup spvUnsafeArray<short4, 4> foo;
    foo[gl_LocalInvocationIndex] = short4(_23.values[gl_GlobalInvocationID.x] != float4(10.0));
    threadgroup_barrier(mem_flags::mem_threadgroup);
    _23.values[gl_GlobalInvocationID.x] = select(float4(40.0), float4(30.0), bool4(foo[gl_LocalInvocationIndex ^ 3u]));
--- a/reference/opt/shaders-msl/masking/copy-arrays.mask-location-0.msl2.multi-patch.tesc
+++ b/reference/opt/shaders-msl/masking/copy-arrays.mask-location-0.msl2.multi-patch.tesc
@ -44,114 +44,6 @@ struct spvUnsafeArray
    }
 };

-template<typename T, uint A>
-inline void spvArrayCopyFromConstantToStack1(thread T (&dst)[A], constant T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromConstantToThreadGroup1(threadgroup T (&dst)[A], constant T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromStackToStack1(thread T (&dst)[A], thread const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromStackToThreadGroup1(threadgroup T (&dst)[A], thread const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromThreadGroupToStack1(thread T (&dst)[A], threadgroup const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromThreadGroupToThreadGroup1(threadgroup T (&dst)[A], threadgroup const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromDeviceToDevice1(device T (&dst)[A], device const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromConstantToDevice1(device T (&dst)[A], constant T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromStackToDevice1(device T (&dst)[A], thread const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromThreadGroupToDevice1(device T (&dst)[A], threadgroup const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromDeviceToStack1(thread T (&dst)[A], device const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromDeviceToThreadGroup1(threadgroup T (&dst)[A], device const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
 struct main0_out
 {
    float4 gl_Position;
@ -171,14 +63,14 @@ struct main0_in
 kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
 {
    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
-    threadgroup float4 spvStorageFoo[8][4][2];
-    threadgroup float4 (&Foo)[4][2] = spvStorageFoo[(gl_GlobalInvocationID.x / 4) % 8];
+    threadgroup spvUnsafeArray<spvUnsafeArray<float4, 2>, 4> spvStorageFoo[8];
+    threadgroup auto &Foo = spvStorageFoo[(gl_GlobalInvocationID.x / 4) % 8];
    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1);
    gl_out[gl_InvocationID].gl_Position = float4(1.0);
-    spvArrayCopyFromDeviceToThreadGroup1(Foo[gl_InvocationID], gl_in[gl_InvocationID].iFoo.elements);
+    Foo[gl_InvocationID] = gl_in[gl_InvocationID].iFoo;
    if (gl_InvocationID == 0)
    {
        spvUnsafeArray<float4, 2> _56 = spvUnsafeArray<float4, 2>({ gl_in[0].ipFoo, gl_in[1].ipFoo });
--- a/reference/opt/shaders-msl/masking/copy-arrays.mask-location-0.msl2.tesc
+++ b/reference/opt/shaders-msl/masking/copy-arrays.mask-location-0.msl2.tesc
@ -44,114 +44,6 @@ struct spvUnsafeArray
    }
 };

-template<typename T, uint A>
-inline void spvArrayCopyFromConstantToStack1(thread T (&dst)[A], constant T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromConstantToThreadGroup1(threadgroup T (&dst)[A], constant T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromStackToStack1(thread T (&dst)[A], thread const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromStackToThreadGroup1(threadgroup T (&dst)[A], thread const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromThreadGroupToStack1(thread T (&dst)[A], threadgroup const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromThreadGroupToThreadGroup1(threadgroup T (&dst)[A], threadgroup const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromDeviceToDevice1(device T (&dst)[A], device const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromConstantToDevice1(device T (&dst)[A], constant T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromStackToDevice1(device T (&dst)[A], thread const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromThreadGroupToDevice1(device T (&dst)[A], threadgroup const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromDeviceToStack1(thread T (&dst)[A], device const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromDeviceToThreadGroup1(threadgroup T (&dst)[A], device const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
 struct main0_out
 {
    float4 gl_Position;
@ -171,7 +63,7 @@ struct main0_in

 kernel void main0(main0_in in [[stage_in]], uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], threadgroup main0_in* gl_in [[threadgroup(0)]])
 {
-    threadgroup float4 Foo[4][2];
+    threadgroup spvUnsafeArray<spvUnsafeArray<float4, 2>, 4> Foo;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 4];
    device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
    if (gl_InvocationID < spvIndirectParams[0])
@ -181,7 +73,7 @@ kernel void main0(main0_in in [[stage_in]], uint gl_InvocationID [[thread_index_
        return;
    gl_out[gl_InvocationID].gl_Position = float4(1.0);
    spvUnsafeArray<float4, 2> _38 = spvUnsafeArray<float4, 2>({ gl_in[gl_InvocationID].iFoo_0, gl_in[gl_InvocationID].iFoo_1 });
-    spvArrayCopyFromStackToThreadGroup1(Foo[gl_InvocationID], _38.elements);
+    Foo[gl_InvocationID] = _38;
    if (gl_InvocationID == 0)
    {
        spvUnsafeArray<float4, 2> _56 = spvUnsafeArray<float4, 2>({ gl_in[0].ipFoo, gl_in[1].ipFoo });
--- a/reference/opt/shaders-msl/masking/write-outputs-block.mask-location-0.multi-patch.msl2.tesc
+++ b/reference/opt/shaders-msl/masking/write-outputs-block.mask-location-0.multi-patch.msl2.tesc
@ -32,7 +32,7 @@ kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], devic
    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
    threadgroup P spvStorage_11[8];
-    threadgroup P (&_11) = spvStorage_11[(gl_GlobalInvocationID.x / 4) % 8];
+    threadgroup auto &_11 = spvStorage_11[(gl_GlobalInvocationID.x / 4) % 8];
    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1);
    _11.a = 1.0;
--- a/reference/opt/shaders-msl/masking/write-outputs-block.mask-location-1.msl2.tesc
+++ b/reference/opt/shaders-msl/masking/write-outputs-block.mask-location-1.msl2.tesc
@ -1,8 +1,49 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
 #include <metal_stdlib>
 #include <simd/simd.h>

 using namespace metal;

+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
 struct P
 {
    float a;
@ -29,7 +70,7 @@ struct main0_patchOut

 kernel void main0(uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
 {
-    threadgroup C c[4];
+    threadgroup spvUnsafeArray<C, 4> c;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 4];
    device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
    patchOut.m_11_a = 1.0;
--- a/reference/opt/shaders-msl/masking/write-outputs-block.mask-location-1.multi-patch.msl2.tesc
+++ b/reference/opt/shaders-msl/masking/write-outputs-block.mask-location-1.multi-patch.msl2.tesc
@ -1,8 +1,49 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
 #include <metal_stdlib>
 #include <simd/simd.h>

 using namespace metal;

+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
 struct P
 {
    float a;
@ -30,8 +71,8 @@ struct main0_patchOut
 kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
 {
    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
-    threadgroup C spvStoragec[8][4];
-    threadgroup C (&c)[4] = spvStoragec[(gl_GlobalInvocationID.x / 4) % 8];
+    threadgroup spvUnsafeArray<C, 4> spvStoragec[8];
+    threadgroup auto &c = spvStoragec[(gl_GlobalInvocationID.x / 4) % 8];
    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1);
--- a/reference/opt/shaders-msl/masking/write-outputs.mask-location-0.msl2.tesc
+++ b/reference/opt/shaders-msl/masking/write-outputs.mask-location-0.msl2.tesc
@ -1,8 +1,49 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
 #include <metal_stdlib>
 #include <simd/simd.h>

 using namespace metal;

+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
 struct main0_out
 {
    float4 gl_Position;
@ -16,7 +57,7 @@ struct main0_patchOut

 kernel void main0(uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
 {
-    threadgroup float4 v0[4];
+    threadgroup spvUnsafeArray<float4, 4> v0;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 4];
    device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
    v0[gl_InvocationID] = float4(1.0);
--- a/reference/opt/shaders-msl/masking/write-outputs.mask-location-0.multi-patch.tesc
+++ b/reference/opt/shaders-msl/masking/write-outputs.mask-location-0.multi-patch.tesc
@ -59,8 +59,8 @@ struct main0_patchOut
 kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
 {
    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
-    threadgroup float4 spvStoragev0[8][4];
-    threadgroup float4 (&v0)[4] = spvStoragev0[(gl_GlobalInvocationID.x / 4) % 8];
+    threadgroup spvUnsafeArray<float4, 4> spvStoragev0[8];
+    threadgroup auto &v0 = spvStoragev0[(gl_GlobalInvocationID.x / 4) % 8];
    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1);
--- a/reference/opt/shaders-msl/masking/write-outputs.mask-location-0.tesc
+++ b/reference/opt/shaders-msl/masking/write-outputs.mask-location-0.tesc
@ -58,7 +58,7 @@ struct main0_patchOut

 kernel void main0(uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
 {
-    threadgroup float4 v0[4];
+    threadgroup spvUnsafeArray<float4, 4> v0;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 4];
    device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
    v0[gl_InvocationID] = float4(1.0);
--- a/reference/opt/shaders-msl/masking/write-outputs.mask-location-1.multi-patch.tesc
+++ b/reference/opt/shaders-msl/masking/write-outputs.mask-location-1.multi-patch.tesc
@ -1,8 +1,49 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
 #include <metal_stdlib>
 #include <simd/simd.h>

 using namespace metal;

+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
 struct main0_out
 {
    float4 v0;
@ -19,8 +60,8 @@ kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], devic
 {
    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
-    threadgroup float4 spvStoragev1[8][2];
-    threadgroup float4 (&v1)[2] = spvStoragev1[(gl_GlobalInvocationID.x / 4) % 8];
+    threadgroup spvUnsafeArray<float4, 2> spvStoragev1[8];
+    threadgroup auto &v1 = spvStoragev1[(gl_GlobalInvocationID.x / 4) % 8];
    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1);
    gl_out[gl_InvocationID].v0 = float4(1.0);
--- a/reference/opt/shaders-msl/masking/write-outputs.mask-location-1.tesc
+++ b/reference/opt/shaders-msl/masking/write-outputs.mask-location-1.tesc
@ -1,8 +1,49 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
 #include <metal_stdlib>
 #include <simd/simd.h>

 using namespace metal;

+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
 struct main0_out
 {
    float4 v0;
@ -17,7 +58,7 @@ struct main0_patchOut

 kernel void main0(uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
 {
-    threadgroup float4 v1[2];
+    threadgroup spvUnsafeArray<float4, 2> v1;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 4];
    device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
    gl_out[gl_InvocationID].v0 = float4(1.0);
--- a/reference/opt/shaders-msl/masking/write-outputs.mask-point-size.multi-patch.tesc
+++ b/reference/opt/shaders-msl/masking/write-outputs.mask-point-size.multi-patch.tesc
@ -67,8 +67,8 @@ struct main0_patchOut
 kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
 {
    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
-    threadgroup gl_PerVertex spvStoragegl_out_masked[8][4];
-    threadgroup gl_PerVertex (&gl_out_masked)[4] = spvStoragegl_out_masked[(gl_GlobalInvocationID.x / 4) % 8];
+    threadgroup spvUnsafeArray<gl_PerVertex, 4> spvStoragegl_out_masked[8];
+    threadgroup auto &gl_out_masked = spvStoragegl_out_masked[(gl_GlobalInvocationID.x / 4) % 8];
    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1);
--- a/reference/opt/shaders-msl/masking/write-outputs.mask-point-size.tesc
+++ b/reference/opt/shaders-msl/masking/write-outputs.mask-point-size.tesc
@ -66,7 +66,7 @@ struct main0_patchOut

 kernel void main0(uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
 {
-    threadgroup gl_PerVertex gl_out_masked[4];
+    threadgroup spvUnsafeArray<gl_PerVertex, 4> gl_out_masked;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 4];
    device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
    gl_out[gl_InvocationID].v0 = float4(1.0);
--- a/reference/opt/shaders-msl/masking/write-outputs.mask-position.multi-patch.tesc
+++ b/reference/opt/shaders-msl/masking/write-outputs.mask-position.multi-patch.tesc
@ -67,8 +67,8 @@ struct main0_patchOut
 kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
 {
    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
-    threadgroup gl_PerVertex spvStoragegl_out_masked[8][4];
-    threadgroup gl_PerVertex (&gl_out_masked)[4] = spvStoragegl_out_masked[(gl_GlobalInvocationID.x / 4) % 8];
+    threadgroup spvUnsafeArray<gl_PerVertex, 4> spvStoragegl_out_masked[8];
+    threadgroup auto &gl_out_masked = spvStoragegl_out_masked[(gl_GlobalInvocationID.x / 4) % 8];
    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1);
--- a/reference/opt/shaders-msl/masking/write-outputs.mask-position.tesc
+++ b/reference/opt/shaders-msl/masking/write-outputs.mask-position.tesc
@ -66,7 +66,7 @@ struct main0_patchOut

 kernel void main0(uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
 {
-    threadgroup gl_PerVertex gl_out_masked[4];
+    threadgroup spvUnsafeArray<gl_PerVertex, 4> gl_out_masked;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 4];
    device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
    gl_out[gl_InvocationID].v0 = float4(1.0);
--- a/reference/opt/shaders-ue4/asm/tesc/hs-incorrect-base-type.invalid.asm.tesc
+++ b/reference/opt/shaders-ue4/asm/tesc/hs-incorrect-base-type.invalid.asm.tesc
@ -300,7 +300,7 @@ struct main0_in

 kernel void main0(main0_in in [[stage_in]], constant type_View& View [[buffer(0)]], const device type_StructuredBuffer_v4float& View_PrimitiveSceneData [[buffer(1)]], uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLTriangleTessellationFactorsHalf* spvTessLevel [[buffer(26)]], threadgroup main0_in* gl_in [[threadgroup(0)]])
 {
-    threadgroup FPNTessellationHSToDS temp_var_hullMainRetVal[3];
+    threadgroup spvUnsafeArray<FPNTessellationHSToDS, 3> temp_var_hullMainRetVal;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 3];
    device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
    if (gl_InvocationID < spvIndirectParams[0])
--- a/reference/opt/shaders-ue4/asm/tesc/hs-input-array-access.invalid.asm.tesc
+++ b/reference/opt/shaders-ue4/asm/tesc/hs-input-array-access.invalid.asm.tesc
@ -342,7 +342,7 @@ struct main0_in

 kernel void main0(main0_in in [[stage_in]], constant type_View& View [[buffer(0)]], constant type_Primitive& Primitive [[buffer(1)]], uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLTriangleTessellationFactorsHalf* spvTessLevel [[buffer(26)]], threadgroup main0_in* gl_in [[threadgroup(0)]])
 {
-    threadgroup FPNTessellationHSToDS temp_var_hullMainRetVal[3];
+    threadgroup spvUnsafeArray<FPNTessellationHSToDS, 3> temp_var_hullMainRetVal;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 3];
    device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
    if (gl_InvocationID < spvIndirectParams[0])
--- a/reference/opt/shaders-ue4/asm/tesc/hs-texcoord-array.invalid.asm.tesc
+++ b/reference/opt/shaders-ue4/asm/tesc/hs-texcoord-array.invalid.asm.tesc
@ -318,7 +318,7 @@ struct main0_in

 kernel void main0(main0_in in [[stage_in]], constant type_View& View [[buffer(0)]], constant type_Primitive& Primitive [[buffer(1)]], uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLTriangleTessellationFactorsHalf* spvTessLevel [[buffer(26)]], threadgroup main0_in* gl_in [[threadgroup(0)]])
 {
-    threadgroup FPNTessellationHSToDS temp_var_hullMainRetVal[3];
+    threadgroup spvUnsafeArray<FPNTessellationHSToDS, 3> temp_var_hullMainRetVal;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 3];
    device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
    if (gl_InvocationID < spvIndirectParams[0])
--- a/reference/opt/shaders-ue4/asm/tesc/tess-factor-must-be-threadgroup.invalid.asm.tesc
+++ b/reference/opt/shaders-ue4/asm/tesc/tess-factor-must-be-threadgroup.invalid.asm.tesc
@ -136,7 +136,7 @@ struct main0_in

 kernel void main0(main0_in in [[stage_in]], constant type_Primitive& Primitive [[buffer(0)]], constant type_Material& Material [[buffer(1)]], uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLTriangleTessellationFactorsHalf* spvTessLevel [[buffer(26)]], threadgroup main0_in* gl_in [[threadgroup(0)]])
 {
-    threadgroup FFlatTessellationHSToDS temp_var_hullMainRetVal[3];
+    threadgroup spvUnsafeArray<FFlatTessellationHSToDS, 3> temp_var_hullMainRetVal;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 3];
    if (gl_InvocationID < spvIndirectParams[0])
        gl_in[gl_InvocationID] = in;
--- a/reference/shaders-msl-no-opt/asm/comp/composite-construct-buffer-struct.asm.comp
+++ b/reference/shaders-msl-no-opt/asm/comp/composite-construct-buffer-struct.asm.comp
@ -1,8 +1,49 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
 #include <metal_stdlib>
 #include <simd/simd.h>

 using namespace metal;

+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
 struct Block
 {
    uint2 _m0[2];
@ -16,7 +57,7 @@ struct SSBO

 kernel void main0(device SSBO& ssbo [[buffer(0)]])
 {
-    threadgroup uint2 _18[2];
+    threadgroup spvUnsafeArray<uint2, 2> _18;
    ssbo._m0[0u] = Block{ { ssbo._m0[0u]._m1[0], ssbo._m0[0u]._m1[1] }, { ssbo._m0[0u]._m1[0], ssbo._m0[0u]._m1[1] } };
 }

--- a/reference/shaders-msl-no-opt/asm/comp/device-array-load-temporary.asm.comp
+++ b/reference/shaders-msl-no-opt/asm/comp/device-array-load-temporary.asm.comp
@ -165,7 +165,7 @@ struct SSBO

 kernel void main0(device SSBO& ssbo [[buffer(0)]])
 {
-    threadgroup uint2 _18[2];
+    threadgroup spvUnsafeArray<uint2, 2> _18;
    spvUnsafeArray<uint2, 2> _27;
    spvArrayCopyFromDeviceToStack1(_27.elements, ssbo._m0[0u]._m1);
    spvArrayCopyFromStackToDevice1(ssbo._m0[0u]._m0, _27.elements);
--- a/reference/shaders-msl-no-opt/asm/comp/device-constant-array-load-store.asm.comp
+++ b/reference/shaders-msl-no-opt/asm/comp/device-constant-array-load-store.asm.comp
@ -165,15 +165,15 @@ struct SSBO

 kernel void main0(device SSBO& ssbo [[buffer(0)]], constant SSBO& ubo [[buffer(1)]])
 {
-    threadgroup uint2 _18[2];
+    threadgroup spvUnsafeArray<uint2, 2> _18;
    spvArrayCopyFromDeviceToDevice1(ssbo._m0[0u]._m0, ssbo._m0[0u]._m1);
    spvArrayCopyFromConstantToDevice1(ssbo._m0[0u]._m0, ubo._m0[0u]._m1);
    spvUnsafeArray<uint2, 2> _23;
    spvArrayCopyFromStackToDevice1(ssbo._m0[0u]._m0, _23.elements);
-    spvArrayCopyFromThreadGroupToDevice1(ssbo._m0[0u]._m0, _18);
-    spvArrayCopyFromDeviceToThreadGroup1(_18, ssbo._m0[0u]._m1);
+    spvArrayCopyFromThreadGroupToDevice1(ssbo._m0[0u]._m0, _18.elements);
+    spvArrayCopyFromDeviceToThreadGroup1(_18.elements, ssbo._m0[0u]._m1);
    spvArrayCopyFromDeviceToStack1(_23.elements, ssbo._m0[0u]._m1);
-    spvArrayCopyFromConstantToThreadGroup1(_18, ubo._m0[0u]._m1);
+    spvArrayCopyFromConstantToThreadGroup1(_18.elements, ubo._m0[0u]._m1);
    spvArrayCopyFromConstantToStack1(_23.elements, ubo._m0[0u]._m1);
 }

--- a/reference/shaders-msl-no-opt/asm/comp/groupshared-inner-array-of-struct-copy.asm.comp
+++ b/reference/shaders-msl-no-opt/asm/comp/groupshared-inner-array-of-struct-copy.asm.comp
@ -0,0 +1,78 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct Data
+{
+    spvUnsafeArray<float3, 16> sourceData;
+};
+
+kernel void main0(texture2d<float> g_inputTexture [[texture(0)]], texture2d<uint, access::write> g_output [[texture(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+{
+    threadgroup spvUnsafeArray<Data, 64> g_data;
+    uint _49;
+    _49 = 0u;
+    for (; _49 < 4u; _49++)
+    {
+        for (uint _56 = 0u; _56 < 4u; )
+        {
+            int3 _65 = int3(gl_GlobalInvocationID) + int3(int(_56), int(_49), 0);
+            g_data[gl_GlobalInvocationID.x].sourceData[(_49 * 4u) + _56] = g_inputTexture.read(uint2(_65.xy), _65.z).xyz;
+            _56++;
+            continue;
+        }
+    }
+    spvUnsafeArray<float3, 16> _45 = g_data[gl_GlobalInvocationID.x].sourceData;
+    uint _77;
+    _77 = 0u;
+    for (int _80 = 0; _80 < 16; )
+    {
+        _77 |= uint(fast::clamp(dot(_45[_80], float3(-1.0)), 0.0, 1.0));
+        _80++;
+        continue;
+    }
+    g_output.write(uint4(_77), uint2(gl_GlobalInvocationID.xy));
+}
+
--- a/reference/shaders-msl-no-opt/asm/comp/variable-pointers-2.asm.comp
+++ b/reference/shaders-msl-no-opt/asm/comp/variable-pointers-2.asm.comp
@ -1,13 +1,54 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
 #include <metal_stdlib>
 #include <simd/simd.h>

 using namespace metal;

+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(64u, 1u, 1u);

 kernel void main0(uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
 {
-    threadgroup float2 test[64];
+    threadgroup spvUnsafeArray<float2, 64> test;
    float _21 = float(gl_GlobalInvocationID.x);
    float2 _22 = float2(_21);
    ((&((&test)[0u]))[0u])[1u + 2u] = _22;
--- a/reference/shaders-msl-no-opt/asm/comp/variable-pointers-3.asm.comp
+++ b/reference/shaders-msl-no-opt/asm/comp/variable-pointers-3.asm.comp
@ -0,0 +1,55 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(64u, 1u, 1u);
+
+kernel void main0(uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+{
+    threadgroup spvUnsafeArray<float2, 64> test;
+    float _21 = float(gl_GlobalInvocationID.x);
+    (true ? &((threadgroup float*)&((&test)[0u])[2u])[0u] : &((threadgroup float*)&((&test)[0u])[2u])[0u])[1u] = _21;
+}
+
--- a/reference/shaders-msl-no-opt/asm/comp/variable-pointers-vector-to-scalar.asm.comp
+++ b/reference/shaders-msl-no-opt/asm/comp/variable-pointers-vector-to-scalar.asm.comp
@ -1,13 +1,54 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
 #include <metal_stdlib>
 #include <simd/simd.h>

 using namespace metal;

+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(64u, 1u, 1u);

 kernel void main0(uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
 {
-    threadgroup float2 test[64];
+    threadgroup spvUnsafeArray<float2, 64> test;
    float _21 = float(gl_GlobalInvocationID.x);
    ((threadgroup float*)&(*(true ? &test[1u] : &test[2u])))[1u] = _21;
 }
--- a/reference/shaders-msl-no-opt/asm/comp/variable-pointers.asm.comp
+++ b/reference/shaders-msl-no-opt/asm/comp/variable-pointers.asm.comp
@ -1,10 +1,49 @@
 #pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"

 #include <metal_stdlib>
 #include <simd/simd.h>

 using namespace metal;

+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
 struct foo
 {
    int a[128];
@ -35,14 +74,14 @@ device int* select_buffer_null(device foo& buf, constant bar& cb)
 }

 static inline __attribute__((always_inline))
-threadgroup int* select_tgsm(constant bar& cb, threadgroup int (&tgsm)[128])
+threadgroup int* select_tgsm(constant bar& cb, threadgroup spvUnsafeArray<int, 128>& tgsm)
 {
    return (cb.d != 0) ? &tgsm[0u] : nullptr;
 }

 kernel void main0(device foo& buf [[buffer(0)]], constant bar& cb [[buffer(1)]], device baz& buf2 [[buffer(2)]])
 {
-    threadgroup int tgsm[128];
+    threadgroup spvUnsafeArray<int, 128> tgsm;
    device int* sbuf = select_buffer(buf, buf2, cb);
    device int* sbuf2 = select_buffer_null(buf, cb);
    threadgroup int* stgsm = select_tgsm(cb, tgsm);
--- a/reference/shaders-msl-no-opt/asm/comp/workgroup-uint-to-uchar-alias-ptr-access-chain.asm.comp
+++ b/reference/shaders-msl-no-opt/asm/comp/workgroup-uint-to-uchar-alias-ptr-access-chain.asm.comp
@ -1,8 +1,49 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
 #include <metal_stdlib>
 #include <simd/simd.h>

 using namespace metal;

+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
 constant uint _15_tmp [[function_constant(0)]];
 constant uint _15 = is_function_constant_defined(_15_tmp) ? _15_tmp : 1u;
 constant uint _16_tmp [[function_constant(1)]];
@ -30,8 +71,8 @@ constant uchar4 _137 = {};

 kernel void main0(device _6& _25 [[buffer(0)]], constant _8& _29 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
-    threadgroup uint _5[256];
-    threadgroup uchar _10[1024];
+    threadgroup spvUnsafeArray<uint, 256> _5;
+    threadgroup spvUnsafeArray<uchar, 1024> _10;
    uint3 _20 = gl_WorkGroupSize;
    bool _40 = _29._m0._m0 != 0u;
    if (_40)
--- a/reference/shaders-msl-no-opt/asm/masking/initializers-block.mask-location-0.msl2.asm.tesc
+++ b/reference/shaders-msl-no-opt/asm/masking/initializers-block.mask-location-0.msl2.asm.tesc
@ -69,7 +69,7 @@ kernel void main0(uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_
 {
    spvUnsafeArray<C, 4> _21 = spvUnsafeArray<C, 4>({ C{ float4(0.0) }, C{ float4(0.0) }, C{ float4(0.0) }, C{ float4(0.0) } });
    
-    threadgroup C c[4];
+    threadgroup spvUnsafeArray<C, 4> c;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 4];
    c[gl_InvocationID] = _21[gl_InvocationID];
    device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
--- a/reference/shaders-msl-no-opt/asm/masking/initializers-block.mask-location-0.multi-patch.msl2.asm.tesc
+++ b/reference/shaders-msl-no-opt/asm/masking/initializers-block.mask-location-0.multi-patch.msl2.asm.tesc
@ -70,8 +70,8 @@ kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], devic
    spvUnsafeArray<C, 4> _21 = spvUnsafeArray<C, 4>({ C{ float4(0.0) }, C{ float4(0.0) }, C{ float4(0.0) }, C{ float4(0.0) } });
    
    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
-    threadgroup C spvStoragec[8][4];
-    threadgroup C (&c)[4] = spvStoragec[(gl_GlobalInvocationID.x / 4) % 8];
+    threadgroup spvUnsafeArray<C, 4> spvStoragec[8];
+    threadgroup auto &c = spvStoragec[(gl_GlobalInvocationID.x / 4) % 8];
    c[gl_GlobalInvocationID.x % 4] = _21[gl_GlobalInvocationID.x % 4];
    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
    patchOut.p_v = float4(0.0);
--- a/reference/shaders-msl-no-opt/asm/masking/initializers-block.mask-location-1.multi-patch.msl2.asm.tesc
+++ b/reference/shaders-msl-no-opt/asm/masking/initializers-block.mask-location-1.multi-patch.msl2.asm.tesc
@ -72,7 +72,7 @@ kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], devic
    gl_out[gl_GlobalInvocationID.x % 4].c_v = _21[gl_GlobalInvocationID.x % 4].v;
    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
    threadgroup P spvStoragep[8];
-    threadgroup P (&p) = spvStoragep[(gl_GlobalInvocationID.x / 4) % 8];
+    threadgroup auto &p = spvStoragep[(gl_GlobalInvocationID.x / 4) % 8];
    p = P{ float4(0.0) };
    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1);
--- a/reference/shaders-msl-no-opt/asm/masking/initializers-block.mask-point-size.msl2.asm.tesc
+++ b/reference/shaders-msl-no-opt/asm/masking/initializers-block.mask-point-size.msl2.asm.tesc
@ -83,7 +83,7 @@ kernel void main0(uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_
    spvUnsafeArray<C, 4> _21 = spvUnsafeArray<C, 4>({ C{ float4(0.0) }, C{ float4(0.0) }, C{ float4(0.0) }, C{ float4(0.0) } });
    spvUnsafeArray<gl_PerVertex, 4> _39 = spvUnsafeArray<gl_PerVertex, 4>({ gl_PerVertex{ float4(0.0), 0.0, spvUnsafeArray<float, 1>({ 0.0 }), spvUnsafeArray<float, 1>({ 0.0 }) }, gl_PerVertex{ float4(0.0), 0.0, spvUnsafeArray<float, 1>({ 0.0 }), spvUnsafeArray<float, 1>({ 0.0 }) }, gl_PerVertex{ float4(0.0), 0.0, spvUnsafeArray<float, 1>({ 0.0 }), spvUnsafeArray<float, 1>({ 0.0 }) }, gl_PerVertex{ float4(0.0), 0.0, spvUnsafeArray<float, 1>({ 0.0 }), spvUnsafeArray<float, 1>({ 0.0 }) } });
    
-    threadgroup gl_PerVertex gl_out_masked[4];
+    threadgroup spvUnsafeArray<gl_PerVertex, 4> gl_out_masked;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 4];
    gl_out[gl_InvocationID].c_v = _21[gl_InvocationID].v;
    gl_out[gl_InvocationID].gl_Position = _39[gl_InvocationID].gl_Position;
--- a/reference/shaders-msl-no-opt/asm/masking/initializers-block.mask-point-size.multi-patch.msl2.asm.tesc
+++ b/reference/shaders-msl-no-opt/asm/masking/initializers-block.mask-point-size.multi-patch.msl2.asm.tesc
@ -88,8 +88,8 @@ kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], devic
    gl_out[gl_GlobalInvocationID.x % 4].gl_Position = _39[gl_GlobalInvocationID.x % 4].gl_Position;
    gl_out[gl_GlobalInvocationID.x % 4].gl_ClipDistance = _39[gl_GlobalInvocationID.x % 4].gl_ClipDistance;
    gl_out[gl_GlobalInvocationID.x % 4].gl_CullDistance = _39[gl_GlobalInvocationID.x % 4].gl_CullDistance;
-    threadgroup gl_PerVertex spvStoragegl_out_masked[8][4];
-    threadgroup gl_PerVertex (&gl_out_masked)[4] = spvStoragegl_out_masked[(gl_GlobalInvocationID.x / 4) % 8];
+    threadgroup spvUnsafeArray<gl_PerVertex, 4> spvStoragegl_out_masked[8];
+    threadgroup auto &gl_out_masked = spvStoragegl_out_masked[(gl_GlobalInvocationID.x / 4) % 8];
    gl_out_masked[gl_GlobalInvocationID.x % 4] = _39[gl_GlobalInvocationID.x % 4];
    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
    patchOut.p_v = float4(0.0);
--- a/reference/shaders-msl-no-opt/asm/masking/initializers-block.mask-position.msl2.asm.tesc
+++ b/reference/shaders-msl-no-opt/asm/masking/initializers-block.mask-position.msl2.asm.tesc
@ -83,7 +83,7 @@ kernel void main0(uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_
    spvUnsafeArray<C, 4> _21 = spvUnsafeArray<C, 4>({ C{ float4(0.0) }, C{ float4(0.0) }, C{ float4(0.0) }, C{ float4(0.0) } });
    spvUnsafeArray<gl_PerVertex, 4> _39 = spvUnsafeArray<gl_PerVertex, 4>({ gl_PerVertex{ float4(0.0), 0.0, spvUnsafeArray<float, 1>({ 0.0 }), spvUnsafeArray<float, 1>({ 0.0 }) }, gl_PerVertex{ float4(0.0), 0.0, spvUnsafeArray<float, 1>({ 0.0 }), spvUnsafeArray<float, 1>({ 0.0 }) }, gl_PerVertex{ float4(0.0), 0.0, spvUnsafeArray<float, 1>({ 0.0 }), spvUnsafeArray<float, 1>({ 0.0 }) }, gl_PerVertex{ float4(0.0), 0.0, spvUnsafeArray<float, 1>({ 0.0 }), spvUnsafeArray<float, 1>({ 0.0 }) } });
    
-    threadgroup gl_PerVertex gl_out_masked[4];
+    threadgroup spvUnsafeArray<gl_PerVertex, 4> gl_out_masked;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 4];
    gl_out[gl_InvocationID].c_v = _21[gl_InvocationID].v;
    gl_out[gl_InvocationID].gl_PointSize = _39[gl_InvocationID].gl_PointSize;
--- a/reference/shaders-msl-no-opt/asm/masking/initializers-block.mask-position.multi-patch.msl2.asm.tesc
+++ b/reference/shaders-msl-no-opt/asm/masking/initializers-block.mask-position.multi-patch.msl2.asm.tesc
@ -88,8 +88,8 @@ kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], devic
    gl_out[gl_GlobalInvocationID.x % 4].gl_PointSize = _39[gl_GlobalInvocationID.x % 4].gl_PointSize;
    gl_out[gl_GlobalInvocationID.x % 4].gl_ClipDistance = _39[gl_GlobalInvocationID.x % 4].gl_ClipDistance;
    gl_out[gl_GlobalInvocationID.x % 4].gl_CullDistance = _39[gl_GlobalInvocationID.x % 4].gl_CullDistance;
-    threadgroup gl_PerVertex spvStoragegl_out_masked[8][4];
-    threadgroup gl_PerVertex (&gl_out_masked)[4] = spvStoragegl_out_masked[(gl_GlobalInvocationID.x / 4) % 8];
+    threadgroup spvUnsafeArray<gl_PerVertex, 4> spvStoragegl_out_masked[8];
+    threadgroup auto &gl_out_masked = spvStoragegl_out_masked[(gl_GlobalInvocationID.x / 4) % 8];
    gl_out_masked[gl_GlobalInvocationID.x % 4] = _39[gl_GlobalInvocationID.x % 4];
    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
    patchOut.p_v = float4(0.0);
--- a/reference/shaders-msl-no-opt/asm/masking/initializers.mask-location-0.msl2.asm.tesc
+++ b/reference/shaders-msl-no-opt/asm/masking/initializers.mask-location-0.msl2.asm.tesc
@ -73,7 +73,7 @@ kernel void main0(uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_
 {
    spvUnsafeArray<_RESERVED_IDENTIFIER_FIXUP_gl_PerVertex, 4> _32 = spvUnsafeArray<_RESERVED_IDENTIFIER_FIXUP_gl_PerVertex, 4>({ _RESERVED_IDENTIFIER_FIXUP_gl_PerVertex{ float4(0.0), 0.0, spvUnsafeArray<float, 1>({ 0.0 }), spvUnsafeArray<float, 1>({ 0.0 }) }, _RESERVED_IDENTIFIER_FIXUP_gl_PerVertex{ float4(0.0), 0.0, spvUnsafeArray<float, 1>({ 0.0 }), spvUnsafeArray<float, 1>({ 0.0 }) }, _RESERVED_IDENTIFIER_FIXUP_gl_PerVertex{ float4(0.0), 0.0, spvUnsafeArray<float, 1>({ 0.0 }), spvUnsafeArray<float, 1>({ 0.0 }) }, _RESERVED_IDENTIFIER_FIXUP_gl_PerVertex{ float4(0.0), 0.0, spvUnsafeArray<float, 1>({ 0.0 }), spvUnsafeArray<float, 1>({ 0.0 }) } });
    
-    threadgroup float4 foo[4];
+    threadgroup spvUnsafeArray<float4, 4> foo;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 4];
    foo[gl_InvocationID] = _17[gl_InvocationID];
    gl_out[gl_InvocationID].gl_Position = _32[gl_InvocationID]._RESERVED_IDENTIFIER_FIXUP_gl_Position;
--- a/reference/shaders-msl-no-opt/asm/masking/initializers.mask-location-0.msl2.multi-patch.asm.tesc
+++ b/reference/shaders-msl-no-opt/asm/masking/initializers.mask-location-0.msl2.multi-patch.asm.tesc
@ -74,8 +74,8 @@ kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], devic
    spvUnsafeArray<_RESERVED_IDENTIFIER_FIXUP_gl_PerVertex, 4> _32 = spvUnsafeArray<_RESERVED_IDENTIFIER_FIXUP_gl_PerVertex, 4>({ _RESERVED_IDENTIFIER_FIXUP_gl_PerVertex{ float4(0.0), 0.0, spvUnsafeArray<float, 1>({ 0.0 }), spvUnsafeArray<float, 1>({ 0.0 }) }, _RESERVED_IDENTIFIER_FIXUP_gl_PerVertex{ float4(0.0), 0.0, spvUnsafeArray<float, 1>({ 0.0 }), spvUnsafeArray<float, 1>({ 0.0 }) }, _RESERVED_IDENTIFIER_FIXUP_gl_PerVertex{ float4(0.0), 0.0, spvUnsafeArray<float, 1>({ 0.0 }), spvUnsafeArray<float, 1>({ 0.0 }) }, _RESERVED_IDENTIFIER_FIXUP_gl_PerVertex{ float4(0.0), 0.0, spvUnsafeArray<float, 1>({ 0.0 }), spvUnsafeArray<float, 1>({ 0.0 }) } });
    
    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
-    threadgroup float4 spvStoragefoo[8][4];
-    threadgroup float4 (&foo)[4] = spvStoragefoo[(gl_GlobalInvocationID.x / 4) % 8];
+    threadgroup spvUnsafeArray<float4, 4> spvStoragefoo[8];
+    threadgroup auto &foo = spvStoragefoo[(gl_GlobalInvocationID.x / 4) % 8];
    foo[gl_GlobalInvocationID.x % 4] = _17[gl_GlobalInvocationID.x % 4];
    gl_out[gl_GlobalInvocationID.x % 4].gl_Position = _32[gl_GlobalInvocationID.x % 4]._RESERVED_IDENTIFIER_FIXUP_gl_Position;
    gl_out[gl_GlobalInvocationID.x % 4].gl_PointSize = _32[gl_GlobalInvocationID.x % 4]._RESERVED_IDENTIFIER_FIXUP_gl_PointSize;
--- a/reference/shaders-msl-no-opt/asm/masking/initializers.mask-location-1.multi-patch.asm.tesc
+++ b/reference/shaders-msl-no-opt/asm/masking/initializers.mask-location-1.multi-patch.asm.tesc
@ -80,7 +80,7 @@ kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], devic
    gl_out[gl_GlobalInvocationID.x % 4].gl_CullDistance = _32[gl_GlobalInvocationID.x % 4]._RESERVED_IDENTIFIER_FIXUP_gl_CullDistance;
    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
    threadgroup float4 spvStoragefoo_patch[8];
-    threadgroup float4 (&foo_patch) = spvStoragefoo_patch[(gl_GlobalInvocationID.x / 4) % 8];
+    threadgroup auto &foo_patch = spvStoragefoo_patch[(gl_GlobalInvocationID.x / 4) % 8];
    foo_patch = float4(0.0);
    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1);
--- a/reference/shaders-msl-no-opt/asm/masking/initializers.mask-point-size.msl2.asm.tesc
+++ b/reference/shaders-msl-no-opt/asm/masking/initializers.mask-point-size.msl2.asm.tesc
@ -73,7 +73,7 @@ kernel void main0(uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_
 {
    spvUnsafeArray<gl_PerVertex, 4> _32 = spvUnsafeArray<gl_PerVertex, 4>({ gl_PerVertex{ float4(0.0), 0.0, spvUnsafeArray<float, 1>({ 0.0 }), spvUnsafeArray<float, 1>({ 0.0 }) }, gl_PerVertex{ float4(0.0), 0.0, spvUnsafeArray<float, 1>({ 0.0 }), spvUnsafeArray<float, 1>({ 0.0 }) }, gl_PerVertex{ float4(0.0), 0.0, spvUnsafeArray<float, 1>({ 0.0 }), spvUnsafeArray<float, 1>({ 0.0 }) }, gl_PerVertex{ float4(0.0), 0.0, spvUnsafeArray<float, 1>({ 0.0 }), spvUnsafeArray<float, 1>({ 0.0 }) } });
    
-    threadgroup gl_PerVertex gl_out_masked[4];
+    threadgroup spvUnsafeArray<gl_PerVertex, 4> gl_out_masked;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 4];
    gl_out[gl_InvocationID].foo = _17[gl_InvocationID];
    gl_out[gl_InvocationID].gl_Position = _32[gl_InvocationID].gl_Position;
--- a/reference/shaders-msl-no-opt/asm/masking/initializers.mask-point-size.msl2.multi-patch.asm.tesc
+++ b/reference/shaders-msl-no-opt/asm/masking/initializers.mask-point-size.msl2.multi-patch.asm.tesc
@ -78,8 +78,8 @@ kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], devic
    gl_out[gl_GlobalInvocationID.x % 4].gl_Position = _32[gl_GlobalInvocationID.x % 4].gl_Position;
    gl_out[gl_GlobalInvocationID.x % 4].gl_ClipDistance = _32[gl_GlobalInvocationID.x % 4].gl_ClipDistance;
    gl_out[gl_GlobalInvocationID.x % 4].gl_CullDistance = _32[gl_GlobalInvocationID.x % 4].gl_CullDistance;
-    threadgroup gl_PerVertex spvStoragegl_out_masked[8][4];
-    threadgroup gl_PerVertex (&gl_out_masked)[4] = spvStoragegl_out_masked[(gl_GlobalInvocationID.x / 4) % 8];
+    threadgroup spvUnsafeArray<gl_PerVertex, 4> spvStoragegl_out_masked[8];
+    threadgroup auto &gl_out_masked = spvStoragegl_out_masked[(gl_GlobalInvocationID.x / 4) % 8];
    gl_out_masked[gl_GlobalInvocationID.x % 4] = _32[gl_GlobalInvocationID.x % 4];
    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
    patchOut.foo_patch = float4(0.0);
--- a/reference/shaders-msl-no-opt/asm/masking/initializers.mask-position.msl2.asm.tesc
+++ b/reference/shaders-msl-no-opt/asm/masking/initializers.mask-position.msl2.asm.tesc
@ -73,7 +73,7 @@ kernel void main0(uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_
 {
    spvUnsafeArray<gl_PerVertex, 4> _32 = spvUnsafeArray<gl_PerVertex, 4>({ gl_PerVertex{ float4(0.0), 0.0, spvUnsafeArray<float, 1>({ 0.0 }), spvUnsafeArray<float, 1>({ 0.0 }) }, gl_PerVertex{ float4(0.0), 0.0, spvUnsafeArray<float, 1>({ 0.0 }), spvUnsafeArray<float, 1>({ 0.0 }) }, gl_PerVertex{ float4(0.0), 0.0, spvUnsafeArray<float, 1>({ 0.0 }), spvUnsafeArray<float, 1>({ 0.0 }) }, gl_PerVertex{ float4(0.0), 0.0, spvUnsafeArray<float, 1>({ 0.0 }), spvUnsafeArray<float, 1>({ 0.0 }) } });
    
-    threadgroup gl_PerVertex gl_out_masked[4];
+    threadgroup spvUnsafeArray<gl_PerVertex, 4> gl_out_masked;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 4];
    gl_out[gl_InvocationID].foo = _17[gl_InvocationID];
    gl_out[gl_InvocationID].gl_PointSize = _32[gl_InvocationID].gl_PointSize;
--- a/reference/shaders-msl-no-opt/asm/masking/initializers.mask-position.msl2.multi-patch.asm.tesc
+++ b/reference/shaders-msl-no-opt/asm/masking/initializers.mask-position.msl2.multi-patch.asm.tesc
@ -78,8 +78,8 @@ kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], devic
    gl_out[gl_GlobalInvocationID.x % 4].gl_PointSize = _32[gl_GlobalInvocationID.x % 4].gl_PointSize;
    gl_out[gl_GlobalInvocationID.x % 4].gl_ClipDistance = _32[gl_GlobalInvocationID.x % 4].gl_ClipDistance;
    gl_out[gl_GlobalInvocationID.x % 4].gl_CullDistance = _32[gl_GlobalInvocationID.x % 4].gl_CullDistance;
-    threadgroup gl_PerVertex spvStoragegl_out_masked[8][4];
-    threadgroup gl_PerVertex (&gl_out_masked)[4] = spvStoragegl_out_masked[(gl_GlobalInvocationID.x / 4) % 8];
+    threadgroup spvUnsafeArray<gl_PerVertex, 4> spvStoragegl_out_masked[8];
+    threadgroup auto &gl_out_masked = spvStoragegl_out_masked[(gl_GlobalInvocationID.x / 4) % 8];
    gl_out_masked[gl_GlobalInvocationID.x % 4] = _32[gl_GlobalInvocationID.x % 4];
    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
    patchOut.foo_patch = float4(0.0);
--- a/reference/shaders-msl-no-opt/comp/array-copy-threadgroup-memory.comp
+++ b/reference/shaders-msl-no-opt/comp/array-copy-threadgroup-memory.comp
@ -44,129 +44,20 @@ struct spvUnsafeArray
    }
 };

-template<typename T, uint A>
-inline void spvArrayCopyFromConstantToStack1(thread T (&dst)[A], constant T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromConstantToThreadGroup1(threadgroup T (&dst)[A], constant T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromStackToStack1(thread T (&dst)[A], thread const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromStackToThreadGroup1(threadgroup T (&dst)[A], thread const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromThreadGroupToStack1(thread T (&dst)[A], threadgroup const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromThreadGroupToThreadGroup1(threadgroup T (&dst)[A], threadgroup const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromDeviceToDevice1(device T (&dst)[A], device const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromConstantToDevice1(device T (&dst)[A], constant T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromStackToDevice1(device T (&dst)[A], thread const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromThreadGroupToDevice1(device T (&dst)[A], threadgroup const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromDeviceToStack1(thread T (&dst)[A], device const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromDeviceToThreadGroup1(threadgroup T (&dst)[A], device const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(8u, 1u, 1u);

 kernel void main0(uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]])
 {
-    threadgroup float shared_group[8][8];
-    threadgroup float shared_group_alt[8][8];
+    threadgroup spvUnsafeArray<spvUnsafeArray<float, 8>, 8> shared_group;
+    threadgroup spvUnsafeArray<spvUnsafeArray<float, 8>, 8> shared_group_alt;
    spvUnsafeArray<float, 8> blob;
    for (int i = 0; i < 8; i++)
    {
        blob[i] = float(i);
    }
-    spvArrayCopyFromStackToThreadGroup1(shared_group[gl_LocalInvocationIndex], blob.elements);
+    shared_group[gl_LocalInvocationIndex] = blob;
    threadgroup_barrier(mem_flags::mem_threadgroup);
-    spvUnsafeArray<float, 8> copied_blob;
-    spvArrayCopyFromThreadGroupToStack1(copied_blob.elements, shared_group[gl_LocalInvocationIndex ^ 1u]);
-    spvArrayCopyFromThreadGroupToThreadGroup1(shared_group_alt[gl_LocalInvocationIndex], shared_group[gl_LocalInvocationIndex]);
+    spvUnsafeArray<float, 8> copied_blob = shared_group[gl_LocalInvocationIndex ^ 1u];
+    shared_group_alt[gl_LocalInvocationIndex] = shared_group[gl_LocalInvocationIndex];
 }

--- a/reference/shaders-msl/comp/complex-type-alias.comp
+++ b/reference/shaders-msl/comp/complex-type-alias.comp
@ -1,10 +1,49 @@
 #pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"

 #include <metal_stdlib>
 #include <simd/simd.h>

 using namespace metal;

+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
 struct Foo0
 {
    float a;
@ -52,7 +91,7 @@ void Zero(thread Foo0& v)

 kernel void main0(device SSBO& _53 [[buffer(0)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
-    threadgroup Foo2 coeffs[64];
+    threadgroup spvUnsafeArray<Foo2, 64> coeffs;
    Foo2 data;
    data.weight = 0.0;
    Foo0 param;
--- a/reference/shaders-msl/comp/functions.comp
+++ b/reference/shaders-msl/comp/functions.comp
@ -1,19 +1,58 @@
 #pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"

 #include <metal_stdlib>
 #include <simd/simd.h>

 using namespace metal;

+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
 static inline __attribute__((always_inline))
-void myfunc(threadgroup int (&foo)[1337])
+void myfunc(threadgroup spvUnsafeArray<int, 1337>& foo)
 {
    foo[0] = 13;
 }

 kernel void main0()
 {
-    threadgroup int foo[1337];
+    threadgroup spvUnsafeArray<int, 1337> foo;
    myfunc(foo);
 }

--- a/reference/shaders-msl/comp/shared-array-of-arrays.comp
+++ b/reference/shaders-msl/comp/shared-array-of-arrays.comp
@ -1,10 +1,49 @@
 #pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"

 #include <metal_stdlib>
 #include <simd/simd.h>

 using namespace metal;

+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
 struct SSBO
 {
    float out_data[1];
@ -13,7 +52,7 @@ struct SSBO
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(4u, 4u, 1u);

 static inline __attribute__((always_inline))
-void work(threadgroup float (&foo)[4][4], thread uint3& gl_LocalInvocationID, thread uint& gl_LocalInvocationIndex, device SSBO& _67, thread uint3& gl_GlobalInvocationID)
+void work(threadgroup spvUnsafeArray<spvUnsafeArray<float, 4>, 4>& foo, thread uint3& gl_LocalInvocationID, thread uint& gl_LocalInvocationIndex, device SSBO& _67, thread uint3& gl_GlobalInvocationID)
 {
    foo[gl_LocalInvocationID.x][gl_LocalInvocationID.y] = float(gl_LocalInvocationIndex);
    threadgroup_barrier(mem_flags::mem_threadgroup);
@ -27,7 +66,7 @@ void work(threadgroup float (&foo)[4][4], thread uint3& gl_LocalInvocationID, th

 kernel void main0(device SSBO& _67 [[buffer(0)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
 {
-    threadgroup float foo[4][4];
+    threadgroup spvUnsafeArray<spvUnsafeArray<float, 4>, 4> foo;
    work(foo, gl_LocalInvocationID, gl_LocalInvocationIndex, _67, gl_GlobalInvocationID);
 }

--- a/reference/shaders-msl/comp/shared.comp
+++ b/reference/shaders-msl/comp/shared.comp
@ -1,8 +1,49 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
 #include <metal_stdlib>
 #include <simd/simd.h>

 using namespace metal;

+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
 struct SSBO
 {
    float in_data[1];
@ -17,7 +58,7 @@ constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(4u, 1u, 1u);

 kernel void main0(const device SSBO& _22 [[buffer(0)]], device SSBO2& _44 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]])
 {
-    threadgroup float sShared[4];
+    threadgroup spvUnsafeArray<float, 4> sShared;
    uint ident = gl_GlobalInvocationID.x;
    float idata = _22.in_data[ident];
    sShared[gl_LocalInvocationIndex] = idata;
--- a/reference/shaders-msl/comp/threadgroup-boolean-workaround.comp
+++ b/reference/shaders-msl/comp/threadgroup-boolean-workaround.comp
@ -1,10 +1,49 @@
 #pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"

 #include <metal_stdlib>
 #include <simd/simd.h>

 using namespace metal;

+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
 struct SSBO
 {
    float4 values[1];
@ -13,7 +52,7 @@ struct SSBO
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(4u, 1u, 1u);

 static inline __attribute__((always_inline))
-void in_function(threadgroup short4 (&foo)[4], thread uint& gl_LocalInvocationIndex, device SSBO& _23, thread uint3& gl_GlobalInvocationID)
+void in_function(threadgroup spvUnsafeArray<short4, 4>& foo, thread uint& gl_LocalInvocationIndex, device SSBO& _23, thread uint3& gl_GlobalInvocationID)
 {
    foo[gl_LocalInvocationIndex] = short4(_23.values[gl_GlobalInvocationID.x] != float4(10.0));
    threadgroup_barrier(mem_flags::mem_threadgroup);
@ -22,7 +61,7 @@ void in_function(threadgroup short4 (&foo)[4], thread uint& gl_LocalInvocationIn

 kernel void main0(device SSBO& _23 [[buffer(0)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
 {
-    threadgroup short4 foo[4];
+    threadgroup spvUnsafeArray<short4, 4> foo;
    in_function(foo, gl_LocalInvocationIndex, _23, gl_GlobalInvocationID);
 }

--- a/reference/shaders-msl/masking/copy-arrays.mask-location-0.msl2.multi-patch.tesc
+++ b/reference/shaders-msl/masking/copy-arrays.mask-location-0.msl2.multi-patch.tesc
@ -44,114 +44,6 @@ struct spvUnsafeArray
    }
 };

-template<typename T, uint A>
-inline void spvArrayCopyFromConstantToStack1(thread T (&dst)[A], constant T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromConstantToThreadGroup1(threadgroup T (&dst)[A], constant T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromStackToStack1(thread T (&dst)[A], thread const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromStackToThreadGroup1(threadgroup T (&dst)[A], thread const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromThreadGroupToStack1(thread T (&dst)[A], threadgroup const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromThreadGroupToThreadGroup1(threadgroup T (&dst)[A], threadgroup const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromDeviceToDevice1(device T (&dst)[A], device const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromConstantToDevice1(device T (&dst)[A], constant T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromStackToDevice1(device T (&dst)[A], thread const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromThreadGroupToDevice1(device T (&dst)[A], threadgroup const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromDeviceToStack1(thread T (&dst)[A], device const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromDeviceToThreadGroup1(threadgroup T (&dst)[A], device const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
 struct main0_out
 {
    float4 gl_Position;
@ -171,14 +63,14 @@ struct main0_in
 kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], device main0_in* spvIn [[buffer(22)]])
 {
    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
-    threadgroup float4 spvStorageFoo[8][4][2];
-    threadgroup float4 (&Foo)[4][2] = spvStorageFoo[(gl_GlobalInvocationID.x / 4) % 8];
+    threadgroup spvUnsafeArray<spvUnsafeArray<float4, 2>, 4> spvStorageFoo[8];
+    threadgroup auto &Foo = spvStorageFoo[(gl_GlobalInvocationID.x / 4) % 8];
    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
    device main0_in* gl_in = &spvIn[min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1) * spvIndirectParams[0]];
    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1);
    gl_out[gl_InvocationID].gl_Position = float4(1.0);
-    spvArrayCopyFromDeviceToThreadGroup1(Foo[gl_InvocationID], gl_in[gl_InvocationID].iFoo.elements);
+    Foo[gl_InvocationID] = gl_in[gl_InvocationID].iFoo;
    if (gl_InvocationID == 0)
    {
        spvUnsafeArray<float4, 2> _56 = spvUnsafeArray<float4, 2>({ gl_in[0].ipFoo, gl_in[1].ipFoo });
--- a/reference/shaders-msl/masking/copy-arrays.mask-location-0.msl2.tesc
+++ b/reference/shaders-msl/masking/copy-arrays.mask-location-0.msl2.tesc
@ -44,114 +44,6 @@ struct spvUnsafeArray
    }
 };

-template<typename T, uint A>
-inline void spvArrayCopyFromConstantToStack1(thread T (&dst)[A], constant T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromConstantToThreadGroup1(threadgroup T (&dst)[A], constant T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromStackToStack1(thread T (&dst)[A], thread const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromStackToThreadGroup1(threadgroup T (&dst)[A], thread const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromThreadGroupToStack1(thread T (&dst)[A], threadgroup const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromThreadGroupToThreadGroup1(threadgroup T (&dst)[A], threadgroup const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromDeviceToDevice1(device T (&dst)[A], device const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromConstantToDevice1(device T (&dst)[A], constant T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromStackToDevice1(device T (&dst)[A], thread const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromThreadGroupToDevice1(device T (&dst)[A], threadgroup const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromDeviceToStack1(thread T (&dst)[A], device const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
-template<typename T, uint A>
-inline void spvArrayCopyFromDeviceToThreadGroup1(threadgroup T (&dst)[A], device const T (&src)[A])
-{
-    for (uint i = 0; i < A; i++)
-    {
-        dst[i] = src[i];
-    }
-}
-
 struct main0_out
 {
    float4 gl_Position;
@ -171,7 +63,7 @@ struct main0_in

 kernel void main0(main0_in in [[stage_in]], uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]], threadgroup main0_in* gl_in [[threadgroup(0)]])
 {
-    threadgroup float4 Foo[4][2];
+    threadgroup spvUnsafeArray<spvUnsafeArray<float4, 2>, 4> Foo;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 4];
    device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
    if (gl_InvocationID < spvIndirectParams[0])
@ -181,7 +73,7 @@ kernel void main0(main0_in in [[stage_in]], uint gl_InvocationID [[thread_index_
        return;
    gl_out[gl_InvocationID].gl_Position = float4(1.0);
    spvUnsafeArray<float4, 2> _38 = spvUnsafeArray<float4, 2>({ gl_in[gl_InvocationID].iFoo_0, gl_in[gl_InvocationID].iFoo_1 });
-    spvArrayCopyFromStackToThreadGroup1(Foo[gl_InvocationID], _38.elements);
+    Foo[gl_InvocationID] = _38;
    if (gl_InvocationID == 0)
    {
        spvUnsafeArray<float4, 2> _56 = spvUnsafeArray<float4, 2>({ gl_in[0].ipFoo, gl_in[1].ipFoo });
--- a/reference/shaders-msl/masking/write-outputs-block.mask-location-0.multi-patch.msl2.tesc
+++ b/reference/shaders-msl/masking/write-outputs-block.mask-location-0.multi-patch.msl2.tesc
@ -44,7 +44,7 @@ kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], devic
    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
    threadgroup P spvStorage_11[8];
-    threadgroup P (&_11) = spvStorage_11[(gl_GlobalInvocationID.x / 4) % 8];
+    threadgroup auto &_11 = spvStorage_11[(gl_GlobalInvocationID.x / 4) % 8];
    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1);
    write_in_function(_11, patchOut, gl_out, gl_InvocationID);
--- a/reference/shaders-msl/masking/write-outputs-block.mask-location-1.msl2.tesc
+++ b/reference/shaders-msl/masking/write-outputs-block.mask-location-1.msl2.tesc
@ -1,10 +1,49 @@
 #pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"

 #include <metal_stdlib>
 #include <simd/simd.h>

 using namespace metal;

+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
 struct P
 {
    float a;
@ -30,7 +69,7 @@ struct main0_patchOut
 };

 static inline __attribute__((always_inline))
-void write_in_function(device main0_patchOut& patchOut, threadgroup C (&c)[4], device main0_out* thread & gl_out, thread uint& gl_InvocationID)
+void write_in_function(device main0_patchOut& patchOut, threadgroup spvUnsafeArray<C, 4>& c, device main0_out* thread & gl_out, thread uint& gl_InvocationID)
 {
    patchOut.m_11_a = 1.0;
    patchOut.m_11_b = 2.0;
@ -41,7 +80,7 @@ void write_in_function(device main0_patchOut& patchOut, threadgroup C (&c)[4], d

 kernel void main0(uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
 {
-    threadgroup C c[4];
+    threadgroup spvUnsafeArray<C, 4> c;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 4];
    device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
    write_in_function(patchOut, c, gl_out, gl_InvocationID);
--- a/reference/shaders-msl/masking/write-outputs-block.mask-location-1.multi-patch.msl2.tesc
+++ b/reference/shaders-msl/masking/write-outputs-block.mask-location-1.multi-patch.msl2.tesc
@ -1,10 +1,49 @@
 #pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"

 #include <metal_stdlib>
 #include <simd/simd.h>

 using namespace metal;

+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
 struct P
 {
    float a;
@ -30,7 +69,7 @@ struct main0_patchOut
 };

 static inline __attribute__((always_inline))
-void write_in_function(device main0_patchOut& patchOut, threadgroup C (&c)[4], device main0_out* thread & gl_out, thread uint& gl_InvocationID)
+void write_in_function(device main0_patchOut& patchOut, threadgroup spvUnsafeArray<C, 4>& c, device main0_out* thread & gl_out, thread uint& gl_InvocationID)
 {
    patchOut.m_11_a = 1.0;
    patchOut.m_11_b = 2.0;
@ -42,8 +81,8 @@ void write_in_function(device main0_patchOut& patchOut, threadgroup C (&c)[4], d
 kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
 {
    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
-    threadgroup C spvStoragec[8][4];
-    threadgroup C (&c)[4] = spvStoragec[(gl_GlobalInvocationID.x / 4) % 8];
+    threadgroup spvUnsafeArray<C, 4> spvStoragec[8];
+    threadgroup auto &c = spvStoragec[(gl_GlobalInvocationID.x / 4) % 8];
    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1);
--- a/reference/shaders-msl/masking/write-outputs.mask-location-0.msl2.tesc
+++ b/reference/shaders-msl/masking/write-outputs.mask-location-0.msl2.tesc
@ -1,10 +1,49 @@
 #pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"

 #include <metal_stdlib>
 #include <simd/simd.h>

 using namespace metal;

+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
 struct main0_out
 {
    float4 gl_Position;
@ -17,7 +56,7 @@ struct main0_patchOut
 };

 static inline __attribute__((always_inline))
-void write_in_func(threadgroup float4 (&v0)[4], thread uint& gl_InvocationID, device float4& v1, device main0_out* thread & gl_out)
+void write_in_func(threadgroup spvUnsafeArray<float4, 4>& v0, thread uint& gl_InvocationID, device float4& v1, device main0_out* thread & gl_out)
 {
    v0[gl_InvocationID] = float4(1.0);
    v0[gl_InvocationID].x = 2.0;
@ -34,7 +73,7 @@ void write_in_func(threadgroup float4 (&v0)[4], thread uint& gl_InvocationID, de

 kernel void main0(uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
 {
-    threadgroup float4 v0[4];
+    threadgroup spvUnsafeArray<float4, 4> v0;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 4];
    device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
    write_in_func(v0, gl_InvocationID, patchOut.v1, gl_out);
--- a/reference/shaders-msl/masking/write-outputs.mask-location-0.multi-patch.tesc
+++ b/reference/shaders-msl/masking/write-outputs.mask-location-0.multi-patch.tesc
@ -57,7 +57,7 @@ struct main0_patchOut
 };

 static inline __attribute__((always_inline))
-void write_in_func(threadgroup float4 (&v0)[4], thread uint& gl_InvocationID, device spvUnsafeArray<float4, 2>& v1, device float4& v3, device main0_out* thread & gl_out)
+void write_in_func(threadgroup spvUnsafeArray<float4, 4>& v0, thread uint& gl_InvocationID, device spvUnsafeArray<float4, 2>& v1, device float4& v3, device main0_out* thread & gl_out)
 {
    v0[gl_InvocationID] = float4(1.0);
    v0[gl_InvocationID].z = 3.0;
@ -77,8 +77,8 @@ void write_in_func(threadgroup float4 (&v0)[4], thread uint& gl_InvocationID, de
 kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
 {
    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
-    threadgroup float4 spvStoragev0[8][4];
-    threadgroup float4 (&v0)[4] = spvStoragev0[(gl_GlobalInvocationID.x / 4) % 8];
+    threadgroup spvUnsafeArray<float4, 4> spvStoragev0[8];
+    threadgroup auto &v0 = spvStoragev0[(gl_GlobalInvocationID.x / 4) % 8];
    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1);
--- a/reference/shaders-msl/masking/write-outputs.mask-location-0.tesc
+++ b/reference/shaders-msl/masking/write-outputs.mask-location-0.tesc
@ -57,7 +57,7 @@ struct main0_patchOut
 };

 static inline __attribute__((always_inline))
-void write_in_func(threadgroup float4 (&v0)[4], thread uint& gl_InvocationID, device spvUnsafeArray<float4, 2>& v1, device float4& v3, device main0_out* thread & gl_out)
+void write_in_func(threadgroup spvUnsafeArray<float4, 4>& v0, thread uint& gl_InvocationID, device spvUnsafeArray<float4, 2>& v1, device float4& v3, device main0_out* thread & gl_out)
 {
    v0[gl_InvocationID] = float4(1.0);
    v0[gl_InvocationID].z = 3.0;
@ -76,7 +76,7 @@ void write_in_func(threadgroup float4 (&v0)[4], thread uint& gl_InvocationID, de

 kernel void main0(uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
 {
-    threadgroup float4 v0[4];
+    threadgroup spvUnsafeArray<float4, 4> v0;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 4];
    device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
    write_in_func(v0, gl_InvocationID, patchOut.v1, patchOut.v3, gl_out);
--- a/reference/shaders-msl/masking/write-outputs.mask-location-1.multi-patch.tesc
+++ b/reference/shaders-msl/masking/write-outputs.mask-location-1.multi-patch.tesc
@ -1,10 +1,49 @@
 #pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"

 #include <metal_stdlib>
 #include <simd/simd.h>

 using namespace metal;

+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
 struct main0_out
 {
    float4 v0;
@ -18,7 +57,7 @@ struct main0_patchOut
 };

 static inline __attribute__((always_inline))
-void write_in_func(device main0_out* thread & gl_out, thread uint& gl_InvocationID, threadgroup float4 (&v1)[2], device float4& v3)
+void write_in_func(device main0_out* thread & gl_out, thread uint& gl_InvocationID, threadgroup spvUnsafeArray<float4, 2>& v1, device float4& v3)
 {
    gl_out[gl_InvocationID].v0 = float4(1.0);
    gl_out[gl_InvocationID].v0.z = 3.0;
@ -39,8 +78,8 @@ kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], devic
 {
    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
-    threadgroup float4 spvStoragev1[8][2];
-    threadgroup float4 (&v1)[2] = spvStoragev1[(gl_GlobalInvocationID.x / 4) % 8];
+    threadgroup spvUnsafeArray<float4, 2> spvStoragev1[8];
+    threadgroup auto &v1 = spvStoragev1[(gl_GlobalInvocationID.x / 4) % 8];
    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1);
    write_in_func(gl_out, gl_InvocationID, v1, patchOut.v3);
--- a/reference/shaders-msl/masking/write-outputs.mask-location-1.tesc
+++ b/reference/shaders-msl/masking/write-outputs.mask-location-1.tesc
@ -1,10 +1,49 @@
 #pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"

 #include <metal_stdlib>
 #include <simd/simd.h>

 using namespace metal;

+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
 struct main0_out
 {
    float4 v0;
@ -18,7 +57,7 @@ struct main0_patchOut
 };

 static inline __attribute__((always_inline))
-void write_in_func(device main0_out* thread & gl_out, thread uint& gl_InvocationID, threadgroup float4 (&v1)[2], device float4& v3)
+void write_in_func(device main0_out* thread & gl_out, thread uint& gl_InvocationID, threadgroup spvUnsafeArray<float4, 2>& v1, device float4& v3)
 {
    gl_out[gl_InvocationID].v0 = float4(1.0);
    gl_out[gl_InvocationID].v0.z = 3.0;
@ -37,7 +76,7 @@ void write_in_func(device main0_out* thread & gl_out, thread uint& gl_Invocation

 kernel void main0(uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
 {
-    threadgroup float4 v1[2];
+    threadgroup spvUnsafeArray<float4, 2> v1;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 4];
    device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
    write_in_func(gl_out, gl_InvocationID, v1, patchOut.v3);
--- a/reference/shaders-msl/masking/write-outputs.mask-point-size.multi-patch.tesc
+++ b/reference/shaders-msl/masking/write-outputs.mask-point-size.multi-patch.tesc
@ -65,7 +65,7 @@ struct main0_patchOut
 };

 static inline __attribute__((always_inline))
-void write_in_func(device main0_out* thread & gl_out, thread uint& gl_InvocationID, device spvUnsafeArray<float4, 2>& v1, device float4& v3, threadgroup gl_PerVertex (&gl_out_masked)[4])
+void write_in_func(device main0_out* thread & gl_out, thread uint& gl_InvocationID, device spvUnsafeArray<float4, 2>& v1, device float4& v3, threadgroup spvUnsafeArray<gl_PerVertex, 4>& gl_out_masked)
 {
    gl_out[gl_InvocationID].v0 = float4(1.0);
    gl_out[gl_InvocationID].v0.z = 3.0;
@ -85,8 +85,8 @@ void write_in_func(device main0_out* thread & gl_out, thread uint& gl_Invocation
 kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
 {
    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
-    threadgroup gl_PerVertex spvStoragegl_out_masked[8][4];
-    threadgroup gl_PerVertex (&gl_out_masked)[4] = spvStoragegl_out_masked[(gl_GlobalInvocationID.x / 4) % 8];
+    threadgroup spvUnsafeArray<gl_PerVertex, 4> spvStoragegl_out_masked[8];
+    threadgroup auto &gl_out_masked = spvStoragegl_out_masked[(gl_GlobalInvocationID.x / 4) % 8];
    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1);
--- a/reference/shaders-msl/masking/write-outputs.mask-point-size.tesc
+++ b/reference/shaders-msl/masking/write-outputs.mask-point-size.tesc
@ -65,7 +65,7 @@ struct main0_patchOut
 };

 static inline __attribute__((always_inline))
-void write_in_func(device main0_out* thread & gl_out, thread uint& gl_InvocationID, device spvUnsafeArray<float4, 2>& v1, device float4& v3, threadgroup gl_PerVertex (&gl_out_masked)[4])
+void write_in_func(device main0_out* thread & gl_out, thread uint& gl_InvocationID, device spvUnsafeArray<float4, 2>& v1, device float4& v3, threadgroup spvUnsafeArray<gl_PerVertex, 4>& gl_out_masked)
 {
    gl_out[gl_InvocationID].v0 = float4(1.0);
    gl_out[gl_InvocationID].v0.z = 3.0;
@ -84,7 +84,7 @@ void write_in_func(device main0_out* thread & gl_out, thread uint& gl_Invocation

 kernel void main0(uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
 {
-    threadgroup gl_PerVertex gl_out_masked[4];
+    threadgroup spvUnsafeArray<gl_PerVertex, 4> gl_out_masked;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 4];
    device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
    write_in_func(gl_out, gl_InvocationID, patchOut.v1, patchOut.v3, gl_out_masked);
--- a/reference/shaders-msl/masking/write-outputs.mask-position.multi-patch.tesc
+++ b/reference/shaders-msl/masking/write-outputs.mask-position.multi-patch.tesc
@ -65,7 +65,7 @@ struct main0_patchOut
 };

 static inline __attribute__((always_inline))
-void write_in_func(device main0_out* thread & gl_out, thread uint& gl_InvocationID, device spvUnsafeArray<float4, 2>& v1, device float4& v3, threadgroup gl_PerVertex (&gl_out_masked)[4])
+void write_in_func(device main0_out* thread & gl_out, thread uint& gl_InvocationID, device spvUnsafeArray<float4, 2>& v1, device float4& v3, threadgroup spvUnsafeArray<gl_PerVertex, 4>& gl_out_masked)
 {
    gl_out[gl_InvocationID].v0 = float4(1.0);
    gl_out[gl_InvocationID].v0.z = 3.0;
@ -85,8 +85,8 @@ void write_in_func(device main0_out* thread & gl_out, thread uint& gl_Invocation
 kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
 {
    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
-    threadgroup gl_PerVertex spvStoragegl_out_masked[8][4];
-    threadgroup gl_PerVertex (&gl_out_masked)[4] = spvStoragegl_out_masked[(gl_GlobalInvocationID.x / 4) % 8];
+    threadgroup spvUnsafeArray<gl_PerVertex, 4> spvStoragegl_out_masked[8];
+    threadgroup auto &gl_out_masked = spvStoragegl_out_masked[(gl_GlobalInvocationID.x / 4) % 8];
    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1] - 1);
--- a/reference/shaders-msl/masking/write-outputs.mask-position.tesc
+++ b/reference/shaders-msl/masking/write-outputs.mask-position.tesc
@ -65,7 +65,7 @@ struct main0_patchOut
 };

 static inline __attribute__((always_inline))
-void write_in_func(device main0_out* thread & gl_out, thread uint& gl_InvocationID, device spvUnsafeArray<float4, 2>& v1, device float4& v3, threadgroup gl_PerVertex (&gl_out_masked)[4])
+void write_in_func(device main0_out* thread & gl_out, thread uint& gl_InvocationID, device spvUnsafeArray<float4, 2>& v1, device float4& v3, threadgroup spvUnsafeArray<gl_PerVertex, 4>& gl_out_masked)
 {
    gl_out[gl_InvocationID].v0 = float4(1.0);
    gl_out[gl_InvocationID].v0.z = 3.0;
@ -84,7 +84,7 @@ void write_in_func(device main0_out* thread & gl_out, thread uint& gl_Invocation

 kernel void main0(uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
 {
-    threadgroup gl_PerVertex gl_out_masked[4];
+    threadgroup spvUnsafeArray<gl_PerVertex, 4> gl_out_masked;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 4];
    device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
    write_in_func(gl_out, gl_InvocationID, patchOut.v1, patchOut.v3, gl_out_masked);
--- a/reference/shaders-ue4/asm/tesc/hs-incorrect-base-type.invalid.asm.tesc
+++ b/reference/shaders-ue4/asm/tesc/hs-incorrect-base-type.invalid.asm.tesc
@ -300,7 +300,7 @@ struct main0_in

 kernel void main0(main0_in in [[stage_in]], constant type_View& View [[buffer(0)]], const device type_StructuredBuffer_v4float& View_PrimitiveSceneData [[buffer(1)]], uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLTriangleTessellationFactorsHalf* spvTessLevel [[buffer(26)]], threadgroup main0_in* gl_in [[threadgroup(0)]])
 {
-    threadgroup FPNTessellationHSToDS temp_var_hullMainRetVal[3];
+    threadgroup spvUnsafeArray<FPNTessellationHSToDS, 3> temp_var_hullMainRetVal;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 3];
    device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
    if (gl_InvocationID < spvIndirectParams[0])
--- a/reference/shaders-ue4/asm/tesc/hs-input-array-access.invalid.asm.tesc
+++ b/reference/shaders-ue4/asm/tesc/hs-input-array-access.invalid.asm.tesc
@ -342,7 +342,7 @@ struct main0_in

 kernel void main0(main0_in in [[stage_in]], constant type_View& View [[buffer(0)]], constant type_Primitive& Primitive [[buffer(1)]], uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLTriangleTessellationFactorsHalf* spvTessLevel [[buffer(26)]], threadgroup main0_in* gl_in [[threadgroup(0)]])
 {
-    threadgroup FPNTessellationHSToDS temp_var_hullMainRetVal[3];
+    threadgroup spvUnsafeArray<FPNTessellationHSToDS, 3> temp_var_hullMainRetVal;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 3];
    device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
    if (gl_InvocationID < spvIndirectParams[0])
--- a/reference/shaders-ue4/asm/tesc/hs-texcoord-array.invalid.asm.tesc
+++ b/reference/shaders-ue4/asm/tesc/hs-texcoord-array.invalid.asm.tesc
@ -318,7 +318,7 @@ struct main0_in

 kernel void main0(main0_in in [[stage_in]], constant type_View& View [[buffer(0)]], constant type_Primitive& Primitive [[buffer(1)]], uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLTriangleTessellationFactorsHalf* spvTessLevel [[buffer(26)]], threadgroup main0_in* gl_in [[threadgroup(0)]])
 {
-    threadgroup FPNTessellationHSToDS temp_var_hullMainRetVal[3];
+    threadgroup spvUnsafeArray<FPNTessellationHSToDS, 3> temp_var_hullMainRetVal;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 3];
    device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
    if (gl_InvocationID < spvIndirectParams[0])
--- a/reference/shaders-ue4/asm/tesc/tess-factor-must-be-threadgroup.invalid.asm.tesc
+++ b/reference/shaders-ue4/asm/tesc/tess-factor-must-be-threadgroup.invalid.asm.tesc
@ -136,7 +136,7 @@ struct main0_in

 kernel void main0(main0_in in [[stage_in]], constant type_Primitive& Primitive [[buffer(0)]], constant type_Material& Material [[buffer(1)]], uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLTriangleTessellationFactorsHalf* spvTessLevel [[buffer(26)]], threadgroup main0_in* gl_in [[threadgroup(0)]])
 {
-    threadgroup FFlatTessellationHSToDS temp_var_hullMainRetVal[3];
+    threadgroup spvUnsafeArray<FFlatTessellationHSToDS, 3> temp_var_hullMainRetVal;
    device main0_out* gl_out = &spvOut[gl_PrimitiveID * 3];
    if (gl_InvocationID < spvIndirectParams[0])
        gl_in[gl_InvocationID] = in;
--- a/shaders-msl-no-opt/asm/comp/groupshared-inner-array-of-struct-copy.asm.comp
+++ b/shaders-msl-no-opt/asm/comp/groupshared-inner-array-of-struct-copy.asm.comp
@ -0,0 +1,137 @@
+; SPIR-V
+; Version: 1.3
+; Generator: Google spiregg; 0
+; Bound: 91
+; Schema: 0
+               OpCapability Shader
+               OpCapability StorageImageReadWithoutFormat
+               OpExtension "SPV_GOOGLE_hlsl_functionality1"
+               OpExtension "SPV_GOOGLE_user_type"
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %csMain "main" %gl_GlobalInvocationID
+               OpExecutionMode %csMain LocalSize 8 8 1
+               OpSource HLSL 500
+               OpName %Data "Data"
+               OpMemberName %Data 0 "sourceData"
+               OpName %g_data "g_data"
+               OpName %type_2d_image "type.2d.image"
+               OpName %g_inputTexture "g_inputTexture"
+               OpName %type_2d_image_0 "type.2d.image"
+               OpName %g_output "g_output"
+               OpName %csMain "csMain"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorateString %gl_GlobalInvocationID UserSemantic "SV_DispatchThreadID"
+               OpDecorate %g_inputTexture DescriptorSet 0
+               OpDecorate %g_inputTexture Binding 0
+               OpDecorate %g_output DescriptorSet 0
+               OpDecorate %g_output Binding 0
+               OpDecorateString %g_inputTexture UserTypeGOOGLE "texture2d:<float4>"
+               OpDecorateString %g_output UserTypeGOOGLE "rwtexture2d:<uint>"
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+       %uint = OpTypeInt 32 0
+     %uint_0 = OpConstant %uint 0
+     %uint_4 = OpConstant %uint 4
+     %uint_1 = OpConstant %uint 1
+     %int_16 = OpConstant %int 16
+      %float = OpTypeFloat 32
+   %float_n1 = OpConstant %float -1
+    %v3float = OpTypeVector %float 3
+         %20 = OpConstantComposite %v3float %float_n1 %float_n1 %float_n1
+    %float_0 = OpConstant %float 0
+    %float_1 = OpConstant %float 1
+      %int_1 = OpConstant %int 1
+    %uint_64 = OpConstant %uint 64
+    %uint_16 = OpConstant %uint 16
+%_arr_v3float_uint_16 = OpTypeArray %v3float %uint_16
+       %Data = OpTypeStruct %_arr_v3float_uint_16
+%_arr_Data_uint_64 = OpTypeArray %Data %uint_64
+%_ptr_Workgroup__arr_Data_uint_64 = OpTypePointer Workgroup %_arr_Data_uint_64
+%type_2d_image = OpTypeImage %float 2D 2 0 0 1 Unknown
+%_ptr_UniformConstant_type_2d_image = OpTypePointer UniformConstant %type_2d_image
+%type_2d_image_0 = OpTypeImage %uint 2D 2 0 0 2 R32ui
+%_ptr_UniformConstant_type_2d_image_0 = OpTypePointer UniformConstant %type_2d_image_0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+         %34 = OpTypeFunction %void
+     %v2uint = OpTypeVector %uint 2
+%_ptr_Function__arr_v3float_uint_16 = OpTypePointer Function %_arr_v3float_uint_16
+%_ptr_Workgroup__arr_v3float_uint_16 = OpTypePointer Workgroup %_arr_v3float_uint_16
+%_ptr_Function_v3float = OpTypePointer Function %v3float
+       %bool = OpTypeBool
+      %v3int = OpTypeVector %int 3
+      %v2int = OpTypeVector %int 2
+    %v4float = OpTypeVector %float 4
+%_ptr_Workgroup_v3float = OpTypePointer Workgroup %v3float
+     %g_data = OpVariable %_ptr_Workgroup__arr_Data_uint_64 Workgroup
+%g_inputTexture = OpVariable %_ptr_UniformConstant_type_2d_image UniformConstant
+   %g_output = OpVariable %_ptr_UniformConstant_type_2d_image_0 UniformConstant
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+     %csMain = OpFunction %void None %34
+         %44 = OpLabel
+         %45 = OpVariable %_ptr_Function__arr_v3float_uint_16 Function
+         %46 = OpLoad %v3uint %gl_GlobalInvocationID
+         %47 = OpCompositeExtract %uint %46 0
+               OpBranch %48
+         %48 = OpLabel
+         %49 = OpPhi %uint %uint_0 %44 %50 %51
+         %52 = OpULessThan %bool %49 %uint_4
+               OpLoopMerge %53 %51 None
+               OpBranchConditional %52 %54 %53
+         %54 = OpLabel
+               OpBranch %55
+         %55 = OpLabel
+         %56 = OpPhi %uint %uint_0 %54 %57 %58
+         %59 = OpULessThan %bool %56 %uint_4
+               OpLoopMerge %60 %58 None
+               OpBranchConditional %59 %58 %60
+         %58 = OpLabel
+         %61 = OpBitcast %v3int %46
+         %62 = OpBitcast %int %56
+         %63 = OpBitcast %int %49
+         %64 = OpCompositeConstruct %v3int %62 %63 %int_0
+         %65 = OpIAdd %v3int %61 %64
+         %66 = OpVectorShuffle %v2int %65 %65 0 1
+         %67 = OpCompositeExtract %int %65 2
+         %68 = OpLoad %type_2d_image %g_inputTexture
+         %69 = OpImageFetch %v4float %68 %66 Lod %67
+         %70 = OpVectorShuffle %v3float %69 %69 0 1 2
+         %71 = OpIMul %uint %49 %uint_4
+         %72 = OpIAdd %uint %71 %56
+         %73 = OpAccessChain %_ptr_Workgroup_v3float %g_data %47 %int_0 %72
+               OpStore %73 %70
+         %57 = OpIAdd %uint %56 %uint_1
+               OpBranch %55
+         %60 = OpLabel
+               OpBranch %51
+         %51 = OpLabel
+         %50 = OpIAdd %uint %49 %uint_1
+               OpBranch %48
+         %53 = OpLabel
+         %74 = OpAccessChain %_ptr_Workgroup__arr_v3float_uint_16 %g_data %47 %int_0
+         %75 = OpLoad %_arr_v3float_uint_16 %74
+               OpStore %45 %75
+               OpBranch %76
+         %76 = OpLabel
+         %77 = OpPhi %uint %uint_0 %53 %78 %79
+         %80 = OpPhi %int %int_0 %53 %81 %79
+         %82 = OpSLessThan %bool %80 %int_16
+               OpLoopMerge %83 %79 None
+               OpBranchConditional %82 %79 %83
+         %79 = OpLabel
+         %84 = OpAccessChain %_ptr_Function_v3float %45 %80
+         %85 = OpLoad %v3float %84
+         %86 = OpDot %float %85 %20
+         %87 = OpExtInst %float %1 FClamp %86 %float_0 %float_1
+         %88 = OpConvertFToU %uint %87
+         %78 = OpBitwiseOr %uint %77 %88
+         %81 = OpIAdd %int %80 %int_1
+               OpBranch %76
+         %83 = OpLabel
+         %89 = OpVectorShuffle %v2uint %46 %46 0 1
+         %90 = OpLoad %type_2d_image_0 %g_output
+               OpImageWrite %90 %89 %77 None
+               OpReturn
+               OpFunctionEnd
--- a/shaders-msl-no-opt/asm/comp/variable-pointers-3.asm.comp
+++ b/shaders-msl-no-opt/asm/comp/variable-pointers-3.asm.comp
@ -0,0 +1,60 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 11
+; Bound: 26
+; Schema: 0
+               OpCapability Shader
+			   OpCapability VariablePointers
+			   OpExtension "SPV_KHR_variable_pointers"
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %gl_LocalInvocationIndex %gl_GlobalInvocationID
+               OpExecutionMode %main LocalSize 64 1 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %test "test"
+               OpName %gl_LocalInvocationIndex "gl_LocalInvocationIndex"
+               OpName %gl_GlobalInvocationID "gl_GlobalInvocationID"
+               OpDecorate %gl_LocalInvocationIndex BuiltIn LocalInvocationIndex
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+	  %bool = OpTypeBool
+	  %true = OpConstantTrue %bool
+      %v2float = OpTypeVector %float 2
+       %uint = OpTypeInt 32 0
+    %uint_64 = OpConstant %uint 64
+%_arr_v2float_uint_64 = OpTypeArray %v2float %uint_64
+%_ptr_Workgroup__arr_v2float_uint_64 = OpTypePointer Workgroup %_arr_v2float_uint_64
+       %test = OpVariable %_ptr_Workgroup__arr_v2float_uint_64 Workgroup
+%_ptr_Input_uint = OpTypePointer Input %uint
+%gl_LocalInvocationIndex = OpVariable %_ptr_Input_uint Input
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+     %uint_0 = OpConstant %uint 0
+     %uint_1 = OpConstant %uint 1
+     %uint_2 = OpConstant %uint 2
+     %uint_3 = OpConstant %uint 3
+%_ptr_Workgroup_float = OpTypePointer Workgroup %float
+%_ptr_Workgroup_v2float = OpTypePointer Workgroup %v2float
+%gl_WorkGroupSize = OpConstantComposite %v3uint %uint_64 %uint_1 %uint_1
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %14 = OpLoad %uint %gl_LocalInvocationIndex
+         %19 = OpAccessChain %_ptr_Input_uint %gl_GlobalInvocationID %uint_0
+         %20 = OpLoad %uint %19
+         %21 = OpConvertUToF %float %20
+		 %22 = OpCompositeConstruct %v2float %21 %21
+
+			; Scalar shenanigans.
+		   %ptr6 = OpPtrAccessChain %_ptr_Workgroup_float %test %uint_0 %uint_2 %uint_0
+		   %ptr6_alt = OpPtrAccessChain %_ptr_Workgroup_float %test %uint_0 %uint_2 %uint_0
+		   %ptr6_sel = OpSelect %_ptr_Workgroup_float %true %ptr6 %ptr6_alt
+		   %ptr7 = OpPtrAccessChain %_ptr_Workgroup_float %ptr6_sel %uint_1
+		   OpStore %ptr7 %21
+
+               OpReturn
+               OpFunctionEnd
--- a/spirv_msl.cpp
+++ b/spirv_msl.cpp
@ -1464,7 +1464,7 @@ void CompilerMSL::emit_entry_point_declarations()
 	{
 		auto &var = get<SPIRVariable>(var_id);
 		add_local_variable_name(var_id);
-		statement(variable_decl(var), ";");
+		statement(CompilerGLSL::variable_decl(var), ";");
 		var.deferred_declaration = false;
 	}
 }
@ -3457,9 +3457,6 @@ void CompilerMSL::emit_local_masked_variable(const SPIRVariable &masked_var, boo
 			auto &type = get_variable_data_type(masked_var);
 			add_local_variable_name(masked_var.self);

-			bool old_is_builtin = is_using_builtin_array;
-			is_using_builtin_array = true;
-
 			const uint32_t max_control_points_per_patch = 32u;
 			uint32_t max_num_instances =
 					(max_control_points_per_patch + get_entry_point().output_vertices - 1u) /
@ -3475,14 +3472,12 @@ void CompilerMSL::emit_local_masked_variable(const SPIRVariable &masked_var, boo
 			// since Metal does not allow that. :(
 			// FIXME: We will likely need an option to support passing down target workgroup size,
 			// so we can emit appropriate size here.
-			statement("threadgroup ", type_to_glsl(type), " ",
-			          "(&", to_name(masked_var.self), ")",
-			          type_to_array_glsl(type), " = spvStorage", to_name(masked_var.self), "[",
+			statement("threadgroup auto ",
+			          "&", to_name(masked_var.self),
+			          " = spvStorage", to_name(masked_var.self), "[",
 			          "(", to_expression(builtin_invocation_id_id), ".x / ",
 			          get_entry_point().output_vertices, ") % ",
 			          max_num_instances, "];");
-
-			is_using_builtin_array = old_is_builtin;
 		});
 	}
 	else
@ -9770,8 +9765,8 @@ bool CompilerMSL::emit_array_copy(const char *expr, uint32_t lhs_id, uint32_t rh
 	bool lhs_is_thread_storage = storage_class_array_is_thread(lhs_storage);
 	bool rhs_is_thread_storage = storage_class_array_is_thread(rhs_storage);

-	bool lhs_is_array_template = lhs_is_thread_storage;
-	bool rhs_is_array_template = rhs_is_thread_storage;
+	bool lhs_is_array_template = lhs_is_thread_storage || lhs_storage == StorageClassWorkgroup;
+	bool rhs_is_array_template = rhs_is_thread_storage || rhs_storage == StorageClassWorkgroup;

 	// Special considerations for stage IO variables.
 	// If the variable is actually backed by non-user visible device storage, we use array templates for those.
@ -9786,15 +9781,13 @@ bool CompilerMSL::emit_array_copy(const char *expr, uint32_t lhs_id, uint32_t rh
 	auto *lhs_var = maybe_get_backing_variable(lhs_id);
 	if (lhs_var && lhs_storage == StorageClassStorageBuffer && storage_class_array_is_thread(lhs_var->storage))
 		lhs_is_array_template = true;
-	else if (lhs_var && (lhs_storage == StorageClassFunction || lhs_storage == StorageClassPrivate) &&
-	         type_is_block_like(get<SPIRType>(lhs_var->basetype)))
+	else if (lhs_var && lhs_storage != StorageClassGeneric && type_is_block_like(get<SPIRType>(lhs_var->basetype)))
 		lhs_is_array_template = false;

 	auto *rhs_var = maybe_get_backing_variable(rhs_id);
 	if (rhs_var && rhs_storage == StorageClassStorageBuffer && storage_class_array_is_thread(rhs_var->storage))
 		rhs_is_array_template = true;
-	else if (rhs_var && (rhs_storage == StorageClassFunction || rhs_storage == StorageClassPrivate) &&
-	         type_is_block_like(get<SPIRType>(rhs_var->basetype)))
+	else if (rhs_var && rhs_storage != StorageClassGeneric && type_is_block_like(get<SPIRType>(rhs_var->basetype)))
 		rhs_is_array_template = false;

 	// If threadgroup storage qualifiers are *not* used:
@ -14399,9 +14392,6 @@ string CompilerMSL::argument_decl(const SPIRFunction::Parameter &arg)
 	bool builtin = has_decoration(var.self, DecorationBuiltIn);
 	auto builtin_type = BuiltIn(get_decoration(arg.id, DecorationBuiltIn));

-	if (address_space == "threadgroup")
-		is_using_builtin_array = true;
-
 	if (var.basevariable && (var.basevariable == stage_in_ptr_var_id || var.basevariable == stage_out_ptr_var_id))
 		decl = join(cv_qualifier, type_to_glsl(type, arg.id));
 	else if (builtin)
@ -15327,19 +15317,6 @@ bool CompilerMSL::variable_decl_is_remapped_storage(const SPIRVariable &variable
 	}
 }

-std::string CompilerMSL::variable_decl(const SPIRVariable &variable)
-{
-	bool old_is_using_builtin_array = is_using_builtin_array;
-
-	// Threadgroup arrays can't have a wrapper type.
-	if (variable_decl_is_remapped_storage(variable, StorageClassWorkgroup))
-		is_using_builtin_array = true;
-
-	auto expr = CompilerGLSL::variable_decl(variable);
-	is_using_builtin_array = old_is_using_builtin_array;
-	return expr;
-}
-
 // GCC workaround of lambdas calling protected funcs
 std::string CompilerMSL::variable_decl(const SPIRType &type, const std::string &name, uint32_t id)
 {
--- a/spirv_msl.hpp
+++ b/spirv_msl.hpp
@ -857,9 +857,6 @@ protected:
 	std::string type_to_array_glsl(const SPIRType &type) override;
 	std::string constant_op_expression(const SPIRConstantOp &cop) override;

-	// Threadgroup arrays can't have a wrapper type
-	std::string variable_decl(const SPIRVariable &variable) override;
-
 	bool variable_decl_is_remapped_storage(const SPIRVariable &variable, spv::StorageClass storage) const override;

 	// GCC workaround of lambdas calling protected functions (for older GCC versions)