Enhancements to MSL compute and entry point naming.

Support Workgroup (threadgroup) variables. Mark if SPIRConstant is used as an array length, since it cannot be specialized. Resolve specialized array length constants. Support passing an array to MSL function. Support emitting GLSL array assignments in MSL via an array copy function. Support for memory and control barriers. Struct packing enhancements, including packing nested structs. Enhancements to replacing illegal MSL variable and function names. Add Compiler::get_entry_point_name_map() function to retrieve entry point renamings. Remove CompilerGLSL::clean_func_name() as obsolete. Fixes to types in bitcast MSL functions. Add Variant::get_id() member function. Add CompilerMSL::Options::msl_version option. Add numerous MSL compute tests.
2017-11-05 21:34:42 -05:00 · 2017-11-05 21:34:42 -05:00 · 1c18078811
commit 1c18078811
parent 9f06d909e4
69 changed files with 2705 additions and 188 deletions
--- a/reference/shaders-msl/asm/comp/bitcast_iadd.asm.comp
+++ b/reference/shaders-msl/asm/comp/bitcast_iadd.asm.comp
@ -0,0 +1,29 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct _3
+{
+    int4 _m0;
+    uint4 _m1;
+};
+
+struct _4
+{
+    uint4 _m0;
+    int4 _m1;
+};
+
+kernel void main0(device _3& _5 [[buffer(0)]], device _4& _6 [[buffer(1)]])
+{
+    _6._m0 = _5._m1 + uint4(_5._m0);
+    _6._m0 = uint4(_5._m0) + _5._m1;
+    _6._m0 = _5._m1 + _5._m1;
+    _6._m0 = uint4(_5._m0 + _5._m0);
+    _6._m1 = int4(_5._m1 + _5._m1);
+    _6._m1 = _5._m0 + _5._m0;
+    _6._m1 = int4(_5._m1) + _5._m0;
+    _6._m1 = _5._m0 + int4(_5._m1);
+}
+
--- a/reference/shaders-msl/asm/comp/bitcast_sar.asm.comp
+++ b/reference/shaders-msl/asm/comp/bitcast_sar.asm.comp
@ -0,0 +1,29 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct _3
+{
+    int4 _m0;
+    uint4 _m1;
+};
+
+struct _4
+{
+    uint4 _m0;
+    int4 _m1;
+};
+
+kernel void main0(device _3& _5 [[buffer(0)]], device _4& _6 [[buffer(1)]])
+{
+    _6._m0 = uint4(int4(_5._m1) >> _5._m0);
+    _6._m0 = uint4(_5._m0 >> int4(_5._m1));
+    _6._m0 = uint4(int4(_5._m1) >> int4(_5._m1));
+    _6._m0 = uint4(_5._m0 >> _5._m0);
+    _6._m1 = int4(_5._m1) >> int4(_5._m1);
+    _6._m1 = _5._m0 >> _5._m0;
+    _6._m1 = int4(_5._m1) >> _5._m0;
+    _6._m1 = _5._m0 >> int4(_5._m1);
+}
+
--- a/reference/shaders-msl/asm/comp/bitcast_sdiv.asm.comp
+++ b/reference/shaders-msl/asm/comp/bitcast_sdiv.asm.comp
@ -0,0 +1,29 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct _3
+{
+    int4 _m0;
+    uint4 _m1;
+};
+
+struct _4
+{
+    uint4 _m0;
+    int4 _m1;
+};
+
+kernel void main0(device _3& _5 [[buffer(0)]], device _4& _6 [[buffer(1)]])
+{
+    _6._m0 = uint4(int4(_5._m1) / _5._m0);
+    _6._m0 = uint4(_5._m0 / int4(_5._m1));
+    _6._m0 = uint4(int4(_5._m1) / int4(_5._m1));
+    _6._m0 = uint4(_5._m0 / _5._m0);
+    _6._m1 = int4(_5._m1) / int4(_5._m1);
+    _6._m1 = _5._m0 / _5._m0;
+    _6._m1 = int4(_5._m1) / _5._m0;
+    _6._m1 = _5._m0 / int4(_5._m1);
+}
+
--- a/reference/shaders-msl/asm/comp/bitcast_slr.asm.comp
+++ b/reference/shaders-msl/asm/comp/bitcast_slr.asm.comp
@ -0,0 +1,29 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct _3
+{
+    int4 _m0;
+    uint4 _m1;
+};
+
+struct _4
+{
+    uint4 _m0;
+    int4 _m1;
+};
+
+kernel void main0(device _3& _5 [[buffer(0)]], device _4& _6 [[buffer(1)]])
+{
+    _6._m0 = _5._m1 >> uint4(_5._m0);
+    _6._m0 = uint4(_5._m0) >> _5._m1;
+    _6._m0 = _5._m1 >> _5._m1;
+    _6._m0 = uint4(_5._m0) >> uint4(_5._m0);
+    _6._m1 = int4(_5._m1 >> _5._m1);
+    _6._m1 = int4(uint4(_5._m0) >> uint4(_5._m0));
+    _6._m1 = int4(_5._m1 >> uint4(_5._m0));
+    _6._m1 = int4(uint4(_5._m0) >> _5._m1);
+}
+
--- a/reference/shaders-msl/asm/comp/multiple-entry.asm.comp
+++ b/reference/shaders-msl/asm/comp/multiple-entry.asm.comp
@ -0,0 +1,29 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct _6
+{
+    int4 _m0;
+    uint4 _m1;
+};
+
+struct _7
+{
+    uint4 _m0;
+    int4 _m1;
+};
+
+kernel void main0(device _6& _8 [[buffer(0)]], device _7& _9 [[buffer(1)]])
+{
+    _9._m0 = _8._m1 + uint4(_8._m0);
+    _9._m0 = uint4(_8._m0) + _8._m1;
+    _9._m0 = _8._m1 + _8._m1;
+    _9._m0 = uint4(_8._m0 + _8._m0);
+    _9._m1 = int4(_8._m1 + _8._m1);
+    _9._m1 = _8._m0 + _8._m0;
+    _9._m1 = int4(_8._m1) + _8._m0;
+    _9._m1 = _8._m0 + int4(_8._m1);
+}
+
--- a/reference/shaders-msl/asm/comp/quantize.asm.comp
+++ b/reference/shaders-msl/asm/comp/quantize.asm.comp
@ -0,0 +1,21 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO0
+{
+    float scalar;
+    float2 vec2_val;
+    float3 vec3_val;
+    float4 vec4_val;
+};
+
+kernel void main0(device SSBO0& _4 [[buffer(0)]])
+{
+    _4.scalar = float(half(_4.scalar));
+    _4.vec2_val = float2(half2(_4.vec2_val));
+    _4.vec3_val = float3(half3(_4.vec3_val));
+    _4.vec4_val = float4(half4(_4.vec4_val));
+}
+
--- a/reference/shaders-msl/comp/bake_gradient.comp
+++ b/reference/shaders-msl/comp/bake_gradient.comp
@ -0,0 +1,40 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+constant uint3 gl_WorkGroupSize = uint3(8u, 8u, 1u);
+
+struct UBO
+{
+    float4 uInvSize;
+    float4 uScale;
+};
+
+float jacobian(thread const float2& dDdx, thread const float2& dDdy)
+{
+    return ((1.0 + dDdx.x) * (1.0 + dDdy.y)) - (dDdx.y * dDdy.x);
+}
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], constant UBO& _46 [[buffer(0)]], texture2d<float> uHeight [[texture(0)]], sampler uHeightSmplr [[sampler(0)]], texture2d<float> uDisplacement [[texture(1)]], sampler uDisplacementSmplr [[sampler(1)]], texture2d<float, access::write> iHeightDisplacement [[texture(2)]], texture2d<float, access::write> iGradJacobian [[texture(3)]])
+{
+    float4 uv = (float2(gl_GlobalInvocationID.xy) * _46.uInvSize.xy).xyxy + (_46.uInvSize * 0.5);
+    float h = uHeight.sample(uHeightSmplr, uv.xy, level(0.0)).x;
+    float x0 = uHeight.sample(uHeightSmplr, uv.xy, level(0.0), int2(-1, 0)).x;
+    float x1 = uHeight.sample(uHeightSmplr, uv.xy, level(0.0), int2(1, 0)).x;
+    float y0 = uHeight.sample(uHeightSmplr, uv.xy, level(0.0), int2(0, -1)).x;
+    float y1 = uHeight.sample(uHeightSmplr, uv.xy, level(0.0), int2(0, 1)).x;
+    float2 grad = (_46.uScale.xy * 0.5) * float2(x1 - x0, y1 - y0);
+    float2 displacement = uDisplacement.sample(uDisplacementSmplr, uv.zw, level(0.0)).xy * 1.2000000476837158203125;
+    float2 dDdx = (uDisplacement.sample(uDisplacementSmplr, uv.zw, level(0.0), int2(1, 0)).xy - uDisplacement.sample(uDisplacementSmplr, uv.zw, level(0.0), int2(-1, 0)).xy) * 0.60000002384185791015625;
+    float2 dDdy = (uDisplacement.sample(uDisplacementSmplr, uv.zw, level(0.0), int2(0, 1)).xy - uDisplacement.sample(uDisplacementSmplr, uv.zw, level(0.0), int2(0, -1)).xy) * 0.60000002384185791015625;
+    float2 param = dDdx * _46.uScale.z;
+    float2 param_1 = dDdy * _46.uScale.z;
+    float j = jacobian(param, param_1);
+    displacement = float2(0.0);
+    iHeightDisplacement.write(float4(h, displacement, 0.0), uint2(int2(gl_GlobalInvocationID.xy)));
+    iGradJacobian.write(float4(grad, j, 0.0), uint2(int2(gl_GlobalInvocationID.xy)));
+}
+
--- a/reference/shaders-msl/comp/basic.comp
+++ b/reference/shaders-msl/comp/basic.comp
@ -0,0 +1,34 @@
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+#include <metal_atomic>
+
+using namespace metal;
+
+struct SSBO
+{
+    float4 in_data[1];
+};
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+struct SSBO3
+{
+    uint counter;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO& _23 [[buffer(0)]], device SSBO2& _45 [[buffer(1)]], device SSBO3& _48 [[buffer(2)]])
+{
+    uint ident = gl_GlobalInvocationID.x;
+    float4 idata = _23.in_data[ident];
+    if (dot(idata, float4(1.0, 5.0, 6.0, 2.0)) > 8.19999980926513671875)
+    {
+        uint _52 = atomic_fetch_add_explicit((volatile device atomic_uint*)&(_48.counter), 1u, memory_order_relaxed);
+        _45.out_data[_52] = idata;
+    }
+}
+
--- a/reference/shaders-msl/comp/bitfield.comp
+++ b/reference/shaders-msl/comp/bitfield.comp
@ -0,0 +1,47 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+// Implementation of the GLSL findLSB() function
+template<typename T>
+T findLSB(T x)
+{
+    return select(ctz(x), T(-1), x == T(0));
+}
+
+// Implementation of the signed GLSL findMSB() function
+template<typename T>
+T findSMSB(T x)
+{
+    T v = select(x, T(-1) - x, x < T(0));
+    return select(clz(T(0)) - (clz(v) + T(1)), T(-1), v == T(0));
+}
+
+// Implementation of the unsigned GLSL findMSB() function
+template<typename T>
+T findUMSB(T x)
+{
+    return select(clz(T(0)) - (clz(x) + T(1)), T(-1), x == T(0));
+}
+
+kernel void main0()
+{
+    int signed_value = 0;
+    uint unsigned_value = 0u;
+    int s = extract_bits(signed_value, 5, 20);
+    uint u = extract_bits(unsigned_value, 6, 21);
+    s = insert_bits(s, 40, 5, 4);
+    u = insert_bits(u, 60u, 5, 4);
+    u = reverse_bits(u);
+    s = reverse_bits(s);
+    int v0 = popcount(u);
+    int v1 = popcount(s);
+    int v2 = findUMSB(u);
+    int v3 = findSMSB(s);
+    int v4 = findLSB(u);
+    int v5 = findLSB(s);
+}
+
--- a/reference/shaders-msl/comp/cfg-preserve-parameter.comp
+++ b/reference/shaders-msl/comp/cfg-preserve-parameter.comp
@ -0,0 +1,78 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+void out_test_0(thread const int& cond, thread int& i)
+{
+    if (cond == 0)
+    {
+        i = 40;
+    }
+    else
+    {
+        i = 60;
+    }
+}
+
+void out_test_1(thread const int& cond, thread int& i)
+{
+    switch (cond)
+    {
+        case 40:
+        {
+            i = 40;
+            break;
+        }
+        default:
+        {
+            i = 70;
+            break;
+        }
+    }
+}
+
+void inout_test_0(thread const int& cond, thread int& i)
+{
+    if (cond == 0)
+    {
+        i = 40;
+    }
+}
+
+void inout_test_1(thread const int& cond, thread int& i)
+{
+    switch (cond)
+    {
+        case 40:
+        {
+            i = 40;
+            break;
+        }
+    }
+}
+
+kernel void main0()
+{
+    int cond = 40;
+    int i = 50;
+    int param = cond;
+    int param_1 = i;
+    out_test_0(param, param_1);
+    i = param_1;
+    int param_2 = cond;
+    int param_3 = i;
+    out_test_1(param_2, param_3);
+    i = param_3;
+    int param_4 = cond;
+    int param_5 = i;
+    inout_test_0(param_4, param_5);
+    i = param_5;
+    int param_6 = cond;
+    int param_7 = i;
+    inout_test_1(param_6, param_7);
+    i = param_7;
+}
+
--- a/reference/shaders-msl/comp/coherent-block.comp
+++ b/reference/shaders-msl/comp/coherent-block.comp
@ -0,0 +1,15 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO
+{
+    float4 value;
+};
+
+kernel void main0(device SSBO& _10 [[buffer(0)]])
+{
+    _10.value = float4(20.0);
+}
+
--- a/reference/shaders-msl/comp/coherent-image.comp
+++ b/reference/shaders-msl/comp/coherent-image.comp
@ -0,0 +1,15 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO
+{
+    int4 value;
+};
+
+kernel void main0(device SSBO& _10 [[buffer(0)]], texture2d<int> uImage [[texture(0)]])
+{
+    _10.value = uImage.read(uint2(int2(10)));
+}
+
--- a/reference/shaders-msl/comp/culling.comp
+++ b/reference/shaders-msl/comp/culling.comp
@ -0,0 +1,36 @@
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+#include <metal_atomic>
+
+using namespace metal;
+
+constant uint3 gl_WorkGroupSize = uint3(4u, 1u, 1u);
+
+struct SSBO
+{
+    float in_data[1];
+};
+
+struct SSBO2
+{
+    float out_data[1];
+};
+
+struct SSBO3
+{
+    uint count;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO& _22 [[buffer(0)]], device SSBO2& _38 [[buffer(1)]], device SSBO3& _41 [[buffer(2)]])
+{
+    uint ident = gl_GlobalInvocationID.x;
+    float idata = _22.in_data[ident];
+    if (idata > 12.0)
+    {
+        uint _45 = atomic_fetch_add_explicit((volatile device atomic_uint*)&(_41.count), 1u, memory_order_relaxed);
+        _38.out_data[_45] = idata;
+    }
+}
+
--- a/reference/shaders-msl/comp/dowhile.comp
+++ b/reference/shaders-msl/comp/dowhile.comp
@ -0,0 +1,29 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO
+{
+    float4x4 mvp;
+    float4 in_data[1];
+};
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO& _28 [[buffer(0)]], device SSBO2& _52 [[buffer(1)]])
+{
+    uint ident = gl_GlobalInvocationID.x;
+    int i = 0;
+    float4 idat = _28.in_data[ident];
+    do
+    {
+        idat = _28.mvp * idat;
+        i++;
+    } while (i < 16);
+    _52.out_data[ident] = idat;
+}
+
--- a/reference/shaders-msl/comp/image.comp
+++ b/reference/shaders-msl/comp/image.comp
@ -0,0 +1,11 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+kernel void main0(texture2d<float> uImageIn [[texture(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], texture2d<float, access::write> uImageOut [[texture(1)]])
+{
+    float4 v = uImageIn.read(uint2((int2(gl_GlobalInvocationID.xy) + int2(uImageIn.get_width(), uImageIn.get_height()))));
+    uImageOut.write(v, uint2(int2(gl_GlobalInvocationID.xy)));
+}
+
--- a/reference/shaders-msl/comp/insert.comp
+++ b/reference/shaders-msl/comp/insert.comp
@ -0,0 +1,21 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO
+{
+    float4 out_data[1];
+};
+
+kernel void main0(device SSBO& _27 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+{
+    float4 v;
+    v.x = 10.0;
+    v.y = 30.0;
+    v.z = 70.0;
+    v.w = 90.0;
+    _27.out_data[gl_GlobalInvocationID.x] = v;
+    _27.out_data[gl_GlobalInvocationID.x].y = 20.0;
+}
+
--- a/reference/shaders-msl/comp/loop.comp
+++ b/reference/shaders-msl/comp/loop.comp
@ -0,0 +1,107 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO
+{
+    float4x4 mvp;
+    float4 in_data[1];
+};
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO& _24 [[buffer(0)]], device SSBO2& _177 [[buffer(1)]])
+{
+    uint ident = gl_GlobalInvocationID.x;
+    float4 idat = _24.in_data[ident];
+    int k = 0;
+    uint i = 0u;
+    if (idat.y == 20.0)
+    {
+        do
+        {
+            k *= 2;
+            i++;
+        } while (i < ident);
+    }
+    switch (k)
+    {
+        case 10:
+        {
+            for (;;)
+            {
+                i++;
+                if (i > 10u)
+                {
+                    break;
+                }
+                continue;
+            }
+            break;
+        }
+        default:
+        {
+            for (;;)
+            {
+                i += 2u;
+                if (i > 20u)
+                {
+                    break;
+                }
+                continue;
+            }
+            break;
+        }
+    }
+    while (k < 10)
+    {
+        idat *= 2.0;
+        k++;
+    }
+    for (uint i_1 = 0u; i_1 < 16u; i_1++, k++)
+    {
+        for (uint j = 0u; j < 30u; j++)
+        {
+            idat = _24.mvp * idat;
+        }
+    }
+    k = 0;
+    for (;;)
+    {
+        k++;
+        if (k > 10)
+        {
+            k += 2;
+        }
+        else
+        {
+            k += 3;
+            continue;
+        }
+        k += 10;
+        continue;
+    }
+    k = 0;
+    do
+    {
+        k++;
+    } while (k > 10);
+    int l = 0;
+    for (;;)
+    {
+        if (l == 5)
+        {
+            l++;
+            continue;
+        }
+        idat += float4(1.0);
+        l++;
+        continue;
+    }
+    _177.out_data[ident] = idat;
+}
+
--- a/reference/shaders-msl/comp/mat3.comp
+++ b/reference/shaders-msl/comp/mat3.comp
@ -0,0 +1,16 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO2
+{
+    float3x3 out_data[1];
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO2& _22 [[buffer(0)]])
+{
+    uint ident = gl_GlobalInvocationID.x;
+    _22.out_data[ident] = float3x3(float3(10.0), float3(20.0), float3(40.0));
+}
+
--- a/reference/shaders-msl/comp/mod.comp
+++ b/reference/shaders-msl/comp/mod.comp
@ -0,0 +1,35 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO
+{
+    float4 in_data[1];
+};
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+// Implementation of the GLSL mod() function, which is slightly different than Metal fmod()
+template<typename Tx, typename Ty>
+Tx mod(Tx x, Ty y)
+{
+    return x - y * floor(x / y);
+}
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO& _23 [[buffer(0)]], device SSBO2& _33 [[buffer(1)]])
+{
+    uint ident = gl_GlobalInvocationID.x;
+    float4 v = mod(_23.in_data[ident], _33.out_data[ident]);
+    _33.out_data[ident] = v;
+    uint4 vu = as_type<uint4>(_23.in_data[ident]) % as_type<uint4>(_33.out_data[ident]);
+    _33.out_data[ident] = as_type<float4>(vu);
+    int4 vi = as_type<int4>(_23.in_data[ident]) % as_type<int4>(_33.out_data[ident]);
+    _33.out_data[ident] = as_type<float4>(vi);
+}
+
--- a/reference/shaders-msl/comp/modf.comp
+++ b/reference/shaders-msl/comp/modf.comp
@ -0,0 +1,24 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO
+{
+    float4 in_data[1];
+};
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO& _23 [[buffer(0)]], device SSBO2& _35 [[buffer(1)]])
+{
+    uint ident = gl_GlobalInvocationID.x;
+    float4 i;
+    float4 _31 = modf(_23.in_data[ident], i);
+    float4 v = _31;
+    _35.out_data[ident] = v;
+}
+
--- a/reference/shaders-msl/comp/return.comp
+++ b/reference/shaders-msl/comp/return.comp
@ -0,0 +1,36 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO2& _27 [[buffer(0)]])
+{
+    uint ident = gl_GlobalInvocationID.x;
+    if (ident == 2u)
+    {
+        _27.out_data[ident] = float4(20.0);
+    }
+    else
+    {
+        if (ident == 4u)
+        {
+            _27.out_data[ident] = float4(10.0);
+            return;
+        }
+    }
+    for (int i = 0; i < 20; i++)
+    {
+        if (i == 10)
+        {
+            break;
+        }
+        return;
+    }
+    _27.out_data[ident] = float4(10.0);
+}
+
--- a/reference/shaders-msl/comp/rmw-opt.comp
+++ b/reference/shaders-msl/comp/rmw-opt.comp
@ -0,0 +1,29 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO
+{
+    int a;
+};
+
+kernel void main0(device SSBO& _9 [[buffer(0)]])
+{
+    _9.a += 10;
+    _9.a -= 10;
+    _9.a *= 10;
+    _9.a /= 10;
+    _9.a = _9.a << 2;
+    _9.a = _9.a >> 3;
+    _9.a &= 40;
+    _9.a ^= 10;
+    _9.a %= 40;
+    _9.a |= 1;
+    bool c = false;
+    bool d = true;
+    c = c && d;
+    d = d || c;
+    _9.a = int(c && d);
+}
+
--- a/reference/shaders-msl/comp/shared.comp
+++ b/reference/shaders-msl/comp/shared.comp
@ -0,0 +1,27 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+constant uint3 gl_WorkGroupSize = uint3(4u, 1u, 1u);
+
+struct SSBO
+{
+    float in_data[1];
+};
+
+struct SSBO2
+{
+    float out_data[1];
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO& _22 [[buffer(0)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], device SSBO2& _44 [[buffer(1)]])
+{
+    uint ident = gl_GlobalInvocationID.x;
+    float idata = _22.in_data[ident];
+    threadgroup float sShared[4];
+    sShared[gl_LocalInvocationIndex] = idata;
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    _44.out_data[ident] = sShared[(4u - gl_LocalInvocationIndex) - 1u];
+}
+
--- a/reference/shaders-msl/comp/struct-layout.comp
+++ b/reference/shaders-msl/comp/struct-layout.comp
@ -0,0 +1,26 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct Foo
+{
+    float4x4 m;
+};
+
+struct SSBO2
+{
+    Foo out_data[1];
+};
+
+struct SSBO
+{
+    Foo in_data[1];
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO2& _23 [[buffer(0)]], device SSBO& _30 [[buffer(1)]])
+{
+    uint ident = gl_GlobalInvocationID.x;
+    _23.out_data[ident].m = _30.in_data[ident].m * _30.in_data[ident].m;
+}
+
--- a/reference/shaders-msl/comp/struct-packing.comp
+++ b/reference/shaders-msl/comp/struct-packing.comp
@ -0,0 +1,100 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct S0
+{
+    float2 a[1];
+    float b;
+};
+
+struct S1
+{
+    packed_float3 a;
+    float b;
+};
+
+struct S2
+{
+    float3 a[1];
+    float b;
+};
+
+struct S3
+{
+    float2 a;
+    float b;
+};
+
+struct S4
+{
+    float2 c;
+};
+
+struct Content
+{
+    S0 m0s[1];
+    S1 m1s[1];
+    S2 m2s[1];
+    S0 m0;
+    S1 m1;
+    S2 m2;
+    S3 m3;
+    char pad7[4];
+    float m4;
+    S4 m3s[8];
+};
+
+struct SSBO1
+{
+    Content content;
+    Content content1[2];
+    Content content2;
+    char pad3[8];
+    float2x2 m0;
+    float2x2 m1;
+    float2x3 m2[4];
+    float3x2 m3;
+    float2x2 m4;
+    float2x2 m5[9];
+    float2x3 m6[4][2];
+    float3x2 m7;
+    float array[1];
+};
+
+struct SSBO0
+{
+    Content content;
+    Content content1[2];
+    Content content2;
+    float array[1];
+};
+
+kernel void main0(device SSBO1& ssbo_430 [[buffer(0)]], device SSBO0& ssbo_140 [[buffer(1)]])
+{
+    ssbo_430.content.m0s[0].a[0] = ssbo_140.content.m0s[0].a[0];
+    ssbo_430.content.m0s[0].b = ssbo_140.content.m0s[0].b;
+    ssbo_430.content.m1s[0].a = ssbo_140.content.m1s[0].a;
+    ssbo_430.content.m1s[0].b = ssbo_140.content.m1s[0].b;
+    ssbo_430.content.m2s[0].a[0] = ssbo_140.content.m2s[0].a[0];
+    ssbo_430.content.m2s[0].b = ssbo_140.content.m2s[0].b;
+    ssbo_430.content.m0.a[0] = ssbo_140.content.m0.a[0];
+    ssbo_430.content.m0.b = ssbo_140.content.m0.b;
+    ssbo_430.content.m1.a = ssbo_140.content.m1.a;
+    ssbo_430.content.m1.b = ssbo_140.content.m1.b;
+    ssbo_430.content.m2.a[0] = ssbo_140.content.m2.a[0];
+    ssbo_430.content.m2.b = ssbo_140.content.m2.b;
+    ssbo_430.content.m3.a = ssbo_140.content.m3.a;
+    ssbo_430.content.m3.b = ssbo_140.content.m3.b;
+    ssbo_430.content.m4 = ssbo_140.content.m4;
+    ssbo_430.content.m3s[0].c = ssbo_140.content.m3s[0].c;
+    ssbo_430.content.m3s[1].c = ssbo_140.content.m3s[1].c;
+    ssbo_430.content.m3s[2].c = ssbo_140.content.m3s[2].c;
+    ssbo_430.content.m3s[3].c = ssbo_140.content.m3s[3].c;
+    ssbo_430.content.m3s[4].c = ssbo_140.content.m3s[4].c;
+    ssbo_430.content.m3s[5].c = ssbo_140.content.m3s[5].c;
+    ssbo_430.content.m3s[6].c = ssbo_140.content.m3s[6].c;
+    ssbo_430.content.m3s[7].c = ssbo_140.content.m3s[7].c;
+}
+
--- a/reference/shaders-msl/comp/torture-loop.comp
+++ b/reference/shaders-msl/comp/torture-loop.comp
@ -0,0 +1,51 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO
+{
+    float4x4 mvp;
+    float4 in_data[1];
+};
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO& _24 [[buffer(0)]], device SSBO2& _89 [[buffer(1)]])
+{
+    uint ident = gl_GlobalInvocationID.x;
+    float4 idat = _24.in_data[ident];
+    int k = 0;
+    for (;;)
+    {
+        int _39 = k;
+        int _40 = _39 + 1;
+        k = _40;
+        if (_40 < 10)
+        {
+            idat *= 2.0;
+            k++;
+            continue;
+        }
+        else
+        {
+            break;
+        }
+    }
+    for (uint i = 0u; i < 16u; i++, k++)
+    {
+        for (uint j = 0u; j < 30u; j++)
+        {
+            idat = _24.mvp * idat;
+        }
+    }
+    do
+    {
+        k++;
+    } while (k > 10);
+    _89.out_data[ident] = idat;
+}
+
--- a/reference/shaders-msl/comp/type-alias.comp
+++ b/reference/shaders-msl/comp/type-alias.comp
@ -0,0 +1,53 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct S0
+{
+    float4 a;
+};
+
+struct S1
+{
+    float4 a;
+};
+
+struct SSBO0
+{
+    S0 s0s[1];
+};
+
+struct SSBO1
+{
+    S1 s1s[1];
+};
+
+struct SSBO2
+{
+    float4 outputs[1];
+};
+
+float4 overload(thread const S0& s0)
+{
+    return s0.a;
+}
+
+float4 overload(thread const S1& s1)
+{
+    return s1.a;
+}
+
+kernel void main0(device SSBO0& _36 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO1& _55 [[buffer(1)]], device SSBO2& _66 [[buffer(2)]])
+{
+    S0 s0;
+    s0.a = _36.s0s[gl_GlobalInvocationID.x].a;
+    S1 s1;
+    s1.a = _55.s1s[gl_GlobalInvocationID.x].a;
+    S0 param = s0;
+    S1 param_1 = s1;
+    _66.outputs[gl_GlobalInvocationID.x] = overload(param) + overload(param_1);
+}
+
--- a/reference/shaders-msl/comp/udiv.comp
+++ b/reference/shaders-msl/comp/udiv.comp
@ -0,0 +1,20 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO2
+{
+    uint outputs[1];
+};
+
+struct SSBO
+{
+    uint inputs[1];
+};
+
+kernel void main0(device SSBO2& _10 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO& _23 [[buffer(1)]])
+{
+    _10.outputs[gl_GlobalInvocationID.x] = _23.inputs[gl_GlobalInvocationID.x] / 29u;
+}
+
--- a/reference/shaders-msl/flatten/struct.flatten.vert
+++ b/reference/shaders-msl/flatten/struct.flatten.vert
@ -5,7 +5,7 @@ using namespace metal;

 struct Light
 {
-    float3 Position;
+    packed_float3 Position;
    float Radius;
    float4 Color;
 };
--- a/reference/shaders-msl/flatten/swizzle.flatten.vert
+++ b/reference/shaders-msl/flatten/swizzle.flatten.vert
@ -39,7 +39,7 @@ vertex main0_out main0(constant UBO& _22 [[buffer(0)]])
    out.oA = _22.A;
    out.oB = float4(_22.B0, _22.B1);
    out.oC = float4(_22.C0, _22.C1);
-    out.oD = float4(float3(_22.D0), _22.D1);
+    out.oD = float4(_22.D0, _22.D1);
    out.oE = float4(_22.E0, _22.E1, _22.E2, _22.E3);
    out.oF = float4(_22.F0, _22.F1, _22.F2);
    return out;
--- a/reference/shaders-msl/vert/copy.flatten.vert
+++ b/reference/shaders-msl/vert/copy.flatten.vert
@ -5,7 +5,7 @@ using namespace metal;

 struct Light
 {
-    float3 Position;
+    packed_float3 Position;
    float Radius;
    float4 Color;
 };
--- a/reference/shaders-msl/vert/dynamic.flatten.vert
+++ b/reference/shaders-msl/vert/dynamic.flatten.vert
@ -5,7 +5,7 @@ using namespace metal;

 struct Light
 {
-    float3 Position;
+    packed_float3 Position;
    float Radius;
    float4 Color;
 };
--- a/reference/shaders-msl/vert/functions.vert
+++ b/reference/shaders-msl/vert/functions.vert
@ -47,15 +47,15 @@ T degrees(T r)
 template<typename T>
 T findLSB(T x)
 {
-    return select(ctz(x), -1, x == 0);
+    return select(ctz(x), T(-1), x == T(0));
 }

 // Implementation of the signed GLSL findMSB() function
 template<typename T>
 T findSMSB(T x)
 {
-    T v = select(x, -1 - x, x < 0);
-    return select(clz(0) - (clz(v) + 1), -1, v == 0);
+    T v = select(x, T(-1) - x, x < T(0));
+    return select(clz(T(0)) - (clz(v) + T(1)), T(-1), v == T(0));
 }

 // Returns the determinant of a 2x2 matrix.
--- a/reference/shaders-msl/vert/ubo.alignment.vert
+++ b/reference/shaders-msl/vert/ubo.alignment.vert
@ -31,7 +31,7 @@ vertex main0_out main0(main0_in in [[stage_in]], constant UBO& _18 [[buffer(0)]]
    main0_out out = {};
    out.gl_Position = _18.mvp * in.aVertex;
    out.vNormal = in.aNormal;
-    out.vColor = float3(_18.color) * _18.opacity;
+    out.vColor = _18.color * _18.opacity;
    out.vSize = _18.targSize * _18.opacity;
    return out;
 }
--- a/shaders-msl/asm/comp/bitcast_iadd.asm.comp
+++ b/shaders-msl/asm/comp/bitcast_iadd.asm.comp
@ -0,0 +1,79 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 1
+; Bound: 30
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %func "main"
+               OpExecutionMode %func LocalSize 1 1 1
+               OpSource ESSL 310
+               OpSourceExtension "GL_GOOGLE_cpp_style_line_directive"
+               OpSourceExtension "GL_GOOGLE_include_directive"
+               OpMemberDecorate %input_struct 0 Offset 0
+               OpMemberDecorate %input_struct 1 Offset 16
+               OpMemberDecorate %output_struct 0 Offset 0
+               OpMemberDecorate %output_struct 1 Offset 16
+               OpDecorate %input_struct BufferBlock
+               OpDecorate %inputs DescriptorSet 0
+               OpDecorate %inputs Binding 0
+			   OpDecorate %inputs Restrict
+               OpDecorate %output_struct BufferBlock
+               OpDecorate %outputs DescriptorSet 0
+               OpDecorate %outputs Binding 1
+			   OpDecorate %outputs Restrict
+
+          %void = OpTypeVoid
+          %main_func = OpTypeFunction %void
+
+          %uint = OpTypeInt 32 0
+          %uvec4 = OpTypeVector %uint 4
+
+         %int = OpTypeInt 32 1
+         %ivec4 = OpTypeVector %int 4
+
+         %ivec4_ptr = OpTypePointer Uniform %ivec4
+         %uvec4_ptr = OpTypePointer Uniform %uvec4
+
+		 %zero = OpConstant %int 0
+		 %one = OpConstant %int 1
+
+         %input_struct = OpTypeStruct %ivec4 %uvec4
+         %input_struct_ptr = OpTypePointer Uniform %input_struct
+         %inputs = OpVariable %input_struct_ptr Uniform
+         %output_struct = OpTypeStruct %uvec4 %ivec4
+         %output_struct_ptr = OpTypePointer Uniform %output_struct
+         %outputs = OpVariable %output_struct_ptr Uniform
+
+          %func = OpFunction %void None %main_func
+          %block = OpLabel
+
+         %input1_ptr = OpAccessChain %ivec4_ptr %inputs %zero
+         %input0_ptr = OpAccessChain %uvec4_ptr %inputs %one
+         %input1 = OpLoad %ivec4 %input1_ptr
+         %input0 = OpLoad %uvec4 %input0_ptr
+
+         %output_ptr_uvec4 = OpAccessChain %uvec4_ptr %outputs %zero
+         %output_ptr_ivec4 = OpAccessChain %ivec4_ptr %outputs %one
+
+; Test all variants of IAdd
+         %result_iadd_0 = OpIAdd %uvec4 %input0 %input1
+         %result_iadd_1 = OpIAdd %uvec4 %input1 %input0
+         %result_iadd_2 = OpIAdd %uvec4 %input0 %input0
+         %result_iadd_3 = OpIAdd %uvec4 %input1 %input1
+         %result_iadd_4 = OpIAdd %ivec4 %input0 %input0
+         %result_iadd_5 = OpIAdd %ivec4 %input1 %input1
+         %result_iadd_6 = OpIAdd %ivec4 %input0 %input1
+         %result_iadd_7 = OpIAdd %ivec4 %input1 %input0
+			   OpStore %output_ptr_uvec4 %result_iadd_0
+			   OpStore %output_ptr_uvec4 %result_iadd_1
+			   OpStore %output_ptr_uvec4 %result_iadd_2
+			   OpStore %output_ptr_uvec4 %result_iadd_3
+			   OpStore %output_ptr_ivec4 %result_iadd_4
+			   OpStore %output_ptr_ivec4 %result_iadd_5
+			   OpStore %output_ptr_ivec4 %result_iadd_6
+			   OpStore %output_ptr_ivec4 %result_iadd_7
+
+               OpReturn
+               OpFunctionEnd
--- a/shaders-msl/asm/comp/bitcast_sar.asm.comp
+++ b/shaders-msl/asm/comp/bitcast_sar.asm.comp
@ -0,0 +1,77 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 1
+; Bound: 30
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %func "main"
+               OpExecutionMode %func LocalSize 1 1 1
+               OpSource ESSL 310
+               OpSourceExtension "GL_GOOGLE_cpp_style_line_directive"
+               OpSourceExtension "GL_GOOGLE_include_directive"
+               OpMemberDecorate %input_struct 0 Offset 0
+               OpMemberDecorate %input_struct 1 Offset 16
+               OpMemberDecorate %output_struct 0 Offset 0
+               OpMemberDecorate %output_struct 1 Offset 16
+               OpDecorate %input_struct BufferBlock
+               OpDecorate %inputs DescriptorSet 0
+               OpDecorate %inputs Binding 0
+               OpDecorate %output_struct BufferBlock
+               OpDecorate %outputs DescriptorSet 0
+               OpDecorate %outputs Binding 1
+
+          %void = OpTypeVoid
+          %main_func = OpTypeFunction %void
+
+          %uint = OpTypeInt 32 0
+          %uvec4 = OpTypeVector %uint 4
+
+         %int = OpTypeInt 32 1
+         %ivec4 = OpTypeVector %int 4
+
+         %ivec4_ptr = OpTypePointer Uniform %ivec4
+         %uvec4_ptr = OpTypePointer Uniform %uvec4
+
+		 %zero = OpConstant %int 0
+		 %one = OpConstant %int 1
+
+         %input_struct = OpTypeStruct %ivec4 %uvec4
+         %input_struct_ptr = OpTypePointer Uniform %input_struct
+         %inputs = OpVariable %input_struct_ptr Uniform
+         %output_struct = OpTypeStruct %uvec4 %ivec4
+         %output_struct_ptr = OpTypePointer Uniform %output_struct
+         %outputs = OpVariable %output_struct_ptr Uniform
+
+          %func = OpFunction %void None %main_func
+          %block = OpLabel
+
+         %input1_ptr = OpAccessChain %ivec4_ptr %inputs %zero
+         %input0_ptr = OpAccessChain %uvec4_ptr %inputs %one
+         %input1 = OpLoad %ivec4 %input1_ptr
+         %input0 = OpLoad %uvec4 %input0_ptr
+
+         %output_ptr_uvec4 = OpAccessChain %uvec4_ptr %outputs %zero
+         %output_ptr_ivec4 = OpAccessChain %ivec4_ptr %outputs %one
+
+; Test all variants of ShiftRightArithmetic
+         %result_iadd_0 = OpShiftRightArithmetic %uvec4 %input0 %input1
+         %result_iadd_1 = OpShiftRightArithmetic %uvec4 %input1 %input0
+         %result_iadd_2 = OpShiftRightArithmetic %uvec4 %input0 %input0
+         %result_iadd_3 = OpShiftRightArithmetic %uvec4 %input1 %input1
+         %result_iadd_4 = OpShiftRightArithmetic %ivec4 %input0 %input0
+         %result_iadd_5 = OpShiftRightArithmetic %ivec4 %input1 %input1
+         %result_iadd_6 = OpShiftRightArithmetic %ivec4 %input0 %input1
+         %result_iadd_7 = OpShiftRightArithmetic %ivec4 %input1 %input0
+			   OpStore %output_ptr_uvec4 %result_iadd_0
+			   OpStore %output_ptr_uvec4 %result_iadd_1
+			   OpStore %output_ptr_uvec4 %result_iadd_2
+			   OpStore %output_ptr_uvec4 %result_iadd_3
+			   OpStore %output_ptr_ivec4 %result_iadd_4
+			   OpStore %output_ptr_ivec4 %result_iadd_5
+			   OpStore %output_ptr_ivec4 %result_iadd_6
+			   OpStore %output_ptr_ivec4 %result_iadd_7
+
+               OpReturn
+               OpFunctionEnd
--- a/shaders-msl/asm/comp/bitcast_sdiv.asm.comp
+++ b/shaders-msl/asm/comp/bitcast_sdiv.asm.comp
@ -0,0 +1,77 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 1
+; Bound: 30
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %func "main"
+               OpExecutionMode %func LocalSize 1 1 1
+               OpSource ESSL 310
+               OpSourceExtension "GL_GOOGLE_cpp_style_line_directive"
+               OpSourceExtension "GL_GOOGLE_include_directive"
+               OpMemberDecorate %input_struct 0 Offset 0
+               OpMemberDecorate %input_struct 1 Offset 16
+               OpMemberDecorate %output_struct 0 Offset 0
+               OpMemberDecorate %output_struct 1 Offset 16
+               OpDecorate %input_struct BufferBlock
+               OpDecorate %inputs DescriptorSet 0
+               OpDecorate %inputs Binding 0
+               OpDecorate %output_struct BufferBlock
+               OpDecorate %outputs DescriptorSet 0
+               OpDecorate %outputs Binding 1
+
+          %void = OpTypeVoid
+          %main_func = OpTypeFunction %void
+
+          %uint = OpTypeInt 32 0
+          %uvec4 = OpTypeVector %uint 4
+
+         %int = OpTypeInt 32 1
+         %ivec4 = OpTypeVector %int 4
+
+         %ivec4_ptr = OpTypePointer Uniform %ivec4
+         %uvec4_ptr = OpTypePointer Uniform %uvec4
+
+		 %zero = OpConstant %int 0
+		 %one = OpConstant %int 1
+
+         %input_struct = OpTypeStruct %ivec4 %uvec4
+         %input_struct_ptr = OpTypePointer Uniform %input_struct
+         %inputs = OpVariable %input_struct_ptr Uniform
+         %output_struct = OpTypeStruct %uvec4 %ivec4
+         %output_struct_ptr = OpTypePointer Uniform %output_struct
+         %outputs = OpVariable %output_struct_ptr Uniform
+
+          %func = OpFunction %void None %main_func
+          %block = OpLabel
+
+         %input1_ptr = OpAccessChain %ivec4_ptr %inputs %zero
+         %input0_ptr = OpAccessChain %uvec4_ptr %inputs %one
+         %input1 = OpLoad %ivec4 %input1_ptr
+         %input0 = OpLoad %uvec4 %input0_ptr
+
+         %output_ptr_uvec4 = OpAccessChain %uvec4_ptr %outputs %zero
+         %output_ptr_ivec4 = OpAccessChain %ivec4_ptr %outputs %one
+
+; Test all variants of SDiv
+         %result_iadd_0 = OpSDiv %uvec4 %input0 %input1
+         %result_iadd_1 = OpSDiv %uvec4 %input1 %input0
+         %result_iadd_2 = OpSDiv %uvec4 %input0 %input0
+         %result_iadd_3 = OpSDiv %uvec4 %input1 %input1
+         %result_iadd_4 = OpSDiv %ivec4 %input0 %input0
+         %result_iadd_5 = OpSDiv %ivec4 %input1 %input1
+         %result_iadd_6 = OpSDiv %ivec4 %input0 %input1
+         %result_iadd_7 = OpSDiv %ivec4 %input1 %input0
+			   OpStore %output_ptr_uvec4 %result_iadd_0
+			   OpStore %output_ptr_uvec4 %result_iadd_1
+			   OpStore %output_ptr_uvec4 %result_iadd_2
+			   OpStore %output_ptr_uvec4 %result_iadd_3
+			   OpStore %output_ptr_ivec4 %result_iadd_4
+			   OpStore %output_ptr_ivec4 %result_iadd_5
+			   OpStore %output_ptr_ivec4 %result_iadd_6
+			   OpStore %output_ptr_ivec4 %result_iadd_7
+
+               OpReturn
+               OpFunctionEnd
--- a/shaders-msl/asm/comp/bitcast_slr.asm.comp
+++ b/shaders-msl/asm/comp/bitcast_slr.asm.comp
@ -0,0 +1,77 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 1
+; Bound: 30
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %func "main"
+               OpExecutionMode %func LocalSize 1 1 1
+               OpSource ESSL 310
+               OpSourceExtension "GL_GOOGLE_cpp_style_line_directive"
+               OpSourceExtension "GL_GOOGLE_include_directive"
+               OpMemberDecorate %input_struct 0 Offset 0
+               OpMemberDecorate %input_struct 1 Offset 16
+               OpMemberDecorate %output_struct 0 Offset 0
+               OpMemberDecorate %output_struct 1 Offset 16
+               OpDecorate %input_struct BufferBlock
+               OpDecorate %inputs DescriptorSet 0
+               OpDecorate %inputs Binding 0
+               OpDecorate %output_struct BufferBlock
+               OpDecorate %outputs DescriptorSet 0
+               OpDecorate %outputs Binding 1
+
+          %void = OpTypeVoid
+          %main_func = OpTypeFunction %void
+
+          %uint = OpTypeInt 32 0
+          %uvec4 = OpTypeVector %uint 4
+
+         %int = OpTypeInt 32 1
+         %ivec4 = OpTypeVector %int 4
+
+         %ivec4_ptr = OpTypePointer Uniform %ivec4
+         %uvec4_ptr = OpTypePointer Uniform %uvec4
+
+		 %zero = OpConstant %int 0
+		 %one = OpConstant %int 1
+
+         %input_struct = OpTypeStruct %ivec4 %uvec4
+         %input_struct_ptr = OpTypePointer Uniform %input_struct
+         %inputs = OpVariable %input_struct_ptr Uniform
+         %output_struct = OpTypeStruct %uvec4 %ivec4
+         %output_struct_ptr = OpTypePointer Uniform %output_struct
+         %outputs = OpVariable %output_struct_ptr Uniform
+
+          %func = OpFunction %void None %main_func
+          %block = OpLabel
+
+         %input1_ptr = OpAccessChain %ivec4_ptr %inputs %zero
+         %input0_ptr = OpAccessChain %uvec4_ptr %inputs %one
+         %input1 = OpLoad %ivec4 %input1_ptr
+         %input0 = OpLoad %uvec4 %input0_ptr
+
+         %output_ptr_uvec4 = OpAccessChain %uvec4_ptr %outputs %zero
+         %output_ptr_ivec4 = OpAccessChain %ivec4_ptr %outputs %one
+
+; Test all variants of ShiftRightLogical
+         %result_iadd_0 = OpShiftRightLogical %uvec4 %input0 %input1
+         %result_iadd_1 = OpShiftRightLogical %uvec4 %input1 %input0
+         %result_iadd_2 = OpShiftRightLogical %uvec4 %input0 %input0
+         %result_iadd_3 = OpShiftRightLogical %uvec4 %input1 %input1
+         %result_iadd_4 = OpShiftRightLogical %ivec4 %input0 %input0
+         %result_iadd_5 = OpShiftRightLogical %ivec4 %input1 %input1
+         %result_iadd_6 = OpShiftRightLogical %ivec4 %input0 %input1
+         %result_iadd_7 = OpShiftRightLogical %ivec4 %input1 %input0
+			   OpStore %output_ptr_uvec4 %result_iadd_0
+			   OpStore %output_ptr_uvec4 %result_iadd_1
+			   OpStore %output_ptr_uvec4 %result_iadd_2
+			   OpStore %output_ptr_uvec4 %result_iadd_3
+			   OpStore %output_ptr_ivec4 %result_iadd_4
+			   OpStore %output_ptr_ivec4 %result_iadd_5
+			   OpStore %output_ptr_ivec4 %result_iadd_6
+			   OpStore %output_ptr_ivec4 %result_iadd_7
+
+               OpReturn
+               OpFunctionEnd
--- a/shaders-msl/asm/comp/multiple-entry.asm.comp
+++ b/shaders-msl/asm/comp/multiple-entry.asm.comp
@ -0,0 +1,97 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 1
+; Bound: 30
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint Fragment %func_alt "main2" %frag_in %frag_out
+               OpEntryPoint GLCompute %func "main"
+               OpExecutionMode %func LocalSize 1 1 1
+               OpSource ESSL 310
+               OpSourceExtension "GL_GOOGLE_cpp_style_line_directive"
+               OpSourceExtension "GL_GOOGLE_include_directive"
+               OpMemberDecorate %input_struct 0 Offset 0
+               OpMemberDecorate %input_struct 1 Offset 16
+               OpMemberDecorate %output_struct 0 Offset 0
+               OpMemberDecorate %output_struct 1 Offset 16
+               OpDecorate %input_struct BufferBlock
+               OpDecorate %inputs DescriptorSet 0
+               OpDecorate %inputs Binding 0
+			   OpDecorate %inputs Restrict
+               OpDecorate %output_struct BufferBlock
+               OpDecorate %outputs DescriptorSet 0
+               OpDecorate %outputs Binding 1
+			   OpDecorate %outputs Restrict
+			   OpDecorate %frag_in Location 0
+			   OpDecorate %frag_out Location 0
+
+          %void = OpTypeVoid
+          %main_func = OpTypeFunction %void
+
+          %uint = OpTypeInt 32 0
+          %uvec4 = OpTypeVector %uint 4
+
+         %int = OpTypeInt 32 1
+         %ivec4 = OpTypeVector %int 4
+
+         %ivec4_ptr = OpTypePointer Uniform %ivec4
+         %uvec4_ptr = OpTypePointer Uniform %uvec4
+
+		 %float = OpTypeFloat 32
+		 %vec4 = OpTypeVector %float 4
+		 %vec4_input_ptr = OpTypePointer Input %vec4
+		 %vec4_output_ptr = OpTypePointer Output %vec4
+
+		 %zero = OpConstant %int 0
+		 %one = OpConstant %int 1
+
+         %input_struct = OpTypeStruct %ivec4 %uvec4
+         %input_struct_ptr = OpTypePointer Uniform %input_struct
+         %inputs = OpVariable %input_struct_ptr Uniform
+         %output_struct = OpTypeStruct %uvec4 %ivec4
+         %output_struct_ptr = OpTypePointer Uniform %output_struct
+         %outputs = OpVariable %output_struct_ptr Uniform
+
+		 %frag_in = OpVariable %vec4_input_ptr Input
+		 %frag_out = OpVariable %vec4_output_ptr Output
+
+          %func = OpFunction %void None %main_func
+          %block = OpLabel
+
+         %input1_ptr = OpAccessChain %ivec4_ptr %inputs %zero
+         %input0_ptr = OpAccessChain %uvec4_ptr %inputs %one
+         %input1 = OpLoad %ivec4 %input1_ptr
+         %input0 = OpLoad %uvec4 %input0_ptr
+
+         %output_ptr_uvec4 = OpAccessChain %uvec4_ptr %outputs %zero
+         %output_ptr_ivec4 = OpAccessChain %ivec4_ptr %outputs %one
+
+; Test all variants of IAdd
+         %result_iadd_0 = OpIAdd %uvec4 %input0 %input1
+         %result_iadd_1 = OpIAdd %uvec4 %input1 %input0
+         %result_iadd_2 = OpIAdd %uvec4 %input0 %input0
+         %result_iadd_3 = OpIAdd %uvec4 %input1 %input1
+         %result_iadd_4 = OpIAdd %ivec4 %input0 %input0
+         %result_iadd_5 = OpIAdd %ivec4 %input1 %input1
+         %result_iadd_6 = OpIAdd %ivec4 %input0 %input1
+         %result_iadd_7 = OpIAdd %ivec4 %input1 %input0
+			   OpStore %output_ptr_uvec4 %result_iadd_0
+			   OpStore %output_ptr_uvec4 %result_iadd_1
+			   OpStore %output_ptr_uvec4 %result_iadd_2
+			   OpStore %output_ptr_uvec4 %result_iadd_3
+			   OpStore %output_ptr_ivec4 %result_iadd_4
+			   OpStore %output_ptr_ivec4 %result_iadd_5
+			   OpStore %output_ptr_ivec4 %result_iadd_6
+			   OpStore %output_ptr_ivec4 %result_iadd_7
+
+               OpReturn
+               OpFunctionEnd
+
+          %func_alt = OpFunction %void None %main_func
+          %block_alt = OpLabel
+		       %frag_input_value = OpLoad %vec4 %frag_in
+		       OpStore %frag_out %frag_input_value
+               OpReturn
+               OpFunctionEnd
--- a/shaders-msl/asm/comp/quantize.asm.comp
+++ b/shaders-msl/asm/comp/quantize.asm.comp
@ -0,0 +1,67 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 1
+; Bound: 38
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %4 "main"
+               OpExecutionMode %4 LocalSize 1 1 1
+               OpSource ESSL 310
+               OpName %4 "main"
+               OpName %10 "SSBO0"
+               OpMemberName %10 0 "scalar"
+               OpMemberName %10 1 "vec2_val"
+               OpMemberName %10 2 "vec3_val"
+               OpMemberName %10 3 "vec4_val"
+               OpName %12 ""
+               OpMemberDecorate %10 0 Offset 0
+               OpMemberDecorate %10 1 Offset 8
+               OpMemberDecorate %10 2 Offset 16
+               OpMemberDecorate %10 3 Offset 32
+               OpDecorate %10 BufferBlock
+               OpDecorate %12 DescriptorSet 0
+               OpDecorate %12 Binding 0
+          %2 = OpTypeVoid
+          %3 = OpTypeFunction %2
+          %6 = OpTypeFloat 32
+          %7 = OpTypeVector %6 2
+          %8 = OpTypeVector %6 3
+          %9 = OpTypeVector %6 4
+         %10 = OpTypeStruct %6 %7 %8 %9
+         %11 = OpTypePointer Uniform %10
+         %12 = OpVariable %11 Uniform
+         %13 = OpTypeInt 32 1
+         %14 = OpConstant %13 0
+         %15 = OpTypePointer Uniform %6
+         %20 = OpConstant %13 1
+         %21 = OpTypePointer Uniform %7
+         %26 = OpConstant %13 2
+         %27 = OpTypePointer Uniform %8
+         %32 = OpConstant %13 3
+         %33 = OpTypePointer Uniform %9
+          %4 = OpFunction %2 None %3
+          %5 = OpLabel
+         %16 = OpAccessChain %15 %12 %14
+         %17 = OpLoad %6 %16
+         %18 = OpQuantizeToF16 %6 %17
+         %19 = OpAccessChain %15 %12 %14
+               OpStore %19 %18
+         %22 = OpAccessChain %21 %12 %20
+         %23 = OpLoad %7 %22
+         %24 = OpQuantizeToF16 %7 %23
+         %25 = OpAccessChain %21 %12 %20
+               OpStore %25 %24
+         %28 = OpAccessChain %27 %12 %26
+         %29 = OpLoad %8 %28
+         %30 = OpQuantizeToF16 %8 %29
+         %31 = OpAccessChain %27 %12 %26
+               OpStore %31 %30
+         %34 = OpAccessChain %33 %12 %32
+         %35 = OpLoad %9 %34
+         %36 = OpQuantizeToF16 %9 %35
+         %37 = OpAccessChain %33 %12 %32
+               OpStore %37 %36
+               OpReturn
+               OpFunctionEnd
--- a/shaders-msl/comp/bake_gradient.comp
+++ b/shaders-msl/comp/bake_gradient.comp
@ -0,0 +1,55 @@
+#version 310 es
+
+layout(local_size_x = 8, local_size_y = 8) in;
+
+layout(binding = 0) uniform sampler2D uHeight;
+layout(binding = 1) uniform sampler2D uDisplacement;
+layout(rgba16f, binding = 2) uniform writeonly mediump image2D iHeightDisplacement;
+layout(rgba16f, binding = 3) uniform writeonly mediump image2D iGradJacobian;
+
+layout(binding = 4) uniform UBO
+{
+    vec4 uInvSize;
+    vec4 uScale;
+};
+
+mediump float jacobian(mediump vec2 dDdx, mediump vec2 dDdy)
+{
+    return (1.0 + dDdx.x) * (1.0 + dDdy.y) - dDdx.y * dDdy.x;
+}
+#define LAMBDA 1.2
+
+void main()
+{
+    vec4 uv = (vec2(gl_GlobalInvocationID.xy) * uInvSize.xy).xyxy + 0.5 * uInvSize;
+
+    float h = textureLod(uHeight, uv.xy, 0.0).x;
+
+    // Compute the heightmap gradient by simple differentiation.
+    float x0 = textureLodOffset(uHeight, uv.xy, 0.0, ivec2(-1, 0)).x;
+    float x1 = textureLodOffset(uHeight, uv.xy, 0.0, ivec2(+1, 0)).x;
+    float y0 = textureLodOffset(uHeight, uv.xy, 0.0, ivec2(0, -1)).x;
+    float y1 = textureLodOffset(uHeight, uv.xy, 0.0, ivec2(0, +1)).x;
+    vec2 grad = uScale.xy * 0.5 * vec2(x1 - x0, y1 - y0);
+
+    // Displacement map must be sampled with a different offset since it's a smaller texture.
+    vec2 displacement = LAMBDA * textureLod(uDisplacement, uv.zw, 0.0).xy;
+
+    // Compute jacobian.
+    vec2 dDdx = 0.5 * LAMBDA * (
+        textureLodOffset(uDisplacement, uv.zw, 0.0, ivec2(+1, 0)).xy -
+        textureLodOffset(uDisplacement, uv.zw, 0.0, ivec2(-1, 0)).xy);
+    vec2 dDdy = 0.5 * LAMBDA * (
+        textureLodOffset(uDisplacement, uv.zw, 0.0, ivec2(0, +1)).xy -
+        textureLodOffset(uDisplacement, uv.zw, 0.0, ivec2(0, -1)).xy);
+    float j = jacobian(dDdx * uScale.z, dDdy * uScale.z);
+
+    displacement = vec2(0.0);
+
+    // Read by vertex shader/tess shader.
+    imageStore(iHeightDisplacement, ivec2(gl_GlobalInvocationID.xy), vec4(h, displacement, 0.0));
+
+    // Read by fragment shader.
+    imageStore(iGradJacobian, ivec2(gl_GlobalInvocationID.xy), vec4(grad, j, 0.0));
+}
+
--- a/shaders-msl/comp/basic.comp
+++ b/shaders-msl/comp/basic.comp
@ -0,0 +1,28 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    vec4 in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    vec4 out_data[];
+};
+
+layout(std430, binding = 2) buffer SSBO3
+{
+    uint counter;
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    vec4 idata = in_data[ident];
+    if (dot(idata, vec4(1.0, 5.0, 6.0, 2.0)) > 8.2)
+    {
+        out_data[atomicAdd(counter, 1u)] = idata;
+    }
+}
+
--- a/shaders-msl/comp/bitfield.comp
+++ b/shaders-msl/comp/bitfield.comp
@ -0,0 +1,23 @@
+#version 310 es
+
+void main()
+{
+   int signed_value = 0;
+   uint unsigned_value = 0u;
+
+   int s = bitfieldExtract(signed_value, 5, 20);
+   uint u = bitfieldExtract(unsigned_value, 6, 21);
+   s = bitfieldInsert(s, 40, 5, 4);
+   u = bitfieldInsert(u, 60u, 5, 4);
+
+   u = bitfieldReverse(u);
+   s = bitfieldReverse(s);
+
+   int v0 = bitCount(u);
+   int v1 = bitCount(s);
+
+   int v2 = findMSB(u);
+   int v3 = findMSB(s);
+   int v4 = findLSB(u);
+   int v5 = findLSB(s);
+}
--- a/shaders-msl/comp/cfg-preserve-parameter.comp
+++ b/shaders-msl/comp/cfg-preserve-parameter.comp
@ -0,0 +1,54 @@
+#version 310 es
+
+// We write in all paths (and no reads), so should just be out.
+void out_test_0(int cond, inout int i)
+{
+   if (cond == 0)
+      i = 40;
+   else
+      i = 60;
+}
+
+// We write in all paths (and no reads), so should just be out.
+void out_test_1(int cond, inout int i)
+{
+   switch (cond)
+   {
+      case 40:
+         i = 40;
+         break;
+
+      default:
+         i = 70;
+         break;
+   }
+}
+
+// We don't write in all paths, so should be inout.
+void inout_test_0(int cond, inout int i)
+{
+   if (cond == 0)
+      i = 40;
+}
+
+void inout_test_1(int cond, inout int i)
+{
+   switch (cond)
+   {
+      case 40:
+         i = 40;
+         break;
+   }
+}
+
+
+void main()
+{
+   int cond = 40;
+   int i = 50;
+
+   out_test_0(cond, i);
+   out_test_1(cond, i);
+   inout_test_0(cond, i);
+   inout_test_1(cond, i);
+}
--- a/shaders-msl/comp/coherent-block.comp
+++ b/shaders-msl/comp/coherent-block.comp
@ -0,0 +1,12 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(binding = 1) coherent restrict writeonly buffer SSBO
+{
+	vec4 value;	
+};
+
+void main()
+{
+	value = vec4(20.0);
+}
--- a/shaders-msl/comp/coherent-image.comp
+++ b/shaders-msl/comp/coherent-image.comp
@ -0,0 +1,14 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(binding = 1) coherent restrict writeonly buffer SSBO
+{
+	ivec4 value;	
+};
+
+layout(r32i, binding = 3) coherent readonly restrict uniform mediump iimage2D uImage;
+
+void main()
+{
+	value = imageLoad(uImage, ivec2(10));
+}
--- a/shaders-msl/comp/culling.comp
+++ b/shaders-msl/comp/culling.comp
@ -0,0 +1,26 @@
+#version 310 es
+layout(local_size_x = 4) in;
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    float in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    float out_data[];
+};
+
+layout(std430, binding = 2) buffer SSBO3
+{
+    uint count;
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    float idata = in_data[ident];
+    if (idata > 12.0)
+        out_data[atomicAdd(count, 1u)] = idata;
+}
+
--- a/shaders-msl/comp/dowhile.comp
+++ b/shaders-msl/comp/dowhile.comp
@ -0,0 +1,31 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    mat4 mvp;
+    vec4 in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    vec4 out_data[];
+};
+
+int i;
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+
+    i = 0;
+    vec4 idat = in_data[ident];
+    do
+    {
+        idat = mvp * idat;
+        i++;
+    } while(i < 16);
+
+    out_data[ident] = idat;
+}
+
--- a/shaders-msl/comp/image.comp
+++ b/shaders-msl/comp/image.comp
@ -0,0 +1,12 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(rgba8, binding = 0) uniform readonly mediump image2D uImageIn;
+layout(rgba8, binding = 1) uniform writeonly mediump image2D uImageOut;
+
+void main()
+{
+    vec4 v = imageLoad(uImageIn, ivec2(gl_GlobalInvocationID.xy) + imageSize(uImageIn));
+    imageStore(uImageOut, ivec2(gl_GlobalInvocationID.xy), v);
+}
+
--- a/shaders-msl/comp/insert.comp
+++ b/shaders-msl/comp/insert.comp
@ -0,0 +1,18 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) writeonly buffer SSBO
+{
+    vec4 out_data[];
+};
+
+void main()
+{
+    vec4 v;
+    v.x = 10.0;
+    v.y = 30.0;
+    v.z = 70.0;
+    v.w = 90.0;
+    out_data[gl_GlobalInvocationID.x] = v;
+    out_data[gl_GlobalInvocationID.x].y = 20.0;
+}
--- a/shaders-msl/comp/loop.comp
+++ b/shaders-msl/comp/loop.comp
@ -0,0 +1,98 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    mat4 mvp;
+    vec4 in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    vec4 out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    vec4 idat = in_data[ident];
+
+    int k = 0;
+    uint i = 0u;
+
+    if (idat.y == 20.0)
+    {
+        do
+        {
+            k = k * 2;
+            i++;
+        } while (i < ident);
+    }
+
+    switch (k)
+    {
+        case 10:
+            for (;;)
+            {
+                i++;
+                if (i > 10u)
+                    break;
+            }
+            break;
+
+        default:
+            for (;;)
+            {
+               i += 2u;
+               if (i > 20u)
+                  break;
+            }
+            break;
+    }
+
+    while (k < 10)
+    {
+        idat *= 2.0;
+        k++;
+    }
+
+    for (uint i = 0u; i < 16u; i++, k++)
+        for (uint j = 0u; j < 30u; j++)
+            idat = mvp * idat;
+
+    k = 0;
+    for (;;)
+    {
+        k++;
+        if (k > 10)
+        {
+            k += 2;
+        }
+        else
+        {
+            k += 3;
+            continue;
+        }
+
+        k += 10;
+    }
+
+    k = 0;
+    do
+    {
+        k++;
+    } while (k > 10);
+
+    int l = 0;
+    for (;; l++)
+    {
+        if (l == 5)
+        {
+            continue;
+        }
+        
+        idat += 1.0;
+    }
+    out_data[ident] = idat;
+}
+
--- a/shaders-msl/comp/mat3.comp
+++ b/shaders-msl/comp/mat3.comp
@ -0,0 +1,14 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    mat3 out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    out_data[ident] = mat3(vec3(10.0), vec3(20.0), vec3(40.0));
+}
+
--- a/shaders-msl/comp/mod.comp
+++ b/shaders-msl/comp/mod.comp
@ -0,0 +1,26 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    vec4 in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    vec4 out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    vec4 v = mod(in_data[ident], out_data[ident]);
+    out_data[ident] = v;
+
+    uvec4 vu = floatBitsToUint(in_data[ident]) % floatBitsToUint(out_data[ident]);
+    out_data[ident] = uintBitsToFloat(vu);
+
+    ivec4 vi = floatBitsToInt(in_data[ident]) % floatBitsToInt(out_data[ident]);
+    out_data[ident] = intBitsToFloat(vi);
+}
+
--- a/shaders-msl/comp/modf.comp
+++ b/shaders-msl/comp/modf.comp
@ -0,0 +1,23 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    vec4 in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    vec4 out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    vec4 i;
+    //vec4 v = frexp(in_data[ident], i);
+    //out_data[ident] = ldexp(v, i);
+    vec4 v = modf(in_data[ident], i);
+    out_data[ident] = v;
+}
+
--- a/shaders-msl/comp/return.comp
+++ b/shaders-msl/comp/return.comp
@ -0,0 +1,33 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    vec4 out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+
+    if (ident == 2u)
+    {
+        out_data[ident] = vec4(20.0);
+    }
+    else if (ident == 4u)
+    {
+        out_data[ident] = vec4(10.0);
+        return;
+    }
+
+    for (int i = 0; i < 20; i++)
+    {
+        if (i == 10)
+            break;
+
+        return;
+    }
+
+    out_data[ident] = vec4(10.0);
+}
+
--- a/shaders-msl/comp/rmw-opt.comp
+++ b/shaders-msl/comp/rmw-opt.comp
@ -0,0 +1,27 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+	int a;
+};
+
+void main()
+{
+	a += 10;
+	a -= 10;
+	a *= 10;
+	a /= 10;
+	a <<= 2;
+	a >>= 3;
+	a &= 40;
+	a ^= 10;
+	a %= 40;
+	a |= 1;
+
+	bool c = false;
+	bool d = true;
+	c = c && d;
+	d = d || c;
+	a = c && d ? 1 : 0;
+}
--- a/shaders-msl/comp/shared.comp
+++ b/shaders-msl/comp/shared.comp
@ -0,0 +1,27 @@
+#version 310 es
+layout(local_size_x = 4) in;
+
+shared float sShared[gl_WorkGroupSize.x];
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    float in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    float out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    float idata = in_data[ident];
+
+    sShared[gl_LocalInvocationIndex] = idata;
+    memoryBarrierShared();
+    barrier();
+
+    out_data[ident] = sShared[gl_WorkGroupSize.x - gl_LocalInvocationIndex - 1u];
+}
+
--- a/shaders-msl/comp/struct-layout.comp
+++ b/shaders-msl/comp/struct-layout.comp
@ -0,0 +1,24 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+struct Foo
+{
+    mat4 m;
+};
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    Foo in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    Foo out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    out_data[ident].m = in_data[ident].m * in_data[ident].m;
+}
+
--- a/shaders-msl/comp/struct-packing.comp
+++ b/shaders-msl/comp/struct-packing.comp
@ -0,0 +1,76 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+struct S0
+{
+    vec2 a[1];
+    float b;
+};
+
+struct S1
+{
+    vec3 a;
+    float b;
+};
+
+struct S2
+{
+    vec3 a[1];
+    float b;
+};
+
+struct S3
+{
+    vec2 a;
+    float b;
+};
+
+struct S4
+{
+	vec2 c;
+};
+
+struct Content
+{
+    S0 m0s[1];
+    S1 m1s[1];
+    S2 m2s[1];
+    S0 m0;
+    S1 m1;
+    S2 m2;
+    S3 m3;
+    float m4;
+
+	S4 m3s[8];
+};
+
+layout(binding = 1, std430) buffer SSBO1
+{
+    Content content;
+    Content content1[2];
+    Content content2;
+
+    layout(column_major) mat2 m0;
+    layout(column_major) mat2 m1;
+    layout(column_major) mat2x3 m2[4];
+    layout(column_major) mat3x2 m3;
+    layout(row_major) mat2 m4;
+    layout(row_major) mat2 m5[9];
+    layout(row_major) mat2x3 m6[4][2];
+    layout(row_major) mat3x2 m7;
+    float array[];
+} ssbo_430;
+
+layout(binding = 0, std140) buffer SSBO0
+{
+    Content content;
+    Content content1[2];
+    Content content2;
+    float array[];
+} ssbo_140;
+
+void main()
+{
+    ssbo_430.content = ssbo_140.content;
+}
+
--- a/shaders-msl/comp/torture-loop.comp
+++ b/shaders-msl/comp/torture-loop.comp
@ -0,0 +1,40 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    mat4 mvp;
+    vec4 in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    vec4 out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    vec4 idat = in_data[ident];
+
+    int k = 0;
+
+    // Continue with side effects.
+    while (++k < 10)
+    {
+        idat *= 2.0;
+        k++;
+    }
+
+    // Again used here ...
+    for (uint i = 0u; i < 16u; i++, k++)
+        for (uint j = 0u; j < 30u; j++)
+            idat = mvp * idat;
+
+    do
+    {
+        k++;
+    } while (k > 10);
+    out_data[ident] = idat;
+}
+
--- a/shaders-msl/comp/type-alias.comp
+++ b/shaders-msl/comp/type-alias.comp
@ -0,0 +1,45 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+struct S0
+{
+   vec4 a;
+};
+
+struct S1
+{
+   vec4 a;
+};
+
+vec4 overload(S0 s0)
+{
+   return s0.a;
+}
+
+vec4 overload(S1 s1)
+{
+   return s1.a;
+}
+
+layout(std430, binding = 0) buffer SSBO0
+{
+   S0 s0s[];
+};
+
+layout(std430, binding = 1) buffer SSBO1
+{
+   S1 s1s[];
+};
+
+layout(std430, binding = 2) buffer SSBO2
+{
+   vec4 outputs[];
+};
+
+
+void main()
+{
+   S0 s0 = s0s[gl_GlobalInvocationID.x];
+   S1 s1 = s1s[gl_GlobalInvocationID.x];
+   outputs[gl_GlobalInvocationID.x] = overload(s0) + overload(s1); 
+}
--- a/shaders-msl/comp/udiv.comp
+++ b/shaders-msl/comp/udiv.comp
@ -0,0 +1,17 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+    uint inputs[];
+};
+
+layout(std430, binding = 0) buffer SSBO2
+{
+    uint outputs[];
+};
+
+void main()
+{
+    outputs[gl_GlobalInvocationID.x] = inputs[gl_GlobalInvocationID.x] / 29u;
+}
--- a/spirv_common.hpp
+++ b/spirv_common.hpp
@ -343,7 +343,8 @@ struct SPIREntryPoint
 {
 	SPIREntryPoint(uint32_t self_, spv::ExecutionModel execution_model, std::string entry_name)
 	    : self(self_)
-	    , name(std::move(entry_name))
+	    , name(entry_name)
+	    , orig_name(entry_name)
 	    , model(execution_model)
 	{
 	}
@ -351,6 +352,7 @@ struct SPIREntryPoint

 	uint32_t self = 0;
 	std::string name;
+	std::string orig_name;
 	std::vector<uint32_t> interface_variables;

 	uint64_t flags = 0;
@ -857,7 +859,9 @@ struct SPIRConstant : IVariant

 	uint32_t constant_type;
 	ConstantMatrix m;
-	bool specialization = false; // If the constant is a specialization constant (i.e. created with OpSpecConstant*).
+	bool specialization = false; // If this constant is a specialization constant (i.e. created with OpSpecConstant*).
+	bool is_used_as_array_length =
+	    false; // If this constant is used as an array length which creates specialization restrictions on some backends.

 	// For composites which are constant arrays, etc.
 	std::vector<uint32_t> subconstants;
@ -915,6 +919,10 @@ public:
 	{
 		return type;
 	}
+	uint32_t get_id() const
+	{
+		return holder ? holder->self : 0;
+	}
 	bool empty() const
 	{
 		return !holder;
--- a/spirv_cross.cpp
+++ b/spirv_cross.cpp
@ -1484,18 +1484,21 @@ void Compiler::parse(const Instruction &instruction)
 	case OpTypeArray:
 	{
 		uint32_t id = ops[0];
-
-		auto &base = get<SPIRType>(ops[1]);
 		auto &arraybase = set<SPIRType>(id);

-		arraybase = base;
+		uint32_t tid = ops[1];
+		auto &base = get<SPIRType>(tid);

-		auto *c = maybe_get<SPIRConstant>(ops[2]);
+		arraybase = base;
+		arraybase.parent_type = tid;
+
+		uint32_t cid = ops[2];
+		mark_used_as_array_length(cid);
+		auto *c = maybe_get<SPIRConstant>(cid);
 		bool literal = c && !c->specialization;

 		arraybase.array_size_literal.push_back(literal);
-		arraybase.array.push_back(literal ? c->scalar() : ops[2]);
-		arraybase.parent_type = ops[1];
+		arraybase.array.push_back(literal ? c->scalar() : cid);
 		// Do NOT set arraybase.self!
 		break;
 	}
@ -2513,6 +2516,14 @@ vector<string> Compiler::get_entry_points() const
 	return entries;
 }

+unordered_map<string, string> Compiler::get_entry_point_name_map() const
+{
+	unordered_map<string, string> entries;
+	for (auto &entry : entry_points)
+		entries[entry.second.orig_name] = entry.second.name;
+	return entries;
+}
+
 void Compiler::set_entry_point(const std::string &name)
 {
 	auto &entry = get_entry_point(name);
@ -2912,6 +2923,29 @@ const SPIRConstant &Compiler::get_constant(uint32_t id) const
 	return get<SPIRConstant>(id);
 }

+void Compiler::mark_used_as_array_length(uint32_t id)
+{
+	switch (ids[id].get_type())
+	{
+	case TypeConstant:
+		get<SPIRConstant>(id).is_used_as_array_length = true;
+		break;
+
+	case TypeConstantOp:
+	{
+		auto &cop = get<SPIRConstantOp>(id);
+		for (uint32_t arg_id : cop.arguments)
+			mark_used_as_array_length(arg_id);
+	}
+
+	case TypeUndef:
+		return;
+
+	default:
+		SPIRV_CROSS_THROW("Array lengths must be a constant instruction (OpConstant.. or OpSpecConstant...).");
+	}
+}
+
 static bool exists_unaccessed_path_to_return(const CFG &cfg, uint32_t block, const unordered_set<uint32_t> &blocks)
 {
 	// This block accesses the variable.
--- a/spirv_cross.hpp
+++ b/spirv_cross.hpp
@ -254,6 +254,11 @@ public:
 	std::vector<std::string> get_entry_points() const;
 	void set_entry_point(const std::string &name);

+	// Returns a mapping between the original entry point name in the SPIR-V and a modified
+	// name defined by the backend. Some backends (eg. MSL) restrict the legal names allowed
+	// for entry point names (eg. "main" is illegal in MSL).
+	std::unordered_map<std::string, std::string> get_entry_point_name_map() const;
+
 	// Returns the internal data structure for entry points to allow poking around.
 	const SPIREntryPoint &get_entry_point(const std::string &name) const;
 	SPIREntryPoint &get_entry_point(const std::string &name);
@ -336,6 +341,10 @@ public:
 	SPIRConstant &get_constant(uint32_t id);
 	const SPIRConstant &get_constant(uint32_t id) const;

+	// Recursively marks any constants referenced by the specified constant instruction as being used
+	// as an array length. The id must be a constant instruction (SPIRConstant or SPIRConstantOp).
+	void mark_used_as_array_length(uint32_t id);
+
 	uint32_t get_current_id_bound() const
 	{
 		return uint32_t(ids.size());
--- a/spirv_glsl.cpp
+++ b/spirv_glsl.cpp
@ -3359,15 +3359,6 @@ string CompilerGLSL::to_function_args(uint32_t img, const SPIRType &imgtype, boo
 	return farg_str;
 }

-// Some languages may have additional standard library functions whose names conflict
-// with a function defined in the body of the shader. Subclasses can override to rename
-// the function name defined in the shader to avoid conflict with the language standard
-// functions (eg. MSL includes saturate()).
-string CompilerGLSL::clean_func_name(string func_name)
-{
-	return func_name;
-}
-
 void CompilerGLSL::emit_glsl_op(uint32_t result_type, uint32_t id, uint32_t eop, const uint32_t *args, uint32_t)
 {
 	GLSLstd450 op = static_cast<GLSLstd450>(eop);
@ -3855,6 +3846,7 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice

 	bool access_chain_is_arrayed = false;
 	bool row_major_matrix_needs_conversion = is_non_native_row_major_matrix(base);
+	bool vector_is_packed = false;
 	bool pending_array_enclose = false;
 	bool dimension_flatten = false;

@ -3951,12 +3943,7 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 				}
 			}

-			if (member_is_packed_type(*type, index))
-			{
-				auto &membertype = get<SPIRType>(type->member_types[index]);
-				expr = unpack_expression_type(expr, membertype);
-			}
-
+			vector_is_packed = member_is_packed_type(*type, index);
 			row_major_matrix_needs_conversion = member_is_non_native_row_major_matrix(*type, index);
 			type = &get<SPIRType>(type->member_types[index]);
 		}
@ -3982,6 +3969,9 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 		// Vector -> Scalar
 		else if (type->vecsize > 1)
 		{
+			if (vector_is_packed)
+				expr = unpack_expression_type(expr, *type);
+
 			if (index_is_literal)
 			{
 				expr += ".";
@ -4835,7 +4825,7 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)

 		string funexpr;
 		vector<string> arglist;
-		funexpr += clean_func_name(to_name(func)) + "(";
+		funexpr += to_name(func) + "(";
 		for (uint32_t i = 0; i < length; i++)
 		{
 			// Do not pass in separate images or samplers if we're remapping
@ -6183,7 +6173,7 @@ bool CompilerGLSL::member_is_non_native_row_major_matrix(const SPIRType &type, u

 // Checks whether the member is in packed data type, that might need to be unpacked.
 // GLSL does not define packed data types, but certain subclasses do.
-bool CompilerGLSL::member_is_packed_type(const SPIRType &type, uint32_t index)
+bool CompilerGLSL::member_is_packed_type(const SPIRType &type, uint32_t index) const
 {
 	return has_member_decoration(type.self, index, DecorationCPacked);
 }
@ -6745,11 +6735,11 @@ void CompilerGLSL::emit_function_prototype(SPIRFunction &func, uint64_t return_f

 	if (func.self == entry_point)
 	{
-		decl += clean_func_name("main");
+		decl += "main";
 		processing_entry_point = true;
 	}
 	else
-		decl += clean_func_name(to_name(func.self));
+		decl += to_name(func.self);

 	decl += "(";
 	vector<string> arglist;
--- a/spirv_glsl.hpp
+++ b/spirv_glsl.hpp
@ -206,7 +206,6 @@ protected:
 	                                     uint32_t grad_x, uint32_t grad_y, uint32_t lod, uint32_t coffset,
 	                                     uint32_t offset, uint32_t bias, uint32_t comp, uint32_t sample,
 	                                     bool *p_forward);
-	virtual std::string clean_func_name(std::string func_name);
 	virtual void emit_buffer_block(const SPIRVariable &type);
 	virtual void emit_push_constant_block(const SPIRVariable &var);
 	virtual void emit_uniform(const SPIRVariable &var);
@ -278,7 +277,7 @@ protected:

 	bool is_non_native_row_major_matrix(uint32_t id);
 	bool member_is_non_native_row_major_matrix(const SPIRType &type, uint32_t index);
-	bool member_is_packed_type(const SPIRType &type, uint32_t index);
+	bool member_is_packed_type(const SPIRType &type, uint32_t index) const;
 	virtual std::string convert_row_major_matrix(std::string exp_str);

 	std::unordered_set<std::string> local_variable_names;
@ -386,7 +385,7 @@ protected:
 	std::string to_member_name(const SPIRType &type, uint32_t index);
 	std::string type_to_glsl_constructor(const SPIRType &type);
 	std::string argument_decl(const SPIRFunction::Parameter &arg);
-	std::string to_qualifiers_glsl(uint32_t id);
+	virtual std::string to_qualifiers_glsl(uint32_t id);
 	const char *to_precision_qualifiers_glsl(uint32_t id);
 	virtual const char *to_storage_qualifiers_glsl(const SPIRVariable &var);
 	const char *flags_to_precision_qualifiers_glsl(const SPIRType &type, uint64_t flags);
@ -418,7 +417,7 @@ protected:
 	// and force recompile.
 	bool check_atomic_image(uint32_t id);

-	void replace_illegal_names();
+	virtual void replace_illegal_names();

 	void replace_fragment_output(SPIRVariable &var);
 	void replace_fragment_outputs();
--- a/spirv_msl.cpp
+++ b/spirv_msl.cpp
@ -31,9 +31,6 @@ CompilerMSL::CompilerMSL(vector<uint32_t> spirv_, vector<MSLVertexAttr> *p_vtx_a
                         vector<MSLResourceBinding> *p_res_bindings)
    : CompilerGLSL(move(spirv_))
 {
-	populate_func_name_overrides();
-	populate_var_name_overrides();
-
 	if (p_vtx_attrs)
 		for (auto &va : *p_vtx_attrs)
 			vtx_attrs_by_location[va.location] = &va;
@ -47,9 +44,6 @@ CompilerMSL::CompilerMSL(const uint32_t *ir, size_t word_count, MSLVertexAttr *p
                         MSLResourceBinding *p_res_bindings, size_t res_bindings_count)
    : CompilerGLSL(ir, word_count)
 {
-	populate_func_name_overrides();
-	populate_var_name_overrides();
-
 	if (p_vtx_attrs)
 		for (size_t i = 0; i < vtx_attrs_count; i++)
 			vtx_attrs_by_location[p_vtx_attrs[i].location] = &p_vtx_attrs[i];
@ -59,24 +53,13 @@ CompilerMSL::CompilerMSL(const uint32_t *ir, size_t word_count, MSLVertexAttr *p
 			resource_bindings.push_back(&p_res_bindings[i]);
 }

-// Populate the collection of function names that need to be overridden
-void CompilerMSL::populate_func_name_overrides()
-{
-	func_name_overrides["main"] = "main0";
-	func_name_overrides["saturate"] = "saturate0";
-}
-
-void CompilerMSL::populate_var_name_overrides()
-{
-	var_name_overrides["kernel"] = "kernel0";
-	var_name_overrides["bias"] = "bias0";
-}
-
 string CompilerMSL::compile()
 {
 	// Force a classic "C" locale, reverts when function returns
 	ClassicLocale classic_locale;

+	replace_illegal_names();
+
 	// Set main function name if it was explicitly set
 	if (!options.entry_point_name.empty())
 		set_name(entry_point, options.entry_point_name);
@ -102,6 +85,14 @@ string CompilerMSL::compile()
 	localize_global_variables();
 	extract_global_variables_from_functions();

+	// Mark any non-stage-in structs to be tightly packed.
+	mark_packable_structs();
+
+	// Metal does not allow dynamic array lengths.
+	// Resolve any specialization constants that are used for array lengths.
+	if (options.resolve_specialized_array_lengths)
+		resolve_specialized_array_lengths();
+
 	// Do not deal with GLES-isms like precision, older extensions and such.
 	CompilerGLSL::options.vulkan_semantics = true;
 	CompilerGLSL::options.es = false;
@ -114,12 +105,13 @@ string CompilerMSL::compile()
 	backend.swizzle_is_function = false;
 	backend.shared_is_implied = false;
 	backend.native_row_major_matrix = false;
+	backend.flexible_member_array_supported = false;

 	uint32_t pass_count = 0;
 	do
 	{
 		if (pass_count >= 3)
-			SPIRV_CROSS_THROW("Over 3 compilation loops detected. Must be a bug!");
+			SPIRV_CROSS_THROW("Over 2 compilation loops detected. Must be a bug!");

 		reset();

@ -184,7 +176,7 @@ void CompilerMSL::preprocess_op_codes()
 	}
 }

-// Move the Private global variables to the entry function.
+// Move the Private and Workgroup global variables to the entry function.
 // Non-constant variables cannot have global scope in Metal.
 void CompilerMSL::localize_global_variables()
 {
@ -192,11 +184,12 @@ void CompilerMSL::localize_global_variables()
 	auto iter = global_variables.begin();
 	while (iter != global_variables.end())
 	{
-		uint32_t gv_id = *iter;
-		auto &gbl_var = get<SPIRVariable>(gv_id);
-		if (gbl_var.storage == StorageClassPrivate)
+		uint32_t v_id = *iter;
+		auto &var = get<SPIRVariable>(v_id);
+		if (var.storage == StorageClassPrivate || var.storage == StorageClassWorkgroup)
 		{
-			entry_func.add_local_variable(gv_id);
+			var.storage = StorageClassFunction;
+			entry_func.add_local_variable(v_id);
 			iter = global_variables.erase(iter);
 		}
 		else
@ -204,6 +197,21 @@ void CompilerMSL::localize_global_variables()
 	}
 }

+// Metal does not allow dynamic array lengths.
+// Turn off specialization of any constants that are used for array lengths.
+void CompilerMSL::resolve_specialized_array_lengths()
+{
+	for (auto &id : ids)
+	{
+		if (id.get_type() == TypeConstant)
+		{
+			auto &c = id.get<SPIRConstant>();
+			if (c.is_used_as_array_length)
+				c.specialization = false;
+		}
+	}
+}
+
 // For any global variable accessed directly by a function,
 // extract that variable and add it as an argument to that function.
 void CompilerMSL::extract_global_variables_from_functions()
@ -313,6 +321,54 @@ void CompilerMSL::extract_global_variables_from_function(uint32_t func_id, std::
 	}
 }

+// For all variables that are some form of non-input-output interface block, mark that all the structs
+// that are recursively contained within the type referenced by that variable should be packed tightly.
+void CompilerMSL::mark_packable_structs()
+{
+	for (auto &id : ids)
+	{
+		if (id.get_type() == TypeVariable)
+		{
+			auto &var = id.get<SPIRVariable>();
+			if (var.storage != StorageClassFunction && !is_hidden_variable(var))
+			{
+				auto &type = get<SPIRType>(var.basetype);
+				if (type.pointer &&
+				    (type.storage == StorageClassUniform || type.storage == StorageClassUniformConstant ||
+				     type.storage == StorageClassPushConstant || type.storage == StorageClassStorageBuffer) &&
+				    (has_decoration(type.self, DecorationBlock) || has_decoration(type.self, DecorationBufferBlock)))
+					mark_as_packable(type);
+			}
+		}
+	}
+}
+
+// If the specified type is a struct, it and any nested structs
+// are marked as packable with the DecorationCPacked decoration,
+void CompilerMSL::mark_as_packable(SPIRType &type)
+{
+	// If this is not the base type (eg. it's a pointer or array), tunnel down
+	if (type.parent_type)
+	{
+		mark_as_packable(get<SPIRType>(type.parent_type));
+		return;
+	}
+
+	if (type.basetype == SPIRType::Struct)
+	{
+		set_decoration(type.self, DecorationCPacked);
+
+		// Recurse
+		size_t mbr_cnt = type.member_types.size();
+		for (uint32_t mbr_idx = 0; mbr_idx < mbr_cnt; mbr_idx++)
+		{
+			uint32_t mbr_type_id = type.member_types[mbr_idx];
+			auto &mbr_type = get<SPIRType>(mbr_type_id);
+			mark_as_packable(mbr_type);
+		}
+	}
+}
+
 // If a vertex attribute exists at the location, it is marked as being used by this shader
 void CompilerMSL::mark_location_as_used_by_shader(uint32_t location, StorageClass storage)
 {
@ -668,13 +724,14 @@ void CompilerMSL::align_struct(SPIRType &ib_type)
 	MemberSorter member_sorter(ib_type, meta[ib_type_id], MemberSorter::Offset);
 	member_sorter.sort();

-	uint32_t curr_offset = 0;
+	uint32_t curr_offset;
 	uint32_t mbr_cnt = uint32_t(ib_type.member_types.size());

 	// Test the alignment of each member, and if a member should be closer to the previous
 	// member than the default spacing expects, it is likely that the previous member is in
 	// a packed format. If so, and the previous member is packable, pack it.
 	// For example...this applies to any 3-element vector that is followed by a scalar.
+	curr_offset = 0;
 	for (uint32_t mbr_idx = 0; mbr_idx < mbr_cnt; mbr_idx++)
 	{
 		// Align current offset to the current member's default alignment.
@ -697,6 +754,7 @@ void CompilerMSL::align_struct(SPIRType &ib_type)
 	// Test the alignment of each member, and if a member is positioned farther than its
 	// alignment and the end of the previous member, add a dummy padding member that will
 	// be added before the current member when the delaration of this struct is emitted.
+	curr_offset = 0;
 	for (uint32_t mbr_idx = 0; mbr_idx < mbr_cnt; mbr_idx++)
 	{
 		// Align current offset to the current member's default alignment.
@ -816,7 +874,7 @@ void CompilerMSL::emit_custom_functions()
 			statement("template<typename T>");
 			statement("T findLSB(T x)");
 			begin_scope();
-			statement("return select(ctz(x), -1, x == 0);");
+			statement("return select(ctz(x), T(-1), x == T(0));");
 			end_scope();
 			statement("");
 			break;
@ -826,7 +884,7 @@ void CompilerMSL::emit_custom_functions()
 			statement("template<typename T>");
 			statement("T findUMSB(T x)");
 			begin_scope();
-			statement("return select(clz(0) - (clz(x) + 1), -1, x == 0);");
+			statement("return select(clz(T(0)) - (clz(x) + T(1)), T(-1), x == T(0));");
 			end_scope();
 			statement("");
 			break;
@ -836,8 +894,19 @@ void CompilerMSL::emit_custom_functions()
 			statement("template<typename T>");
 			statement("T findSMSB(T x)");
 			begin_scope();
-			statement("T v = select(x, -1 - x, x < 0);");
-			statement("return select(clz(0) - (clz(v) + 1), -1, v == 0);");
+			statement("T v = select(x, T(-1) - x, x < T(0));");
+			statement("return select(clz(T(0)) - (clz(v) + T(1)), T(-1), v == T(0));");
+			end_scope();
+			statement("");
+			break;
+
+		case SPVFuncImplArrayCopy:
+			statement("// Implementation of an array copy function to cover GLSL's ability to copy an array via "
+			          "assignment. ");
+			statement("template<typename T>");
+			statement("void spvArrayCopy(thread T* dst, thread const T* src, uint count)");
+			begin_scope();
+			statement("for (uint i = 0; i < count; *dst++ = *src++, i++);");
 			end_scope();
 			statement("");
 			break;
@ -979,51 +1048,41 @@ void CompilerMSL::emit_custom_functions()

 void CompilerMSL::emit_resources()
 {
-
-	// Output all basic struct types which are not Block or BufferBlock as these are declared inplace
-	// when such variables are instantiated.
+	// Output non-interface structs. These include local function structs
+	// and structs nested within uniform and read-write buffers.
+	unordered_set<uint32_t> declared_structs;
 	for (auto &id : ids)
 	{
 		if (id.get_type() == TypeType)
 		{
 			auto &type = id.get<SPIRType>();
-			if (type.basetype == SPIRType::Struct && type.array.empty() && !type.pointer &&
-			    !has_decoration(type.self, DecorationBlock) && !has_decoration(type.self, DecorationBufferBlock))
+			uint32_t type_id = type.self;
+
+			bool is_struct = (type.basetype == SPIRType::Struct) && type.array.empty();
+			bool is_block =
+			    has_decoration(type.self, DecorationBlock) || has_decoration(type.self, DecorationBufferBlock);
+			bool is_basic_struct = is_struct && !type.pointer && !is_block;
+
+			bool is_interface = (type.storage == StorageClassInput || type.storage == StorageClassOutput ||
+			                     type.storage == StorageClassUniformConstant);
+			bool is_non_interface_block = is_struct && type.pointer && is_block && !is_interface;
+
+			bool is_declarable_struct = is_basic_struct || is_non_interface_block;
+
+			// Align and emit declarable structs...but avoid declaring each more than once.
+			if (is_declarable_struct && declared_structs.count(type_id) == 0)
 			{
+				declared_structs.insert(type_id);
+
+				if (has_decoration(type_id, DecorationCPacked))
+					align_struct(type);
+
 				emit_struct(type);
 			}
 		}
 	}

-	// Output Uniform buffers and constants
-	unordered_set<uint32_t> declared_interface_structs;
-	for (auto &id : ids)
-	{
-		if (id.get_type() == TypeVariable)
-		{
-			auto &var = id.get<SPIRVariable>();
-			auto &type = get<SPIRType>(var.basetype);
-
-			if (var.storage != StorageClassFunction && type.pointer &&
-			    (type.storage == StorageClassUniform || type.storage == StorageClassUniformConstant ||
-			     type.storage == StorageClassPushConstant || type.storage == StorageClassStorageBuffer) &&
-			    (has_decoration(type.self, DecorationBlock) || has_decoration(type.self, DecorationBufferBlock)) &&
-			    !is_hidden_variable(var))
-			{
-				// Avoid declaring the same struct multiple times.
-				if (declared_interface_structs.count(type.self) == 0)
-				{
-					align_struct(type);
-					emit_struct(type);
-					declared_interface_structs.insert(type.self);
-				}
-			}
-		}
-	}
-
-	declare_undefined_values();
-
-	// Output interface blocks.
+	// Output interface structs.
 	emit_interface_block(stage_in_var_id);
 	for (auto &nsi_var : non_stage_in_input_var_ids)
 		emit_interface_block(nsi_var.second);
@ -1071,7 +1130,8 @@ void CompilerMSL::emit_specialization_constants()
 	// the work group size at compile time in SPIR-V, and [[threads_per_threadgroup]] would need to be passed around as a global.
 	// The work group size may be a specialization constant.
 	if (workgroup_size_id)
-		statement("constant uint3 gl_WorkGroupSize = ", constant_expression(get<SPIRConstant>(workgroup_size_id)), ";");
+		statement("constant uint3 ", builtin_to_glsl(BuiltInWorkgroupSize, StorageClassWorkgroup), " = ",
+		          constant_expression(get<SPIRConstant>(workgroup_size_id)), ";");

 	if (!spec_consts.empty() || workgroup_size_id)
 		statement("");
@ -1090,7 +1150,6 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
 #define BFOP(op) emit_binary_func_op(ops[0], ops[1], ops[2], ops[3], #op)
 #define BFOP_CAST(op, type) \
 	emit_binary_func_op_cast(ops[0], ops[1], ops[2], ops[3], #op, type, opcode_is_sign_invariant(opcode))
-#define BFOP(op) emit_binary_func_op(ops[0], ops[1], ops[2], ops[3], #op)
 #define UFOP(op) emit_unary_func_op(ops[0], ops[1], ops[2], #op)

 	auto ops = stream(instruction);
@ -1434,12 +1493,27 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
 	}

 	case OpStore:
-	{
-		if (!maybe_emit_input_struct_assignment(ops[0], ops[1]))
-			CompilerGLSL::emit_instruction(instruction);
+		if (maybe_emit_input_struct_assignment(ops[0], ops[1]))
+			break;

+		if (maybe_emit_array_assignment(ops[0], ops[1]))
+			break;
+
+		CompilerGLSL::emit_instruction(instruction);
+		break;
+
+	// Compute barriers
+	case OpMemoryBarrier:
+		emit_barrier(0, ops[0], ops[1]);
+		break;
+
+	case OpControlBarrier:
+		// In GLSL a memory barrier is often followed by a control barrier.
+		// But in MSL, memory barriers are also control barriers, so don't
+		// emit a simple control barrier if a memory barrier has just been emitted.
+		if (previous_instruction_opcode != OpMemoryBarrier)
+			emit_barrier(ops[0], ops[1], ops[2]);
 		break;
-	}

 	// OpOuterProduct

@ -1447,6 +1521,75 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
 		CompilerGLSL::emit_instruction(instruction);
 		break;
 	}
+
+	previous_instruction_opcode = opcode;
+}
+
+void CompilerMSL::emit_barrier(uint32_t id_exe_scope, uint32_t id_mem_scope, uint32_t id_mem_sem)
+{
+	if (get_entry_point().model != ExecutionModelGLCompute)
+		return;
+
+	string bar_stmt = "threadgroup_barrier(mem_flags::";
+
+	uint32_t mem_sem = id_mem_sem ? get<SPIRConstant>(id_mem_sem).scalar() : MemorySemanticsMaskNone;
+	switch (mem_sem)
+	{
+	case MemorySemanticsCrossWorkgroupMemoryMask:
+		bar_stmt += "mem_device";
+		break;
+
+	case MemorySemanticsSubgroupMemoryMask:
+	case MemorySemanticsWorkgroupMemoryMask:
+	case MemorySemanticsAtomicCounterMemoryMask:
+		bar_stmt += "mem_threadgroup";
+		break;
+
+	case MemorySemanticsImageMemoryMask:
+		bar_stmt += "mem_texture";
+		break;
+
+	case MemorySemanticsAcquireMask:
+	case MemorySemanticsReleaseMask:
+	case MemorySemanticsAcquireReleaseMask:
+	case MemorySemanticsSequentiallyConsistentMask:
+	case MemorySemanticsUniformMemoryMask:
+	case MemorySemanticsMaskNone:
+	default:
+		bar_stmt += "mem_none";
+		break;
+	}
+
+	if (options.msl_version >= 2.0)
+	{
+		bar_stmt += ", ";
+
+		// Use the wider of the two scopes (smaller value)
+		uint32_t exe_scope = id_exe_scope ? get<SPIRConstant>(id_exe_scope).scalar() : ScopeInvocation;
+		uint32_t mem_scope = id_mem_scope ? get<SPIRConstant>(id_mem_scope).scalar() : ScopeInvocation;
+		uint32_t scope = min(exe_scope, mem_scope);
+		switch (scope)
+		{
+		case ScopeCrossDevice:
+		case ScopeDevice:
+			bar_stmt += "memory_scope_device";
+			break;
+
+		case ScopeSubgroup:
+		case ScopeInvocation:
+			bar_stmt += "memory_scope_simdgroup";
+			break;
+
+		case ScopeWorkgroup:
+		default:
+			bar_stmt += "memory_scope_threadgroup";
+			break;
+		}
+	}
+
+	bar_stmt += ");";
+
+	statement(bar_stmt);
 }

 // Since MSL does not allow structs to be nested within the stage_in struct, the original input
@ -1456,53 +1599,79 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
 // Returns whether the struct assignment was emitted.
 bool CompilerMSL::maybe_emit_input_struct_assignment(uint32_t id_lhs, uint32_t id_rhs)
 {
-    // We only care about assignments of an entire struct
-    uint32_t type_id = expression_type_id(id_rhs);
-    auto &type = get<SPIRType>(type_id);
-    if (type.basetype != SPIRType::Struct)
-        return false;
+	// We only care about assignments of an entire struct
+	uint32_t type_id = expression_type_id(id_rhs);
+	auto &type = get<SPIRType>(type_id);
+	if (type.basetype != SPIRType::Struct)
+		return false;

-    // We only care about assignments from Input variables
-    auto *p_v_rhs = maybe_get_backing_variable(id_rhs);
+	// We only care about assignments from Input variables
+	auto *p_v_rhs = maybe_get_backing_variable(id_rhs);
 	if (!(p_v_rhs && p_v_rhs->storage == StorageClassInput))
-        return false;
+		return false;

-    // Get the ID of the type of the underlying RHS variable.
-    // This will be an Input OpTypePointer containing the qualified member names.
-    uint32_t tid_v_rhs = p_v_rhs->basetype;
+	// Get the ID of the type of the underlying RHS variable.
+	// This will be an Input OpTypePointer containing the qualified member names.
+	uint32_t tid_v_rhs = p_v_rhs->basetype;

-    // Ensure the LHS variable has been declared
-    auto *p_v_lhs = maybe_get_backing_variable(id_lhs);
-    if (p_v_lhs)
-        flush_variable_declaration(p_v_lhs->self);
+	// Ensure the LHS variable has been declared
+	auto *p_v_lhs = maybe_get_backing_variable(id_lhs);
+	if (p_v_lhs)
+		flush_variable_declaration(p_v_lhs->self);

-    size_t mbr_cnt = type.member_types.size();
-    for (uint32_t mbr_idx = 0; mbr_idx < mbr_cnt; mbr_idx++)
-    {
-        string expr;
+	size_t mbr_cnt = type.member_types.size();
+	for (uint32_t mbr_idx = 0; mbr_idx < mbr_cnt; mbr_idx++)
+	{
+		string expr;

-        //LHS
-        expr += to_name(id_lhs);
-        expr += ".";
-        expr += to_member_name(type, mbr_idx);
+		//LHS
+		expr += to_name(id_lhs);
+		expr += ".";
+		expr += to_member_name(type, mbr_idx);

-        expr += " = ";
+		expr += " = ";

-        //RHS
-        string qual_mbr_name = get_member_qualified_name(tid_v_rhs, mbr_idx);
-        if (qual_mbr_name.empty())
-        {
-            expr += to_name(id_rhs);
-            expr += ".";
-            expr += to_member_name(type, mbr_idx);
-        }
-        else
-            expr += qual_mbr_name;
+		//RHS
+		string qual_mbr_name = get_member_qualified_name(tid_v_rhs, mbr_idx);
+		if (qual_mbr_name.empty())
+		{
+			expr += to_name(id_rhs);
+			expr += ".";
+			expr += to_member_name(type, mbr_idx);
+		}
+		else
+			expr += qual_mbr_name;

-        statement(expr, ";");
-    }
+		statement(expr, ";");
+	}

-    return true;
+	return true;
+}
+
+// Since MSL does not allow arrays to be copied via simple variable assignment,
+// if the LHS and RHS represent an assignment of an entire array, it must be
+// implemented by calling an array copy function.
+// Returns whether the struct assignment was emitted.
+bool CompilerMSL::maybe_emit_array_assignment(uint32_t id_lhs, uint32_t id_rhs)
+{
+	// Assignment from an array initializer is fine.
+	if (ids[id_rhs].get_type() == TypeConstant)
+		return false;
+
+	// We only care about assignments of an entire array
+	auto &type = expression_type(id_rhs);
+	if (type.array.size() == 0)
+		return false;
+
+	// Ensure the LHS variable has been declared
+	auto *p_v_lhs = maybe_get_backing_variable(id_lhs);
+	if (p_v_lhs)
+		flush_variable_declaration(p_v_lhs->self);
+
+	statement("spvArrayCopy(", to_expression(id_lhs), ", ", to_expression(id_rhs), ", ", to_array_size(type, 0), ");");
+	register_write(id_lhs);
+
+	return true;
 }

 // Emits one of the atomic functions. In MSL, the atomic functions operate on pointers
@ -1605,7 +1774,7 @@ void CompilerMSL::emit_glsl_op(uint32_t result_type, uint32_t id, uint32_t eop,
 		emit_unary_func_op(result_type, id, args[0], "pack_float_to_unorm2x16");
 		break;
 	case GLSLstd450PackHalf2x16:
-		emit_unary_func_op(result_type, id, args[0], "pack_half_to_snorm2x16");
+		emit_unary_func_op(result_type, id, args[0], "unsupported_GLSLstd450PackHalf2x16"); // Currently unsupported
 		break;

 	case GLSLstd450UnpackSnorm4x8:
@ -1621,7 +1790,7 @@ void CompilerMSL::emit_glsl_op(uint32_t result_type, uint32_t id, uint32_t eop,
 		emit_unary_func_op(result_type, id, args[0], "unpack_unorm2x16_to_float");
 		break;
 	case GLSLstd450UnpackHalf2x16:
-		emit_unary_func_op(result_type, id, args[0], "unpack_snorm2x16_to_half");
+		emit_unary_func_op(result_type, id, args[0], "unsupported_GLSLstd450UnpackHalf2x16"); // Currently unsupported
 		break;

 	case GLSLstd450PackDouble2x32:
@ -1687,7 +1856,7 @@ void CompilerMSL::emit_function_prototype(SPIRFunction &func, uint64_t)
 	auto &type = get<SPIRType>(func.return_type);
 	decl += func_type_decl(type);
 	decl += " ";
-	decl += clean_func_name(to_name(func.self));
+	decl += to_name(func.self);

 	decl += "(";

@ -2291,14 +2460,7 @@ string CompilerMSL::func_type_decl(SPIRType &type)
 	return entry_type + " " + return_type;
 }

-// Ensures the function name is not "main", which is illegal in MSL
-string CompilerMSL::clean_func_name(string func_name)
-{
-	auto iter = func_name_overrides.find(func_name);
-	return (iter != func_name_overrides.end()) ? iter->second : func_name;
-}
-
-// In MSL address space qualifiers are required for all pointer or reference arguments
+// In MSL, address space qualifiers are required for all pointer or reference arguments
 string CompilerMSL::get_argument_address_space(const SPIRVariable &argument)
 {
 	const auto &type = get<SPIRType>(argument.basetype);
@ -2478,20 +2640,33 @@ uint32_t CompilerMSL::get_metal_resource_index(SPIRVariable &var, SPIRType::Base
 // Returns the name of the entry point of this shader
 string CompilerMSL::get_entry_point_name()
 {
-	return clean_func_name(to_name(entry_point));
+	return to_name(entry_point);
 }

 string CompilerMSL::argument_decl(const SPIRFunction::Parameter &arg)
 {
+	auto &var = get<SPIRVariable>(arg.id);
 	auto &type = expression_type(arg.id);
 	bool constref = !arg.alias_global_variable && (!type.pointer || arg.write_count == 0);

 	// TODO: Check if this arg is an uniform pointer
 	bool pointer = type.storage == StorageClassUniformConstant;

-	auto &var = get<SPIRVariable>(arg.id);
-	return join(constref ? "const " : "", type_to_glsl(type), pointer ? " " : "& ", to_name(var.self),
-	            type_to_array_glsl(type));
+	string decl;
+	if (constref)
+		decl += "const ";
+
+	decl += type_to_glsl(type);
+
+	if (is_array(type))
+		decl += "*";
+	else if (!pointer)
+		decl += "&";
+
+	decl += " ";
+	decl += to_name(var.self);
+
+	return decl;
 }

 // If we're currently in the entry point function, and the object
@ -2526,17 +2701,79 @@ string CompilerMSL::to_qualified_member_name(const SPIRType &type, uint32_t inde
 // if the first chars are _ and a digit, which indicate a transient name.
 string CompilerMSL::ensure_valid_name(string name, string pfx)
 {
-	if (name.size() >= 2 && name[0] == '_' && isdigit(name[1]))
+	return (name.size() >= 2 && name[0] == '_' && isdigit(name[1])) ? (pfx + name) : name;
+}
+
+// Replace all names that match MSL keywords or Metal Standard Library functions.
+void CompilerMSL::replace_illegal_names()
+{
+	static const unordered_set<string> keywords = {
+		"kernel", "bias",
+	};
+
+	static const unordered_set<string> illegal_func_names = {
+		"main", "saturate",
+	};
+
+	for (auto &id : ids)
 	{
-		return join(pfx, name);
+		switch (id.get_type())
+		{
+		case TypeVariable:
+		{
+			auto &dec = meta[id.get_id()].decoration;
+			if (keywords.find(dec.alias) != end(keywords))
+				dec.alias += "0";
+
+			break;
+		}
+
+		case TypeFunction:
+		{
+			auto &dec = meta[id.get_id()].decoration;
+			if (illegal_func_names.find(dec.alias) != end(illegal_func_names))
+				dec.alias += "0";
+
+			break;
+		}
+
+		case TypeType:
+		{
+			for (auto &mbr_dec : meta[id.get_id()].members)
+				if (keywords.find(mbr_dec.alias) != end(keywords))
+					mbr_dec.alias += "0";
+
+			break;
+		}
+
+		default:
+			break;
+		}
 	}
-	else
+
+	for (auto &entry : entry_points)
 	{
-		auto iter = var_name_overrides.find(name);
-		return (iter != var_name_overrides.end()) ? iter->second : name;
+		// Change both the entry point name and the alias, to keep them synced.
+		string &ep_name = entry.second.name;
+		if (illegal_func_names.find(ep_name) != end(illegal_func_names))
+		{
+			ep_name += "0";
+			meta[entry.first].decoration.alias = ep_name;
+		}
 	}
 }

+string CompilerMSL::to_qualifiers_glsl(uint32_t id)
+{
+	string quals;
+
+	auto &type = expression_type(id);
+	if (type.storage == StorageClassWorkgroup)
+		quals += "threadgroup ";
+
+	return quals;
+}
+
 // The optional id parameter indicates the object whose type we are trying
 // to find the description for. It is optional. Most type descriptions do not
 // depend on a specific object's use of that type.
@ -2882,11 +3119,11 @@ string CompilerMSL::built_in_func_arg(BuiltIn builtin, bool prefix_comma)
 	if (prefix_comma)
 		bi_arg += ", ";

-    bi_arg += builtin_type_decl(builtin);
+	bi_arg += builtin_type_decl(builtin);
 	bi_arg += " " + builtin_to_glsl(builtin, StorageClassInput);
 	bi_arg += " [[" + builtin_qualifier(builtin) + "]]";

-    return bi_arg;
+	return bi_arg;
 }

 // Returns the byte size of a struct member.
@ -2905,9 +3142,6 @@ size_t CompilerMSL::get_declared_struct_member_size(const SPIRType &struct_type,
 	case SPIRType::Sampler:
 		SPIRV_CROSS_THROW("Querying size of opaque object.");

-	case SPIRType::Struct:
-		return get_declared_struct_size(type);
-
 	default:
 	{
 		size_t component_size = type.width / 8;
@ -2915,8 +3149,12 @@ size_t CompilerMSL::get_declared_struct_member_size(const SPIRType &struct_type,
 		unsigned columns = type.columns;

 		// For arrays, we can use ArrayStride to get an easy check.
+		// Runtime arrays will have zero size so force to min of one.
 		if (!type.array.empty())
-			return type_struct_member_array_stride(struct_type, index) * type.array.back();
+			return type_struct_member_array_stride(struct_type, index) * max(type.array.back(), 1U);
+
+		if (type.basetype == SPIRType::Struct)
+			return get_declared_struct_size(type);

 		if (columns == 1) // An unpacked 3-element vector is the same size as a 4-element vector.
 		{
@ -2967,11 +3205,14 @@ size_t CompilerMSL::get_declared_struct_member_alignment(const SPIRType &struct_
 	{
 		// Alignment of packed type is the same as the underlying component size.
 		// Alignment of unpacked type is the same as the type size (or one matrix column).
-		auto dec_mask = get_member_decoration_mask(struct_type.self, index);
-		if (dec_mask & (1ull << DecorationCPacked))
+		if (member_is_packed_type(struct_type, index))
 			return type.width / 8;
 		else
-			return get_declared_struct_member_size(struct_type, index) / type.columns;
+		{
+			// Divide by array size and colum count. Runtime arrays will have zero size so force to min of one.
+			uint32_t array_size = type.array.empty() ? 1 : max(type.array.back(), 1U);
+			return get_declared_struct_member_size(struct_type, index) / (type.columns * array_size);
+		}
 	}
 	}
 }
@ -2981,7 +3222,7 @@ bool CompilerMSL::skip_argument(uint32_t) const
 	return false;
 }

-bool CompilerMSL::OpCodePreprocessor::handle(Op opcode, const uint32_t *args, uint32_t /*length*/)
+bool CompilerMSL::OpCodePreprocessor::handle(Op opcode, const uint32_t *args, uint32_t length)
 {
 	// Since MSL exists in a single execution scope, function prototype declarations are not
 	// needed, and clutter the output. If secondary functions are output (either as a SPIR-V
@ -2989,12 +3230,11 @@ bool CompilerMSL::OpCodePreprocessor::handle(Op opcode, const uint32_t *args, ui
 	// suppress_missing_prototypes to suppress compiler warnings of missing function prototypes.

 	// Mark if the input requires the implementation of an SPIR-V function that does not exist in Metal.
-	SPVFuncImpl spv_func = compiler.get_spv_func_impl(opcode, args);
+	SPVFuncImpl spv_func = get_spv_func_impl(opcode, args);
 	if (spv_func != SPVFuncImplNone)
 	{
 		compiler.spv_function_implementations.insert(spv_func);
 		suppress_missing_prototypes = true;
-		return true;
 	}

 	switch (opcode)
@ -3026,21 +3266,38 @@ bool CompilerMSL::OpCodePreprocessor::handle(Op opcode, const uint32_t *args, ui
 		break;
 	}

+	// Keep track of the instruction return types, mapped by ID
+	if (length > 1)
+		result_types[args[1]] = args[0];
+
 	return true;
 }

 // Returns an enumeration of a SPIR-V function that needs to be output for certain Op codes.
-CompilerMSL::SPVFuncImpl CompilerMSL::get_spv_func_impl(Op opcode, const uint32_t *args)
+CompilerMSL::SPVFuncImpl CompilerMSL::OpCodePreprocessor::get_spv_func_impl(Op opcode, const uint32_t *args)
 {
 	switch (opcode)
 	{
 	case OpFMod:
 		return SPVFuncImplMod;

+	case OpStore:
+	{
+		// Get the result type of the RHS. Since this is run as a pre-processing stage,
+		// we must extract the result type directly from the Instruction, rather than the ID.
+		uint32_t id_rhs = args[1];
+		uint32_t type_id_rhs = result_types[id_rhs];
+		if ((compiler.ids[id_rhs].get_type() != TypeConstant) && type_id_rhs &&
+		    compiler.is_array(compiler.get<SPIRType>(type_id_rhs)))
+			return SPVFuncImplArrayCopy;
+
+		break;
+	}
+
 	case OpExtInst:
 	{
 		uint32_t extension_set = args[2];
-		if (get<SPIRExtension>(extension_set).ext == SPIRExtension::GLSL)
+		if (compiler.get<SPIRExtension>(extension_set).ext == SPIRExtension::GLSL)
 		{
 			GLSLstd450 op_450 = static_cast<GLSLstd450>(args[3]);
 			switch (op_450)
@ -3057,7 +3314,7 @@ CompilerMSL::SPVFuncImpl CompilerMSL::get_spv_func_impl(Op opcode, const uint32_
 				return SPVFuncImplFindUMsb;
 			case GLSLstd450MatrixInverse:
 			{
-				auto &mat_type = get<SPIRType>(args[0]);
+				auto &mat_type = compiler.get<SPIRType>(args[0]);
 				switch (mat_type.columns)
 				{
 				case 2:
--- a/spirv_msl.hpp
+++ b/spirv_msl.hpp
@ -76,8 +76,10 @@ public:
 	// Options for compiling to Metal Shading Language
 	struct Options
 	{
-		bool enable_point_size_builtin = true;
 		std::string entry_point_name;
+		float msl_version = 1.2f;
+		bool enable_point_size_builtin = true;
+		bool resolve_specialized_array_lengths = true;
 	};

 	const Options &get_options() const
@ -101,6 +103,7 @@ public:
 		SPVFuncImplFindILsb,
 		SPVFuncImplFindSMsb,
 		SPVFuncImplFindUMsb,
+		SPVFuncImplArrayCopy,
 		SPVFuncImplInverse2x2,
 		SPVFuncImplInverse3x3,
 		SPVFuncImplInverse4x4,
@ -165,10 +168,15 @@ protected:
 	std::string unpack_expression_type(std::string expr_str, const SPIRType &type) override;
 	std::string bitcast_glsl_op(const SPIRType &result_type, const SPIRType &argument_type) override;
 	bool skip_argument(uint32_t id) const override;
+	std::string to_qualifiers_glsl(uint32_t id) override;
+	void replace_illegal_names() override;

 	void preprocess_op_codes();
 	void localize_global_variables();
 	void extract_global_variables_from_functions();
+	void resolve_specialized_array_lengths();
+	void mark_packable_structs();
+	void mark_as_packable(SPIRType &type);

 	std::unordered_map<uint32_t, std::set<uint32_t>> function_global_vars;
 	void extract_global_variables_from_function(uint32_t func_id, std::set<uint32_t> &added_arg_ids,
@ -181,12 +189,10 @@ protected:
 	void emit_resources();
 	void emit_specialization_constants();
 	void emit_interface_block(uint32_t ib_var_id);
-	void populate_func_name_overrides();
-	void populate_var_name_overrides();
 	bool maybe_emit_input_struct_assignment(uint32_t id_lhs, uint32_t id_rhs);
+	bool maybe_emit_array_assignment(uint32_t id_lhs, uint32_t id_rhs);

 	std::string func_type_decl(SPIRType &type);
-	std::string clean_func_name(std::string func_name) override;
 	std::string entry_point_args(bool append_comma);
 	std::string get_entry_point_name();
 	std::string to_qualified_member_name(const SPIRType &type, uint32_t index);
@ -210,17 +216,15 @@ protected:
 	void align_struct(SPIRType &ib_type);
 	bool is_member_packable(SPIRType &ib_type, uint32_t index);
 	MSLStructMemberKey get_struct_member_key(uint32_t type_id, uint32_t index);
-	SPVFuncImpl get_spv_func_impl(spv::Op opcode, const uint32_t *args);
 	std::string get_argument_address_space(const SPIRVariable &argument);
 	void emit_atomic_func_op(uint32_t result_type, uint32_t result_id, const char *op, uint32_t mem_order_1,
 	                         uint32_t mem_order_2, bool has_mem_order_2, uint32_t op0, uint32_t op1 = 0,
 	                         bool op1_is_pointer = false, uint32_t op2 = 0);
 	const char *get_memory_order(uint32_t spv_mem_sem);
 	void add_pragma_line(const std::string &line);
+	void emit_barrier(uint32_t id_exe_scope, uint32_t id_mem_scope, uint32_t id_mem_sem);

 	Options options;
-	std::unordered_map<std::string, std::string> func_name_overrides;
-	std::unordered_map<std::string, std::string> var_name_overrides;
 	std::set<SPVFuncImpl> spv_function_implementations;
 	std::unordered_map<uint32_t, MSLVertexAttr *> vtx_attrs_by_location;
 	std::map<uint32_t, uint32_t> non_stage_in_input_var_ids;
@ -238,6 +242,7 @@ protected:
 	std::string stage_out_var_name = "out";
 	std::string stage_uniform_var_name = "uniforms";
 	std::string sampler_name_suffix = "Smplr";
+	spv::Op previous_instruction_opcode = spv::OpNop;

 	// OpcodeHandler that handles several MSL preprocessing operations.
 	struct OpCodePreprocessor : OpcodeHandler
@ -248,8 +253,10 @@ protected:
 		}

 		bool handle(spv::Op opcode, const uint32_t *args, uint32_t length) override;
+		CompilerMSL::SPVFuncImpl get_spv_func_impl(spv::Op opcode, const uint32_t *args);

 		CompilerMSL &compiler;
+		std::unordered_map<uint32_t, uint32_t> result_types;
 		bool suppress_missing_prototypes = false;
 		bool uses_atomics = false;
 	};