Merge pull request #321 from brenwill/master

Enhancements to MSL compute and entry point naming.
2017-11-11 09:57:49 +01:00 · 2017-11-11 09:57:49 +01:00 · 2b8fe8d711
commit 2b8fe8d711
parent 886fa8129d 1014847f17
73 changed files with 2763 additions and 229 deletions
--- a/.gitignore
+++ b/.gitignore
@ -13,5 +13,6 @@
 *.opensdf
 *.shader
 *.a
+*.bc

 !CMakeLists.txt
--- a/main.cpp
+++ b/main.cpp
@ -447,9 +447,11 @@ struct CLIArguments
 	const char *cpp_interface_name = nullptr;
 	uint32_t version = 0;
 	uint32_t shader_model = 0;
+	uint32_t msl_version = 0;
 	bool es = false;
 	bool set_version = false;
 	bool set_shader_model = false;
+	bool set_msl_version = false;
 	bool set_es = false;
 	bool dump_resources = false;
 	bool force_temporary = false;
@ -483,7 +485,7 @@ static void print_help()
 	                "[--version <GLSL version>] [--dump-resources] [--help] [--force-temporary] "
 	                "[--vulkan-semantics] [--flatten-ubo] [--fixup-clipspace] [--flip-vert-y] [--iterations iter] "
 	                "[--cpp] [--cpp-interface-name <name>] "
-	                "[--msl] "
+	                "[--msl] [--msl-version <MMmmpp>]"
 	                "[--hlsl] [--shader-model] [--hlsl-enable-compat] "
 	                "[--separate-shader-objects]"
 	                "[--pls-in format input-name] [--pls-out format output-name] [--remap source_name target_name "
@ -686,6 +688,10 @@ static int main_inner(int argc, char *argv[])
 		args.shader_model = parser.next_uint();
 		args.set_shader_model = true;
 	});
+	cbs.add("--msl-version", [&args](CLIParser &parser) {
+		args.msl_version = parser.next_uint();
+		args.set_msl_version = true;
+	});

 	cbs.add("--remove-unused-variables", [&args](CLIParser &) { args.remove_unused = true; });

@ -725,6 +731,8 @@ static int main_inner(int argc, char *argv[])

 		auto *msl_comp = static_cast<CompilerMSL *>(compiler.get());
 		auto msl_opts = msl_comp->get_options();
+		if (args.set_msl_version)
+			msl_opts.msl_version = args.msl_version;
 		msl_comp->set_options(msl_opts);
 	}
 	else if (args.hlsl)
--- a/reference/shaders-msl/asm/comp/bitcast_iadd.asm.comp
+++ b/reference/shaders-msl/asm/comp/bitcast_iadd.asm.comp
@ -0,0 +1,29 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct _3
+{
+    int4 _m0;
+    uint4 _m1;
+};
+
+struct _4
+{
+    uint4 _m0;
+    int4 _m1;
+};
+
+kernel void main0(device _3& _5 [[buffer(0)]], device _4& _6 [[buffer(1)]])
+{
+    _6._m0 = _5._m1 + uint4(_5._m0);
+    _6._m0 = uint4(_5._m0) + _5._m1;
+    _6._m0 = _5._m1 + _5._m1;
+    _6._m0 = uint4(_5._m0 + _5._m0);
+    _6._m1 = int4(_5._m1 + _5._m1);
+    _6._m1 = _5._m0 + _5._m0;
+    _6._m1 = int4(_5._m1) + _5._m0;
+    _6._m1 = _5._m0 + int4(_5._m1);
+}
+
--- a/reference/shaders-msl/asm/comp/bitcast_sar.asm.comp
+++ b/reference/shaders-msl/asm/comp/bitcast_sar.asm.comp
@ -0,0 +1,29 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct _3
+{
+    int4 _m0;
+    uint4 _m1;
+};
+
+struct _4
+{
+    uint4 _m0;
+    int4 _m1;
+};
+
+kernel void main0(device _3& _5 [[buffer(0)]], device _4& _6 [[buffer(1)]])
+{
+    _6._m0 = uint4(int4(_5._m1) >> _5._m0);
+    _6._m0 = uint4(_5._m0 >> int4(_5._m1));
+    _6._m0 = uint4(int4(_5._m1) >> int4(_5._m1));
+    _6._m0 = uint4(_5._m0 >> _5._m0);
+    _6._m1 = int4(_5._m1) >> int4(_5._m1);
+    _6._m1 = _5._m0 >> _5._m0;
+    _6._m1 = int4(_5._m1) >> _5._m0;
+    _6._m1 = _5._m0 >> int4(_5._m1);
+}
+
--- a/reference/shaders-msl/asm/comp/bitcast_sdiv.asm.comp
+++ b/reference/shaders-msl/asm/comp/bitcast_sdiv.asm.comp
@ -0,0 +1,29 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct _3
+{
+    int4 _m0;
+    uint4 _m1;
+};
+
+struct _4
+{
+    uint4 _m0;
+    int4 _m1;
+};
+
+kernel void main0(device _3& _5 [[buffer(0)]], device _4& _6 [[buffer(1)]])
+{
+    _6._m0 = uint4(int4(_5._m1) / _5._m0);
+    _6._m0 = uint4(_5._m0 / int4(_5._m1));
+    _6._m0 = uint4(int4(_5._m1) / int4(_5._m1));
+    _6._m0 = uint4(_5._m0 / _5._m0);
+    _6._m1 = int4(_5._m1) / int4(_5._m1);
+    _6._m1 = _5._m0 / _5._m0;
+    _6._m1 = int4(_5._m1) / _5._m0;
+    _6._m1 = _5._m0 / int4(_5._m1);
+}
+
--- a/reference/shaders-msl/asm/comp/bitcast_slr.asm.comp
+++ b/reference/shaders-msl/asm/comp/bitcast_slr.asm.comp
@ -0,0 +1,29 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct _3
+{
+    int4 _m0;
+    uint4 _m1;
+};
+
+struct _4
+{
+    uint4 _m0;
+    int4 _m1;
+};
+
+kernel void main0(device _3& _5 [[buffer(0)]], device _4& _6 [[buffer(1)]])
+{
+    _6._m0 = _5._m1 >> uint4(_5._m0);
+    _6._m0 = uint4(_5._m0) >> _5._m1;
+    _6._m0 = _5._m1 >> _5._m1;
+    _6._m0 = uint4(_5._m0) >> uint4(_5._m0);
+    _6._m1 = int4(_5._m1 >> _5._m1);
+    _6._m1 = int4(uint4(_5._m0) >> uint4(_5._m0));
+    _6._m1 = int4(_5._m1 >> uint4(_5._m0));
+    _6._m1 = int4(uint4(_5._m0) >> _5._m1);
+}
+
--- a/reference/shaders-msl/asm/comp/multiple-entry.asm.comp
+++ b/reference/shaders-msl/asm/comp/multiple-entry.asm.comp
@ -0,0 +1,29 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct _6
+{
+    int4 _m0;
+    uint4 _m1;
+};
+
+struct _7
+{
+    uint4 _m0;
+    int4 _m1;
+};
+
+kernel void main0(device _6& _8 [[buffer(0)]], device _7& _9 [[buffer(1)]])
+{
+    _9._m0 = _8._m1 + uint4(_8._m0);
+    _9._m0 = uint4(_8._m0) + _8._m1;
+    _9._m0 = _8._m1 + _8._m1;
+    _9._m0 = uint4(_8._m0 + _8._m0);
+    _9._m1 = int4(_8._m1 + _8._m1);
+    _9._m1 = _8._m0 + _8._m0;
+    _9._m1 = int4(_8._m1) + _8._m0;
+    _9._m1 = _8._m0 + int4(_8._m1);
+}
+
--- a/reference/shaders-msl/asm/comp/quantize.asm.comp
+++ b/reference/shaders-msl/asm/comp/quantize.asm.comp
@ -0,0 +1,21 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO0
+{
+    float scalar;
+    float2 vec2_val;
+    float3 vec3_val;
+    float4 vec4_val;
+};
+
+kernel void main0(device SSBO0& _4 [[buffer(0)]])
+{
+    _4.scalar = float(half(_4.scalar));
+    _4.vec2_val = float2(half2(_4.vec2_val));
+    _4.vec3_val = float3(half3(_4.vec3_val));
+    _4.vec4_val = float4(half4(_4.vec4_val));
+}
+
--- a/reference/shaders-msl/comp/bake_gradient.comp
+++ b/reference/shaders-msl/comp/bake_gradient.comp
@ -0,0 +1,40 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+constant uint3 gl_WorkGroupSize = uint3(8u, 8u, 1u);
+
+struct UBO
+{
+    float4 uInvSize;
+    float4 uScale;
+};
+
+float jacobian(thread const float2& dDdx, thread const float2& dDdy)
+{
+    return ((1.0 + dDdx.x) * (1.0 + dDdy.y)) - (dDdx.y * dDdy.x);
+}
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], constant UBO& _46 [[buffer(0)]], texture2d<float> uHeight [[texture(0)]], sampler uHeightSmplr [[sampler(0)]], texture2d<float> uDisplacement [[texture(1)]], sampler uDisplacementSmplr [[sampler(1)]], texture2d<float, access::write> iHeightDisplacement [[texture(2)]], texture2d<float, access::write> iGradJacobian [[texture(3)]])
+{
+    float4 uv = (float2(gl_GlobalInvocationID.xy) * _46.uInvSize.xy).xyxy + (_46.uInvSize * 0.5);
+    float h = uHeight.sample(uHeightSmplr, uv.xy, level(0.0)).x;
+    float x0 = uHeight.sample(uHeightSmplr, uv.xy, level(0.0), int2(-1, 0)).x;
+    float x1 = uHeight.sample(uHeightSmplr, uv.xy, level(0.0), int2(1, 0)).x;
+    float y0 = uHeight.sample(uHeightSmplr, uv.xy, level(0.0), int2(0, -1)).x;
+    float y1 = uHeight.sample(uHeightSmplr, uv.xy, level(0.0), int2(0, 1)).x;
+    float2 grad = (_46.uScale.xy * 0.5) * float2(x1 - x0, y1 - y0);
+    float2 displacement = uDisplacement.sample(uDisplacementSmplr, uv.zw, level(0.0)).xy * 1.2000000476837158203125;
+    float2 dDdx = (uDisplacement.sample(uDisplacementSmplr, uv.zw, level(0.0), int2(1, 0)).xy - uDisplacement.sample(uDisplacementSmplr, uv.zw, level(0.0), int2(-1, 0)).xy) * 0.60000002384185791015625;
+    float2 dDdy = (uDisplacement.sample(uDisplacementSmplr, uv.zw, level(0.0), int2(0, 1)).xy - uDisplacement.sample(uDisplacementSmplr, uv.zw, level(0.0), int2(0, -1)).xy) * 0.60000002384185791015625;
+    float2 param = dDdx * _46.uScale.z;
+    float2 param_1 = dDdy * _46.uScale.z;
+    float j = jacobian(param, param_1);
+    displacement = float2(0.0);
+    iHeightDisplacement.write(float4(h, displacement, 0.0), uint2(int2(gl_GlobalInvocationID.xy)));
+    iGradJacobian.write(float4(grad, j, 0.0), uint2(int2(gl_GlobalInvocationID.xy)));
+}
+
--- a/reference/shaders-msl/comp/basic.comp
+++ b/reference/shaders-msl/comp/basic.comp
@ -0,0 +1,34 @@
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+#include <metal_atomic>
+
+using namespace metal;
+
+struct SSBO
+{
+    float4 in_data[1];
+};
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+struct SSBO3
+{
+    uint counter;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO& _23 [[buffer(0)]], device SSBO2& _45 [[buffer(1)]], device SSBO3& _48 [[buffer(2)]])
+{
+    uint ident = gl_GlobalInvocationID.x;
+    float4 idata = _23.in_data[ident];
+    if (dot(idata, float4(1.0, 5.0, 6.0, 2.0)) > 8.19999980926513671875)
+    {
+        uint _52 = atomic_fetch_add_explicit((volatile device atomic_uint*)&(_48.counter), 1u, memory_order_relaxed);
+        _45.out_data[_52] = idata;
+    }
+}
+
--- a/reference/shaders-msl/comp/bitfield.comp
+++ b/reference/shaders-msl/comp/bitfield.comp
@ -0,0 +1,47 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+// Implementation of the GLSL findLSB() function
+template<typename T>
+T findLSB(T x)
+{
+    return select(ctz(x), T(-1), x == T(0));
+}
+
+// Implementation of the signed GLSL findMSB() function
+template<typename T>
+T findSMSB(T x)
+{
+    T v = select(x, T(-1) - x, x < T(0));
+    return select(clz(T(0)) - (clz(v) + T(1)), T(-1), v == T(0));
+}
+
+// Implementation of the unsigned GLSL findMSB() function
+template<typename T>
+T findUMSB(T x)
+{
+    return select(clz(T(0)) - (clz(x) + T(1)), T(-1), x == T(0));
+}
+
+kernel void main0()
+{
+    int signed_value = 0;
+    uint unsigned_value = 0u;
+    int s = extract_bits(signed_value, 5, 20);
+    uint u = extract_bits(unsigned_value, 6, 21);
+    s = insert_bits(s, 40, 5, 4);
+    u = insert_bits(u, 60u, 5, 4);
+    u = reverse_bits(u);
+    s = reverse_bits(s);
+    int v0 = popcount(u);
+    int v1 = popcount(s);
+    int v2 = findUMSB(u);
+    int v3 = findSMSB(s);
+    int v4 = findLSB(u);
+    int v5 = findLSB(s);
+}
+
--- a/reference/shaders-msl/comp/cfg-preserve-parameter.comp
+++ b/reference/shaders-msl/comp/cfg-preserve-parameter.comp
@ -0,0 +1,78 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+void out_test_0(thread const int& cond, thread int& i)
+{
+    if (cond == 0)
+    {
+        i = 40;
+    }
+    else
+    {
+        i = 60;
+    }
+}
+
+void out_test_1(thread const int& cond, thread int& i)
+{
+    switch (cond)
+    {
+        case 40:
+        {
+            i = 40;
+            break;
+        }
+        default:
+        {
+            i = 70;
+            break;
+        }
+    }
+}
+
+void inout_test_0(thread const int& cond, thread int& i)
+{
+    if (cond == 0)
+    {
+        i = 40;
+    }
+}
+
+void inout_test_1(thread const int& cond, thread int& i)
+{
+    switch (cond)
+    {
+        case 40:
+        {
+            i = 40;
+            break;
+        }
+    }
+}
+
+kernel void main0()
+{
+    int cond = 40;
+    int i = 50;
+    int param = cond;
+    int param_1 = i;
+    out_test_0(param, param_1);
+    i = param_1;
+    int param_2 = cond;
+    int param_3 = i;
+    out_test_1(param_2, param_3);
+    i = param_3;
+    int param_4 = cond;
+    int param_5 = i;
+    inout_test_0(param_4, param_5);
+    i = param_5;
+    int param_6 = cond;
+    int param_7 = i;
+    inout_test_1(param_6, param_7);
+    i = param_7;
+}
+
--- a/reference/shaders-msl/comp/coherent-block.comp
+++ b/reference/shaders-msl/comp/coherent-block.comp
@ -0,0 +1,15 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO
+{
+    float4 value;
+};
+
+kernel void main0(device SSBO& _10 [[buffer(0)]])
+{
+    _10.value = float4(20.0);
+}
+
--- a/reference/shaders-msl/comp/coherent-image.comp
+++ b/reference/shaders-msl/comp/coherent-image.comp
@ -0,0 +1,15 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO
+{
+    int4 value;
+};
+
+kernel void main0(device SSBO& _10 [[buffer(0)]], texture2d<int> uImage [[texture(0)]])
+{
+    _10.value = uImage.read(uint2(int2(10)));
+}
+
--- a/reference/shaders-msl/comp/culling.comp
+++ b/reference/shaders-msl/comp/culling.comp
@ -0,0 +1,36 @@
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+#include <metal_atomic>
+
+using namespace metal;
+
+constant uint3 gl_WorkGroupSize = uint3(4u, 1u, 1u);
+
+struct SSBO
+{
+    float in_data[1];
+};
+
+struct SSBO2
+{
+    float out_data[1];
+};
+
+struct SSBO3
+{
+    uint count;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO& _22 [[buffer(0)]], device SSBO2& _38 [[buffer(1)]], device SSBO3& _41 [[buffer(2)]])
+{
+    uint ident = gl_GlobalInvocationID.x;
+    float idata = _22.in_data[ident];
+    if (idata > 12.0)
+    {
+        uint _45 = atomic_fetch_add_explicit((volatile device atomic_uint*)&(_41.count), 1u, memory_order_relaxed);
+        _38.out_data[_45] = idata;
+    }
+}
+
--- a/reference/shaders-msl/comp/dowhile.comp
+++ b/reference/shaders-msl/comp/dowhile.comp
@ -0,0 +1,29 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO
+{
+    float4x4 mvp;
+    float4 in_data[1];
+};
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO& _28 [[buffer(0)]], device SSBO2& _52 [[buffer(1)]])
+{
+    uint ident = gl_GlobalInvocationID.x;
+    int i = 0;
+    float4 idat = _28.in_data[ident];
+    do
+    {
+        idat = _28.mvp * idat;
+        i++;
+    } while (i < 16);
+    _52.out_data[ident] = idat;
+}
+
--- a/reference/shaders-msl/comp/image.comp
+++ b/reference/shaders-msl/comp/image.comp
@ -0,0 +1,11 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+kernel void main0(texture2d<float> uImageIn [[texture(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], texture2d<float, access::write> uImageOut [[texture(1)]])
+{
+    float4 v = uImageIn.read(uint2((int2(gl_GlobalInvocationID.xy) + int2(uImageIn.get_width(), uImageIn.get_height()))));
+    uImageOut.write(v, uint2(int2(gl_GlobalInvocationID.xy)));
+}
+
--- a/reference/shaders-msl/comp/insert.comp
+++ b/reference/shaders-msl/comp/insert.comp
@ -0,0 +1,21 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO
+{
+    float4 out_data[1];
+};
+
+kernel void main0(device SSBO& _27 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+{
+    float4 v;
+    v.x = 10.0;
+    v.y = 30.0;
+    v.z = 70.0;
+    v.w = 90.0;
+    _27.out_data[gl_GlobalInvocationID.x] = v;
+    _27.out_data[gl_GlobalInvocationID.x].y = 20.0;
+}
+
--- a/reference/shaders-msl/comp/loop.comp
+++ b/reference/shaders-msl/comp/loop.comp
@ -0,0 +1,107 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO
+{
+    float4x4 mvp;
+    float4 in_data[1];
+};
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO& _24 [[buffer(0)]], device SSBO2& _177 [[buffer(1)]])
+{
+    uint ident = gl_GlobalInvocationID.x;
+    float4 idat = _24.in_data[ident];
+    int k = 0;
+    uint i = 0u;
+    if (idat.y == 20.0)
+    {
+        do
+        {
+            k *= 2;
+            i++;
+        } while (i < ident);
+    }
+    switch (k)
+    {
+        case 10:
+        {
+            for (;;)
+            {
+                i++;
+                if (i > 10u)
+                {
+                    break;
+                }
+                continue;
+            }
+            break;
+        }
+        default:
+        {
+            for (;;)
+            {
+                i += 2u;
+                if (i > 20u)
+                {
+                    break;
+                }
+                continue;
+            }
+            break;
+        }
+    }
+    while (k < 10)
+    {
+        idat *= 2.0;
+        k++;
+    }
+    for (uint i_1 = 0u; i_1 < 16u; i_1++, k++)
+    {
+        for (uint j = 0u; j < 30u; j++)
+        {
+            idat = _24.mvp * idat;
+        }
+    }
+    k = 0;
+    for (;;)
+    {
+        k++;
+        if (k > 10)
+        {
+            k += 2;
+        }
+        else
+        {
+            k += 3;
+            continue;
+        }
+        k += 10;
+        continue;
+    }
+    k = 0;
+    do
+    {
+        k++;
+    } while (k > 10);
+    int l = 0;
+    for (;;)
+    {
+        if (l == 5)
+        {
+            l++;
+            continue;
+        }
+        idat += float4(1.0);
+        l++;
+        continue;
+    }
+    _177.out_data[ident] = idat;
+}
+
--- a/reference/shaders-msl/comp/mat3.comp
+++ b/reference/shaders-msl/comp/mat3.comp
@ -0,0 +1,16 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO2
+{
+    float3x3 out_data[1];
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO2& _22 [[buffer(0)]])
+{
+    uint ident = gl_GlobalInvocationID.x;
+    _22.out_data[ident] = float3x3(float3(10.0), float3(20.0), float3(40.0));
+}
+
--- a/reference/shaders-msl/comp/mod.comp
+++ b/reference/shaders-msl/comp/mod.comp
@ -0,0 +1,35 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO
+{
+    float4 in_data[1];
+};
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+// Implementation of the GLSL mod() function, which is slightly different than Metal fmod()
+template<typename Tx, typename Ty>
+Tx mod(Tx x, Ty y)
+{
+    return x - y * floor(x / y);
+}
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO& _23 [[buffer(0)]], device SSBO2& _33 [[buffer(1)]])
+{
+    uint ident = gl_GlobalInvocationID.x;
+    float4 v = mod(_23.in_data[ident], _33.out_data[ident]);
+    _33.out_data[ident] = v;
+    uint4 vu = as_type<uint4>(_23.in_data[ident]) % as_type<uint4>(_33.out_data[ident]);
+    _33.out_data[ident] = as_type<float4>(vu);
+    int4 vi = as_type<int4>(_23.in_data[ident]) % as_type<int4>(_33.out_data[ident]);
+    _33.out_data[ident] = as_type<float4>(vi);
+}
+
--- a/reference/shaders-msl/comp/modf.comp
+++ b/reference/shaders-msl/comp/modf.comp
@ -0,0 +1,24 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO
+{
+    float4 in_data[1];
+};
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO& _23 [[buffer(0)]], device SSBO2& _35 [[buffer(1)]])
+{
+    uint ident = gl_GlobalInvocationID.x;
+    float4 i;
+    float4 _31 = modf(_23.in_data[ident], i);
+    float4 v = _31;
+    _35.out_data[ident] = v;
+}
+
--- a/reference/shaders-msl/comp/return.comp
+++ b/reference/shaders-msl/comp/return.comp
@ -0,0 +1,36 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO2& _27 [[buffer(0)]])
+{
+    uint ident = gl_GlobalInvocationID.x;
+    if (ident == 2u)
+    {
+        _27.out_data[ident] = float4(20.0);
+    }
+    else
+    {
+        if (ident == 4u)
+        {
+            _27.out_data[ident] = float4(10.0);
+            return;
+        }
+    }
+    for (int i = 0; i < 20; i++)
+    {
+        if (i == 10)
+        {
+            break;
+        }
+        return;
+    }
+    _27.out_data[ident] = float4(10.0);
+}
+
--- a/reference/shaders-msl/comp/rmw-opt.comp
+++ b/reference/shaders-msl/comp/rmw-opt.comp
@ -0,0 +1,29 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO
+{
+    int a;
+};
+
+kernel void main0(device SSBO& _9 [[buffer(0)]])
+{
+    _9.a += 10;
+    _9.a -= 10;
+    _9.a *= 10;
+    _9.a /= 10;
+    _9.a = _9.a << 2;
+    _9.a = _9.a >> 3;
+    _9.a &= 40;
+    _9.a ^= 10;
+    _9.a %= 40;
+    _9.a |= 1;
+    bool c = false;
+    bool d = true;
+    c = c && d;
+    d = d || c;
+    _9.a = int(c && d);
+}
+
--- a/reference/shaders-msl/comp/shared.comp
+++ b/reference/shaders-msl/comp/shared.comp
@ -0,0 +1,27 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+constant uint3 gl_WorkGroupSize = uint3(4u, 1u, 1u);
+
+struct SSBO
+{
+    float in_data[1];
+};
+
+struct SSBO2
+{
+    float out_data[1];
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO& _22 [[buffer(0)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], device SSBO2& _44 [[buffer(1)]])
+{
+    uint ident = gl_GlobalInvocationID.x;
+    float idata = _22.in_data[ident];
+    threadgroup float sShared[4];
+    sShared[gl_LocalInvocationIndex] = idata;
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    _44.out_data[ident] = sShared[(4u - gl_LocalInvocationIndex) - 1u];
+}
+
--- a/reference/shaders-msl/comp/struct-layout.comp
+++ b/reference/shaders-msl/comp/struct-layout.comp
@ -0,0 +1,26 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct Foo
+{
+    float4x4 m;
+};
+
+struct SSBO2
+{
+    Foo out_data[1];
+};
+
+struct SSBO
+{
+    Foo in_data[1];
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO2& _23 [[buffer(0)]], device SSBO& _30 [[buffer(1)]])
+{
+    uint ident = gl_GlobalInvocationID.x;
+    _23.out_data[ident].m = _30.in_data[ident].m * _30.in_data[ident].m;
+}
+
--- a/reference/shaders-msl/comp/struct-packing.comp
+++ b/reference/shaders-msl/comp/struct-packing.comp
@ -0,0 +1,100 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct S0
+{
+    float2 a[1];
+    float b;
+};
+
+struct S1
+{
+    packed_float3 a;
+    float b;
+};
+
+struct S2
+{
+    float3 a[1];
+    float b;
+};
+
+struct S3
+{
+    float2 a;
+    float b;
+};
+
+struct S4
+{
+    float2 c;
+};
+
+struct Content
+{
+    S0 m0s[1];
+    S1 m1s[1];
+    S2 m2s[1];
+    S0 m0;
+    S1 m1;
+    S2 m2;
+    S3 m3;
+    char pad7[4];
+    float m4;
+    S4 m3s[8];
+};
+
+struct SSBO1
+{
+    Content content;
+    Content content1[2];
+    Content content2;
+    char pad3[8];
+    float2x2 m0;
+    float2x2 m1;
+    float2x3 m2[4];
+    float3x2 m3;
+    float2x2 m4;
+    float2x2 m5[9];
+    float2x3 m6[4][2];
+    float3x2 m7;
+    float array[1];
+};
+
+struct SSBO0
+{
+    Content content;
+    Content content1[2];
+    Content content2;
+    float array[1];
+};
+
+kernel void main0(device SSBO1& ssbo_430 [[buffer(0)]], device SSBO0& ssbo_140 [[buffer(1)]])
+{
+    ssbo_430.content.m0s[0].a[0] = ssbo_140.content.m0s[0].a[0];
+    ssbo_430.content.m0s[0].b = ssbo_140.content.m0s[0].b;
+    ssbo_430.content.m1s[0].a = ssbo_140.content.m1s[0].a;
+    ssbo_430.content.m1s[0].b = ssbo_140.content.m1s[0].b;
+    ssbo_430.content.m2s[0].a[0] = ssbo_140.content.m2s[0].a[0];
+    ssbo_430.content.m2s[0].b = ssbo_140.content.m2s[0].b;
+    ssbo_430.content.m0.a[0] = ssbo_140.content.m0.a[0];
+    ssbo_430.content.m0.b = ssbo_140.content.m0.b;
+    ssbo_430.content.m1.a = ssbo_140.content.m1.a;
+    ssbo_430.content.m1.b = ssbo_140.content.m1.b;
+    ssbo_430.content.m2.a[0] = ssbo_140.content.m2.a[0];
+    ssbo_430.content.m2.b = ssbo_140.content.m2.b;
+    ssbo_430.content.m3.a = ssbo_140.content.m3.a;
+    ssbo_430.content.m3.b = ssbo_140.content.m3.b;
+    ssbo_430.content.m4 = ssbo_140.content.m4;
+    ssbo_430.content.m3s[0].c = ssbo_140.content.m3s[0].c;
+    ssbo_430.content.m3s[1].c = ssbo_140.content.m3s[1].c;
+    ssbo_430.content.m3s[2].c = ssbo_140.content.m3s[2].c;
+    ssbo_430.content.m3s[3].c = ssbo_140.content.m3s[3].c;
+    ssbo_430.content.m3s[4].c = ssbo_140.content.m3s[4].c;
+    ssbo_430.content.m3s[5].c = ssbo_140.content.m3s[5].c;
+    ssbo_430.content.m3s[6].c = ssbo_140.content.m3s[6].c;
+    ssbo_430.content.m3s[7].c = ssbo_140.content.m3s[7].c;
+}
+
--- a/reference/shaders-msl/comp/torture-loop.comp
+++ b/reference/shaders-msl/comp/torture-loop.comp
@ -0,0 +1,51 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO
+{
+    float4x4 mvp;
+    float4 in_data[1];
+};
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO& _24 [[buffer(0)]], device SSBO2& _89 [[buffer(1)]])
+{
+    uint ident = gl_GlobalInvocationID.x;
+    float4 idat = _24.in_data[ident];
+    int k = 0;
+    for (;;)
+    {
+        int _39 = k;
+        int _40 = _39 + 1;
+        k = _40;
+        if (_40 < 10)
+        {
+            idat *= 2.0;
+            k++;
+            continue;
+        }
+        else
+        {
+            break;
+        }
+    }
+    for (uint i = 0u; i < 16u; i++, k++)
+    {
+        for (uint j = 0u; j < 30u; j++)
+        {
+            idat = _24.mvp * idat;
+        }
+    }
+    do
+    {
+        k++;
+    } while (k > 10);
+    _89.out_data[ident] = idat;
+}
+
--- a/reference/shaders-msl/comp/type-alias.comp
+++ b/reference/shaders-msl/comp/type-alias.comp
@ -0,0 +1,53 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct S0
+{
+    float4 a;
+};
+
+struct S1
+{
+    float4 a;
+};
+
+struct SSBO0
+{
+    S0 s0s[1];
+};
+
+struct SSBO1
+{
+    S1 s1s[1];
+};
+
+struct SSBO2
+{
+    float4 outputs[1];
+};
+
+float4 overload(thread const S0& s0)
+{
+    return s0.a;
+}
+
+float4 overload(thread const S1& s1)
+{
+    return s1.a;
+}
+
+kernel void main0(device SSBO0& _36 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO1& _55 [[buffer(1)]], device SSBO2& _66 [[buffer(2)]])
+{
+    S0 s0;
+    s0.a = _36.s0s[gl_GlobalInvocationID.x].a;
+    S1 s1;
+    s1.a = _55.s1s[gl_GlobalInvocationID.x].a;
+    S0 param = s0;
+    S1 param_1 = s1;
+    _66.outputs[gl_GlobalInvocationID.x] = overload(param) + overload(param_1);
+}
+
--- a/reference/shaders-msl/comp/udiv.comp
+++ b/reference/shaders-msl/comp/udiv.comp
@ -0,0 +1,20 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct SSBO2
+{
+    uint outputs[1];
+};
+
+struct SSBO
+{
+    uint inputs[1];
+};
+
+kernel void main0(device SSBO2& _10 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device SSBO& _23 [[buffer(1)]])
+{
+    _10.outputs[gl_GlobalInvocationID.x] = _23.inputs[gl_GlobalInvocationID.x] / 29u;
+}
+
--- a/reference/shaders-msl/flatten/struct.flatten.vert
+++ b/reference/shaders-msl/flatten/struct.flatten.vert
@ -5,7 +5,7 @@ using namespace metal;

 struct Light
 {
-    float3 Position;
+    packed_float3 Position;
    float Radius;
    float4 Color;
 };
--- a/reference/shaders-msl/flatten/swizzle.flatten.vert
+++ b/reference/shaders-msl/flatten/swizzle.flatten.vert
@ -39,7 +39,7 @@ vertex main0_out main0(constant UBO& _22 [[buffer(0)]])
    out.oA = _22.A;
    out.oB = float4(_22.B0, _22.B1);
    out.oC = float4(_22.C0, _22.C1);
-    out.oD = float4(float3(_22.D0), _22.D1);
+    out.oD = float4(_22.D0, _22.D1);
    out.oE = float4(_22.E0, _22.E1, _22.E2, _22.E3);
    out.oF = float4(_22.F0, _22.F1, _22.F2);
    return out;
--- a/reference/shaders-msl/vert/copy.flatten.vert
+++ b/reference/shaders-msl/vert/copy.flatten.vert
@ -5,7 +5,7 @@ using namespace metal;

 struct Light
 {
-    float3 Position;
+    packed_float3 Position;
    float Radius;
    float4 Color;
 };
--- a/reference/shaders-msl/vert/dynamic.flatten.vert
+++ b/reference/shaders-msl/vert/dynamic.flatten.vert
@ -5,7 +5,7 @@ using namespace metal;

 struct Light
 {
-    float3 Position;
+    packed_float3 Position;
    float Radius;
    float4 Color;
 };
--- a/reference/shaders-msl/vert/functions.vert
+++ b/reference/shaders-msl/vert/functions.vert
@ -47,15 +47,15 @@ T degrees(T r)
 template<typename T>
 T findLSB(T x)
 {
-    return select(ctz(x), -1, x == 0);
+    return select(ctz(x), T(-1), x == T(0));
 }

 // Implementation of the signed GLSL findMSB() function
 template<typename T>
 T findSMSB(T x)
 {
-    T v = select(x, -1 - x, x < 0);
-    return select(clz(0) - (clz(v) + 1), -1, v == 0);
+    T v = select(x, T(-1) - x, x < T(0));
+    return select(clz(T(0)) - (clz(v) + T(1)), T(-1), v == T(0));
 }

 // Returns the determinant of a 2x2 matrix.
--- a/reference/shaders-msl/vert/ubo.alignment.vert
+++ b/reference/shaders-msl/vert/ubo.alignment.vert
@ -31,7 +31,7 @@ vertex main0_out main0(main0_in in [[stage_in]], constant UBO& _18 [[buffer(0)]]
    main0_out out = {};
    out.gl_Position = _18.mvp * in.aVertex;
    out.vNormal = in.aNormal;
-    out.vColor = float3(_18.color) * _18.opacity;
+    out.vColor = _18.color * _18.opacity;
    out.vSize = _18.targSize * _18.opacity;
    return out;
 }
--- a/shaders-msl/asm/comp/bitcast_iadd.asm.comp
+++ b/shaders-msl/asm/comp/bitcast_iadd.asm.comp
@ -0,0 +1,79 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 1
+; Bound: 30
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %func "main"
+               OpExecutionMode %func LocalSize 1 1 1
+               OpSource ESSL 310
+               OpSourceExtension "GL_GOOGLE_cpp_style_line_directive"
+               OpSourceExtension "GL_GOOGLE_include_directive"
+               OpMemberDecorate %input_struct 0 Offset 0
+               OpMemberDecorate %input_struct 1 Offset 16
+               OpMemberDecorate %output_struct 0 Offset 0
+               OpMemberDecorate %output_struct 1 Offset 16
+               OpDecorate %input_struct BufferBlock
+               OpDecorate %inputs DescriptorSet 0
+               OpDecorate %inputs Binding 0
+			   OpDecorate %inputs Restrict
+               OpDecorate %output_struct BufferBlock
+               OpDecorate %outputs DescriptorSet 0
+               OpDecorate %outputs Binding 1
+			   OpDecorate %outputs Restrict
+
+          %void = OpTypeVoid
+          %main_func = OpTypeFunction %void
+
+          %uint = OpTypeInt 32 0
+          %uvec4 = OpTypeVector %uint 4
+
+         %int = OpTypeInt 32 1
+         %ivec4 = OpTypeVector %int 4
+
+         %ivec4_ptr = OpTypePointer Uniform %ivec4
+         %uvec4_ptr = OpTypePointer Uniform %uvec4
+
+		 %zero = OpConstant %int 0
+		 %one = OpConstant %int 1
+
+         %input_struct = OpTypeStruct %ivec4 %uvec4
+         %input_struct_ptr = OpTypePointer Uniform %input_struct
+         %inputs = OpVariable %input_struct_ptr Uniform
+         %output_struct = OpTypeStruct %uvec4 %ivec4
+         %output_struct_ptr = OpTypePointer Uniform %output_struct
+         %outputs = OpVariable %output_struct_ptr Uniform
+
+          %func = OpFunction %void None %main_func
+          %block = OpLabel
+
+         %input1_ptr = OpAccessChain %ivec4_ptr %inputs %zero
+         %input0_ptr = OpAccessChain %uvec4_ptr %inputs %one
+         %input1 = OpLoad %ivec4 %input1_ptr
+         %input0 = OpLoad %uvec4 %input0_ptr
+
+         %output_ptr_uvec4 = OpAccessChain %uvec4_ptr %outputs %zero
+         %output_ptr_ivec4 = OpAccessChain %ivec4_ptr %outputs %one
+
+; Test all variants of IAdd
+         %result_iadd_0 = OpIAdd %uvec4 %input0 %input1
+         %result_iadd_1 = OpIAdd %uvec4 %input1 %input0
+         %result_iadd_2 = OpIAdd %uvec4 %input0 %input0
+         %result_iadd_3 = OpIAdd %uvec4 %input1 %input1
+         %result_iadd_4 = OpIAdd %ivec4 %input0 %input0
+         %result_iadd_5 = OpIAdd %ivec4 %input1 %input1
+         %result_iadd_6 = OpIAdd %ivec4 %input0 %input1
+         %result_iadd_7 = OpIAdd %ivec4 %input1 %input0
+			   OpStore %output_ptr_uvec4 %result_iadd_0
+			   OpStore %output_ptr_uvec4 %result_iadd_1
+			   OpStore %output_ptr_uvec4 %result_iadd_2
+			   OpStore %output_ptr_uvec4 %result_iadd_3
+			   OpStore %output_ptr_ivec4 %result_iadd_4
+			   OpStore %output_ptr_ivec4 %result_iadd_5
+			   OpStore %output_ptr_ivec4 %result_iadd_6
+			   OpStore %output_ptr_ivec4 %result_iadd_7
+
+               OpReturn
+               OpFunctionEnd
--- a/shaders-msl/asm/comp/bitcast_sar.asm.comp
+++ b/shaders-msl/asm/comp/bitcast_sar.asm.comp
@ -0,0 +1,77 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 1
+; Bound: 30
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %func "main"
+               OpExecutionMode %func LocalSize 1 1 1
+               OpSource ESSL 310
+               OpSourceExtension "GL_GOOGLE_cpp_style_line_directive"
+               OpSourceExtension "GL_GOOGLE_include_directive"
+               OpMemberDecorate %input_struct 0 Offset 0
+               OpMemberDecorate %input_struct 1 Offset 16
+               OpMemberDecorate %output_struct 0 Offset 0
+               OpMemberDecorate %output_struct 1 Offset 16
+               OpDecorate %input_struct BufferBlock
+               OpDecorate %inputs DescriptorSet 0
+               OpDecorate %inputs Binding 0
+               OpDecorate %output_struct BufferBlock
+               OpDecorate %outputs DescriptorSet 0
+               OpDecorate %outputs Binding 1
+
+          %void = OpTypeVoid
+          %main_func = OpTypeFunction %void
+
+          %uint = OpTypeInt 32 0
+          %uvec4 = OpTypeVector %uint 4
+
+         %int = OpTypeInt 32 1
+         %ivec4 = OpTypeVector %int 4
+
+         %ivec4_ptr = OpTypePointer Uniform %ivec4
+         %uvec4_ptr = OpTypePointer Uniform %uvec4
+
+		 %zero = OpConstant %int 0
+		 %one = OpConstant %int 1
+
+         %input_struct = OpTypeStruct %ivec4 %uvec4
+         %input_struct_ptr = OpTypePointer Uniform %input_struct
+         %inputs = OpVariable %input_struct_ptr Uniform
+         %output_struct = OpTypeStruct %uvec4 %ivec4
+         %output_struct_ptr = OpTypePointer Uniform %output_struct
+         %outputs = OpVariable %output_struct_ptr Uniform
+
+          %func = OpFunction %void None %main_func
+          %block = OpLabel
+
+         %input1_ptr = OpAccessChain %ivec4_ptr %inputs %zero
+         %input0_ptr = OpAccessChain %uvec4_ptr %inputs %one
+         %input1 = OpLoad %ivec4 %input1_ptr
+         %input0 = OpLoad %uvec4 %input0_ptr
+
+         %output_ptr_uvec4 = OpAccessChain %uvec4_ptr %outputs %zero
+         %output_ptr_ivec4 = OpAccessChain %ivec4_ptr %outputs %one
+
+; Test all variants of ShiftRightArithmetic
+         %result_iadd_0 = OpShiftRightArithmetic %uvec4 %input0 %input1
+         %result_iadd_1 = OpShiftRightArithmetic %uvec4 %input1 %input0
+         %result_iadd_2 = OpShiftRightArithmetic %uvec4 %input0 %input0
+         %result_iadd_3 = OpShiftRightArithmetic %uvec4 %input1 %input1
+         %result_iadd_4 = OpShiftRightArithmetic %ivec4 %input0 %input0
+         %result_iadd_5 = OpShiftRightArithmetic %ivec4 %input1 %input1
+         %result_iadd_6 = OpShiftRightArithmetic %ivec4 %input0 %input1
+         %result_iadd_7 = OpShiftRightArithmetic %ivec4 %input1 %input0
+			   OpStore %output_ptr_uvec4 %result_iadd_0
+			   OpStore %output_ptr_uvec4 %result_iadd_1
+			   OpStore %output_ptr_uvec4 %result_iadd_2
+			   OpStore %output_ptr_uvec4 %result_iadd_3
+			   OpStore %output_ptr_ivec4 %result_iadd_4
+			   OpStore %output_ptr_ivec4 %result_iadd_5
+			   OpStore %output_ptr_ivec4 %result_iadd_6
+			   OpStore %output_ptr_ivec4 %result_iadd_7
+
+               OpReturn
+               OpFunctionEnd
--- a/shaders-msl/asm/comp/bitcast_sdiv.asm.comp
+++ b/shaders-msl/asm/comp/bitcast_sdiv.asm.comp
@ -0,0 +1,77 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 1
+; Bound: 30
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %func "main"
+               OpExecutionMode %func LocalSize 1 1 1
+               OpSource ESSL 310
+               OpSourceExtension "GL_GOOGLE_cpp_style_line_directive"
+               OpSourceExtension "GL_GOOGLE_include_directive"
+               OpMemberDecorate %input_struct 0 Offset 0
+               OpMemberDecorate %input_struct 1 Offset 16
+               OpMemberDecorate %output_struct 0 Offset 0
+               OpMemberDecorate %output_struct 1 Offset 16
+               OpDecorate %input_struct BufferBlock
+               OpDecorate %inputs DescriptorSet 0
+               OpDecorate %inputs Binding 0
+               OpDecorate %output_struct BufferBlock
+               OpDecorate %outputs DescriptorSet 0
+               OpDecorate %outputs Binding 1
+
+          %void = OpTypeVoid
+          %main_func = OpTypeFunction %void
+
+          %uint = OpTypeInt 32 0
+          %uvec4 = OpTypeVector %uint 4
+
+         %int = OpTypeInt 32 1
+         %ivec4 = OpTypeVector %int 4
+
+         %ivec4_ptr = OpTypePointer Uniform %ivec4
+         %uvec4_ptr = OpTypePointer Uniform %uvec4
+
+		 %zero = OpConstant %int 0
+		 %one = OpConstant %int 1
+
+         %input_struct = OpTypeStruct %ivec4 %uvec4
+         %input_struct_ptr = OpTypePointer Uniform %input_struct
+         %inputs = OpVariable %input_struct_ptr Uniform
+         %output_struct = OpTypeStruct %uvec4 %ivec4
+         %output_struct_ptr = OpTypePointer Uniform %output_struct
+         %outputs = OpVariable %output_struct_ptr Uniform
+
+          %func = OpFunction %void None %main_func
+          %block = OpLabel
+
+         %input1_ptr = OpAccessChain %ivec4_ptr %inputs %zero
+         %input0_ptr = OpAccessChain %uvec4_ptr %inputs %one
+         %input1 = OpLoad %ivec4 %input1_ptr
+         %input0 = OpLoad %uvec4 %input0_ptr
+
+         %output_ptr_uvec4 = OpAccessChain %uvec4_ptr %outputs %zero
+         %output_ptr_ivec4 = OpAccessChain %ivec4_ptr %outputs %one
+
+; Test all variants of SDiv
+         %result_iadd_0 = OpSDiv %uvec4 %input0 %input1
+         %result_iadd_1 = OpSDiv %uvec4 %input1 %input0
+         %result_iadd_2 = OpSDiv %uvec4 %input0 %input0
+         %result_iadd_3 = OpSDiv %uvec4 %input1 %input1
+         %result_iadd_4 = OpSDiv %ivec4 %input0 %input0
+         %result_iadd_5 = OpSDiv %ivec4 %input1 %input1
+         %result_iadd_6 = OpSDiv %ivec4 %input0 %input1
+         %result_iadd_7 = OpSDiv %ivec4 %input1 %input0
+			   OpStore %output_ptr_uvec4 %result_iadd_0
+			   OpStore %output_ptr_uvec4 %result_iadd_1
+			   OpStore %output_ptr_uvec4 %result_iadd_2
+			   OpStore %output_ptr_uvec4 %result_iadd_3
+			   OpStore %output_ptr_ivec4 %result_iadd_4
+			   OpStore %output_ptr_ivec4 %result_iadd_5
+			   OpStore %output_ptr_ivec4 %result_iadd_6
+			   OpStore %output_ptr_ivec4 %result_iadd_7
+
+               OpReturn
+               OpFunctionEnd
--- a/shaders-msl/asm/comp/bitcast_slr.asm.comp
+++ b/shaders-msl/asm/comp/bitcast_slr.asm.comp
@ -0,0 +1,77 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 1
+; Bound: 30
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %func "main"
+               OpExecutionMode %func LocalSize 1 1 1
+               OpSource ESSL 310
+               OpSourceExtension "GL_GOOGLE_cpp_style_line_directive"
+               OpSourceExtension "GL_GOOGLE_include_directive"
+               OpMemberDecorate %input_struct 0 Offset 0
+               OpMemberDecorate %input_struct 1 Offset 16
+               OpMemberDecorate %output_struct 0 Offset 0
+               OpMemberDecorate %output_struct 1 Offset 16
+               OpDecorate %input_struct BufferBlock
+               OpDecorate %inputs DescriptorSet 0
+               OpDecorate %inputs Binding 0
+               OpDecorate %output_struct BufferBlock
+               OpDecorate %outputs DescriptorSet 0
+               OpDecorate %outputs Binding 1
+
+          %void = OpTypeVoid
+          %main_func = OpTypeFunction %void
+
+          %uint = OpTypeInt 32 0
+          %uvec4 = OpTypeVector %uint 4
+
+         %int = OpTypeInt 32 1
+         %ivec4 = OpTypeVector %int 4
+
+         %ivec4_ptr = OpTypePointer Uniform %ivec4
+         %uvec4_ptr = OpTypePointer Uniform %uvec4
+
+		 %zero = OpConstant %int 0
+		 %one = OpConstant %int 1
+
+         %input_struct = OpTypeStruct %ivec4 %uvec4
+         %input_struct_ptr = OpTypePointer Uniform %input_struct
+         %inputs = OpVariable %input_struct_ptr Uniform
+         %output_struct = OpTypeStruct %uvec4 %ivec4
+         %output_struct_ptr = OpTypePointer Uniform %output_struct
+         %outputs = OpVariable %output_struct_ptr Uniform
+
+          %func = OpFunction %void None %main_func
+          %block = OpLabel
+
+         %input1_ptr = OpAccessChain %ivec4_ptr %inputs %zero
+         %input0_ptr = OpAccessChain %uvec4_ptr %inputs %one
+         %input1 = OpLoad %ivec4 %input1_ptr
+         %input0 = OpLoad %uvec4 %input0_ptr
+
+         %output_ptr_uvec4 = OpAccessChain %uvec4_ptr %outputs %zero
+         %output_ptr_ivec4 = OpAccessChain %ivec4_ptr %outputs %one
+
+; Test all variants of ShiftRightLogical
+         %result_iadd_0 = OpShiftRightLogical %uvec4 %input0 %input1
+         %result_iadd_1 = OpShiftRightLogical %uvec4 %input1 %input0
+         %result_iadd_2 = OpShiftRightLogical %uvec4 %input0 %input0
+         %result_iadd_3 = OpShiftRightLogical %uvec4 %input1 %input1
+         %result_iadd_4 = OpShiftRightLogical %ivec4 %input0 %input0
+         %result_iadd_5 = OpShiftRightLogical %ivec4 %input1 %input1
+         %result_iadd_6 = OpShiftRightLogical %ivec4 %input0 %input1
+         %result_iadd_7 = OpShiftRightLogical %ivec4 %input1 %input0
+			   OpStore %output_ptr_uvec4 %result_iadd_0
+			   OpStore %output_ptr_uvec4 %result_iadd_1
+			   OpStore %output_ptr_uvec4 %result_iadd_2
+			   OpStore %output_ptr_uvec4 %result_iadd_3
+			   OpStore %output_ptr_ivec4 %result_iadd_4
+			   OpStore %output_ptr_ivec4 %result_iadd_5
+			   OpStore %output_ptr_ivec4 %result_iadd_6
+			   OpStore %output_ptr_ivec4 %result_iadd_7
+
+               OpReturn
+               OpFunctionEnd
--- a/shaders-msl/asm/comp/multiple-entry.asm.comp
+++ b/shaders-msl/asm/comp/multiple-entry.asm.comp
@ -0,0 +1,97 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 1
+; Bound: 30
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint Fragment %func_alt "main2" %frag_in %frag_out
+               OpEntryPoint GLCompute %func "main"
+               OpExecutionMode %func LocalSize 1 1 1
+               OpSource ESSL 310
+               OpSourceExtension "GL_GOOGLE_cpp_style_line_directive"
+               OpSourceExtension "GL_GOOGLE_include_directive"
+               OpMemberDecorate %input_struct 0 Offset 0
+               OpMemberDecorate %input_struct 1 Offset 16
+               OpMemberDecorate %output_struct 0 Offset 0
+               OpMemberDecorate %output_struct 1 Offset 16
+               OpDecorate %input_struct BufferBlock
+               OpDecorate %inputs DescriptorSet 0
+               OpDecorate %inputs Binding 0
+			   OpDecorate %inputs Restrict
+               OpDecorate %output_struct BufferBlock
+               OpDecorate %outputs DescriptorSet 0
+               OpDecorate %outputs Binding 1
+			   OpDecorate %outputs Restrict
+			   OpDecorate %frag_in Location 0
+			   OpDecorate %frag_out Location 0
+
+          %void = OpTypeVoid
+          %main_func = OpTypeFunction %void
+
+          %uint = OpTypeInt 32 0
+          %uvec4 = OpTypeVector %uint 4
+
+         %int = OpTypeInt 32 1
+         %ivec4 = OpTypeVector %int 4
+
+         %ivec4_ptr = OpTypePointer Uniform %ivec4
+         %uvec4_ptr = OpTypePointer Uniform %uvec4
+
+		 %float = OpTypeFloat 32
+		 %vec4 = OpTypeVector %float 4
+		 %vec4_input_ptr = OpTypePointer Input %vec4
+		 %vec4_output_ptr = OpTypePointer Output %vec4
+
+		 %zero = OpConstant %int 0
+		 %one = OpConstant %int 1
+
+         %input_struct = OpTypeStruct %ivec4 %uvec4
+         %input_struct_ptr = OpTypePointer Uniform %input_struct
+         %inputs = OpVariable %input_struct_ptr Uniform
+         %output_struct = OpTypeStruct %uvec4 %ivec4
+         %output_struct_ptr = OpTypePointer Uniform %output_struct
+         %outputs = OpVariable %output_struct_ptr Uniform
+
+		 %frag_in = OpVariable %vec4_input_ptr Input
+		 %frag_out = OpVariable %vec4_output_ptr Output
+
+          %func = OpFunction %void None %main_func
+          %block = OpLabel
+
+         %input1_ptr = OpAccessChain %ivec4_ptr %inputs %zero
+         %input0_ptr = OpAccessChain %uvec4_ptr %inputs %one
+         %input1 = OpLoad %ivec4 %input1_ptr
+         %input0 = OpLoad %uvec4 %input0_ptr
+
+         %output_ptr_uvec4 = OpAccessChain %uvec4_ptr %outputs %zero
+         %output_ptr_ivec4 = OpAccessChain %ivec4_ptr %outputs %one
+
+; Test all variants of IAdd
+         %result_iadd_0 = OpIAdd %uvec4 %input0 %input1
+         %result_iadd_1 = OpIAdd %uvec4 %input1 %input0
+         %result_iadd_2 = OpIAdd %uvec4 %input0 %input0
+         %result_iadd_3 = OpIAdd %uvec4 %input1 %input1
+         %result_iadd_4 = OpIAdd %ivec4 %input0 %input0
+         %result_iadd_5 = OpIAdd %ivec4 %input1 %input1
+         %result_iadd_6 = OpIAdd %ivec4 %input0 %input1
+         %result_iadd_7 = OpIAdd %ivec4 %input1 %input0
+			   OpStore %output_ptr_uvec4 %result_iadd_0
+			   OpStore %output_ptr_uvec4 %result_iadd_1
+			   OpStore %output_ptr_uvec4 %result_iadd_2
+			   OpStore %output_ptr_uvec4 %result_iadd_3
+			   OpStore %output_ptr_ivec4 %result_iadd_4
+			   OpStore %output_ptr_ivec4 %result_iadd_5
+			   OpStore %output_ptr_ivec4 %result_iadd_6
+			   OpStore %output_ptr_ivec4 %result_iadd_7
+
+               OpReturn
+               OpFunctionEnd
+
+          %func_alt = OpFunction %void None %main_func
+          %block_alt = OpLabel
+		       %frag_input_value = OpLoad %vec4 %frag_in
+		       OpStore %frag_out %frag_input_value
+               OpReturn
+               OpFunctionEnd
--- a/shaders-msl/asm/comp/quantize.asm.comp
+++ b/shaders-msl/asm/comp/quantize.asm.comp
@ -0,0 +1,67 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 1
+; Bound: 38
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %4 "main"
+               OpExecutionMode %4 LocalSize 1 1 1
+               OpSource ESSL 310
+               OpName %4 "main"
+               OpName %10 "SSBO0"
+               OpMemberName %10 0 "scalar"
+               OpMemberName %10 1 "vec2_val"
+               OpMemberName %10 2 "vec3_val"
+               OpMemberName %10 3 "vec4_val"
+               OpName %12 ""
+               OpMemberDecorate %10 0 Offset 0
+               OpMemberDecorate %10 1 Offset 8
+               OpMemberDecorate %10 2 Offset 16
+               OpMemberDecorate %10 3 Offset 32
+               OpDecorate %10 BufferBlock
+               OpDecorate %12 DescriptorSet 0
+               OpDecorate %12 Binding 0
+          %2 = OpTypeVoid
+          %3 = OpTypeFunction %2
+          %6 = OpTypeFloat 32
+          %7 = OpTypeVector %6 2
+          %8 = OpTypeVector %6 3
+          %9 = OpTypeVector %6 4
+         %10 = OpTypeStruct %6 %7 %8 %9
+         %11 = OpTypePointer Uniform %10
+         %12 = OpVariable %11 Uniform
+         %13 = OpTypeInt 32 1
+         %14 = OpConstant %13 0
+         %15 = OpTypePointer Uniform %6
+         %20 = OpConstant %13 1
+         %21 = OpTypePointer Uniform %7
+         %26 = OpConstant %13 2
+         %27 = OpTypePointer Uniform %8
+         %32 = OpConstant %13 3
+         %33 = OpTypePointer Uniform %9
+          %4 = OpFunction %2 None %3
+          %5 = OpLabel
+         %16 = OpAccessChain %15 %12 %14
+         %17 = OpLoad %6 %16
+         %18 = OpQuantizeToF16 %6 %17
+         %19 = OpAccessChain %15 %12 %14
+               OpStore %19 %18
+         %22 = OpAccessChain %21 %12 %20
+         %23 = OpLoad %7 %22
+         %24 = OpQuantizeToF16 %7 %23
+         %25 = OpAccessChain %21 %12 %20
+               OpStore %25 %24
+         %28 = OpAccessChain %27 %12 %26
+         %29 = OpLoad %8 %28
+         %30 = OpQuantizeToF16 %8 %29
+         %31 = OpAccessChain %27 %12 %26
+               OpStore %31 %30
+         %34 = OpAccessChain %33 %12 %32
+         %35 = OpLoad %9 %34
+         %36 = OpQuantizeToF16 %9 %35
+         %37 = OpAccessChain %33 %12 %32
+               OpStore %37 %36
+               OpReturn
+               OpFunctionEnd
--- a/shaders-msl/comp/bake_gradient.comp
+++ b/shaders-msl/comp/bake_gradient.comp
@ -0,0 +1,55 @@
+#version 310 es
+
+layout(local_size_x = 8, local_size_y = 8) in;
+
+layout(binding = 0) uniform sampler2D uHeight;
+layout(binding = 1) uniform sampler2D uDisplacement;
+layout(rgba16f, binding = 2) uniform writeonly mediump image2D iHeightDisplacement;
+layout(rgba16f, binding = 3) uniform writeonly mediump image2D iGradJacobian;
+
+layout(binding = 4) uniform UBO
+{
+    vec4 uInvSize;
+    vec4 uScale;
+};
+
+mediump float jacobian(mediump vec2 dDdx, mediump vec2 dDdy)
+{
+    return (1.0 + dDdx.x) * (1.0 + dDdy.y) - dDdx.y * dDdy.x;
+}
+#define LAMBDA 1.2
+
+void main()
+{
+    vec4 uv = (vec2(gl_GlobalInvocationID.xy) * uInvSize.xy).xyxy + 0.5 * uInvSize;
+
+    float h = textureLod(uHeight, uv.xy, 0.0).x;
+
+    // Compute the heightmap gradient by simple differentiation.
+    float x0 = textureLodOffset(uHeight, uv.xy, 0.0, ivec2(-1, 0)).x;
+    float x1 = textureLodOffset(uHeight, uv.xy, 0.0, ivec2(+1, 0)).x;
+    float y0 = textureLodOffset(uHeight, uv.xy, 0.0, ivec2(0, -1)).x;
+    float y1 = textureLodOffset(uHeight, uv.xy, 0.0, ivec2(0, +1)).x;
+    vec2 grad = uScale.xy * 0.5 * vec2(x1 - x0, y1 - y0);
+
+    // Displacement map must be sampled with a different offset since it's a smaller texture.
+    vec2 displacement = LAMBDA * textureLod(uDisplacement, uv.zw, 0.0).xy;
+
+    // Compute jacobian.
+    vec2 dDdx = 0.5 * LAMBDA * (
+        textureLodOffset(uDisplacement, uv.zw, 0.0, ivec2(+1, 0)).xy -
+        textureLodOffset(uDisplacement, uv.zw, 0.0, ivec2(-1, 0)).xy);
+    vec2 dDdy = 0.5 * LAMBDA * (
+        textureLodOffset(uDisplacement, uv.zw, 0.0, ivec2(0, +1)).xy -
+        textureLodOffset(uDisplacement, uv.zw, 0.0, ivec2(0, -1)).xy);
+    float j = jacobian(dDdx * uScale.z, dDdy * uScale.z);
+
+    displacement = vec2(0.0);
+
+    // Read by vertex shader/tess shader.
+    imageStore(iHeightDisplacement, ivec2(gl_GlobalInvocationID.xy), vec4(h, displacement, 0.0));
+
+    // Read by fragment shader.
+    imageStore(iGradJacobian, ivec2(gl_GlobalInvocationID.xy), vec4(grad, j, 0.0));
+}
+
--- a/shaders-msl/comp/basic.comp
+++ b/shaders-msl/comp/basic.comp
@ -0,0 +1,28 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    vec4 in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    vec4 out_data[];
+};
+
+layout(std430, binding = 2) buffer SSBO3
+{
+    uint counter;
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    vec4 idata = in_data[ident];
+    if (dot(idata, vec4(1.0, 5.0, 6.0, 2.0)) > 8.2)
+    {
+        out_data[atomicAdd(counter, 1u)] = idata;
+    }
+}
+
--- a/shaders-msl/comp/bitfield.comp
+++ b/shaders-msl/comp/bitfield.comp
@ -0,0 +1,23 @@
+#version 310 es
+
+void main()
+{
+   int signed_value = 0;
+   uint unsigned_value = 0u;
+
+   int s = bitfieldExtract(signed_value, 5, 20);
+   uint u = bitfieldExtract(unsigned_value, 6, 21);
+   s = bitfieldInsert(s, 40, 5, 4);
+   u = bitfieldInsert(u, 60u, 5, 4);
+
+   u = bitfieldReverse(u);
+   s = bitfieldReverse(s);
+
+   int v0 = bitCount(u);
+   int v1 = bitCount(s);
+
+   int v2 = findMSB(u);
+   int v3 = findMSB(s);
+   int v4 = findLSB(u);
+   int v5 = findLSB(s);
+}
--- a/shaders-msl/comp/cfg-preserve-parameter.comp
+++ b/shaders-msl/comp/cfg-preserve-parameter.comp
@ -0,0 +1,54 @@
+#version 310 es
+
+// We write in all paths (and no reads), so should just be out.
+void out_test_0(int cond, inout int i)
+{
+   if (cond == 0)
+      i = 40;
+   else
+      i = 60;
+}
+
+// We write in all paths (and no reads), so should just be out.
+void out_test_1(int cond, inout int i)
+{
+   switch (cond)
+   {
+      case 40:
+         i = 40;
+         break;
+
+      default:
+         i = 70;
+         break;
+   }
+}
+
+// We don't write in all paths, so should be inout.
+void inout_test_0(int cond, inout int i)
+{
+   if (cond == 0)
+      i = 40;
+}
+
+void inout_test_1(int cond, inout int i)
+{
+   switch (cond)
+   {
+      case 40:
+         i = 40;
+         break;
+   }
+}
+
+
+void main()
+{
+   int cond = 40;
+   int i = 50;
+
+   out_test_0(cond, i);
+   out_test_1(cond, i);
+   inout_test_0(cond, i);
+   inout_test_1(cond, i);
+}
--- a/shaders-msl/comp/coherent-block.comp
+++ b/shaders-msl/comp/coherent-block.comp
@ -0,0 +1,12 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(binding = 1) coherent restrict writeonly buffer SSBO
+{
+	vec4 value;	
+};
+
+void main()
+{
+	value = vec4(20.0);
+}
--- a/shaders-msl/comp/coherent-image.comp
+++ b/shaders-msl/comp/coherent-image.comp
@ -0,0 +1,14 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(binding = 1) coherent restrict writeonly buffer SSBO
+{
+	ivec4 value;	
+};
+
+layout(r32i, binding = 3) coherent readonly restrict uniform mediump iimage2D uImage;
+
+void main()
+{
+	value = imageLoad(uImage, ivec2(10));
+}
--- a/shaders-msl/comp/culling.comp
+++ b/shaders-msl/comp/culling.comp
@ -0,0 +1,26 @@
+#version 310 es
+layout(local_size_x = 4) in;
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    float in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    float out_data[];
+};
+
+layout(std430, binding = 2) buffer SSBO3
+{
+    uint count;
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    float idata = in_data[ident];
+    if (idata > 12.0)
+        out_data[atomicAdd(count, 1u)] = idata;
+}
+
--- a/shaders-msl/comp/dowhile.comp
+++ b/shaders-msl/comp/dowhile.comp
@ -0,0 +1,31 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    mat4 mvp;
+    vec4 in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    vec4 out_data[];
+};
+
+int i;
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+
+    i = 0;
+    vec4 idat = in_data[ident];
+    do
+    {
+        idat = mvp * idat;
+        i++;
+    } while(i < 16);
+
+    out_data[ident] = idat;
+}
+
--- a/shaders-msl/comp/image.comp
+++ b/shaders-msl/comp/image.comp
@ -0,0 +1,12 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(rgba8, binding = 0) uniform readonly mediump image2D uImageIn;
+layout(rgba8, binding = 1) uniform writeonly mediump image2D uImageOut;
+
+void main()
+{
+    vec4 v = imageLoad(uImageIn, ivec2(gl_GlobalInvocationID.xy) + imageSize(uImageIn));
+    imageStore(uImageOut, ivec2(gl_GlobalInvocationID.xy), v);
+}
+
--- a/shaders-msl/comp/insert.comp
+++ b/shaders-msl/comp/insert.comp
@ -0,0 +1,18 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) writeonly buffer SSBO
+{
+    vec4 out_data[];
+};
+
+void main()
+{
+    vec4 v;
+    v.x = 10.0;
+    v.y = 30.0;
+    v.z = 70.0;
+    v.w = 90.0;
+    out_data[gl_GlobalInvocationID.x] = v;
+    out_data[gl_GlobalInvocationID.x].y = 20.0;
+}
--- a/shaders-msl/comp/loop.comp
+++ b/shaders-msl/comp/loop.comp
@ -0,0 +1,98 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    mat4 mvp;
+    vec4 in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    vec4 out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    vec4 idat = in_data[ident];
+
+    int k = 0;
+    uint i = 0u;
+
+    if (idat.y == 20.0)
+    {
+        do
+        {
+            k = k * 2;
+            i++;
+        } while (i < ident);
+    }
+
+    switch (k)
+    {
+        case 10:
+            for (;;)
+            {
+                i++;
+                if (i > 10u)
+                    break;
+            }
+            break;
+
+        default:
+            for (;;)
+            {
+               i += 2u;
+               if (i > 20u)
+                  break;
+            }
+            break;
+    }
+
+    while (k < 10)
+    {
+        idat *= 2.0;
+        k++;
+    }
+
+    for (uint i = 0u; i < 16u; i++, k++)
+        for (uint j = 0u; j < 30u; j++)
+            idat = mvp * idat;
+
+    k = 0;
+    for (;;)
+    {
+        k++;
+        if (k > 10)
+        {
+            k += 2;
+        }
+        else
+        {
+            k += 3;
+            continue;
+        }
+
+        k += 10;
+    }
+
+    k = 0;
+    do
+    {
+        k++;
+    } while (k > 10);
+
+    int l = 0;
+    for (;; l++)
+    {
+        if (l == 5)
+        {
+            continue;
+        }
+        
+        idat += 1.0;
+    }
+    out_data[ident] = idat;
+}
+
--- a/shaders-msl/comp/mat3.comp
+++ b/shaders-msl/comp/mat3.comp
@ -0,0 +1,14 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    mat3 out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    out_data[ident] = mat3(vec3(10.0), vec3(20.0), vec3(40.0));
+}
+
--- a/shaders-msl/comp/mod.comp
+++ b/shaders-msl/comp/mod.comp
@ -0,0 +1,26 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    vec4 in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    vec4 out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    vec4 v = mod(in_data[ident], out_data[ident]);
+    out_data[ident] = v;
+
+    uvec4 vu = floatBitsToUint(in_data[ident]) % floatBitsToUint(out_data[ident]);
+    out_data[ident] = uintBitsToFloat(vu);
+
+    ivec4 vi = floatBitsToInt(in_data[ident]) % floatBitsToInt(out_data[ident]);
+    out_data[ident] = intBitsToFloat(vi);
+}
+
--- a/shaders-msl/comp/modf.comp
+++ b/shaders-msl/comp/modf.comp
@ -0,0 +1,23 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    vec4 in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    vec4 out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    vec4 i;
+    //vec4 v = frexp(in_data[ident], i);
+    //out_data[ident] = ldexp(v, i);
+    vec4 v = modf(in_data[ident], i);
+    out_data[ident] = v;
+}
+
--- a/shaders-msl/comp/return.comp
+++ b/shaders-msl/comp/return.comp
@ -0,0 +1,33 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    vec4 out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+
+    if (ident == 2u)
+    {
+        out_data[ident] = vec4(20.0);
+    }
+    else if (ident == 4u)
+    {
+        out_data[ident] = vec4(10.0);
+        return;
+    }
+
+    for (int i = 0; i < 20; i++)
+    {
+        if (i == 10)
+            break;
+
+        return;
+    }
+
+    out_data[ident] = vec4(10.0);
+}
+
--- a/shaders-msl/comp/rmw-opt.comp
+++ b/shaders-msl/comp/rmw-opt.comp
@ -0,0 +1,27 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+	int a;
+};
+
+void main()
+{
+	a += 10;
+	a -= 10;
+	a *= 10;
+	a /= 10;
+	a <<= 2;
+	a >>= 3;
+	a &= 40;
+	a ^= 10;
+	a %= 40;
+	a |= 1;
+
+	bool c = false;
+	bool d = true;
+	c = c && d;
+	d = d || c;
+	a = c && d ? 1 : 0;
+}
--- a/shaders-msl/comp/shared.comp
+++ b/shaders-msl/comp/shared.comp
@ -0,0 +1,27 @@
+#version 310 es
+layout(local_size_x = 4) in;
+
+shared float sShared[gl_WorkGroupSize.x];
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    float in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    float out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    float idata = in_data[ident];
+
+    sShared[gl_LocalInvocationIndex] = idata;
+    memoryBarrierShared();
+    barrier();
+
+    out_data[ident] = sShared[gl_WorkGroupSize.x - gl_LocalInvocationIndex - 1u];
+}
+
--- a/shaders-msl/comp/struct-layout.comp
+++ b/shaders-msl/comp/struct-layout.comp
@ -0,0 +1,24 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+struct Foo
+{
+    mat4 m;
+};
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    Foo in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    Foo out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    out_data[ident].m = in_data[ident].m * in_data[ident].m;
+}
+
--- a/shaders-msl/comp/struct-packing.comp
+++ b/shaders-msl/comp/struct-packing.comp
@ -0,0 +1,76 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+struct S0
+{
+    vec2 a[1];
+    float b;
+};
+
+struct S1
+{
+    vec3 a;
+    float b;
+};
+
+struct S2
+{
+    vec3 a[1];
+    float b;
+};
+
+struct S3
+{
+    vec2 a;
+    float b;
+};
+
+struct S4
+{
+	vec2 c;
+};
+
+struct Content
+{
+    S0 m0s[1];
+    S1 m1s[1];
+    S2 m2s[1];
+    S0 m0;
+    S1 m1;
+    S2 m2;
+    S3 m3;
+    float m4;
+
+	S4 m3s[8];
+};
+
+layout(binding = 1, std430) buffer SSBO1
+{
+    Content content;
+    Content content1[2];
+    Content content2;
+
+    layout(column_major) mat2 m0;
+    layout(column_major) mat2 m1;
+    layout(column_major) mat2x3 m2[4];
+    layout(column_major) mat3x2 m3;
+    layout(row_major) mat2 m4;
+    layout(row_major) mat2 m5[9];
+    layout(row_major) mat2x3 m6[4][2];
+    layout(row_major) mat3x2 m7;
+    float array[];
+} ssbo_430;
+
+layout(binding = 0, std140) buffer SSBO0
+{
+    Content content;
+    Content content1[2];
+    Content content2;
+    float array[];
+} ssbo_140;
+
+void main()
+{
+    ssbo_430.content = ssbo_140.content;
+}
+
--- a/shaders-msl/comp/torture-loop.comp
+++ b/shaders-msl/comp/torture-loop.comp
@ -0,0 +1,40 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    mat4 mvp;
+    vec4 in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    vec4 out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    vec4 idat = in_data[ident];
+
+    int k = 0;
+
+    // Continue with side effects.
+    while (++k < 10)
+    {
+        idat *= 2.0;
+        k++;
+    }
+
+    // Again used here ...
+    for (uint i = 0u; i < 16u; i++, k++)
+        for (uint j = 0u; j < 30u; j++)
+            idat = mvp * idat;
+
+    do
+    {
+        k++;
+    } while (k > 10);
+    out_data[ident] = idat;
+}
+
--- a/shaders-msl/comp/type-alias.comp
+++ b/shaders-msl/comp/type-alias.comp
@ -0,0 +1,45 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+struct S0
+{
+   vec4 a;
+};
+
+struct S1
+{
+   vec4 a;
+};
+
+vec4 overload(S0 s0)
+{
+   return s0.a;
+}
+
+vec4 overload(S1 s1)
+{
+   return s1.a;
+}
+
+layout(std430, binding = 0) buffer SSBO0
+{
+   S0 s0s[];
+};
+
+layout(std430, binding = 1) buffer SSBO1
+{
+   S1 s1s[];
+};
+
+layout(std430, binding = 2) buffer SSBO2
+{
+   vec4 outputs[];
+};
+
+
+void main()
+{
+   S0 s0 = s0s[gl_GlobalInvocationID.x];
+   S1 s1 = s1s[gl_GlobalInvocationID.x];
+   outputs[gl_GlobalInvocationID.x] = overload(s0) + overload(s1); 
+}
--- a/shaders-msl/comp/udiv.comp
+++ b/shaders-msl/comp/udiv.comp
@ -0,0 +1,17 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+    uint inputs[];
+};
+
+layout(std430, binding = 0) buffer SSBO2
+{
+    uint outputs[];
+};
+
+void main()
+{
+    outputs[gl_GlobalInvocationID.x] = inputs[gl_GlobalInvocationID.x] / 29u;
+}
--- a/spirv_common.hpp
+++ b/spirv_common.hpp
@ -341,9 +341,10 @@ struct SPIRExtension : IVariant
 // so in order to avoid conflicts, we can't stick them in the ids array.
 struct SPIREntryPoint
 {
-	SPIREntryPoint(uint32_t self_, spv::ExecutionModel execution_model, std::string entry_name)
+	SPIREntryPoint(uint32_t self_, spv::ExecutionModel execution_model, const std::string &entry_name)
 	    : self(self_)
-	    , name(std::move(entry_name))
+	    , name(entry_name)
+	    , orig_name(entry_name)
 	    , model(execution_model)
 	{
 	}
@ -351,6 +352,7 @@ struct SPIREntryPoint

 	uint32_t self = 0;
 	std::string name;
+	std::string orig_name;
 	std::vector<uint32_t> interface_variables;

 	uint64_t flags = 0;
@ -858,7 +860,9 @@ struct SPIRConstant : IVariant

 	uint32_t constant_type;
 	ConstantMatrix m;
-	bool specialization = false; // If the constant is a specialization constant (i.e. created with OpSpecConstant*).
+	bool specialization = false; // If this constant is a specialization constant (i.e. created with OpSpecConstant*).
+	bool is_used_as_array_length =
+	    false; // If this constant is used as an array length which creates specialization restrictions on some backends.

 	// For composites which are constant arrays, etc.
 	std::vector<uint32_t> subconstants;
@ -916,6 +920,10 @@ public:
 	{
 		return type;
 	}
+	uint32_t get_id() const
+	{
+		return holder ? holder->self : 0;
+	}
 	bool empty() const
 	{
 		return !holder;
--- a/spirv_cpp.cpp
+++ b/spirv_cpp.cpp
@ -171,9 +171,8 @@ void CompilerCPP::emit_resources()
 			auto &type = get<SPIRType>(var.basetype);

 			if (var.storage != StorageClassFunction && type.pointer && type.storage == StorageClassUniform &&
-			    !is_hidden_variable(var) &&
-			    (meta[type.self].decoration.decoration_flags &
-			     ((1ull << DecorationBlock) | (1ull << DecorationBufferBlock))))
+			    !is_hidden_variable(var) && (meta[type.self].decoration.decoration_flags &
+			                                 ((1ull << DecorationBlock) | (1ull << DecorationBufferBlock))))
 			{
 				emit_buffer_block(var);
 			}
--- a/spirv_cross.cpp
+++ b/spirv_cross.cpp
@ -168,7 +168,7 @@ bool Compiler::block_is_pure(const SPIRBlock &block)
 		case OpMemoryBarrier:
 			return false;

-			// OpExtInst is potentially impure depending on extension, but GLSL builtins are at least pure.
+		// OpExtInst is potentially impure depending on extension, but GLSL builtins are at least pure.

 		default:
 			break;
@ -457,7 +457,7 @@ bool Compiler::is_hidden_variable(const SPIRVariable &var, bool include_builtins
 	// Combined image samplers are always considered active as they are "magic" variables.
 	if (find_if(begin(combined_image_samplers), end(combined_image_samplers), [&var](const CombinedImageSampler &samp) {
 		    return samp.combined_id == var.self;
-	    }) != end(combined_image_samplers))
+		}) != end(combined_image_samplers))
 	{
 		return false;
 	}
@ -1498,18 +1498,21 @@ void Compiler::parse(const Instruction &instruction)
 	case OpTypeArray:
 	{
 		uint32_t id = ops[0];
-
-		auto &base = get<SPIRType>(ops[1]);
 		auto &arraybase = set<SPIRType>(id);

-		arraybase = base;
+		uint32_t tid = ops[1];
+		auto &base = get<SPIRType>(tid);

-		auto *c = maybe_get<SPIRConstant>(ops[2]);
+		arraybase = base;
+		arraybase.parent_type = tid;
+
+		uint32_t cid = ops[2];
+		mark_used_as_array_length(cid);
+		auto *c = maybe_get<SPIRConstant>(cid);
 		bool literal = c && !c->specialization;

 		arraybase.array_size_literal.push_back(literal);
-		arraybase.array.push_back(literal ? c->scalar() : ops[2]);
-		arraybase.parent_type = ops[1];
+		arraybase.array.push_back(literal ? c->scalar() : cid);
 		// Do NOT set arraybase.self!
 		break;
 	}
@ -2523,7 +2526,7 @@ vector<string> Compiler::get_entry_points() const
 {
 	vector<string> entries;
 	for (auto &entry : entry_points)
-		entries.push_back(entry.second.name);
+		entries.push_back(entry.second.orig_name);
 	return entries;
 }

@ -2536,8 +2539,9 @@ void Compiler::set_entry_point(const std::string &name)
 SPIREntryPoint &Compiler::get_entry_point(const std::string &name)
 {
 	auto itr =
-	    find_if(begin(entry_points), end(entry_points),
-	            [&](const std::pair<uint32_t, SPIREntryPoint> &entry) -> bool { return entry.second.name == name; });
+	    find_if(begin(entry_points), end(entry_points), [&](const std::pair<uint32_t, SPIREntryPoint> &entry) -> bool {
+		    return entry.second.orig_name == name;
+		});

 	if (itr == end(entry_points))
 		SPIRV_CROSS_THROW("Entry point does not exist.");
@ -2548,8 +2552,9 @@ SPIREntryPoint &Compiler::get_entry_point(const std::string &name)
 const SPIREntryPoint &Compiler::get_entry_point(const std::string &name) const
 {
 	auto itr =
-	    find_if(begin(entry_points), end(entry_points),
-	            [&](const std::pair<uint32_t, SPIREntryPoint> &entry) -> bool { return entry.second.name == name; });
+	    find_if(begin(entry_points), end(entry_points), [&](const std::pair<uint32_t, SPIREntryPoint> &entry) -> bool {
+		    return entry.second.orig_name == name;
+		});

 	if (itr == end(entry_points))
 		SPIRV_CROSS_THROW("Entry point does not exist.");
@ -2557,6 +2562,11 @@ const SPIREntryPoint &Compiler::get_entry_point(const std::string &name) const
 	return itr->second;
 }

+const string &Compiler::get_cleansed_entry_point_name(const std::string &name) const
+{
+	return get_entry_point(name).name;
+}
+
 const SPIREntryPoint &Compiler::get_entry_point() const
 {
 	return entry_points.find(entry_point)->second;
@ -2713,7 +2723,7 @@ void Compiler::CombinedImageSamplerHandler::register_combined_image_sampler(SPIR
 	                   [&param](const SPIRFunction::CombinedImageSamplerParameter &p) {
 		                   return param.image_id == p.image_id && param.sampler_id == p.sampler_id &&
 		                          param.global_image == p.global_image && param.global_sampler == p.global_sampler;
-	                   });
+		               });

 	if (itr == end(caller.combined_parameters))
 	{
@ -2850,7 +2860,7 @@ bool Compiler::CombinedImageSamplerHandler::handle(Op opcode, const uint32_t *ar
 	auto itr = find_if(begin(compiler.combined_image_samplers), end(compiler.combined_image_samplers),
 	                   [image_id, sampler_id](const CombinedImageSampler &combined) {
 		                   return combined.image_id == image_id && combined.sampler_id == sampler_id;
-	                   });
+		               });

 	if (itr == end(compiler.combined_image_samplers))
 	{
@ -2926,6 +2936,31 @@ const SPIRConstant &Compiler::get_constant(uint32_t id) const
 	return get<SPIRConstant>(id);
 }

+// Recursively marks any constants referenced by the specified constant instruction as being used
+// as an array length. The id must be a constant instruction (SPIRConstant or SPIRConstantOp).
+void Compiler::mark_used_as_array_length(uint32_t id)
+{
+	switch (ids[id].get_type())
+	{
+	case TypeConstant:
+		get<SPIRConstant>(id).is_used_as_array_length = true;
+		break;
+
+	case TypeConstantOp:
+	{
+		auto &cop = get<SPIRConstantOp>(id);
+		for (uint32_t arg_id : cop.arguments)
+			mark_used_as_array_length(arg_id);
+	}
+
+	case TypeUndef:
+		return;
+
+	default:
+		SPIRV_CROSS_THROW("Array lengths must be a constant instruction (OpConstant.. or OpSpecConstant...).");
+	}
+}
+
 static bool exists_unaccessed_path_to_return(const CFG &cfg, uint32_t block, const unordered_set<uint32_t> &blocks)
 {
 	// This block accesses the variable.
@ -3167,8 +3202,8 @@ void Compiler::analyze_variable_scope(SPIRFunction &entry)
 				break;
 			}

-				// Atomics shouldn't be able to access function-local variables.
-				// Some GLSL builtins access a pointer.
+			// Atomics shouldn't be able to access function-local variables.
+			// Some GLSL builtins access a pointer.

 			default:
 				break;
--- a/spirv_cross.hpp
+++ b/spirv_cross.hpp
@ -268,6 +268,16 @@ public:
 	const SPIREntryPoint &get_entry_point(const std::string &name) const;
 	SPIREntryPoint &get_entry_point(const std::string &name);

+	// Some shader languages restrict the names that can be given to entry points, and the
+	// corresponding backend will automatically rename an entry point name, during the call
+	// to compile() if it is illegal. For example, the common entry point name main() is
+	// illegal in MSL, and is renamed to an alternate name by the MSL backend.
+	// Given the original entry point name contained in the SPIR-V, this function returns
+	// the name, as updated by the backend during the call to compile(). If the name is not
+	// illegal, and has not been renamed, or if this function is called before compile(),
+	// this function will simply return the same name.
+	const std::string &get_cleansed_entry_point_name(const std::string &name) const;
+
 	// Query and modify OpExecutionMode.
 	uint64_t get_execution_mode_mask() const;
 	void unset_execution_mode(spv::ExecutionMode mode);
@ -497,6 +507,7 @@ protected:
 	bool expression_is_lvalue(uint32_t id) const;
 	bool variable_storage_is_aliased(const SPIRVariable &var);
 	SPIRVariable *maybe_get_backing_variable(uint32_t chain);
+	void mark_used_as_array_length(uint32_t id);

 	void register_read(uint32_t expr, uint32_t chain, bool forwarded);
 	void register_write(uint32_t chain);
--- a/spirv_glsl.cpp
+++ b/spirv_glsl.cpp
@ -2686,7 +2686,7 @@ string CompilerGLSL::declare_temporary(uint32_t result_type, uint32_t result_id)
 		if (find_if(begin(header.declare_temporary), end(header.declare_temporary),
 		            [result_type, result_id](const pair<uint32_t, uint32_t> &tmp) {
 			            return tmp.first == result_type && tmp.second == result_id;
-		            }) == end(header.declare_temporary))
+			        }) == end(header.declare_temporary))
 		{
 			header.declare_temporary.emplace_back(result_type, result_id);
 			force_recompile = true;
@ -2913,9 +2913,8 @@ void CompilerGLSL::emit_quaternary_func_op(uint32_t result_type, uint32_t result
                                           uint32_t op2, uint32_t op3, const char *op)
 {
 	bool forward = should_forward(op0) && should_forward(op1) && should_forward(op2) && should_forward(op3);
-	emit_op(result_type, result_id,
-	        join(op, "(", to_expression(op0), ", ", to_expression(op1), ", ", to_expression(op2), ", ",
-	             to_expression(op3), ")"),
+	emit_op(result_type, result_id, join(op, "(", to_expression(op0), ", ", to_expression(op1), ", ",
+	                                     to_expression(op2), ", ", to_expression(op3), ")"),
 	        forward);

 	inherit_expression_dependencies(result_id, op0);
@ -3556,15 +3555,6 @@ string CompilerGLSL::to_function_args(uint32_t img, const SPIRType &imgtype, boo
 	return farg_str;
 }

-// Some languages may have additional standard library functions whose names conflict
-// with a function defined in the body of the shader. Subclasses can override to rename
-// the function name defined in the shader to avoid conflict with the language standard
-// functions (eg. MSL includes saturate()).
-string CompilerGLSL::clean_func_name(string func_name)
-{
-	return func_name;
-}
-
 void CompilerGLSL::emit_glsl_op(uint32_t result_type, uint32_t id, uint32_t eop, const uint32_t *args, uint32_t)
 {
 	GLSLstd450 op = static_cast<GLSLstd450>(eop);
@ -4052,6 +4042,7 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice

 	bool access_chain_is_arrayed = false;
 	bool row_major_matrix_needs_conversion = is_non_native_row_major_matrix(base);
+	bool vector_is_packed = false;
 	bool pending_array_enclose = false;
 	bool dimension_flatten = false;

@ -4148,12 +4139,7 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 				}
 			}

-			if (member_is_packed_type(*type, index))
-			{
-				auto &membertype = get<SPIRType>(type->member_types[index]);
-				expr = unpack_expression_type(expr, membertype);
-			}
-
+			vector_is_packed = member_is_packed_type(*type, index);
 			row_major_matrix_needs_conversion = member_is_non_native_row_major_matrix(*type, index);
 			type = &get<SPIRType>(type->member_types[index]);
 		}
@ -4179,6 +4165,9 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 		// Vector -> Scalar
 		else if (type->vecsize > 1)
 		{
+			if (vector_is_packed)
+				expr = unpack_expression_type(expr, *type);
+
 			if (index_is_literal)
 			{
 				expr += ".";
@ -5081,7 +5070,7 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)

 		string funexpr;
 		vector<string> arglist;
-		funexpr += clean_func_name(to_name(func)) + "(";
+		funexpr += to_name(func) + "(";
 		for (uint32_t i = 0; i < length; i++)
 		{
 			// Do not pass in separate images or samplers if we're remapping
@ -5893,8 +5882,8 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		register_read(ops[1], ops[2], should_forward(ops[2]));
 		break;

-		// OpAtomicStore unimplemented. Not sure what would use that.
-		// OpAtomicLoad seems to only be relevant for atomic counters.
+	// OpAtomicStore unimplemented. Not sure what would use that.
+	// OpAtomicLoad seems to only be relevant for atomic counters.

 	case OpAtomicIIncrement:
 		forced_temporaries.insert(ops[1]);
@ -6453,7 +6442,7 @@ bool CompilerGLSL::member_is_non_native_row_major_matrix(const SPIRType &type, u

 // Checks whether the member is in packed data type, that might need to be unpacked.
 // GLSL does not define packed data types, but certain subclasses do.
-bool CompilerGLSL::member_is_packed_type(const SPIRType &type, uint32_t index)
+bool CompilerGLSL::member_is_packed_type(const SPIRType &type, uint32_t index) const
 {
 	return has_member_decoration(type.self, index, DecorationCPacked);
 }
@ -7015,11 +7004,11 @@ void CompilerGLSL::emit_function_prototype(SPIRFunction &func, uint64_t return_f

 	if (func.self == entry_point)
 	{
-		decl += clean_func_name("main");
+		decl += "main";
 		processing_entry_point = true;
 	}
 	else
-		decl += clean_func_name(to_name(func.self));
+		decl += to_name(func.self);

 	decl += "(";
 	vector<string> arglist;
--- a/spirv_glsl.hpp
+++ b/spirv_glsl.hpp
@ -211,7 +211,6 @@ protected:
 	                                     uint32_t grad_x, uint32_t grad_y, uint32_t lod, uint32_t coffset,
 	                                     uint32_t offset, uint32_t bias, uint32_t comp, uint32_t sample,
 	                                     bool *p_forward);
-	virtual std::string clean_func_name(std::string func_name);
 	virtual void emit_buffer_block(const SPIRVariable &type);
 	virtual void emit_push_constant_block(const SPIRVariable &var);
 	virtual void emit_uniform(const SPIRVariable &var);
@ -283,7 +282,7 @@ protected:

 	bool is_non_native_row_major_matrix(uint32_t id);
 	bool member_is_non_native_row_major_matrix(const SPIRType &type, uint32_t index);
-	bool member_is_packed_type(const SPIRType &type, uint32_t index);
+	bool member_is_packed_type(const SPIRType &type, uint32_t index) const;
 	virtual std::string convert_row_major_matrix(std::string exp_str);

 	std::unordered_set<std::string> local_variable_names;
@ -391,7 +390,7 @@ protected:
 	std::string to_member_name(const SPIRType &type, uint32_t index);
 	std::string type_to_glsl_constructor(const SPIRType &type);
 	std::string argument_decl(const SPIRFunction::Parameter &arg);
-	std::string to_qualifiers_glsl(uint32_t id);
+	virtual std::string to_qualifiers_glsl(uint32_t id);
 	const char *to_precision_qualifiers_glsl(uint32_t id);
 	virtual const char *to_storage_qualifiers_glsl(const SPIRVariable &var);
 	const char *flags_to_precision_qualifiers_glsl(const SPIRType &type, uint64_t flags);
@ -423,7 +422,7 @@ protected:
 	// and force recompile.
 	bool check_atomic_image(uint32_t id);

-	void replace_illegal_names();
+	virtual void replace_illegal_names();

 	void replace_fragment_output(SPIRVariable &var);
 	void replace_fragment_outputs();
--- a/spirv_hlsl.cpp
+++ b/spirv_hlsl.cpp
@ -1052,10 +1052,7 @@ void CompilerHLSL::emit_resources()
 	if (requires_op_fmod)
 	{
 		static const char *types[] = {
-			"float",
-			"float2",
-			"float3",
-			"float4",
+			"float", "float2", "float3", "float4",
 		};

 		for (auto &type : types)
--- a/spirv_msl.cpp
+++ b/spirv_msl.cpp
@ -18,7 +18,6 @@
 #include "GLSL.std.450.h"

 #include <algorithm>
-#include <cassert>
 #include <numeric>

 using namespace spv;
@ -31,9 +30,6 @@ CompilerMSL::CompilerMSL(vector<uint32_t> spirv_, vector<MSLVertexAttr> *p_vtx_a
                         vector<MSLResourceBinding> *p_res_bindings)
    : CompilerGLSL(move(spirv_))
 {
-	populate_func_name_overrides();
-	populate_var_name_overrides();
-
 	if (p_vtx_attrs)
 		for (auto &va : *p_vtx_attrs)
 			vtx_attrs_by_location[va.location] = &va;
@ -47,9 +43,6 @@ CompilerMSL::CompilerMSL(const uint32_t *ir, size_t word_count, MSLVertexAttr *p
                         MSLResourceBinding *p_res_bindings, size_t res_bindings_count)
    : CompilerGLSL(ir, word_count)
 {
-	populate_func_name_overrides();
-	populate_var_name_overrides();
-
 	if (p_vtx_attrs)
 		for (size_t i = 0; i < vtx_attrs_count; i++)
 			vtx_attrs_by_location[p_vtx_attrs[i].location] = &p_vtx_attrs[i];
@ -59,27 +52,12 @@ CompilerMSL::CompilerMSL(const uint32_t *ir, size_t word_count, MSLVertexAttr *p
 			resource_bindings.push_back(&p_res_bindings[i]);
 }

-// Populate the collection of function names that need to be overridden
-void CompilerMSL::populate_func_name_overrides()
-{
-	func_name_overrides["main"] = "main0";
-	func_name_overrides["saturate"] = "saturate0";
-}
-
-void CompilerMSL::populate_var_name_overrides()
-{
-	var_name_overrides["kernel"] = "kernel0";
-	var_name_overrides["bias"] = "bias0";
-}
-
 string CompilerMSL::compile()
 {
 	// Force a classic "C" locale, reverts when function returns
 	ClassicLocale classic_locale;

-	// Set main function name if it was explicitly set
-	if (!options.entry_point_name.empty())
-		set_name(entry_point, options.entry_point_name);
+	replace_illegal_names();

 	non_stage_in_input_var_ids.clear();
 	struct_member_padding.clear();
@ -102,6 +80,14 @@ string CompilerMSL::compile()
 	localize_global_variables();
 	extract_global_variables_from_functions();

+	// Mark any non-stage-in structs to be tightly packed.
+	mark_packable_structs();
+
+	// Metal does not allow dynamic array lengths.
+	// Resolve any specialization constants that are used for array lengths.
+	if (options.resolve_specialized_array_lengths)
+		resolve_specialized_array_lengths();
+
 	// Do not deal with GLES-isms like precision, older extensions and such.
 	CompilerGLSL::options.vulkan_semantics = true;
 	CompilerGLSL::options.es = false;
@ -114,6 +100,7 @@ string CompilerMSL::compile()
 	backend.swizzle_is_function = false;
 	backend.shared_is_implied = false;
 	backend.native_row_major_matrix = false;
+	backend.flexible_member_array_supported = false;

 	uint32_t pass_count = 0;
 	do
@ -184,7 +171,7 @@ void CompilerMSL::preprocess_op_codes()
 	}
 }

-// Move the Private global variables to the entry function.
+// Move the Private and Workgroup global variables to the entry function.
 // Non-constant variables cannot have global scope in Metal.
 void CompilerMSL::localize_global_variables()
 {
@ -192,11 +179,12 @@ void CompilerMSL::localize_global_variables()
 	auto iter = global_variables.begin();
 	while (iter != global_variables.end())
 	{
-		uint32_t gv_id = *iter;
-		auto &gbl_var = get<SPIRVariable>(gv_id);
-		if (gbl_var.storage == StorageClassPrivate)
+		uint32_t v_id = *iter;
+		auto &var = get<SPIRVariable>(v_id);
+		if (var.storage == StorageClassPrivate || var.storage == StorageClassWorkgroup)
 		{
-			entry_func.add_local_variable(gv_id);
+			var.storage = StorageClassFunction;
+			entry_func.add_local_variable(v_id);
 			iter = global_variables.erase(iter);
 		}
 		else
@ -204,6 +192,21 @@ void CompilerMSL::localize_global_variables()
 	}
 }

+// Metal does not allow dynamic array lengths.
+// Turn off specialization of any constants that are used for array lengths.
+void CompilerMSL::resolve_specialized_array_lengths()
+{
+	for (auto &id : ids)
+	{
+		if (id.get_type() == TypeConstant)
+		{
+			auto &c = id.get<SPIRConstant>();
+			if (c.is_used_as_array_length)
+				c.specialization = false;
+		}
+	}
+}
+
 // For any global variable accessed directly by a function,
 // extract that variable and add it as an argument to that function.
 void CompilerMSL::extract_global_variables_from_functions()
@ -313,6 +316,54 @@ void CompilerMSL::extract_global_variables_from_function(uint32_t func_id, std::
 	}
 }

+// For all variables that are some form of non-input-output interface block, mark that all the structs
+// that are recursively contained within the type referenced by that variable should be packed tightly.
+void CompilerMSL::mark_packable_structs()
+{
+	for (auto &id : ids)
+	{
+		if (id.get_type() == TypeVariable)
+		{
+			auto &var = id.get<SPIRVariable>();
+			if (var.storage != StorageClassFunction && !is_hidden_variable(var))
+			{
+				auto &type = get<SPIRType>(var.basetype);
+				if (type.pointer &&
+				    (type.storage == StorageClassUniform || type.storage == StorageClassUniformConstant ||
+				     type.storage == StorageClassPushConstant || type.storage == StorageClassStorageBuffer) &&
+				    (has_decoration(type.self, DecorationBlock) || has_decoration(type.self, DecorationBufferBlock)))
+					mark_as_packable(type);
+			}
+		}
+	}
+}
+
+// If the specified type is a struct, it and any nested structs
+// are marked as packable with the DecorationCPacked decoration,
+void CompilerMSL::mark_as_packable(SPIRType &type)
+{
+	// If this is not the base type (eg. it's a pointer or array), tunnel down
+	if (type.parent_type)
+	{
+		mark_as_packable(get<SPIRType>(type.parent_type));
+		return;
+	}
+
+	if (type.basetype == SPIRType::Struct)
+	{
+		set_decoration(type.self, DecorationCPacked);
+
+		// Recurse
+		size_t mbr_cnt = type.member_types.size();
+		for (uint32_t mbr_idx = 0; mbr_idx < mbr_cnt; mbr_idx++)
+		{
+			uint32_t mbr_type_id = type.member_types[mbr_idx];
+			auto &mbr_type = get<SPIRType>(mbr_type_id);
+			mark_as_packable(mbr_type);
+		}
+	}
+}
+
 // If a vertex attribute exists at the location, it is marked as being used by this shader
 void CompilerMSL::mark_location_as_used_by_shader(uint32_t location, StorageClass storage)
 {
@ -668,13 +719,14 @@ void CompilerMSL::align_struct(SPIRType &ib_type)
 	MemberSorter member_sorter(ib_type, meta[ib_type_id], MemberSorter::Offset);
 	member_sorter.sort();

-	uint32_t curr_offset = 0;
+	uint32_t curr_offset;
 	uint32_t mbr_cnt = uint32_t(ib_type.member_types.size());

 	// Test the alignment of each member, and if a member should be closer to the previous
 	// member than the default spacing expects, it is likely that the previous member is in
 	// a packed format. If so, and the previous member is packable, pack it.
 	// For example...this applies to any 3-element vector that is followed by a scalar.
+	curr_offset = 0;
 	for (uint32_t mbr_idx = 0; mbr_idx < mbr_cnt; mbr_idx++)
 	{
 		// Align current offset to the current member's default alignment.
@ -697,6 +749,7 @@ void CompilerMSL::align_struct(SPIRType &ib_type)
 	// Test the alignment of each member, and if a member is positioned farther than its
 	// alignment and the end of the previous member, add a dummy padding member that will
 	// be added before the current member when the delaration of this struct is emitted.
+	curr_offset = 0;
 	for (uint32_t mbr_idx = 0; mbr_idx < mbr_cnt; mbr_idx++)
 	{
 		// Align current offset to the current member's default alignment.
@ -816,7 +869,7 @@ void CompilerMSL::emit_custom_functions()
 			statement("template<typename T>");
 			statement("T findLSB(T x)");
 			begin_scope();
-			statement("return select(ctz(x), -1, x == 0);");
+			statement("return select(ctz(x), T(-1), x == T(0));");
 			end_scope();
 			statement("");
 			break;
@ -826,7 +879,7 @@ void CompilerMSL::emit_custom_functions()
 			statement("template<typename T>");
 			statement("T findUMSB(T x)");
 			begin_scope();
-			statement("return select(clz(0) - (clz(x) + 1), -1, x == 0);");
+			statement("return select(clz(T(0)) - (clz(x) + T(1)), T(-1), x == T(0));");
 			end_scope();
 			statement("");
 			break;
@ -836,8 +889,19 @@ void CompilerMSL::emit_custom_functions()
 			statement("template<typename T>");
 			statement("T findSMSB(T x)");
 			begin_scope();
-			statement("T v = select(x, -1 - x, x < 0);");
-			statement("return select(clz(0) - (clz(v) + 1), -1, v == 0);");
+			statement("T v = select(x, T(-1) - x, x < T(0));");
+			statement("return select(clz(T(0)) - (clz(v) + T(1)), T(-1), v == T(0));");
+			end_scope();
+			statement("");
+			break;
+
+		case SPVFuncImplArrayCopy:
+			statement("// Implementation of an array copy function to cover GLSL's ability to copy an array via "
+			          "assignment. ");
+			statement("template<typename T>");
+			statement("void spvArrayCopy(thread T* dst, thread const T* src, uint count)");
+			begin_scope();
+			statement("for (uint i = 0; i < count; *dst++ = *src++, i++);");
 			end_scope();
 			statement("");
 			break;
@ -979,51 +1043,41 @@ void CompilerMSL::emit_custom_functions()

 void CompilerMSL::emit_resources()
 {
-
-	// Output all basic struct types which are not Block or BufferBlock as these are declared inplace
-	// when such variables are instantiated.
+	// Output non-interface structs. These include local function structs
+	// and structs nested within uniform and read-write buffers.
+	unordered_set<uint32_t> declared_structs;
 	for (auto &id : ids)
 	{
 		if (id.get_type() == TypeType)
 		{
 			auto &type = id.get<SPIRType>();
-			if (type.basetype == SPIRType::Struct && type.array.empty() && !type.pointer &&
-			    !has_decoration(type.self, DecorationBlock) && !has_decoration(type.self, DecorationBufferBlock))
+			uint32_t type_id = type.self;
+
+			bool is_struct = (type.basetype == SPIRType::Struct) && type.array.empty();
+			bool is_block =
+			    has_decoration(type.self, DecorationBlock) || has_decoration(type.self, DecorationBufferBlock);
+			bool is_basic_struct = is_struct && !type.pointer && !is_block;
+
+			bool is_interface = (type.storage == StorageClassInput || type.storage == StorageClassOutput ||
+			                     type.storage == StorageClassUniformConstant);
+			bool is_non_interface_block = is_struct && type.pointer && is_block && !is_interface;
+
+			bool is_declarable_struct = is_basic_struct || is_non_interface_block;
+
+			// Align and emit declarable structs...but avoid declaring each more than once.
+			if (is_declarable_struct && declared_structs.count(type_id) == 0)
 			{
+				declared_structs.insert(type_id);
+
+				if (has_decoration(type_id, DecorationCPacked))
+					align_struct(type);
+
 				emit_struct(type);
 			}
 		}
 	}

-	// Output Uniform buffers and constants
-	unordered_set<uint32_t> declared_interface_structs;
-	for (auto &id : ids)
-	{
-		if (id.get_type() == TypeVariable)
-		{
-			auto &var = id.get<SPIRVariable>();
-			auto &type = get<SPIRType>(var.basetype);
-
-			if (var.storage != StorageClassFunction && type.pointer &&
-			    (type.storage == StorageClassUniform || type.storage == StorageClassUniformConstant ||
-			     type.storage == StorageClassPushConstant || type.storage == StorageClassStorageBuffer) &&
-			    (has_decoration(type.self, DecorationBlock) || has_decoration(type.self, DecorationBufferBlock)) &&
-			    !is_hidden_variable(var))
-			{
-				// Avoid declaring the same struct multiple times.
-				if (declared_interface_structs.count(type.self) == 0)
-				{
-					align_struct(type);
-					emit_struct(type);
-					declared_interface_structs.insert(type.self);
-				}
-			}
-		}
-	}
-
-	declare_undefined_values();
-
-	// Output interface blocks.
+	// Output interface structs.
 	emit_interface_block(stage_in_var_id);
 	for (auto &nsi_var : non_stage_in_input_var_ids)
 		emit_interface_block(nsi_var.second);
@ -1071,7 +1125,8 @@ void CompilerMSL::emit_specialization_constants()
 	// the work group size at compile time in SPIR-V, and [[threads_per_threadgroup]] would need to be passed around as a global.
 	// The work group size may be a specialization constant.
 	if (workgroup_size_id)
-		statement("constant uint3 gl_WorkGroupSize = ", constant_expression(get<SPIRConstant>(workgroup_size_id)), ";");
+		statement("constant uint3 ", builtin_to_glsl(BuiltInWorkgroupSize, StorageClassWorkgroup), " = ",
+		          constant_expression(get<SPIRConstant>(workgroup_size_id)), ";");

 	if (!spec_consts.empty() || workgroup_size_id)
 		statement("");
@ -1090,7 +1145,6 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
 #define BFOP(op) emit_binary_func_op(ops[0], ops[1], ops[2], ops[3], #op)
 #define BFOP_CAST(op, type) \
 	emit_binary_func_op_cast(ops[0], ops[1], ops[2], ops[3], #op, type, opcode_is_sign_invariant(opcode))
-#define BFOP(op) emit_binary_func_op(ops[0], ops[1], ops[2], ops[3], #op)
 #define UFOP(op) emit_unary_func_op(ops[0], ops[1], ops[2], #op)

 	auto ops = stream(instruction);
@ -1434,74 +1488,185 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
 	}

 	case OpStore:
-	{
-		if (!maybe_emit_input_struct_assignment(ops[0], ops[1]))
-			CompilerGLSL::emit_instruction(instruction);
+		if (maybe_emit_input_struct_assignment(ops[0], ops[1]))
+			break;

+		if (maybe_emit_array_assignment(ops[0], ops[1]))
+			break;
+
+		CompilerGLSL::emit_instruction(instruction);
 		break;
-	}

-		// OpOuterProduct
+	// Compute barriers
+	case OpMemoryBarrier:
+		emit_barrier(0, ops[0], ops[1]);
+		break;
+
+	case OpControlBarrier:
+		// In GLSL a memory barrier is often followed by a control barrier.
+		// But in MSL, memory barriers are also control barriers, so don't
+		// emit a simple control barrier if a memory barrier has just been emitted.
+		if (previous_instruction_opcode != OpMemoryBarrier)
+			emit_barrier(ops[0], ops[1], ops[2]);
+		break;
+
+	// OpOuterProduct

 	default:
 		CompilerGLSL::emit_instruction(instruction);
 		break;
 	}
+
+	previous_instruction_opcode = opcode;
 }

-// Since MSL does not allow structs to be nested within the stage_in struct, the original
-// input structs are flattened into a single stage_in struct by add_interface_block.
-// As a result, if the LHS and RHS represent an assignment of an entire input struct,
-// we must perform this member-by-member, mapping to the flattened stage_in struct.
+void CompilerMSL::emit_barrier(uint32_t id_exe_scope, uint32_t id_mem_scope, uint32_t id_mem_sem)
+{
+	if (get_entry_point().model != ExecutionModelGLCompute)
+		return;
+
+	string bar_stmt = "threadgroup_barrier(mem_flags::";
+
+	uint32_t mem_sem = id_mem_sem ? get<SPIRConstant>(id_mem_sem).scalar() : MemorySemanticsMaskNone;
+	switch (mem_sem)
+	{
+	case MemorySemanticsCrossWorkgroupMemoryMask:
+		bar_stmt += "mem_device";
+		break;
+
+	case MemorySemanticsSubgroupMemoryMask:
+	case MemorySemanticsWorkgroupMemoryMask:
+	case MemorySemanticsAtomicCounterMemoryMask:
+		bar_stmt += "mem_threadgroup";
+		break;
+
+	case MemorySemanticsImageMemoryMask:
+		bar_stmt += "mem_texture";
+		break;
+
+	case MemorySemanticsAcquireMask:
+	case MemorySemanticsReleaseMask:
+	case MemorySemanticsAcquireReleaseMask:
+	case MemorySemanticsSequentiallyConsistentMask:
+	case MemorySemanticsUniformMemoryMask:
+	case MemorySemanticsMaskNone:
+	default:
+		bar_stmt += "mem_none";
+		break;
+	}
+
+	if (options.supports_msl_version(2))
+	{
+		bar_stmt += ", ";
+
+		// Use the wider of the two scopes (smaller value)
+		uint32_t exe_scope = id_exe_scope ? get<SPIRConstant>(id_exe_scope).scalar() : ScopeInvocation;
+		uint32_t mem_scope = id_mem_scope ? get<SPIRConstant>(id_mem_scope).scalar() : ScopeInvocation;
+		uint32_t scope = min(exe_scope, mem_scope);
+		switch (scope)
+		{
+		case ScopeCrossDevice:
+		case ScopeDevice:
+			bar_stmt += "memory_scope_device";
+			break;
+
+		case ScopeSubgroup:
+		case ScopeInvocation:
+			bar_stmt += "memory_scope_simdgroup";
+			break;
+
+		case ScopeWorkgroup:
+		default:
+			bar_stmt += "memory_scope_threadgroup";
+			break;
+		}
+	}
+
+	bar_stmt += ");";
+
+	statement(bar_stmt);
+}
+
+// Since MSL does not allow structs to be nested within the stage_in struct, the original input
+// structs are flattened into a single stage_in struct by add_interface_block. As a result,
+// if the LHS and RHS represent an assignment of an entire input struct, we must perform this
+// member-by-member, mapping each RHS member to its name in the flattened stage_in struct.
 // Returns whether the struct assignment was emitted.
 bool CompilerMSL::maybe_emit_input_struct_assignment(uint32_t id_lhs, uint32_t id_rhs)
 {
-	auto *p_v_lhs = maybe_get_backing_variable(id_lhs);
+	// We only care about assignments of an entire struct
+	uint32_t type_id = expression_type_id(id_rhs);
+	auto &type = get<SPIRType>(type_id);
+	if (type.basetype != SPIRType::Struct)
+		return false;
+
+	// We only care about assignments from Input variables
 	auto *p_v_rhs = maybe_get_backing_variable(id_rhs);
+	if (!(p_v_rhs && p_v_rhs->storage == StorageClassInput))
+		return false;

-	if (p_v_lhs && p_v_rhs && p_v_rhs->storage == StorageClassInput)
+	// Get the ID of the type of the underlying RHS variable.
+	// This will be an Input OpTypePointer containing the qualified member names.
+	uint32_t tid_v_rhs = p_v_rhs->basetype;
+
+	// Ensure the LHS variable has been declared
+	auto *p_v_lhs = maybe_get_backing_variable(id_lhs);
+	if (p_v_lhs)
+		flush_variable_declaration(p_v_lhs->self);
+
+	size_t mbr_cnt = type.member_types.size();
+	for (uint32_t mbr_idx = 0; mbr_idx < mbr_cnt; mbr_idx++)
 	{
-		uint32_t tid_lhs = p_v_lhs->basetype;
-		uint32_t tid_rhs = p_v_rhs->basetype;
+		string expr;

-		auto &t_lhs = get<SPIRType>(tid_lhs);
-		auto &t_rhs = get<SPIRType>(tid_rhs);
+		//LHS
+		expr += to_name(id_lhs);
+		expr += ".";
+		expr += to_member_name(type, mbr_idx);

-		if (t_lhs.basetype == SPIRType::Struct && t_rhs.basetype == SPIRType::Struct)
+		expr += " = ";
+
+		//RHS
+		string qual_mbr_name = get_member_qualified_name(tid_v_rhs, mbr_idx);
+		if (qual_mbr_name.empty())
 		{
-			size_t mbr_cnt = t_rhs.member_types.size();
-			assert(t_lhs.member_types.size() == mbr_cnt);
-
-			flush_variable_declaration(p_v_lhs->self);
-
-			for (uint32_t mbr_idx = 0; mbr_idx < mbr_cnt; mbr_idx++)
-			{
-				string expr;
-
-				//LHS
-				expr += to_name(id_lhs);
-				expr += ".";
-				expr += to_member_name(t_lhs, mbr_idx);
-
-				expr += " = ";
-
-				//RHS
-				string qual_mbr_name = get_member_qualified_name(tid_rhs, mbr_idx);
-				if (qual_mbr_name.empty())
-				{
-					expr += to_name(id_rhs);
-					expr += ".";
-					expr += to_member_name(t_rhs, mbr_idx);
-				}
-				else
-					expr += qual_mbr_name;
-
-				statement(expr, ";");
-			}
-			return true;
+			expr += to_name(id_rhs);
+			expr += ".";
+			expr += to_member_name(type, mbr_idx);
 		}
+		else
+			expr += qual_mbr_name;
+
+		statement(expr, ";");
 	}
-	return false;
+
+	return true;
+}
+
+// Since MSL does not allow arrays to be copied via simple variable assignment,
+// if the LHS and RHS represent an assignment of an entire array, it must be
+// implemented by calling an array copy function.
+// Returns whether the struct assignment was emitted.
+bool CompilerMSL::maybe_emit_array_assignment(uint32_t id_lhs, uint32_t id_rhs)
+{
+	// Assignment from an array initializer is fine.
+	if (ids[id_rhs].get_type() == TypeConstant)
+		return false;
+
+	// We only care about assignments of an entire array
+	auto &type = expression_type(id_rhs);
+	if (type.array.size() == 0)
+		return false;
+
+	// Ensure the LHS variable has been declared
+	auto *p_v_lhs = maybe_get_backing_variable(id_lhs);
+	if (p_v_lhs)
+		flush_variable_declaration(p_v_lhs->self);
+
+	statement("spvArrayCopy(", to_expression(id_lhs), ", ", to_expression(id_rhs), ", ", to_array_size(type, 0), ");");
+	register_write(id_lhs);
+
+	return true;
 }

 // Emits one of the atomic functions. In MSL, the atomic functions operate on pointers
@ -1604,7 +1769,7 @@ void CompilerMSL::emit_glsl_op(uint32_t result_type, uint32_t id, uint32_t eop,
 		emit_unary_func_op(result_type, id, args[0], "pack_float_to_unorm2x16");
 		break;
 	case GLSLstd450PackHalf2x16:
-		emit_unary_func_op(result_type, id, args[0], "pack_half_to_snorm2x16");
+		emit_unary_func_op(result_type, id, args[0], "unsupported_GLSLstd450PackHalf2x16"); // Currently unsupported
 		break;

 	case GLSLstd450UnpackSnorm4x8:
@ -1620,7 +1785,7 @@ void CompilerMSL::emit_glsl_op(uint32_t result_type, uint32_t id, uint32_t eop,
 		emit_unary_func_op(result_type, id, args[0], "unpack_unorm2x16_to_float");
 		break;
 	case GLSLstd450UnpackHalf2x16:
-		emit_unary_func_op(result_type, id, args[0], "unpack_snorm2x16_to_half");
+		emit_unary_func_op(result_type, id, args[0], "unsupported_GLSLstd450UnpackHalf2x16"); // Currently unsupported
 		break;

 	case GLSLstd450PackDouble2x32:
@ -1650,10 +1815,10 @@ void CompilerMSL::emit_glsl_op(uint32_t result_type, uint32_t id, uint32_t eop,
 		break;
 	}

-		// TODO:
-		//        GLSLstd450InterpolateAtCentroid (centroid_no_perspective qualifier)
-		//        GLSLstd450InterpolateAtSample (sample_no_perspective qualifier)
-		//        GLSLstd450InterpolateAtOffset
+	// TODO:
+	//        GLSLstd450InterpolateAtCentroid (centroid_no_perspective qualifier)
+	//        GLSLstd450InterpolateAtSample (sample_no_perspective qualifier)
+	//        GLSLstd450InterpolateAtOffset

 	default:
 		CompilerGLSL::emit_glsl_op(result_type, id, eop, args, count);
@ -1686,7 +1851,7 @@ void CompilerMSL::emit_function_prototype(SPIRFunction &func, uint64_t)
 	auto &type = get<SPIRType>(func.return_type);
 	decl += func_type_decl(type);
 	decl += " ";
-	decl += clean_func_name(to_name(func.self));
+	decl += to_name(func.self);

 	decl += "(";

@ -2290,14 +2455,7 @@ string CompilerMSL::func_type_decl(SPIRType &type)
 	return entry_type + " " + return_type;
 }

-// Ensures the function name is not "main", which is illegal in MSL
-string CompilerMSL::clean_func_name(string func_name)
-{
-	auto iter = func_name_overrides.find(func_name);
-	return (iter != func_name_overrides.end()) ? iter->second : func_name;
-}
-
-// In MSL address space qualifiers are required for all pointer or reference arguments
+// In MSL, address space qualifiers are required for all pointer or reference arguments
 string CompilerMSL::get_argument_address_space(const SPIRVariable &argument)
 {
 	const auto &type = get<SPIRType>(argument.basetype);
@ -2477,20 +2635,33 @@ uint32_t CompilerMSL::get_metal_resource_index(SPIRVariable &var, SPIRType::Base
 // Returns the name of the entry point of this shader
 string CompilerMSL::get_entry_point_name()
 {
-	return clean_func_name(to_name(entry_point));
+	return to_name(entry_point);
 }

 string CompilerMSL::argument_decl(const SPIRFunction::Parameter &arg)
 {
+	auto &var = get<SPIRVariable>(arg.id);
 	auto &type = expression_type(arg.id);
 	bool constref = !arg.alias_global_variable && (!type.pointer || arg.write_count == 0);

 	// TODO: Check if this arg is an uniform pointer
 	bool pointer = type.storage == StorageClassUniformConstant;

-	auto &var = get<SPIRVariable>(arg.id);
-	return join(constref ? "const " : "", type_to_glsl(type), pointer ? " " : "& ", to_name(var.self),
-	            type_to_array_glsl(type));
+	string decl;
+	if (constref)
+		decl += "const ";
+
+	decl += type_to_glsl(type);
+
+	if (is_array(type))
+		decl += "*";
+	else if (!pointer)
+		decl += "&";
+
+	decl += " ";
+	decl += to_name(var.self);
+
+	return decl;
 }

 // If we're currently in the entry point function, and the object
@ -2525,17 +2696,79 @@ string CompilerMSL::to_qualified_member_name(const SPIRType &type, uint32_t inde
 // if the first chars are _ and a digit, which indicate a transient name.
 string CompilerMSL::ensure_valid_name(string name, string pfx)
 {
-	if (name.size() >= 2 && name[0] == '_' && isdigit(name[1]))
+	return (name.size() >= 2 && name[0] == '_' && isdigit(name[1])) ? (pfx + name) : name;
+}
+
+// Replace all names that match MSL keywords or Metal Standard Library functions.
+void CompilerMSL::replace_illegal_names()
+{
+	static const unordered_set<string> keywords = {
+		"kernel", "bias",
+	};
+
+	static const unordered_set<string> illegal_func_names = {
+		"main", "saturate",
+	};
+
+	for (auto &id : ids)
 	{
-		return join(pfx, name);
+		switch (id.get_type())
+		{
+		case TypeVariable:
+		{
+			auto &dec = meta[id.get_id()].decoration;
+			if (keywords.find(dec.alias) != end(keywords))
+				dec.alias += "0";
+
+			break;
+		}
+
+		case TypeFunction:
+		{
+			auto &dec = meta[id.get_id()].decoration;
+			if (illegal_func_names.find(dec.alias) != end(illegal_func_names))
+				dec.alias += "0";
+
+			break;
+		}
+
+		case TypeType:
+		{
+			for (auto &mbr_dec : meta[id.get_id()].members)
+				if (keywords.find(mbr_dec.alias) != end(keywords))
+					mbr_dec.alias += "0";
+
+			break;
+		}
+
+		default:
+			break;
+		}
 	}
-	else
+
+	for (auto &entry : entry_points)
 	{
-		auto iter = var_name_overrides.find(name);
-		return (iter != var_name_overrides.end()) ? iter->second : name;
+		// Change both the entry point name and the alias, to keep them synced.
+		string &ep_name = entry.second.name;
+		if (illegal_func_names.find(ep_name) != end(illegal_func_names))
+		{
+			ep_name += "0";
+			meta[entry.first].decoration.alias = ep_name;
+		}
 	}
 }

+string CompilerMSL::to_qualifiers_glsl(uint32_t id)
+{
+	string quals;
+
+	auto &type = expression_type(id);
+	if (type.storage == StorageClassWorkgroup)
+		quals += "threadgroup ";
+
+	return quals;
+}
+
 // The optional id parameter indicates the object whose type we are trying
 // to find the description for. It is optional. Most type descriptions do not
 // depend on a specific object's use of that type.
@ -2880,12 +3113,11 @@ string CompilerMSL::built_in_func_arg(BuiltIn builtin, bool prefix_comma)
 	string bi_arg;
 	if (prefix_comma)
 		bi_arg += ", ";
+
 	bi_arg += builtin_type_decl(builtin);
-
-	assert(builtin == BuiltInVertexIndex || builtin == BuiltInInstanceIndex);
 	bi_arg += " " + builtin_to_glsl(builtin, StorageClassInput);
-
 	bi_arg += " [[" + builtin_qualifier(builtin) + "]]";
+
 	return bi_arg;
 }

@ -2905,9 +3137,6 @@ size_t CompilerMSL::get_declared_struct_member_size(const SPIRType &struct_type,
 	case SPIRType::Sampler:
 		SPIRV_CROSS_THROW("Querying size of opaque object.");

-	case SPIRType::Struct:
-		return get_declared_struct_size(type);
-
 	default:
 	{
 		size_t component_size = type.width / 8;
@ -2915,8 +3144,12 @@ size_t CompilerMSL::get_declared_struct_member_size(const SPIRType &struct_type,
 		unsigned columns = type.columns;

 		// For arrays, we can use ArrayStride to get an easy check.
+		// Runtime arrays will have zero size so force to min of one.
 		if (!type.array.empty())
-			return type_struct_member_array_stride(struct_type, index) * type.array.back();
+			return type_struct_member_array_stride(struct_type, index) * max(type.array.back(), 1U);
+
+		if (type.basetype == SPIRType::Struct)
+			return get_declared_struct_size(type);

 		if (columns == 1) // An unpacked 3-element vector is the same size as a 4-element vector.
 		{
@ -2967,11 +3200,14 @@ size_t CompilerMSL::get_declared_struct_member_alignment(const SPIRType &struct_
 	{
 		// Alignment of packed type is the same as the underlying component size.
 		// Alignment of unpacked type is the same as the type size (or one matrix column).
-		auto dec_mask = get_member_decoration_mask(struct_type.self, index);
-		if (dec_mask & (1ull << DecorationCPacked))
+		if (member_is_packed_type(struct_type, index))
 			return type.width / 8;
 		else
-			return get_declared_struct_member_size(struct_type, index) / type.columns;
+		{
+			// Divide by array size and colum count. Runtime arrays will have zero size so force to min of one.
+			uint32_t array_size = type.array.empty() ? 1 : max(type.array.back(), 1U);
+			return get_declared_struct_member_size(struct_type, index) / (type.columns * array_size);
+		}
 	}
 	}
 }
@ -2981,7 +3217,7 @@ bool CompilerMSL::skip_argument(uint32_t) const
 	return false;
 }

-bool CompilerMSL::OpCodePreprocessor::handle(Op opcode, const uint32_t *args, uint32_t /*length*/)
+bool CompilerMSL::OpCodePreprocessor::handle(Op opcode, const uint32_t *args, uint32_t length)
 {
 	// Since MSL exists in a single execution scope, function prototype declarations are not
 	// needed, and clutter the output. If secondary functions are output (either as a SPIR-V
@ -2989,12 +3225,11 @@ bool CompilerMSL::OpCodePreprocessor::handle(Op opcode, const uint32_t *args, ui
 	// suppress_missing_prototypes to suppress compiler warnings of missing function prototypes.

 	// Mark if the input requires the implementation of an SPIR-V function that does not exist in Metal.
-	SPVFuncImpl spv_func = compiler.get_spv_func_impl(opcode, args);
+	SPVFuncImpl spv_func = get_spv_func_impl(opcode, args);
 	if (spv_func != SPVFuncImplNone)
 	{
 		compiler.spv_function_implementations.insert(spv_func);
 		suppress_missing_prototypes = true;
-		return true;
 	}

 	switch (opcode)
@ -3026,21 +3261,38 @@ bool CompilerMSL::OpCodePreprocessor::handle(Op opcode, const uint32_t *args, ui
 		break;
 	}

+	// Keep track of the instruction return types, mapped by ID
+	if (length > 1)
+		result_types[args[1]] = args[0];
+
 	return true;
 }

 // Returns an enumeration of a SPIR-V function that needs to be output for certain Op codes.
-CompilerMSL::SPVFuncImpl CompilerMSL::get_spv_func_impl(Op opcode, const uint32_t *args)
+CompilerMSL::SPVFuncImpl CompilerMSL::OpCodePreprocessor::get_spv_func_impl(Op opcode, const uint32_t *args)
 {
 	switch (opcode)
 	{
 	case OpFMod:
 		return SPVFuncImplMod;

+	case OpStore:
+	{
+		// Get the result type of the RHS. Since this is run as a pre-processing stage,
+		// we must extract the result type directly from the Instruction, rather than the ID.
+		uint32_t id_rhs = args[1];
+		uint32_t type_id_rhs = result_types[id_rhs];
+		if ((compiler.ids[id_rhs].get_type() != TypeConstant) && type_id_rhs &&
+		    compiler.is_array(compiler.get<SPIRType>(type_id_rhs)))
+			return SPVFuncImplArrayCopy;
+
+		break;
+	}
+
 	case OpExtInst:
 	{
 		uint32_t extension_set = args[2];
-		if (get<SPIRExtension>(extension_set).ext == SPIRExtension::GLSL)
+		if (compiler.get<SPIRExtension>(extension_set).ext == SPIRExtension::GLSL)
 		{
 			GLSLstd450 op_450 = static_cast<GLSLstd450>(args[3]);
 			switch (op_450)
@ -3057,7 +3309,7 @@ CompilerMSL::SPVFuncImpl CompilerMSL::get_spv_func_impl(Op opcode, const uint32_
 				return SPVFuncImplFindUMsb;
 			case GLSLstd450MatrixInverse:
 			{
-				auto &mat_type = get<SPIRType>(args[0]);
+				auto &mat_type = compiler.get<SPIRType>(args[0]);
 				switch (mat_type.columns)
 				{
 				case 2:
--- a/spirv_msl.hpp
+++ b/spirv_msl.hpp
@ -18,6 +18,7 @@
 #define SPIRV_CROSS_MSL_HPP

 #include "spirv_glsl.hpp"
+#include <limits>
 #include <map>
 #include <set>
 #include <unordered_map>
@ -63,7 +64,7 @@ using MSLStructMemberKey = uint64_t;

 // Special constant used in a MSLResourceBinding desc_set
 // element to indicate the bindings for the push constants.
-static const uint32_t kPushConstDescSet = UINT32_MAX;
+static const uint32_t kPushConstDescSet = std::numeric_limits<uint32_t>::max();

 // Special constant used in a MSLResourceBinding binding
 // element to indicate the bindings for the push constants.
@ -76,8 +77,24 @@ public:
 	// Options for compiling to Metal Shading Language
 	struct Options
 	{
+		uint32_t msl_version = make_msl_version(1, 2);
 		bool enable_point_size_builtin = true;
-		std::string entry_point_name;
+		bool resolve_specialized_array_lengths = true;
+
+		void set_msl_version(uint32_t major, uint32_t minor = 0, uint32_t patch = 0)
+		{
+			msl_version = make_msl_version(major, minor, patch);
+		}
+
+		bool supports_msl_version(uint32_t major, uint32_t minor = 0, uint32_t patch = 0)
+		{
+			return msl_version >= make_msl_version(major, minor, patch);
+		}
+
+		static uint32_t make_msl_version(uint32_t major, uint32_t minor = 0, uint32_t patch = 0)
+		{
+			return (major * 10000) + (minor * 100) + patch;
+		}
 	};

 	const Options &get_options() const
@ -101,6 +118,7 @@ public:
 		SPVFuncImplFindILsb,
 		SPVFuncImplFindSMsb,
 		SPVFuncImplFindUMsb,
+		SPVFuncImplArrayCopy,
 		SPVFuncImplInverse2x2,
 		SPVFuncImplInverse3x3,
 		SPVFuncImplInverse4x4,
@ -165,10 +183,15 @@ protected:
 	std::string unpack_expression_type(std::string expr_str, const SPIRType &type) override;
 	std::string bitcast_glsl_op(const SPIRType &result_type, const SPIRType &argument_type) override;
 	bool skip_argument(uint32_t id) const override;
+	std::string to_qualifiers_glsl(uint32_t id) override;
+	void replace_illegal_names() override;

 	void preprocess_op_codes();
 	void localize_global_variables();
 	void extract_global_variables_from_functions();
+	void resolve_specialized_array_lengths();
+	void mark_packable_structs();
+	void mark_as_packable(SPIRType &type);

 	std::unordered_map<uint32_t, std::set<uint32_t>> function_global_vars;
 	void extract_global_variables_from_function(uint32_t func_id, std::set<uint32_t> &added_arg_ids,
@ -181,12 +204,10 @@ protected:
 	void emit_resources();
 	void emit_specialization_constants();
 	void emit_interface_block(uint32_t ib_var_id);
-	void populate_func_name_overrides();
-	void populate_var_name_overrides();
 	bool maybe_emit_input_struct_assignment(uint32_t id_lhs, uint32_t id_rhs);
+	bool maybe_emit_array_assignment(uint32_t id_lhs, uint32_t id_rhs);

 	std::string func_type_decl(SPIRType &type);
-	std::string clean_func_name(std::string func_name) override;
 	std::string entry_point_args(bool append_comma);
 	std::string get_entry_point_name();
 	std::string to_qualified_member_name(const SPIRType &type, uint32_t index);
@ -210,17 +231,15 @@ protected:
 	void align_struct(SPIRType &ib_type);
 	bool is_member_packable(SPIRType &ib_type, uint32_t index);
 	MSLStructMemberKey get_struct_member_key(uint32_t type_id, uint32_t index);
-	SPVFuncImpl get_spv_func_impl(spv::Op opcode, const uint32_t *args);
 	std::string get_argument_address_space(const SPIRVariable &argument);
 	void emit_atomic_func_op(uint32_t result_type, uint32_t result_id, const char *op, uint32_t mem_order_1,
 	                         uint32_t mem_order_2, bool has_mem_order_2, uint32_t op0, uint32_t op1 = 0,
 	                         bool op1_is_pointer = false, uint32_t op2 = 0);
 	const char *get_memory_order(uint32_t spv_mem_sem);
 	void add_pragma_line(const std::string &line);
+	void emit_barrier(uint32_t id_exe_scope, uint32_t id_mem_scope, uint32_t id_mem_sem);

 	Options options;
-	std::unordered_map<std::string, std::string> func_name_overrides;
-	std::unordered_map<std::string, std::string> var_name_overrides;
 	std::set<SPVFuncImpl> spv_function_implementations;
 	std::unordered_map<uint32_t, MSLVertexAttr *> vtx_attrs_by_location;
 	std::map<uint32_t, uint32_t> non_stage_in_input_var_ids;
@ -238,6 +257,7 @@ protected:
 	std::string stage_out_var_name = "out";
 	std::string stage_uniform_var_name = "uniforms";
 	std::string sampler_name_suffix = "Smplr";
+	spv::Op previous_instruction_opcode = spv::OpNop;

 	// OpcodeHandler that handles several MSL preprocessing operations.
 	struct OpCodePreprocessor : OpcodeHandler
@ -248,8 +268,10 @@ protected:
 		}

 		bool handle(spv::Op opcode, const uint32_t *args, uint32_t length) override;
+		CompilerMSL::SPVFuncImpl get_spv_func_impl(spv::Op opcode, const uint32_t *args);

 		CompilerMSL &compiler;
+		std::unordered_map<uint32_t, uint32_t> result_types;
 		bool suppress_missing_prototypes = false;
 		bool uses_atomics = false;
 	};