diff --git a/reference/opt/shaders-msl/flatten/rowmajor.flatten.vert b/reference/opt/shaders-msl/flatten/rowmajor.flatten.vert
new file mode 100644
index 00000000..3e0fcdbb
--- /dev/null
+++ b/reference/opt/shaders-msl/flatten/rowmajor.flatten.vert
@@ -0,0 +1,29 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct UBO
+{
+    float4x4 uMVPR;
+    float4x4 uMVPC;
+    float2x4 uMVP;
+};
+
+struct main0_in
+{
+    float4 aVertex [[attribute(0)]];
+};
+
+struct main0_out
+{
+    float4 gl_Position [[position]];
+};
+
+vertex main0_out main0(main0_in in [[stage_in]], constant UBO& _18 [[buffer(0)]])
+{
+    main0_out out = {};
+    out.gl_Position = (_18.uMVPR * in.aVertex) + (in.aVertex * _18.uMVPC);
+    return out;
+}
+
diff --git a/reference/opt/shaders-msl/flatten/swizzle.flatten.vert b/reference/opt/shaders-msl/flatten/swizzle.flatten.vert
index 1accb1d7..53fc21f9 100644
--- a/reference/opt/shaders-msl/flatten/swizzle.flatten.vert
+++ b/reference/opt/shaders-msl/flatten/swizzle.flatten.vert
@@ -38,8 +38,8 @@ vertex main0_out main0(constant UBO& _22 [[buffer(0)]])
     out.gl_Position = float4(0.0);
     out.oA = _22.A;
     out.oB = float4(_22.B0, _22.B1);
-    out.oC = float4(_22.C0, _22.C1);
-    out.oD = float4(_22.D0, _22.D1);
+    out.oC = float4(_22.C0, _22.C1) + float4(_22.C1.xy, _22.C1.z, _22.C0);
+    out.oD = float4(_22.D0, _22.D1) + float4(float3(_22.D0).xy, float3(_22.D0).z, _22.D1);
     out.oE = float4(_22.E0, _22.E1, _22.E2, _22.E3);
     out.oF = float4(_22.F0, _22.F1, _22.F2);
     return out;
diff --git a/reference/shaders-msl-no-opt/vert/functions_nested.vert b/reference/shaders-msl-no-opt/vert/functions_nested.vert
new file mode 100644
index 00000000..f0c9d135
--- /dev/null
+++ b/reference/shaders-msl-no-opt/vert/functions_nested.vert
@@ -0,0 +1,190 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct attr_desc
+{
+    int type;
+    int attribute_size;
+    int starting_offset;
+    int stride;
+    int swap_bytes;
+    int is_volatile;
+};
+
+struct VertexBuffer
+{
+    float4x4 scale_offset_mat;
+    uint vertex_base_index;
+    int4 input_attributes[16];
+};
+
+struct VertexConstantsBuffer
+{
+    float4 vc[16];
+};
+
+constant float4 _295 = {};
+
+struct main0_out
+{
+    float4 tc0 [[user(locn0)]];
+    float4 back_color [[user(locn10)]];
+    float4 gl_Position [[position]];
+};
+
+attr_desc fetch_desc(thread const int& location, constant VertexBuffer& v_227)
+{
+    int attribute_flags = v_227.input_attributes[location].w;
+    attr_desc result;
+    result.type = v_227.input_attributes[location].x;
+    result.attribute_size = v_227.input_attributes[location].y;
+    result.starting_offset = v_227.input_attributes[location].z;
+    result.stride = attribute_flags & 255;
+    result.swap_bytes = (attribute_flags >> 8) & 1;
+    result.is_volatile = (attribute_flags >> 9) & 1;
+    return result;
+}
+
+uint get_bits(thread const uint4& v, thread const int& swap)
+{
+    if (swap != 0)
+    {
+        return ((v.w | (v.z << uint(8))) | (v.y << uint(16))) | (v.x << uint(24));
+    }
+    return ((v.x | (v.y << uint(8))) | (v.z << uint(16))) | (v.w << uint(24));
+}
+
+float4 fetch_attr(thread const attr_desc& desc, thread const int& vertex_id, thread const texture2d<uint> input_stream)
+{
+    float4 result = float4(0.0, 0.0, 0.0, 1.0);
+    bool reverse_order = false;
+    int first_byte = (vertex_id * desc.stride) + desc.starting_offset;
+    for (int n = 0; n < 4; n++)
+    {
+        if (n == desc.attribute_size)
+        {
+            break;
+        }
+        uint4 tmp;
+        switch (desc.type)
+        {
+            case 0:
+            {
+                int _131 = first_byte;
+                first_byte = _131 + 1;
+                tmp.x = input_stream.read(uint2(_131, 0)).x;
+                int _138 = first_byte;
+                first_byte = _138 + 1;
+                tmp.y = input_stream.read(uint2(_138, 0)).x;
+                uint4 param = tmp;
+                int param_1 = desc.swap_bytes;
+                result[n] = float(get_bits(param, param_1));
+                break;
+            }
+            case 1:
+            {
+                int _156 = first_byte;
+                first_byte = _156 + 1;
+                tmp.x = input_stream.read(uint2(_156, 0)).x;
+                int _163 = first_byte;
+                first_byte = _163 + 1;
+                tmp.y = input_stream.read(uint2(_163, 0)).x;
+                int _170 = first_byte;
+                first_byte = _170 + 1;
+                tmp.z = input_stream.read(uint2(_170, 0)).x;
+                int _177 = first_byte;
+                first_byte = _177 + 1;
+                tmp.w = input_stream.read(uint2(_177, 0)).x;
+                uint4 param_2 = tmp;
+                int param_3 = desc.swap_bytes;
+                result[n] = as_type<float>(get_bits(param_2, param_3));
+                break;
+            }
+            case 2:
+            {
+                int _195 = first_byte;
+                first_byte = _195 + 1;
+                result[n] = float(input_stream.read(uint2(_195, 0)).x);
+                reverse_order = desc.swap_bytes != 0;
+                break;
+            }
+        }
+    }
+    float4 _209;
+    if (reverse_order)
+    {
+        _209 = result.wzyx;
+    }
+    else
+    {
+        _209 = result;
+    }
+    return _209;
+}
+
+float4 read_location(thread const int& location, constant VertexBuffer& v_227, thread uint& gl_VertexIndex, thread texture2d<uint> buff_in_2, thread texture2d<uint> buff_in_1)
+{
+    int param = location;
+    attr_desc desc = fetch_desc(param, v_227);
+    int vertex_id = gl_VertexIndex - int(v_227.vertex_base_index);
+    if (desc.is_volatile != 0)
+    {
+        attr_desc param_1 = desc;
+        int param_2 = vertex_id;
+        return fetch_attr(param_1, param_2, buff_in_2);
+    }
+    else
+    {
+        attr_desc param_3 = desc;
+        int param_4 = vertex_id;
+        return fetch_attr(param_3, param_4, buff_in_1);
+    }
+}
+
+void vs_adjust(thread float4& dst_reg0, thread float4& dst_reg1, thread float4& dst_reg7, constant VertexBuffer& v_227, thread uint& gl_VertexIndex, thread texture2d<uint> buff_in_2, thread texture2d<uint> buff_in_1, constant VertexConstantsBuffer& v_309)
+{
+    int param = 3;
+    float4 in_diff_color = read_location(param, v_227, gl_VertexIndex, buff_in_2, buff_in_1);
+    int param_1 = 0;
+    float4 in_pos = read_location(param_1, v_227, gl_VertexIndex, buff_in_2, buff_in_1);
+    int param_2 = 8;
+    float4 in_tc0 = read_location(param_2, v_227, gl_VertexIndex, buff_in_2, buff_in_1);
+    dst_reg1 = in_diff_color * v_309.vc[13];
+    float4 tmp0;
+    tmp0.x = float4(dot(float4(in_pos.xyz, 1.0), v_309.vc[4])).x;
+    tmp0.y = float4(dot(float4(in_pos.xyz, 1.0), v_309.vc[5])).y;
+    tmp0.z = float4(dot(float4(in_pos.xyz, 1.0), v_309.vc[6])).z;
+    float4 tmp1;
+    float4 _359 = float4(in_tc0.xy.x, in_tc0.xy.y, tmp1.z, tmp1.w);
+    tmp1 = _359;
+    tmp1.z = v_309.vc[15].x;
+    dst_reg7.y = float4(dot(float4(tmp1.xyz, 1.0), v_309.vc[8])).y;
+    dst_reg7.x = float4(dot(float4(tmp1.xyz, 1.0), v_309.vc[7])).x;
+    dst_reg0.y = float4(dot(float4(tmp0.xyz, 1.0), v_309.vc[1])).y;
+    dst_reg0.x = float4(dot(float4(tmp0.xyz, 1.0), v_309.vc[0])).x;
+}
+
+vertex main0_out main0(constant VertexBuffer& v_227 [[buffer(0)]], uint gl_VertexIndex [[vertex_id]], texture2d<uint> buff_in_2 [[texture(0)]], texture2d<uint> buff_in_1 [[texture(1)]], constant VertexConstantsBuffer& v_309 [[buffer(1)]])
+{
+    main0_out out = {};
+    float4 dst_reg0 = float4(0.0, 0.0, 0.0, 1.0);
+    float4 dst_reg1 = float4(0.0);
+    float4 dst_reg7 = float4(0.0);
+    float4 param = dst_reg0;
+    float4 param_1 = dst_reg1;
+    float4 param_2 = dst_reg7;
+    vs_adjust(param, param_1, param_2, v_227, gl_VertexIndex, buff_in_2, buff_in_1, v_309);
+    dst_reg0 = param;
+    dst_reg1 = param_1;
+    dst_reg7 = param_2;
+    out.gl_Position = dst_reg0;
+    out.back_color = dst_reg1;
+    out.tc0 = dst_reg7;
+    out.gl_Position *= v_227.scale_offset_mat;
+    return out;
+}
+
diff --git a/reference/shaders-msl/flatten/rowmajor.flatten.vert b/reference/shaders-msl/flatten/rowmajor.flatten.vert
new file mode 100644
index 00000000..3ea6d78b
--- /dev/null
+++ b/reference/shaders-msl/flatten/rowmajor.flatten.vert
@@ -0,0 +1,38 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct UBO
+{
+    float4x4 uMVPR;
+    float4x4 uMVPC;
+    float2x4 uMVP;
+};
+
+struct main0_in
+{
+    float4 aVertex [[attribute(0)]];
+};
+
+struct main0_out
+{
+    float4 gl_Position [[position]];
+};
+
+// Implementation of a conversion of matrix content from RowMajor to ColumnMajor organization.
+float2x4 spvConvertFromRowMajor2x4(float2x4 m)
+{
+    return float2x4(float4(m[0][0], m[0][2], m[1][0], m[1][2]), float4(m[0][1], m[0][3], m[1][1], m[1][3]));
+}
+
+vertex main0_out main0(main0_in in [[stage_in]], constant UBO& _18 [[buffer(0)]])
+{
+    main0_out out = {};
+    float2 v = in.aVertex * spvConvertFromRowMajor2x4(_18.uMVP);
+    out.gl_Position = (_18.uMVPR * in.aVertex) + (in.aVertex * _18.uMVPC);
+    return out;
+}
+
diff --git a/reference/shaders-msl/flatten/swizzle.flatten.vert b/reference/shaders-msl/flatten/swizzle.flatten.vert
index 1accb1d7..53fc21f9 100644
--- a/reference/shaders-msl/flatten/swizzle.flatten.vert
+++ b/reference/shaders-msl/flatten/swizzle.flatten.vert
@@ -38,8 +38,8 @@ vertex main0_out main0(constant UBO& _22 [[buffer(0)]])
     out.gl_Position = float4(0.0);
     out.oA = _22.A;
     out.oB = float4(_22.B0, _22.B1);
-    out.oC = float4(_22.C0, _22.C1);
-    out.oD = float4(_22.D0, _22.D1);
+    out.oC = float4(_22.C0, _22.C1) + float4(_22.C1.xy, _22.C1.z, _22.C0);
+    out.oD = float4(_22.D0, _22.D1) + float4(float3(_22.D0).xy, float3(_22.D0).z, _22.D1);
     out.oE = float4(_22.E0, _22.E1, _22.E2, _22.E3);
     out.oF = float4(_22.F0, _22.F1, _22.F2);
     return out;
diff --git a/shaders-msl-no-opt/vert/functions_nested.vert b/shaders-msl-no-opt/vert/functions_nested.vert
new file mode 100644
index 00000000..2eec5ac5
--- /dev/null
+++ b/shaders-msl-no-opt/vert/functions_nested.vert
@@ -0,0 +1,132 @@
+#version 450
+#extension GL_ARB_separate_shader_objects : enable
+
+layout(std140, set = 0, binding = 0) uniform VertexBuffer
+{
+	mat4 scale_offset_mat;
+	uint vertex_base_index;
+    ivec4 input_attributes[16];
+};
+layout(set=0, binding=3) uniform usamplerBuffer buff_in_1;
+layout(set=0, binding=4) uniform usamplerBuffer buff_in_2;
+
+layout(location=10) out vec4 back_color;
+layout(location=0) out vec4 tc0;
+
+layout(std140, set=0, binding = 1) uniform VertexConstantsBuffer
+{
+	vec4 vc[16];
+};
+
+struct attr_desc
+{
+	int type;
+	int attribute_size;
+	int starting_offset;
+	int stride;
+    int swap_bytes;
+    int is_volatile;
+};
+
+uint get_bits(uvec4 v, int swap)
+{
+	if (swap != 0) return (v.w | v.z << 8 | v.y << 16 | v.x << 24);
+	return (v.x | v.y << 8 | v.z << 16 | v.w << 24);
+}
+
+vec4 fetch_attr(attr_desc desc, int vertex_id, usamplerBuffer input_stream)
+{
+	vec4 result = vec4(0.0f, 0.0f, 0.0f, 1.0f);
+	uvec4 tmp;
+	uint bits;
+	bool reverse_order = false;
+
+	int first_byte = (vertex_id * desc.stride) + desc.starting_offset;
+	for (int n = 0; n < 4; n++)
+	{
+		if (n == desc.attribute_size) break;
+
+		switch (desc.type)
+		{
+		case 0:
+			//signed normalized 16-bit
+			tmp.x = texelFetch(input_stream, first_byte++).x;
+			tmp.y = texelFetch(input_stream, first_byte++).x;
+			result[n] = get_bits(tmp, desc.swap_bytes);
+			break;
+		case 1:
+			//float
+			tmp.x = texelFetch(input_stream, first_byte++).x;
+			tmp.y = texelFetch(input_stream, first_byte++).x;
+			tmp.z = texelFetch(input_stream, first_byte++).x;
+			tmp.w = texelFetch(input_stream, first_byte++).x;
+			result[n] = uintBitsToFloat(get_bits(tmp, desc.swap_bytes));
+			break;
+		case 2:
+			//unsigned byte
+			result[n] = texelFetch(input_stream, first_byte++).x;
+			reverse_order = (desc.swap_bytes != 0);
+			break;
+		}
+	}
+
+	return (reverse_order)? result.wzyx: result;
+}
+
+attr_desc fetch_desc(int location)
+{
+	attr_desc result;
+	int attribute_flags = input_attributes[location].w;
+	result.type = input_attributes[location].x;
+	result.attribute_size = input_attributes[location].y;
+	result.starting_offset = input_attributes[location].z;
+	result.stride = attribute_flags & 0xFF;
+    result.swap_bytes = (attribute_flags >> 8) & 0x1;
+    result.is_volatile = (attribute_flags >> 9) & 0x1;
+	return result;
+}
+
+vec4 read_location(int location)
+{
+	attr_desc desc = fetch_desc(location);
+
+	int vertex_id = gl_VertexIndex - int(vertex_base_index);
+	if (desc.is_volatile != 0)
+		return fetch_attr(desc, vertex_id, buff_in_2);
+	else
+		return fetch_attr(desc, vertex_id, buff_in_1);
+}
+
+void vs_adjust(inout vec4 dst_reg0, inout vec4 dst_reg1, inout vec4 dst_reg7)
+{
+	vec4 tmp0;
+	vec4 tmp1;
+	vec4 in_diff_color= read_location(3);
+	vec4 in_pos= read_location(0);
+	vec4 in_tc0= read_location(8);
+	dst_reg1 = (in_diff_color * vc[13]);
+	tmp0.x = vec4(dot(vec4(in_pos.xyzx.xyz, 1.0), vc[4])).x;
+	tmp0.y = vec4(dot(vec4(in_pos.xyzx.xyz, 1.0), vc[5])).y;
+	tmp0.z = vec4(dot(vec4(in_pos.xyzx.xyz, 1.0), vc[6])).z;
+	tmp1.xy = in_tc0.xyxx.xy;
+	tmp1.z = vc[15].xxxx.z;
+	dst_reg7.y = vec4(dot(vec4(tmp1.xyzx.xyz, 1.0), vc[8])).y;
+	dst_reg7.x = vec4(dot(vec4(tmp1.xyzx.xyz, 1.0), vc[7])).x;
+	dst_reg0.y = vec4(dot(vec4(tmp0.xyzx.xyz, 1.0), vc[1])).y;
+	dst_reg0.x = vec4(dot(vec4(tmp0.xyzx.xyz, 1.0), vc[0])).x;
+}
+
+void main ()
+{
+	vec4 dst_reg0= vec4(0.0f, 0.0f, 0.0f, 1.0f);
+	vec4 dst_reg1= vec4(0.0, 0.0, 0.0, 0.0);
+	vec4 dst_reg7= vec4(0.0, 0.0, 0.0, 0.0);
+
+	vs_adjust(dst_reg0, dst_reg1, dst_reg7);
+
+	gl_Position = dst_reg0;
+	back_color = dst_reg1;
+	tc0 = dst_reg7;
+	gl_Position = gl_Position * scale_offset_mat;
+}
+
diff --git a/shaders-msl/flatten/rowmajor.flatten.vert b/shaders-msl/flatten/rowmajor.flatten.vert
new file mode 100644
index 00000000..88c468c8
--- /dev/null
+++ b/shaders-msl/flatten/rowmajor.flatten.vert
@@ -0,0 +1,16 @@
+#version 310 es
+
+layout(std140) uniform UBO
+{
+    layout(column_major) mat4 uMVPR;
+    layout(row_major) mat4 uMVPC;
+    layout(row_major) mat2x4 uMVP;
+};
+
+layout(location = 0) in vec4 aVertex;
+
+void main()
+{
+	vec2 v = aVertex * uMVP;
+	gl_Position = uMVPR * aVertex + uMVPC * aVertex;
+}
diff --git a/shaders-msl/flatten/swizzle.flatten.vert b/shaders-msl/flatten/swizzle.flatten.vert
index b4d9655d..e310cdf3 100644
--- a/shaders-msl/flatten/swizzle.flatten.vert
+++ b/shaders-msl/flatten/swizzle.flatten.vert
@@ -40,8 +40,8 @@ void main()
 
     oA = A;
     oB = vec4(B0, B1);
-    oC = vec4(C0, C1);
-    oD = vec4(D0, D1);
+    oC = vec4(C0, C1) + vec4(C1.xy, C1.z, C0);	// not packed
+    oD = vec4(D0, D1) + vec4(D0.xy, D0.z, D1);	// packed - must convert for swizzle
     oE = vec4(E0, E1, E2, E3);
     oF = vec4(F0, F1, F2);
 }
diff --git a/spirv_glsl.cpp b/spirv_glsl.cpp
index 74120d00..be7c38cc 100644
--- a/spirv_glsl.cpp
+++ b/spirv_glsl.cpp
@@ -2237,7 +2237,7 @@ string CompilerGLSL::to_expression(uint32_t id)
 		if (e.base_expression)
 			return to_enclosed_expression(e.base_expression) + e.expression;
 		else if (e.need_transpose)
-			return convert_row_major_matrix(e.expression);
+			return convert_row_major_matrix(e.expression, get<SPIRType>(e.expression_type));
 		else
 			return e.expression;
 	}
@@ -4252,7 +4252,8 @@ const char *CompilerGLSL::index_to_swizzle(uint32_t index)
 }
 
 string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indices, uint32_t count,
-                                           bool index_is_literal, bool chain_only, bool *need_transpose)
+                                           bool index_is_literal, bool chain_only, bool *need_transpose,
+                                           bool *result_is_packed)
 {
 	string expr;
 	if (!chain_only)
@@ -4411,7 +4412,7 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 		{
 			if (row_major_matrix_needs_conversion)
 			{
-				expr = convert_row_major_matrix(expr);
+				expr = convert_row_major_matrix(expr, *type);
 				row_major_matrix_needs_conversion = false;
 			}
 
@@ -4429,7 +4430,10 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 		else if (type->vecsize > 1)
 		{
 			if (vector_is_packed)
+			{
 				expr = unpack_expression_type(expr, *type);
+				vector_is_packed = false;
+			}
 
 			if (index_is_literal)
 			{
@@ -4465,6 +4469,10 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 
 	if (need_transpose)
 		*need_transpose = row_major_matrix_needs_conversion;
+
+	if (result_is_packed)
+		*result_is_packed = vector_is_packed;
+
 	return expr;
 }
 
@@ -4474,7 +4482,7 @@ string CompilerGLSL::to_flattened_struct_member(const SPIRType &type, uint32_t i
 }
 
 string CompilerGLSL::access_chain(uint32_t base, const uint32_t *indices, uint32_t count, const SPIRType &target_type,
-                                  bool *out_need_transpose)
+                                  bool *out_need_transpose, bool *result_is_packed)
 {
 	if (flattened_buffer_blocks.count(base))
 	{
@@ -4484,6 +4492,8 @@ string CompilerGLSL::access_chain(uint32_t base, const uint32_t *indices, uint32
 
 		if (out_need_transpose)
 			*out_need_transpose = target_type.columns > 1 && need_transpose;
+		if (result_is_packed)
+			*result_is_packed = false;
 
 		return flattened_access_chain(base, indices, count, target_type, 0, matrix_stride, need_transpose);
 	}
@@ -4493,11 +4503,13 @@ string CompilerGLSL::access_chain(uint32_t base, const uint32_t *indices, uint32
 		auto &type = get<SPIRType>(get<SPIRVariable>(base).basetype);
 		if (out_need_transpose)
 			*out_need_transpose = false;
+		if (result_is_packed)
+			*result_is_packed = false;
 		return sanitize_underscores(join(to_name(type.self), "_", chain));
 	}
 	else
 	{
-		return access_chain_internal(base, indices, count, false, false, out_need_transpose);
+		return access_chain_internal(base, indices, count, false, false, out_need_transpose, result_is_packed);
 	}
 }
 
@@ -4590,7 +4602,7 @@ std::string CompilerGLSL::flattened_access_chain_struct(uint32_t base, const uin
 
 		// Cannot forward transpositions, so resolve them here.
 		if (need_transpose)
-			expr += convert_row_major_matrix(tmp);
+			expr += convert_row_major_matrix(tmp, member_type);
 		else
 			expr += tmp;
 	}
@@ -5080,7 +5092,7 @@ string CompilerGLSL::build_composite_combiner(const uint32_t *elems, uint32_t le
 		{
 			// We'll likely end up with duplicated swizzles, e.g.
 			// foobar.xyz.xyz from patterns like
-			// OpVectorSwizzle
+			// OpVectorShuffle
 			// OpCompositeExtract x 3
 			// OpCompositeConstruct 3x + other scalar.
 			// Just modify op in-place.
@@ -5240,6 +5252,11 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		auto &e = emit_op(result_type, id, expr, forward, true);
 		e.need_transpose = need_transpose;
 		register_read(id, ptr, forward);
+
+		// Pass through whether the result is of a packed type.
+		if (has_decoration(ptr, DecorationCPacked))
+			set_decoration(id, DecorationCPacked);
+
 		break;
 	}
 
@@ -5252,11 +5269,18 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 
 		// If the base is immutable, the access chain pointer must also be.
 		// If an expression is mutable and forwardable, we speculate that it is immutable.
-		bool need_transpose;
-		auto e = access_chain(ops[2], &ops[3], length - 3, get<SPIRType>(ops[0]), &need_transpose);
+		bool need_transpose, result_is_packed;
+		auto e = access_chain(ops[2], &ops[3], length - 3, get<SPIRType>(ops[0]), &need_transpose, &result_is_packed);
 		auto &expr = set<SPIRExpression>(ops[1], move(e), ops[0], should_forward(ops[2]));
 		expr.loaded_from = ops[2];
 		expr.need_transpose = need_transpose;
+
+		// Mark the result as being packed. Some platforms handled packed vectors differently than non-packed.
+		if (result_is_packed)
+			set_decoration(ops[1], DecorationCPacked);
+		else
+			unset_decoration(ops[1], DecorationCPacked);
+
 		break;
 	}
 
@@ -5635,11 +5659,13 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 				shuffle = true;
 
 		string expr;
-		bool trivial_forward;
+		bool should_fwd, trivial_forward;
 
 		if (shuffle)
 		{
-			trivial_forward = !expression_is_forwarded(vec0) && !expression_is_forwarded(vec1);
+			bool allow_fwd = !backend.force_temp_use_for_two_vector_shuffles;
+			should_fwd = allow_fwd && should_forward(vec0) && should_forward(vec1);
+			trivial_forward = allow_fwd && !expression_is_forwarded(vec0) && !expression_is_forwarded(vec1);
 
 			// Constructor style and shuffling from two different vectors.
 			vector<string> args;
@@ -5654,13 +5680,19 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		}
 		else
 		{
+			should_fwd = should_forward(vec0);
 			trivial_forward = !expression_is_forwarded(vec0);
 
 			// We only source from first vector, so can use swizzle.
+			// If the vector is packed, unpack it before applying a swizzle (needed for MSL)
 			expr += to_enclosed_expression(vec0);
+			if (has_decoration(vec0, DecorationCPacked))
+				expr = unpack_expression_type(expr, expression_type(vec0));
+
 			expr += ".";
 			for (uint32_t i = 0; i < length; i++)
 				expr += index_to_swizzle(elems[i]);
+
 			if (backend.swizzle_is_function && length > 1)
 				expr += "()";
 		}
@@ -5668,7 +5700,7 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		// A shuffle is trivial in that it doesn't actually *do* anything.
 		// We inherit the forwardedness from our arguments to avoid flushing out to temporaries when it's not really needed.
 
-		emit_op(result_type, id, expr, should_forward(vec0) && should_forward(vec1), trivial_forward);
+		emit_op(result_type, id, expr, should_fwd, trivial_forward);
 		break;
 	}
 
@@ -6167,8 +6199,8 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		register_read(ops[1], ops[2], should_forward(ops[2]));
 		break;
 
-		// OpAtomicStore unimplemented. Not sure what would use that.
-		// OpAtomicLoad seems to only be relevant for atomic counters.
+	// OpAtomicStore unimplemented. Not sure what would use that.
+	// OpAtomicLoad seems to only be relevant for atomic counters.
 
 	case OpAtomicIIncrement:
 		forced_temporaries.insert(ops[1]);
@@ -6905,7 +6937,7 @@ bool CompilerGLSL::member_is_packed_type(const SPIRType &type, uint32_t index) c
 // row_major matrix result of the expression to a column_major matrix.
 // Base implementation uses the standard library transpose() function.
 // Subclasses may override to use a different function.
-string CompilerGLSL::convert_row_major_matrix(string exp_str)
+string CompilerGLSL::convert_row_major_matrix(string exp_str, const SPIRType & /*exp_type*/)
 {
 	strip_enclosed_expression(exp_str);
 	return join("transpose(", exp_str, ")");
diff --git a/spirv_glsl.hpp b/spirv_glsl.hpp
index aad1527a..67076272 100644
--- a/spirv_glsl.hpp
+++ b/spirv_glsl.hpp
@@ -290,10 +290,10 @@ protected:
 	void add_resource_name(uint32_t id);
 	void add_member_name(SPIRType &type, uint32_t name);
 
-	bool is_non_native_row_major_matrix(uint32_t id);
-	bool member_is_non_native_row_major_matrix(const SPIRType &type, uint32_t index);
+	virtual bool is_non_native_row_major_matrix(uint32_t id);
+	virtual bool member_is_non_native_row_major_matrix(const SPIRType &type, uint32_t index);
 	bool member_is_packed_type(const SPIRType &type, uint32_t index) const;
-	virtual std::string convert_row_major_matrix(std::string exp_str);
+	virtual std::string convert_row_major_matrix(std::string exp_str, const SPIRType &exp_type);
 
 	std::unordered_set<std::string> local_variable_names;
 	std::unordered_set<std::string> resource_names;
@@ -321,6 +321,8 @@ protected:
 		bool boolean_mix_support = true;
 		bool allow_precision_qualifiers = false;
 		bool can_swizzle_scalar = false;
+		bool force_temp_use_for_two_vector_shuffles = false;
+
 	} backend;
 
 	void emit_struct(SPIRType &type);
@@ -371,9 +373,10 @@ protected:
 	SPIRExpression &emit_op(uint32_t result_type, uint32_t result_id, const std::string &rhs, bool forward_rhs,
 	                        bool suppress_usage_tracking = false);
 	std::string access_chain_internal(uint32_t base, const uint32_t *indices, uint32_t count, bool index_is_literal,
-	                                  bool chain_only = false, bool *need_transpose = nullptr);
+	                                  bool chain_only = false, bool *need_transpose = nullptr,
+	                                  bool *result_is_packed = nullptr);
 	std::string access_chain(uint32_t base, const uint32_t *indices, uint32_t count, const SPIRType &target_type,
-	                         bool *need_transpose = nullptr);
+	                         bool *need_transpose = nullptr, bool *result_is_packed = nullptr);
 
 	std::string flattened_access_chain(uint32_t base, const uint32_t *indices, uint32_t count,
 	                                   const SPIRType &target_type, uint32_t offset, uint32_t matrix_stride,
diff --git a/spirv_msl.cpp b/spirv_msl.cpp
index 087c4150..8e8a8db7 100644
--- a/spirv_msl.cpp
+++ b/spirv_msl.cpp
@@ -57,6 +57,21 @@ string CompilerMSL::compile()
 	// Force a classic "C" locale, reverts when function returns
 	ClassicLocale classic_locale;
 
+	// Do not deal with GLES-isms like precision, older extensions and such.
+	CompilerGLSL::options.vulkan_semantics = true;
+	CompilerGLSL::options.es = false;
+	CompilerGLSL::options.version = 120;
+	backend.float_literal_suffix = false;
+	backend.uint32_t_literal_suffix = true;
+	backend.basic_int_type = "int";
+	backend.basic_uint_type = "uint";
+	backend.discard_literal = "discard_fragment()";
+	backend.swizzle_is_function = false;
+	backend.shared_is_implied = false;
+	backend.native_row_major_matrix = false;
+	backend.flexible_member_array_supported = false;
+	backend.force_temp_use_for_two_vector_shuffles = true;
+
 	replace_illegal_names();
 
 	non_stage_in_input_var_ids.clear();
@@ -88,20 +103,6 @@ string CompilerMSL::compile()
 	if (options.resolve_specialized_array_lengths)
 		resolve_specialized_array_lengths();
 
-	// Do not deal with GLES-isms like precision, older extensions and such.
-	CompilerGLSL::options.vulkan_semantics = true;
-	CompilerGLSL::options.es = false;
-	CompilerGLSL::options.version = 120;
-	backend.float_literal_suffix = false;
-	backend.uint32_t_literal_suffix = true;
-	backend.basic_int_type = "int";
-	backend.basic_uint_type = "uint";
-	backend.discard_literal = "discard_fragment()";
-	backend.swizzle_is_function = false;
-	backend.shared_is_implied = false;
-	backend.native_row_major_matrix = false;
-	backend.flexible_member_array_supported = false;
-
 	uint32_t pass_count = 0;
 	do
 	{
@@ -279,6 +280,15 @@ void CompilerMSL::extract_global_variables_from_function(uint32_t func_id, std::
 			}
 			case OpFunctionCall:
 			{
+				// First see if any of the function call args are globals
+				for (uint32_t arg_idx = 3; arg_idx < i.length; arg_idx++)
+				{
+					uint32_t arg_id = ops[arg_idx];
+					if (global_var_ids.find(arg_id) != global_var_ids.end())
+						added_arg_ids.insert(arg_id);
+				}
+
+				// Then recurse into the function itself to extract globals used internally in the function
 				uint32_t inner_func_id = ops[2];
 				std::set<uint32_t> inner_func_args;
 				extract_global_variables_from_function(inner_func_id, inner_func_args, global_var_ids,
@@ -306,12 +316,10 @@ void CompilerMSL::extract_global_variables_from_function(uint32_t func_id, std::
 			func.add_parameter(type_id, next_id, true);
 			set<SPIRVariable>(next_id, type_id, StorageClassFunction, 0, arg_id);
 
-			// Ensure both the existing and new variables have the same name, and the name is valid
-			string vld_name = ensure_valid_name(to_name(arg_id), "v");
-			set_name(arg_id, vld_name);
-			set_name(next_id, vld_name);
+			// Ensure the existing variable has a valid name and the new variable has all the same meta info
+			set_name(arg_id, ensure_valid_name(to_name(arg_id), "v"));
+			meta[next_id] = meta[arg_id];
 
-			meta[next_id].decoration.qualified_alias = meta[arg_id].decoration.qualified_alias;
 			next_id++;
 		}
 	}
@@ -811,8 +819,8 @@ string CompilerMSL::unpack_expression_type(string expr_str, const SPIRType &type
 // Emits the file header info
 void CompilerMSL::emit_header()
 {
-	for (auto &header : pragma_lines)
-		statement(header);
+	for (auto &pragma : pragma_lines)
+		statement(pragma);
 
 	if (!pragma_lines.empty())
 		statement("");
@@ -830,7 +838,7 @@ void CompilerMSL::emit_header()
 
 void CompilerMSL::add_pragma_line(const string &line)
 {
-	pragma_lines.push_back(line);
+	pragma_lines.insert(line);
 }
 
 // Emits any needed custom function bodies.
@@ -1041,6 +1049,64 @@ void CompilerMSL::emit_custom_functions()
 			statement("");
 			break;
 
+		case SPVFuncImplRowMajor2x3:
+			statement("// Implementation of a conversion of matrix content from RowMajor to ColumnMajor organization.");
+			statement("float2x3 spvConvertFromRowMajor2x3(float2x3 m)");
+			begin_scope();
+			statement("return float2x3(float3(m[0][0], m[0][2], m[1][1]), float3(m[0][1], m[1][0], m[1][2]));");
+			end_scope();
+			statement("");
+			break;
+
+		case SPVFuncImplRowMajor2x4:
+			statement("// Implementation of a conversion of matrix content from RowMajor to ColumnMajor organization.");
+			statement("float2x4 spvConvertFromRowMajor2x4(float2x4 m)");
+			begin_scope();
+			statement("return float2x4(float4(m[0][0], m[0][2], m[1][0], m[1][2]), float4(m[0][1], m[0][3], m[1][1], "
+			          "m[1][3]));");
+			end_scope();
+			statement("");
+			break;
+
+		case SPVFuncImplRowMajor3x2:
+			statement("// Implementation of a conversion of matrix content from RowMajor to ColumnMajor organization.");
+			statement("float3x2 spvConvertFromRowMajor3x2(float3x2 m)");
+			begin_scope();
+			statement("return float3x2(float2(m[0][0], m[1][1]), float2(m[0][1], m[2][0]), float2(m[1][0], m[2][1]));");
+			end_scope();
+			statement("");
+			break;
+
+		case SPVFuncImplRowMajor3x4:
+			statement("// Implementation of a conversion of matrix content from RowMajor to ColumnMajor organization.");
+			statement("float3x4 spvConvertFromRowMajor3x4(float3x4 m)");
+			begin_scope();
+			statement("return float3x4(float4(m[0][0], m[0][3], m[1][2], m[2][1]), float4(m[0][1], m[1][0], m[1][3], "
+			          "m[2][2]), float4(m[0][2], m[1][1], m[2][0], m[2][3]));");
+			end_scope();
+			statement("");
+			break;
+
+		case SPVFuncImplRowMajor4x2:
+			statement("// Implementation of a conversion of matrix content from RowMajor to ColumnMajor organization.");
+			statement("float4x2 spvConvertFromRowMajor4x2(float4x2 m)");
+			begin_scope();
+			statement("return float4x2(float2(m[0][0], m[2][0]), float2(m[0][1], m[2][1]), float2(m[1][0], m[3][0]), "
+			          "float2(m[1][1], m[3][1]));");
+			end_scope();
+			statement("");
+			break;
+
+		case SPVFuncImplRowMajor4x3:
+			statement("// Implementation of a conversion of matrix content from RowMajor to ColumnMajor organization.");
+			statement("float4x3 spvConvertFromRowMajor4x3(float4x3 m)");
+			begin_scope();
+			statement("return float4x3(float3(m[0][0], m[1][1], m[2][2]), float3(m[0][1], m[1][2], m[3][0]), "
+			          "float3(m[0][2], m[2][0], m[3][1]), float3(m[1][0], m[2][1], m[3][2]));");
+			end_scope();
+			statement("");
+			break;
+
 		default:
 			break;
 		}
@@ -1541,6 +1607,24 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
 			emit_barrier(ops[0], ops[1], ops[2]);
 		break;
 
+	case OpVectorTimesMatrix:
+	case OpMatrixTimesVector:
+	{
+		// If the matrix needs transpose and it is square, just flip the multiply order.
+		uint32_t mtx_id = ops[opcode == OpMatrixTimesVector ? 2 : 3];
+		auto *e = maybe_get<SPIRExpression>(mtx_id);
+		auto &t = expression_type(mtx_id);
+		if (e && e->need_transpose && t.columns == t.vecsize)
+		{
+			e->need_transpose = false;
+			emit_binary_op(ops[0], ops[1], ops[3], ops[2], "*");
+			e->need_transpose = true;
+		}
+		else
+			BOP(*);
+		break;
+	}
+
 		// OpOuterProduct
 
 	default:
@@ -1559,34 +1643,18 @@ void CompilerMSL::emit_barrier(uint32_t id_exe_scope, uint32_t id_mem_scope, uin
 	string bar_stmt = "threadgroup_barrier(mem_flags::";
 
 	uint32_t mem_sem = id_mem_sem ? get<SPIRConstant>(id_mem_sem).scalar() : uint32_t(MemorySemanticsMaskNone);
-	switch (mem_sem)
-	{
-	case MemorySemanticsCrossWorkgroupMemoryMask:
+
+	if (mem_sem & MemorySemanticsCrossWorkgroupMemoryMask)
 		bar_stmt += "mem_device";
-		break;
-
-	case MemorySemanticsSubgroupMemoryMask:
-	case MemorySemanticsWorkgroupMemoryMask:
-	case MemorySemanticsAtomicCounterMemoryMask:
+	else if (mem_sem & (MemorySemanticsSubgroupMemoryMask | MemorySemanticsWorkgroupMemoryMask |
+	                    MemorySemanticsAtomicCounterMemoryMask))
 		bar_stmt += "mem_threadgroup";
-		break;
-
-	case MemorySemanticsImageMemoryMask:
+	else if (mem_sem & MemorySemanticsImageMemoryMask)
 		bar_stmt += "mem_texture";
-		break;
-
-	case MemorySemanticsAcquireMask:
-	case MemorySemanticsReleaseMask:
-	case MemorySemanticsAcquireReleaseMask:
-	case MemorySemanticsSequentiallyConsistentMask:
-	case MemorySemanticsUniformMemoryMask:
-	case MemorySemanticsMaskNone:
-	default:
+	else
 		bar_stmt += "mem_none";
-		break;
-	}
 
-	if (options.supports_msl_version(2))
+	if (options.is_ios() && options.supports_msl_version(2))
 	{
 		bar_stmt += ", ";
 
@@ -1919,7 +1987,7 @@ void CompilerMSL::emit_function_prototype(SPIRFunction &func, uint64_t)
 
 		// Manufacture automatic sampler arg for SampledImage texture
 		auto &arg_type = get<SPIRType>(arg.type);
-		if (arg_type.basetype == SPIRType::SampledImage)
+		if (arg_type.basetype == SPIRType::SampledImage && arg_type.image.dim != DimBuffer)
 			decl += ", thread const sampler& " + to_sampler_expression(arg.id);
 
 		if (&arg != &func.arguments.back())
@@ -2213,7 +2281,7 @@ string CompilerMSL::to_func_call_arg(uint32_t id)
 	{
 		auto &var = id_v.get<SPIRVariable>();
 		auto &type = get<SPIRType>(var.basetype);
-		if (type.basetype == SPIRType::SampledImage)
+		if (type.basetype == SPIRType::SampledImage && type.image.dim != DimBuffer)
 			arg_str += ", " + to_sampler_expression(id);
 	}
 
@@ -2229,6 +2297,84 @@ string CompilerMSL::to_sampler_expression(uint32_t id)
 	return samp_id ? to_expression(samp_id) : to_expression(id) + sampler_name_suffix;
 }
 
+// Checks whether the ID is a row_major matrix that requires conversion before use
+bool CompilerMSL::is_non_native_row_major_matrix(uint32_t id)
+{
+	// Natively supported row-major matrices do not need to be converted.
+	if (backend.native_row_major_matrix)
+		return false;
+
+	// Non-matrix or column-major matrix types do not need to be converted.
+	if (!(meta[id].decoration.decoration_flags & (1ull << DecorationRowMajor)))
+		return false;
+
+	// Generate a function that will swap matrix elements from row-major to column-major.
+	const auto type = expression_type(id);
+	add_convert_row_major_matrix_function(type.columns, type.vecsize);
+	return true;
+}
+
+// Checks whether the member is a row_major matrix that requires conversion before use
+bool CompilerMSL::member_is_non_native_row_major_matrix(const SPIRType &type, uint32_t index)
+{
+	// Natively supported row-major matrices do not need to be converted.
+	if (backend.native_row_major_matrix)
+		return false;
+
+	// Non-matrix or column-major matrix types do not need to be converted.
+	if (!(combined_decoration_for_member(type, index) & (1ull << DecorationRowMajor)))
+		return false;
+
+	// Generate a function that will swap matrix elements from row-major to column-major.
+	const auto mbr_type = get<SPIRType>(type.member_types[index]);
+	add_convert_row_major_matrix_function(mbr_type.columns, mbr_type.vecsize);
+	return true;
+}
+
+// Adds a function suitable for converting a non-square row-major matrix to a column-major matrix.
+void CompilerMSL::add_convert_row_major_matrix_function(uint32_t cols, uint32_t rows)
+{
+	SPVFuncImpl spv_func;
+	if (cols == rows) // Square matrix...just use transpose() function
+		return;
+	else if (cols == 2 && rows == 3)
+		spv_func = SPVFuncImplRowMajor2x3;
+	else if (cols == 2 && rows == 4)
+		spv_func = SPVFuncImplRowMajor2x4;
+	else if (cols == 3 && rows == 2)
+		spv_func = SPVFuncImplRowMajor3x2;
+	else if (cols == 3 && rows == 4)
+		spv_func = SPVFuncImplRowMajor3x4;
+	else if (cols == 4 && rows == 2)
+		spv_func = SPVFuncImplRowMajor4x2;
+	else if (cols == 4 && rows == 3)
+		spv_func = SPVFuncImplRowMajor4x3;
+	else
+		SPIRV_CROSS_THROW("Could not convert row-major matrix.");
+
+	auto rslt = spv_function_implementations.insert(spv_func);
+	if (rslt.second)
+	{
+		add_pragma_line("#pragma clang diagnostic ignored \"-Wmissing-prototypes\"");
+		force_recompile = true;
+	}
+}
+
+// Wraps the expression string in a function call that converts the
+// row_major matrix result of the expression to a column_major matrix.
+string CompilerMSL::convert_row_major_matrix(string exp_str, const SPIRType &exp_type)
+{
+	strip_enclosed_expression(exp_str);
+
+	string func_name;
+	if (exp_type.columns == exp_type.vecsize)
+		func_name = "transpose";
+	else
+		func_name = string("spvConvertFromRowMajor") + to_string(exp_type.columns) + "x" + to_string(exp_type.vecsize);
+
+	return join(func_name, "(", exp_str, ")");
+}
+
 // Called automatically at the end of the entry point function
 void CompilerMSL::emit_fixup()
 {
@@ -2237,10 +2383,8 @@ void CompilerMSL::emit_fixup()
 	if ((execution.model == ExecutionModelVertex) && stage_out_var_id && !qual_pos_var_name.empty())
 	{
 		if (CompilerGLSL::options.vertex.fixup_clipspace)
-		{
 			statement(qual_pos_var_name, ".z = (", qual_pos_var_name, ".z + ", qual_pos_var_name,
 			          ".w) * 0.5;       // Adjust clip-space for Metal");
-		}
 
 		if (CompilerGLSL::options.vertex.flip_vert_y)
 			statement(qual_pos_var_name, ".y = -(", qual_pos_var_name, ".y);", "    // Invert Y-axis for Metal");
@@ -2599,6 +2743,7 @@ string CompilerMSL::entry_point_args(bool append_comma)
 			{
 				if (!ep_args.empty())
 					ep_args += ", ";
+
 				BuiltIn bi_type = meta[var_id].decoration.builtin_type;
 				ep_args += builtin_type_decl(bi_type) + " " + to_expression(var_id);
 				ep_args += " [[" + builtin_qualifier(bi_type) + "]]";
@@ -2682,7 +2827,10 @@ string CompilerMSL::argument_decl(const SPIRFunction::Parameter &arg)
 	if (constref)
 		decl += "const ";
 
-	decl += type_to_glsl(type, arg.id);
+	if (is_builtin_variable(var))
+		decl += builtin_type_decl((BuiltIn)get_decoration(arg.id, DecorationBuiltIn));
+	else
+		decl += type_to_glsl(type, arg.id);
 
 	if (is_array(type))
 		decl += "*";
@@ -3298,9 +3446,35 @@ bool CompilerMSL::OpCodePreprocessor::handle(Op opcode, const uint32_t *args, ui
 		break;
 	}
 
-	// Keep track of the instruction return types, mapped by ID
-	if (length > 1)
-		result_types[args[1]] = args[0];
+	// If it has one, keep track of the instruction's result type, mapped by ID
+	switch (opcode)
+	{
+	case OpStore:
+	case OpCopyMemory:
+	case OpCopyMemorySized:
+	case OpImageWrite:
+	case OpAtomicStore:
+	case OpAtomicFlagClear:
+	case OpEmitStreamVertex:
+	case OpEndStreamPrimitive:
+	case OpControlBarrier:
+	case OpMemoryBarrier:
+	case OpGroupWaitEvents:
+	case OpRetainEvent:
+	case OpReleaseEvent:
+	case OpSetUserEventStatus:
+	case OpCaptureEventProfilingInfo:
+	case OpCommitReadPipe:
+	case OpCommitWritePipe:
+	case OpGroupCommitReadPipe:
+	case OpGroupCommitWritePipe:
+		break;
+
+	default:
+		if (length > 1)
+			result_types[args[1]] = args[0];
+		break;
+	}
 
 	return true;
 }
diff --git a/spirv_msl.hpp b/spirv_msl.hpp
index a7082ae3..fbc8dadf 100644
--- a/spirv_msl.hpp
+++ b/spirv_msl.hpp
@@ -77,10 +77,26 @@ public:
 	// Options for compiling to Metal Shading Language
 	struct Options
 	{
+		typedef enum {
+			iOS,
+			macOS,
+		} Platform;
+
+		Platform platform = macOS;
 		uint32_t msl_version = make_msl_version(1, 2);
 		bool enable_point_size_builtin = true;
 		bool resolve_specialized_array_lengths = true;
 
+		bool is_ios()
+		{
+			return platform == iOS;
+		}
+
+		bool is_macos()
+		{
+			return platform == macOS;
+		}
+
 		void set_msl_version(uint32_t major, uint32_t minor = 0, uint32_t patch = 0)
 		{
 			msl_version = make_msl_version(major, minor, patch);
@@ -122,6 +138,12 @@ public:
 		SPVFuncImplInverse2x2,
 		SPVFuncImplInverse3x3,
 		SPVFuncImplInverse4x4,
+		SPVFuncImplRowMajor2x3,
+		SPVFuncImplRowMajor2x4,
+		SPVFuncImplRowMajor3x2,
+		SPVFuncImplRowMajor3x4,
+		SPVFuncImplRowMajor4x2,
+		SPVFuncImplRowMajor4x3,
 	};
 
 	// Constructs an instance to compile the SPIR-V code into Metal Shading Language,
@@ -186,6 +208,9 @@ protected:
 	std::string to_qualifiers_glsl(uint32_t id) override;
 	void replace_illegal_names() override;
 	void declare_undefined_values() override;
+	bool is_non_native_row_major_matrix(uint32_t id) override;
+	bool member_is_non_native_row_major_matrix(const SPIRType &type, uint32_t index) override;
+	std::string convert_row_major_matrix(std::string exp_str, const SPIRType &exp_type) override;
 
 	void preprocess_op_codes();
 	void localize_global_variables();
@@ -207,6 +232,7 @@ protected:
 	void emit_interface_block(uint32_t ib_var_id);
 	bool maybe_emit_input_struct_assignment(uint32_t id_lhs, uint32_t id_rhs);
 	bool maybe_emit_array_assignment(uint32_t id_lhs, uint32_t id_rhs);
+	void add_convert_row_major_matrix_function(uint32_t cols, uint32_t rows);
 
 	std::string func_type_decl(SPIRType &type);
 	std::string entry_point_args(bool append_comma);
@@ -245,7 +271,7 @@ protected:
 	std::unordered_map<uint32_t, MSLVertexAttr *> vtx_attrs_by_location;
 	std::map<uint32_t, uint32_t> non_stage_in_input_var_ids;
 	std::unordered_map<MSLStructMemberKey, uint32_t> struct_member_padding;
-	std::vector<std::string> pragma_lines;
+	std::set<std::string> pragma_lines;
 	std::vector<MSLResourceBinding *> resource_bindings;
 	MSLResourceBinding next_metal_resource_index;
 	uint32_t stage_in_var_id = 0;
diff --git a/test_shaders.py b/test_shaders.py
index b83bd052..1ca8f9af 100755
--- a/test_shaders.py
+++ b/test_shaders.py
@@ -358,11 +358,20 @@ def test_shader_msl(stats, shader, update, keep, opt):
     noopt = shader_is_noopt(shader[1])
     spirv, msl = cross_compile_msl(joined_path, is_spirv, opt and (not noopt))
     regression_check(shader, msl, update, keep, opt)
-    os.remove(spirv)
+
+    # Uncomment the following line to print the temp SPIR-V file path.
+    # This temp SPIR-V file is not deleted until after the Metal validation step below.
+    # If Metal validation fails, the temp SPIR-V file can be copied out and
+    # used as input to an invocation of spirv-cross to debug from Xcode directly.
+    # To do so, build spriv-cross using `make DEBUG=1`, then run the spriv-cross
+    # executable from Xcode using args: `--msl --entry main --output msl_path spirv_path`.
+#    print('SPRIV shader: ' + spirv)
 
     if not force_no_external_validation:
         validate_shader_msl(shader, opt)
 
+    os.remove(spirv)
+
 def test_shader_hlsl(stats, shader, update, keep, opt):
     joined_path = os.path.join(shader[0], shader[1])
     print('Testing HLSL shader:', joined_path)
diff --git a/test_shaders.sh b/test_shaders.sh
index 0657cb5c..a3608730 100755
--- a/test_shaders.sh
+++ b/test_shaders.sh
@@ -11,6 +11,7 @@ echo "Using spirv-opt in: $(which spirv-opt)."
 ./test_shaders.py shaders --opt || exit 1
 ./test_shaders.py shaders-msl --msl || exit 1
 ./test_shaders.py shaders-msl --msl --opt || exit 1
+./test_shaders.py shaders-msl-no-opt --msl || exit 1
 ./test_shaders.py shaders-hlsl --hlsl || exit 1
 ./test_shaders.py shaders-hlsl --hlsl --opt || exit 1
 
diff --git a/update_test_shaders.sh b/update_test_shaders.sh
index 69165847..712c3eec 100755
--- a/update_test_shaders.sh
+++ b/update_test_shaders.sh
@@ -11,6 +11,7 @@ echo "Using spirv-opt in: $(which spirv-opt)."
 ./test_shaders.py shaders --update --opt || exit 1
 ./test_shaders.py shaders-msl --msl --update || exit 1
 ./test_shaders.py shaders-msl --msl --update --opt || exit 1
+./test_shaders.py shaders-msl-no-opt --msl --update || exit 1
 ./test_shaders.py shaders-hlsl --hlsl --update || exit 1
 ./test_shaders.py shaders-hlsl --hlsl --update --opt || exit 1