diff --git a/.travis.yml b/.travis.yml
index 593d472b..c3dfc6ca 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -24,3 +24,4 @@ script:
   - make -j2
   - PATH=./glslang/StandAlone:./SPIRV-Tools/tools:$PATH
   - ./test_shaders.py shaders
+  - ./test_shaders.py --metal shaders-msl
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ebf933e3..0532f962 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -93,6 +93,9 @@ if (${PYTHONINTERP_FOUND})
 	add_test(NAME spirv-cross-test
 		COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test_shaders.py
 			${CMAKE_CURRENT_SOURCE_DIR}/shaders)
+	add_test(NAME spirv-cross-test-metal
+		COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test_shaders.py --metal
+			${CMAKE_CURRENT_SOURCE_DIR}/shaders-msl)
   endif()
 else()
   message(WARNING "Testing disabled. Could not find python3. If you have python3 installed try running "
diff --git a/README.md b/README.md
index 415f9523..edc0452a 100644
--- a/README.md
+++ b/README.md
@@ -171,6 +171,8 @@ In these cases, run `./test_shaders.py shaders --update` to update the reference
 Always make sure you are running up to date glslangValidator as well as SPIRV-Tools when updating reference files.
 
 In short, the master branch should always be able to run `./test_shaders.py shaders` without failure.
+SPIRV-Cross uses Travis CI to test all pull requests, so it is not strictly needed to perform testing yourself if you have problems running it locally.
+A pull request which does not pass testing on Travis will not be accepted however.
 
 When adding support for new features to SPIRV-Cross, a new shader and reference file should be added which covers usage of the new shader features in question.
 
@@ -205,6 +207,10 @@ The current reference output is contained in reference/.
 
 See `./test_shaders.py --help` for more.
 
+### Metal backend
+
+To test the roundtrip path GLSL -> SPIR-V -> MSL, `--metal` can be added, e.g. `./test_shaders.py --metal shaders-msl`.
+
 ### Updating regression tests
 
 When legitimate changes are found, use `--update` flag to update regression files.
diff --git a/main.cpp b/main.cpp
index ff79bcd1..c3443fa7 100644
--- a/main.cpp
+++ b/main.cpp
@@ -27,7 +27,7 @@
 #include <unordered_set>
 
 #ifdef _MSC_VER
-#pragma warning(disable: 4996)
+#pragma warning(disable : 4996)
 #endif
 
 using namespace spv;
@@ -671,8 +671,12 @@ int main(int argc, char *argv[])
 		res = compiler->get_shader_resources();
 
 	if (args.flatten_ubo)
+	{
 		for (auto &ubo : res.uniform_buffers)
-			compiler->flatten_interface_block(ubo.id);
+			compiler->flatten_buffer_block(ubo.id);
+		for (auto &ubo : res.push_constant_buffers)
+			compiler->flatten_buffer_block(ubo.id);
+	}
 
 	auto pls_inputs = remap_pls(args.pls_in, res.stage_inputs, &res.subpass_inputs);
 	auto pls_outputs = remap_pls(args.pls_out, res.stage_outputs, nullptr);
diff --git a/reference/shaders-msl/vert/basic.vert b/reference/shaders-msl/vert/basic.vert
new file mode 100644
index 00000000..d2c96d2c
--- /dev/null
+++ b/reference/shaders-msl/vert/basic.vert
@@ -0,0 +1,32 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct UBO
+{
+    float4x4 uMVP;
+};
+
+struct main0_in
+{
+    float3 aNormal [[attribute(1)]];
+    float4 aVertex [[attribute(0)]];
+};
+
+struct main0_out
+{
+    float3 vNormal [[user(locn0)]];
+    float4 gl_Position [[position]];
+    float gl_PointSize;
+};
+
+vertex main0_out main0(main0_in in [[stage_in]], constant UBO& _16 [[buffer(0)]])
+{
+    main0_out out = {};
+    out.gl_Position = _16.uMVP * in.aVertex;
+    out.vNormal = in.aNormal;
+    out.gl_Position.y = -(out.gl_Position.y);    // Invert Y-axis for Metal
+    return out;
+}
+
diff --git a/reference/shaders/comp/basic.comp b/reference/shaders/comp/basic.comp
index ca2503bd..14850899 100644
--- a/reference/shaders/comp/basic.comp
+++ b/reference/shaders/comp/basic.comp
@@ -1,12 +1,12 @@
 #version 310 es
 layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
 
-layout(binding = 0, std430) buffer SSBO
+layout(binding = 0, std430) readonly buffer SSBO
 {
     vec4 in_data[];
 } _23;
 
-layout(binding = 1, std430) buffer SSBO2
+layout(binding = 1, std430) writeonly buffer SSBO2
 {
     vec4 out_data[];
 } _45;
diff --git a/reference/shaders/comp/culling.comp b/reference/shaders/comp/culling.comp
index cd284b96..fd83bfcb 100644
--- a/reference/shaders/comp/culling.comp
+++ b/reference/shaders/comp/culling.comp
@@ -1,12 +1,12 @@
 #version 310 es
 layout(local_size_x = 4, local_size_y = 1, local_size_z = 1) in;
 
-layout(binding = 0, std430) buffer SSBO
+layout(binding = 0, std430) readonly buffer SSBO
 {
     float in_data[];
 } _22;
 
-layout(binding = 1, std430) buffer SSBO2
+layout(binding = 1, std430) writeonly buffer SSBO2
 {
     float out_data[];
 } _38;
diff --git a/reference/shaders/comp/dowhile.comp b/reference/shaders/comp/dowhile.comp
index 16ba4001..e717961a 100644
--- a/reference/shaders/comp/dowhile.comp
+++ b/reference/shaders/comp/dowhile.comp
@@ -1,13 +1,13 @@
 #version 310 es
 layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
 
-layout(binding = 0, std430) buffer SSBO
+layout(binding = 0, std430) readonly buffer SSBO
 {
     mat4 mvp;
     vec4 in_data[];
 } _28;
 
-layout(binding = 1, std430) buffer SSBO2
+layout(binding = 1, std430) writeonly buffer SSBO2
 {
     vec4 out_data[];
 } _52;
diff --git a/reference/shaders/comp/generate_height.comp b/reference/shaders/comp/generate_height.comp
index a2128dd8..1367e951 100644
--- a/reference/shaders/comp/generate_height.comp
+++ b/reference/shaders/comp/generate_height.comp
@@ -1,7 +1,7 @@
 #version 310 es
 layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
 
-layout(binding = 0, std430) buffer Distribution
+layout(binding = 0, std430) readonly buffer Distribution
 {
     vec2 distribution[];
 } _190;
@@ -11,7 +11,7 @@ layout(binding = 2, std140) uniform UBO
     vec4 uModTime;
 } _218;
 
-layout(binding = 1, std430) buffer HeightmapFFT
+layout(binding = 1, std430) writeonly buffer HeightmapFFT
 {
     uint heights[];
 } _276;
diff --git a/reference/shaders/comp/inout-struct.invalid.comp b/reference/shaders/comp/inout-struct.invalid.comp
index 1aaa48f2..eec37a19 100644
--- a/reference/shaders/comp/inout-struct.invalid.comp
+++ b/reference/shaders/comp/inout-struct.invalid.comp
@@ -9,17 +9,17 @@ struct Foo
     vec4 d;
 };
 
-layout(binding = 1, std430) buffer SSBO2
+layout(binding = 1, std430) readonly buffer SSBO2
 {
     vec4 data[];
 } indata;
 
-layout(binding = 0, std430) buffer SSBO
+layout(binding = 0, std430) writeonly buffer SSBO
 {
     vec4 data[];
 } outdata;
 
-layout(binding = 2, std430) buffer SSBO3
+layout(binding = 2, std430) readonly buffer SSBO3
 {
     Foo foos[];
 } foobar;
diff --git a/reference/shaders/comp/insert.comp b/reference/shaders/comp/insert.comp
index 6c10020c..cbe1e27f 100644
--- a/reference/shaders/comp/insert.comp
+++ b/reference/shaders/comp/insert.comp
@@ -1,7 +1,7 @@
 #version 310 es
 layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
 
-layout(binding = 0, std430) buffer SSBO
+layout(binding = 0, std430) writeonly buffer SSBO
 {
     vec4 out_data[];
 } _27;
diff --git a/reference/shaders/comp/loop.comp b/reference/shaders/comp/loop.comp
index 9853acaa..049a3066 100644
--- a/reference/shaders/comp/loop.comp
+++ b/reference/shaders/comp/loop.comp
@@ -1,13 +1,13 @@
 #version 310 es
 layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
 
-layout(binding = 0, std430) buffer SSBO
+layout(binding = 0, std430) readonly buffer SSBO
 {
     mat4 mvp;
     vec4 in_data[];
 } _24;
 
-layout(binding = 1, std430) buffer SSBO2
+layout(binding = 1, std430) writeonly buffer SSBO2
 {
     vec4 out_data[];
 } _177;
diff --git a/reference/shaders/comp/mat3.comp b/reference/shaders/comp/mat3.comp
index dc302396..2b050f5d 100644
--- a/reference/shaders/comp/mat3.comp
+++ b/reference/shaders/comp/mat3.comp
@@ -1,7 +1,7 @@
 #version 310 es
 layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
 
-layout(binding = 1, std430) buffer SSBO2
+layout(binding = 1, std430) writeonly buffer SSBO2
 {
     mat3 out_data[];
 } _22;
diff --git a/reference/shaders/comp/mod.comp b/reference/shaders/comp/mod.comp
index dfb9cf4c..4be0c5f7 100644
--- a/reference/shaders/comp/mod.comp
+++ b/reference/shaders/comp/mod.comp
@@ -1,12 +1,12 @@
 #version 310 es
 layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
 
-layout(binding = 0, std430) buffer SSBO
+layout(binding = 0, std430) readonly buffer SSBO
 {
     vec4 in_data[];
 } _23;
 
-layout(binding = 1, std430) buffer SSBO2
+layout(binding = 1, std430) writeonly buffer SSBO2
 {
     vec4 out_data[];
 } _33;
diff --git a/reference/shaders/comp/modf.comp b/reference/shaders/comp/modf.comp
index 721d812f..c92149bf 100644
--- a/reference/shaders/comp/modf.comp
+++ b/reference/shaders/comp/modf.comp
@@ -1,12 +1,12 @@
 #version 310 es
 layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
 
-layout(binding = 0, std430) buffer SSBO
+layout(binding = 0, std430) readonly buffer SSBO
 {
     vec4 in_data[];
 } _23;
 
-layout(binding = 1, std430) buffer SSBO2
+layout(binding = 1, std430) writeonly buffer SSBO2
 {
     vec4 out_data[];
 } _35;
diff --git a/reference/shaders/comp/read-write-only.comp b/reference/shaders/comp/read-write-only.comp
new file mode 100644
index 00000000..06227ee2
--- /dev/null
+++ b/reference/shaders/comp/read-write-only.comp
@@ -0,0 +1,27 @@
+#version 310 es
+layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+
+layout(binding = 2, std430) restrict writeonly buffer SSBO2
+{
+    vec4 data4;
+    vec4 data5;
+} _10;
+
+layout(binding = 0, std430) readonly buffer SSBO0
+{
+    vec4 data0;
+    vec4 data1;
+} _15;
+
+layout(binding = 1, std430) restrict buffer SSBO1
+{
+    vec4 data2;
+    vec4 data3;
+} _21;
+
+void main()
+{
+    _10.data4 = _15.data0 + _21.data2;
+    _10.data5 = _15.data1 + _21.data3;
+}
+
diff --git a/reference/shaders/comp/return.comp b/reference/shaders/comp/return.comp
index 20d61d25..4be20e93 100644
--- a/reference/shaders/comp/return.comp
+++ b/reference/shaders/comp/return.comp
@@ -1,7 +1,7 @@
 #version 310 es
 layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
 
-layout(binding = 1, std430) buffer SSBO2
+layout(binding = 1, std430) writeonly buffer SSBO2
 {
     vec4 out_data[];
 } _27;
diff --git a/reference/shaders/comp/shared.comp b/reference/shaders/comp/shared.comp
index e2ff6045..d0987a65 100644
--- a/reference/shaders/comp/shared.comp
+++ b/reference/shaders/comp/shared.comp
@@ -1,12 +1,12 @@
 #version 310 es
 layout(local_size_x = 4, local_size_y = 1, local_size_z = 1) in;
 
-layout(binding = 0, std430) buffer SSBO
+layout(binding = 0, std430) readonly buffer SSBO
 {
     float in_data[];
 } _22;
 
-layout(binding = 1, std430) buffer SSBO2
+layout(binding = 1, std430) writeonly buffer SSBO2
 {
     float out_data[];
 } _44;
diff --git a/reference/shaders/comp/struct-layout.comp b/reference/shaders/comp/struct-layout.comp
index 1cbf5dfb..4feea8be 100644
--- a/reference/shaders/comp/struct-layout.comp
+++ b/reference/shaders/comp/struct-layout.comp
@@ -6,12 +6,12 @@ struct Foo
     mat4 m;
 };
 
-layout(binding = 1, std430) buffer SSBO2
+layout(binding = 1, std430) writeonly buffer SSBO2
 {
     Foo out_data[];
 } _23;
 
-layout(binding = 0, std430) buffer SSBO
+layout(binding = 0, std430) readonly buffer SSBO
 {
     Foo in_data[];
 } _30;
diff --git a/reference/shaders/comp/torture-loop.comp b/reference/shaders/comp/torture-loop.comp
index ae183190..645af5c3 100644
--- a/reference/shaders/comp/torture-loop.comp
+++ b/reference/shaders/comp/torture-loop.comp
@@ -1,13 +1,13 @@
 #version 310 es
 layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
 
-layout(binding = 0, std430) buffer SSBO
+layout(binding = 0, std430) readonly buffer SSBO
 {
     mat4 mvp;
     vec4 in_data[];
 } _24;
 
-layout(binding = 1, std430) buffer SSBO2
+layout(binding = 1, std430) writeonly buffer SSBO2
 {
     vec4 out_data[];
 } _89;
diff --git a/reference/shaders/flatten/array.flatten.vert b/reference/shaders/flatten/array.flatten.vert
new file mode 100644
index 00000000..d7b60fd1
--- /dev/null
+++ b/reference/shaders/flatten/array.flatten.vert
@@ -0,0 +1,12 @@
+#version 310 es
+
+uniform vec4 UBO[56];
+in vec4 aVertex;
+
+void main()
+{
+    vec4 a4 = UBO[23];
+    vec4 offset = (UBO[50] + UBO[45]) + vec4(UBO[54].x);
+    gl_Position = ((mat4(UBO[40], UBO[41], UBO[42], UBO[43]) * aVertex) + UBO[55]) + offset;
+}
+
diff --git a/reference/shaders/flatten/basic.flatten.vert b/reference/shaders/flatten/basic.flatten.vert
new file mode 100644
index 00000000..2f65d2ec
--- /dev/null
+++ b/reference/shaders/flatten/basic.flatten.vert
@@ -0,0 +1,13 @@
+#version 310 es
+
+uniform vec4 UBO[4];
+in vec4 aVertex;
+out vec3 vNormal;
+in vec3 aNormal;
+
+void main()
+{
+    gl_Position = mat4(UBO[0], UBO[1], UBO[2], UBO[3]) * aVertex;
+    vNormal = aNormal;
+}
+
diff --git a/reference/shaders/flatten/copy.flatten.vert b/reference/shaders/flatten/copy.flatten.vert
new file mode 100644
index 00000000..7e653d9d
--- /dev/null
+++ b/reference/shaders/flatten/copy.flatten.vert
@@ -0,0 +1,29 @@
+#version 310 es
+
+struct Light
+{
+    vec3 Position;
+    float Radius;
+    vec4 Color;
+};
+
+uniform vec4 UBO[12];
+in vec4 aVertex;
+out vec4 vColor;
+in vec3 aNormal;
+
+void main()
+{
+    gl_Position = mat4(UBO[0], UBO[1], UBO[2], UBO[3]) * aVertex;
+    vColor = vec4(0.0);
+    for (int i = 0; i < 4; i++)
+    {
+        Light light;
+        light.Position = Light(UBO[i * 2 + 4].xyz, UBO[i * 2 + 4].w, UBO[i * 2 + 5]).Position;
+        light.Radius = Light(UBO[i * 2 + 4].xyz, UBO[i * 2 + 4].w, UBO[i * 2 + 5]).Radius;
+        light.Color = Light(UBO[i * 2 + 4].xyz, UBO[i * 2 + 4].w, UBO[i * 2 + 5]).Color;
+        vec3 L = aVertex.xyz - light.Position;
+        vColor += (((UBO[i * 2 + 5]) * clamp(1.0 - (length(L) / light.Radius), 0.0, 1.0)) * dot(aNormal, normalize(L)));
+    }
+}
+
diff --git a/reference/shaders/flatten/dynamic.flatten.vert b/reference/shaders/flatten/dynamic.flatten.vert
new file mode 100644
index 00000000..4a36ee82
--- /dev/null
+++ b/reference/shaders/flatten/dynamic.flatten.vert
@@ -0,0 +1,25 @@
+#version 310 es
+
+struct Light
+{
+    vec3 Position;
+    float Radius;
+    vec4 Color;
+};
+
+uniform vec4 UBO[12];
+in vec4 aVertex;
+out vec4 vColor;
+in vec3 aNormal;
+
+void main()
+{
+    gl_Position = mat4(UBO[0], UBO[1], UBO[2], UBO[3]) * aVertex;
+    vColor = vec4(0.0);
+    for (int i = 0; i < 4; i++)
+    {
+        vec3 L = aVertex.xyz - (UBO[i * 2 + 4].xyz);
+        vColor += (((UBO[i * 2 + 5]) * clamp(1.0 - (length(L) / (UBO[i * 2 + 4].w)), 0.0, 1.0)) * dot(aNormal, normalize(L)));
+    }
+}
+
diff --git a/reference/shaders/flatten/matrixindex.flatten.vert b/reference/shaders/flatten/matrixindex.flatten.vert
new file mode 100644
index 00000000..4deeaf1d
--- /dev/null
+++ b/reference/shaders/flatten/matrixindex.flatten.vert
@@ -0,0 +1,19 @@
+#version 310 es
+
+uniform vec4 UBO[14];
+out vec4 oA;
+out vec4 oB;
+out vec4 oC;
+out vec4 oD;
+out vec4 oE;
+
+void main()
+{
+    gl_Position = vec4(0.0);
+    oA = UBO[1];
+    oB = vec4(UBO[4].y, UBO[5].y, UBO[6].y, UBO[7].y);
+    oC = UBO[9];
+    oD = vec4(UBO[10].x, UBO[11].x, UBO[12].x, UBO[13].x);
+    oE = vec4(UBO[1].z, UBO[6].y, UBO[9].z, UBO[12].y);
+}
+
diff --git a/reference/shaders/flatten/multiindex.flatten.vert b/reference/shaders/flatten/multiindex.flatten.vert
new file mode 100644
index 00000000..f046f719
--- /dev/null
+++ b/reference/shaders/flatten/multiindex.flatten.vert
@@ -0,0 +1,10 @@
+#version 310 es
+
+uniform vec4 UBO[15];
+in ivec2 aIndex;
+
+void main()
+{
+    gl_Position = UBO[aIndex.x * 5 + aIndex.y * 1 + 0];
+}
+
diff --git a/reference/shaders/flatten/push-constant.flatten.vert b/reference/shaders/flatten/push-constant.flatten.vert
new file mode 100644
index 00000000..216c1f9d
--- /dev/null
+++ b/reference/shaders/flatten/push-constant.flatten.vert
@@ -0,0 +1,13 @@
+#version 310 es
+
+uniform vec4 PushMe[6];
+layout(location = 1) in vec4 Pos;
+layout(location = 0) out vec2 vRot;
+layout(location = 0) in vec2 Rot;
+
+void main()
+{
+    gl_Position = mat4(PushMe[0], PushMe[1], PushMe[2], PushMe[3]) * Pos;
+    vRot = (mat2(PushMe[4].xy, PushMe[4].zw) * Rot) + vec2(PushMe[5].z);
+}
+
diff --git a/reference/shaders/flatten/rowmajor.flatten.vert b/reference/shaders/flatten/rowmajor.flatten.vert
new file mode 100644
index 00000000..01d7c78e
--- /dev/null
+++ b/reference/shaders/flatten/rowmajor.flatten.vert
@@ -0,0 +1,11 @@
+#version 310 es
+
+uniform vec4 UBO[12];
+in vec4 aVertex;
+
+void main()
+{
+    vec2 v = mat4x2(UBO[8].xy, UBO[9].xy, UBO[10].xy, UBO[11].xy) * aVertex;
+    gl_Position = (mat4(UBO[0], UBO[1], UBO[2], UBO[3]) * aVertex) + (aVertex * mat4(UBO[4], UBO[5], UBO[6], UBO[7]));
+}
+
diff --git a/reference/shaders/flatten/struct.flatten.vert b/reference/shaders/flatten/struct.flatten.vert
new file mode 100644
index 00000000..6bfda33b
--- /dev/null
+++ b/reference/shaders/flatten/struct.flatten.vert
@@ -0,0 +1,22 @@
+#version 310 es
+
+struct Light
+{
+    vec3 Position;
+    float Radius;
+    vec4 Color;
+};
+
+uniform vec4 UBO[6];
+in vec4 aVertex;
+out vec4 vColor;
+in vec3 aNormal;
+
+void main()
+{
+    gl_Position = mat4(UBO[0], UBO[1], UBO[2], UBO[3]) * aVertex;
+    vColor = vec4(0.0);
+    vec3 L = aVertex.xyz - UBO[4].xyz;
+    vColor += ((UBO[5] * clamp(1.0 - (length(L) / UBO[4].w), 0.0, 1.0)) * dot(aNormal, normalize(L)));
+}
+
diff --git a/reference/shaders/flatten/struct.rowmajor.flatten.vert b/reference/shaders/flatten/struct.rowmajor.flatten.vert
new file mode 100644
index 00000000..a40aea08
--- /dev/null
+++ b/reference/shaders/flatten/struct.rowmajor.flatten.vert
@@ -0,0 +1,25 @@
+#version 310 es
+
+struct Foo
+{
+    mat3x4 MVP0;
+    mat3x4 MVP1;
+};
+
+uniform vec4 UBO[8];
+layout(location = 0) in vec4 v0;
+layout(location = 1) in vec4 v1;
+layout(location = 0) out vec3 V0;
+layout(location = 1) out vec3 V1;
+
+void main()
+{
+    Foo f;
+    f.MVP0 = Foo(transpose(mat4x3(UBO[0].xyz, UBO[1].xyz, UBO[2].xyz, UBO[3].xyz)), transpose(mat4x3(UBO[4].xyz, UBO[5].xyz, UBO[6].xyz, UBO[7].xyz))).MVP0;
+    f.MVP1 = Foo(transpose(mat4x3(UBO[0].xyz, UBO[1].xyz, UBO[2].xyz, UBO[3].xyz)), transpose(mat4x3(UBO[4].xyz, UBO[5].xyz, UBO[6].xyz, UBO[7].xyz))).MVP1;
+    vec3 a = v0 * f.MVP0;
+    vec3 b = v1 * f.MVP1;
+    V0 = a;
+    V1 = b;
+}
+
diff --git a/reference/shaders/flatten/swizzle.flatten.vert b/reference/shaders/flatten/swizzle.flatten.vert
new file mode 100644
index 00000000..37289c0c
--- /dev/null
+++ b/reference/shaders/flatten/swizzle.flatten.vert
@@ -0,0 +1,21 @@
+#version 310 es
+
+uniform vec4 UBO[8];
+out vec4 oA;
+out vec4 oB;
+out vec4 oC;
+out vec4 oD;
+out vec4 oE;
+out vec4 oF;
+
+void main()
+{
+    gl_Position = vec4(0.0);
+    oA = UBO[0];
+    oB = vec4(UBO[1].xy, UBO[1].zw);
+    oC = vec4(UBO[2].x, UBO[3].xyz);
+    oD = vec4(UBO[4].xyz, UBO[4].w);
+    oE = vec4(UBO[5].x, UBO[5].y, UBO[5].z, UBO[5].w);
+    oF = vec4(UBO[6].x, UBO[6].zw, UBO[7].x);
+}
+
diff --git a/reference/shaders/flatten/types.flatten.frag b/reference/shaders/flatten/types.flatten.frag
new file mode 100644
index 00000000..a74327d9
--- /dev/null
+++ b/reference/shaders/flatten/types.flatten.frag
@@ -0,0 +1,14 @@
+#version 310 es
+precision mediump float;
+precision highp int;
+
+uniform mediump ivec4 UBO1[2];
+uniform mediump uvec4 UBO2[2];
+uniform vec4 UBO0[2];
+layout(location = 0) out vec4 FragColor;
+
+void main()
+{
+    FragColor = ((((vec4(UBO1[0]) + vec4(UBO1[1])) + vec4(UBO2[0])) + vec4(UBO2[1])) + UBO0[0]) + UBO0[1];
+}
+
diff --git a/reference/shaders/legacy/vert/transpose.legacy.vert b/reference/shaders/legacy/vert/transpose.legacy.vert
new file mode 100644
index 00000000..c73d1a11
--- /dev/null
+++ b/reference/shaders/legacy/vert/transpose.legacy.vert
@@ -0,0 +1,22 @@
+#version 100
+
+struct Buffer
+{
+    mat4 MVPRowMajor;
+    mat4 MVPColMajor;
+    mat4 M;
+};
+
+uniform Buffer _13;
+
+attribute vec4 Position;
+
+void main()
+{
+    vec4 c0 = _13.M * (Position * _13.MVPRowMajor);
+    vec4 c1 = _13.M * (_13.MVPColMajor * Position);
+    vec4 c2 = _13.M * (_13.MVPRowMajor * Position);
+    vec4 c3 = _13.M * (Position * _13.MVPColMajor);
+    gl_Position = ((c0 + c1) + c2) + c3;
+}
+
diff --git a/shaders-msl/vert/basic.vert b/shaders-msl/vert/basic.vert
new file mode 100644
index 00000000..6d8bdc17
--- /dev/null
+++ b/shaders-msl/vert/basic.vert
@@ -0,0 +1,17 @@
+#version 310 es
+
+layout(std140) uniform UBO
+{
+    uniform mat4 uMVP;
+};
+
+layout(location = 0) in vec4 aVertex;
+layout(location = 1) in vec3 aNormal;
+
+out vec3 vNormal;
+
+void main()
+{
+    gl_Position = uMVP * aVertex;
+    vNormal = aNormal;
+}
diff --git a/shaders/comp/read-write-only.comp b/shaders/comp/read-write-only.comp
new file mode 100644
index 00000000..b224b6f1
--- /dev/null
+++ b/shaders/comp/read-write-only.comp
@@ -0,0 +1,26 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(binding = 0, std430) readonly buffer SSBO0
+{
+   vec4 data0;
+   vec4 data1;
+};
+
+layout(binding = 1, std430) restrict buffer SSBO1
+{
+   vec4 data2;
+   vec4 data3;
+};
+
+layout(binding = 2, std430) restrict writeonly buffer SSBO2
+{
+   vec4 data4;
+   vec4 data5;
+};
+
+void main()
+{
+   data4 = data0 + data2;
+   data5 = data1 + data3;
+}
diff --git a/shaders/flatten/array.flatten.vert b/shaders/flatten/array.flatten.vert
new file mode 100644
index 00000000..a1f1f100
--- /dev/null
+++ b/shaders/flatten/array.flatten.vert
@@ -0,0 +1,19 @@
+#version 310 es
+
+layout(std140) uniform UBO
+{
+    vec4 A4[5][4][2];
+    mat4 uMVP;
+    vec4 A1[2];
+    vec4 A2[2][3];
+    float A3[3];
+    vec4 Offset;
+};
+in vec4 aVertex;
+
+void main()
+{
+    vec4 a4 = A4[2][3][1]; // 2 * (4 * 2) + 3 * 2 + 1 = 16 + 6 + 1 = 23.
+    vec4 offset = A2[1][1] + A1[1] + A3[2];
+    gl_Position = uMVP * aVertex + Offset + offset;
+}
diff --git a/shaders/flatten/basic.flatten.vert b/shaders/flatten/basic.flatten.vert
new file mode 100644
index 00000000..f4e9bb39
--- /dev/null
+++ b/shaders/flatten/basic.flatten.vert
@@ -0,0 +1,15 @@
+#version 310 es
+
+layout(std140) uniform UBO
+{
+    mat4 uMVP;
+};
+in vec4 aVertex;
+in vec3 aNormal;
+out vec3 vNormal;
+
+void main()
+{
+    gl_Position = uMVP * aVertex;
+    vNormal = aNormal;
+}
diff --git a/shaders/flatten/copy.flatten.vert b/shaders/flatten/copy.flatten.vert
new file mode 100644
index 00000000..f2c2d3af
--- /dev/null
+++ b/shaders/flatten/copy.flatten.vert
@@ -0,0 +1,34 @@
+#version 310 es
+
+struct Light
+{
+    vec3 Position;
+    float Radius;
+
+    vec4 Color;
+};
+
+layout(std140) uniform UBO
+{
+    mat4 uMVP;
+
+    Light lights[4];
+};
+
+in vec4 aVertex;
+in vec3 aNormal;
+out vec4 vColor;
+
+void main()
+{
+    gl_Position = uMVP * aVertex;
+
+    vColor = vec4(0.0);
+
+    for (int i = 0; i < 4; ++i)
+    {
+        Light light = lights[i];
+        vec3 L = aVertex.xyz - light.Position;
+        vColor += dot(aNormal, normalize(L)) * (clamp(1.0 - length(L) / light.Radius, 0.0, 1.0) * lights[i].Color);
+    }
+}
diff --git a/shaders/flatten/dynamic.flatten.vert b/shaders/flatten/dynamic.flatten.vert
new file mode 100644
index 00000000..72862929
--- /dev/null
+++ b/shaders/flatten/dynamic.flatten.vert
@@ -0,0 +1,33 @@
+#version 310 es
+
+struct Light
+{
+    vec3 Position;
+    float Radius;
+
+    vec4 Color;
+};
+
+layout(std140) uniform UBO
+{
+    mat4 uMVP;
+
+    Light lights[4];
+};
+
+in vec4 aVertex;
+in vec3 aNormal;
+out vec4 vColor;
+
+void main()
+{
+    gl_Position = uMVP * aVertex;
+
+    vColor = vec4(0.0);
+
+    for (int i = 0; i < 4; ++i)
+    {
+        vec3 L = aVertex.xyz - lights[i].Position;
+        vColor += dot(aNormal, normalize(L)) * (clamp(1.0 - length(L) / lights[i].Radius, 0.0, 1.0) * lights[i].Color);
+    }
+}
diff --git a/shaders/flatten/matrixindex.flatten.vert b/shaders/flatten/matrixindex.flatten.vert
new file mode 100644
index 00000000..eb6ba2a2
--- /dev/null
+++ b/shaders/flatten/matrixindex.flatten.vert
@@ -0,0 +1,25 @@
+#version 310 es
+
+layout(std140) uniform UBO
+{
+    layout(column_major) mat4 M1C;
+    layout(row_major) mat4 M1R;
+    layout(column_major) mat2x4 M2C;
+    layout(row_major) mat2x4 M2R;
+};
+
+out vec4 oA;
+out vec4 oB;
+out vec4 oC;
+out vec4 oD;
+out vec4 oE;
+
+void main()
+{
+	gl_Position = vec4(0.0);
+	oA = M1C[1];
+	oB = M1R[1];
+	oC = M2C[1];
+	oD = M2R[0];
+	oE = vec4(M1C[1][2], M1R[1][2], M2C[1][2], M2R[1][2]);
+}
diff --git a/shaders/flatten/multiindex.flatten.vert b/shaders/flatten/multiindex.flatten.vert
new file mode 100644
index 00000000..32ca9a09
--- /dev/null
+++ b/shaders/flatten/multiindex.flatten.vert
@@ -0,0 +1,13 @@
+#version 310 es
+
+layout(std140) uniform UBO
+{
+    vec4 Data[3][5];
+};
+
+in ivec2 aIndex;
+
+void main()
+{
+    gl_Position = Data[aIndex.x][aIndex.y];
+}
diff --git a/shaders/flatten/push-constant.flatten.vert b/shaders/flatten/push-constant.flatten.vert
new file mode 100644
index 00000000..c7b1b42e
--- /dev/null
+++ b/shaders/flatten/push-constant.flatten.vert
@@ -0,0 +1,17 @@
+#version 310 es
+
+layout(push_constant, std430) uniform PushMe
+{
+   mat4 MVP;
+   mat2 Rot; // The MatrixStride will be 8 here.
+   float Arr[4];
+} registers;
+
+layout(location = 0) in vec2 Rot;
+layout(location = 1) in vec4 Pos;
+layout(location = 0) out vec2 vRot;
+void main()
+{
+   gl_Position = registers.MVP * Pos;
+   vRot = registers.Rot * Rot + registers.Arr[2]; // Constant access should work even if array stride is just 4 here.
+}
diff --git a/shaders/flatten/rowmajor.flatten.vert b/shaders/flatten/rowmajor.flatten.vert
new file mode 100644
index 00000000..449cd409
--- /dev/null
+++ b/shaders/flatten/rowmajor.flatten.vert
@@ -0,0 +1,16 @@
+#version 310 es
+
+layout(std140) uniform UBO
+{
+    layout(column_major) mat4 uMVPR;
+    layout(row_major) mat4 uMVPC;
+    layout(row_major) mat2x4 uMVP;
+};
+
+in vec4 aVertex;
+
+void main()
+{
+	vec2 v = aVertex * uMVP;
+	gl_Position = uMVPR * aVertex + uMVPC * aVertex;
+}
diff --git a/shaders/flatten/struct.flatten.vert b/shaders/flatten/struct.flatten.vert
new file mode 100644
index 00000000..b6d0b775
--- /dev/null
+++ b/shaders/flatten/struct.flatten.vert
@@ -0,0 +1,30 @@
+#version 310 es
+
+struct Light
+{
+    vec3 Position;
+    float Radius;
+
+    vec4 Color;
+};
+
+layout(std140) uniform UBO
+{
+    mat4 uMVP;
+
+    Light light;
+};
+
+in vec4 aVertex;
+in vec3 aNormal;
+out vec4 vColor;
+
+void main()
+{
+    gl_Position = uMVP * aVertex;
+
+    vColor = vec4(0.0);
+
+    vec3 L = aVertex.xyz - light.Position;
+    vColor += dot(aNormal, normalize(L)) * (clamp(1.0 - length(L) / light.Radius, 0.0, 1.0) * light.Color);
+}
diff --git a/shaders/flatten/struct.rowmajor.flatten.vert b/shaders/flatten/struct.rowmajor.flatten.vert
new file mode 100644
index 00000000..231389b8
--- /dev/null
+++ b/shaders/flatten/struct.rowmajor.flatten.vert
@@ -0,0 +1,26 @@
+#version 310 es
+
+struct Foo
+{
+   mat3x4 MVP0;
+   mat3x4 MVP1;
+};
+
+layout(std140, binding = 0) uniform UBO
+{
+   layout(row_major) Foo foo;
+};
+
+layout(location = 0) in vec4 v0;
+layout(location = 1) in vec4 v1;
+layout(location = 0) out vec3 V0;
+layout(location = 1) out vec3 V1;
+
+void main()
+{
+   Foo f = foo;
+   vec3 a = v0 * f.MVP0;
+   vec3 b = v1 * f.MVP1;
+   V0 = a;
+   V1 = b;
+}
diff --git a/shaders/flatten/swizzle.flatten.vert b/shaders/flatten/swizzle.flatten.vert
new file mode 100644
index 00000000..cd2ddd84
--- /dev/null
+++ b/shaders/flatten/swizzle.flatten.vert
@@ -0,0 +1,42 @@
+#version 310 es
+
+// comments note the 16b alignment boundaries (see GL spec 7.6.2.2 Standard Uniform Block Layout)
+layout(std140) uniform UBO
+{
+    // 16b boundary
+    vec4 A;
+    // 16b boundary
+    vec2 B0;
+    vec2 B1;
+    // 16b boundary
+    float C0;
+    // 16b boundary (vec3 is aligned to 16b)
+    vec3 C1;
+    // 16b boundary
+    vec3 D0;
+    float D1;
+    // 16b boundary
+    float E0;
+    float E1;
+    float E2;
+    float E3;
+    // 16b boundary
+    float F0;
+    vec2 F1;
+    // 16b boundary (vec2 before us is aligned to 8b)
+    float F2;
+};
+
+out vec4 oA, oB, oC, oD, oE, oF;
+
+void main()
+{
+    gl_Position = vec4(0.0);
+
+    oA = A;
+    oB = vec4(B0, B1);
+    oC = vec4(C0, C1);
+    oD = vec4(D0, D1);
+    oE = vec4(E0, E1, E2, E3);
+    oF = vec4(F0, F1, F2);
+}
diff --git a/shaders/flatten/types.flatten.frag b/shaders/flatten/types.flatten.frag
new file mode 100644
index 00000000..faab5b7e
--- /dev/null
+++ b/shaders/flatten/types.flatten.frag
@@ -0,0 +1,27 @@
+#version 310 es
+precision mediump float;
+
+layout(std140, binding = 0) uniform UBO0
+{
+   vec4 a;
+   vec4 b;
+};
+
+layout(std140, binding = 0) uniform UBO1
+{
+   ivec4 c;
+   ivec4 d;
+};
+
+layout(std140, binding = 0) uniform UBO2
+{
+   uvec4 e;
+   uvec4 f;
+};
+
+layout(location = 0) out vec4 FragColor;
+
+void main()
+{
+   FragColor = vec4(c) + vec4(d) + vec4(e) + vec4(f) + a + b; 
+}
diff --git a/shaders/legacy/vert/transpose.legacy.vert b/shaders/legacy/vert/transpose.legacy.vert
new file mode 100644
index 00000000..84f61826
--- /dev/null
+++ b/shaders/legacy/vert/transpose.legacy.vert
@@ -0,0 +1,20 @@
+#version 310 es
+
+uniform Buffer
+{
+	layout(row_major) mat4 MVPRowMajor;
+	layout(column_major) mat4 MVPColMajor;
+	mat4 M;
+};
+
+layout(location = 0) in vec4 Position;
+
+void main()
+{
+	vec4 c0 = M * (MVPRowMajor * Position);
+	vec4 c1 = M * (MVPColMajor * Position);
+	vec4 c2 = M * (Position * MVPRowMajor);
+	vec4 c3 = M * (Position * MVPColMajor);
+	gl_Position = c0 + c1 + c2 + c3;
+}
+
diff --git a/spirv_common.hpp b/spirv_common.hpp
index 58ced976..16285088 100644
--- a/spirv_common.hpp
+++ b/spirv_common.hpp
@@ -105,7 +105,7 @@ inline std::string convert_to_string(T &&t)
 
 #ifdef _MSC_VER
 #pragma warning(push)
-#pragma warning(disable: 4996)
+#pragma warning(disable : 4996)
 #endif
 
 inline std::string convert_to_string(float t)
@@ -265,6 +265,10 @@ struct SPIRType : IVariant
 	// Since we cannot rely on OpName to be equal, we need to figure out aliases.
 	uint32_t type_alias = 0;
 
+	// Denotes the type which this type is based on.
+	// Allows the backend to traverse how a complex type is built up during access chains.
+	uint32_t parent_type = 0;
+
 	// Used in backends to avoid emitting members with conflicting names.
 	std::unordered_set<std::string> member_name_cache;
 };
@@ -351,6 +355,10 @@ struct SPIRExpression : IVariant
 	// If this expression has been used while invalidated.
 	bool used_while_invalidated = false;
 
+	// Before use, this expression must be transposed.
+	// This is needed for targets which don't support row_major layouts.
+	bool need_transpose = false;
+
 	// A list of expressions which this expression depends on.
 	std::vector<uint32_t> expression_dependencies;
 };
@@ -900,6 +908,7 @@ struct Meta
 		uint32_t binding = 0;
 		uint32_t offset = 0;
 		uint32_t array_stride = 0;
+		uint32_t matrix_stride = 0;
 		uint32_t input_attachment = 0;
 		uint32_t spec_id = 0;
 		bool builtin = false;
diff --git a/spirv_cross.cpp b/spirv_cross.cpp
index ed1cc46a..c3e630bf 100644
--- a/spirv_cross.cpp
+++ b/spirv_cross.cpp
@@ -842,6 +842,10 @@ void Compiler::set_member_decoration(uint32_t id, uint32_t index, Decoration dec
 		dec.spec_id = argument;
 		break;
 
+	case DecorationMatrixStride:
+		dec.matrix_stride = argument;
+		break;
+
 	default:
 		break;
 	}
@@ -961,6 +965,10 @@ void Compiler::set_decoration(uint32_t id, Decoration decoration, uint32_t argum
 		dec.array_stride = argument;
 		break;
 
+	case DecorationMatrixStride:
+		dec.matrix_stride = argument;
+		break;
+
 	case DecorationBinding:
 		dec.binding = argument;
 		break;
@@ -1020,6 +1028,10 @@ uint32_t Compiler::get_decoration(uint32_t id, Decoration decoration) const
 		return dec.input_attachment;
 	case DecorationSpecId:
 		return dec.spec_id;
+	case DecorationArrayStride:
+		return dec.array_stride;
+	case DecorationMatrixStride:
+		return dec.matrix_stride;
 	default:
 		return 1;
 	}
@@ -1271,6 +1283,7 @@ void Compiler::parse(const Instruction &instruction)
 		vecbase = base;
 		vecbase.vecsize = vecsize;
 		vecbase.self = id;
+		vecbase.parent_type = ops[1];
 		break;
 	}
 
@@ -1285,6 +1298,7 @@ void Compiler::parse(const Instruction &instruction)
 		matrixbase = base;
 		matrixbase.columns = colcount;
 		matrixbase.self = id;
+		matrixbase.parent_type = ops[1];
 		break;
 	}
 
@@ -1302,6 +1316,7 @@ void Compiler::parse(const Instruction &instruction)
 
 		arraybase.array_size_literal.push_back(literal);
 		arraybase.array.push_back(literal ? c->scalar() : ops[2]);
+		arraybase.parent_type = ops[1];
 		// Do NOT set arraybase.self!
 		break;
 	}
@@ -1316,6 +1331,7 @@ void Compiler::parse(const Instruction &instruction)
 		arraybase = base;
 		arraybase.array.push_back(0);
 		arraybase.array_size_literal.push_back(true);
+		arraybase.parent_type = ops[1];
 		// Do NOT set arraybase.self!
 		break;
 	}
@@ -1371,6 +1387,8 @@ void Compiler::parse(const Instruction &instruction)
 		if (ptrbase.storage == StorageClassAtomicCounter)
 			ptrbase.basetype = SPIRType::AtomicCounter;
 
+		ptrbase.parent_type = ops[2];
+
 		// Do NOT set ptrbase.self!
 		break;
 	}
@@ -2045,6 +2063,17 @@ uint32_t Compiler::type_struct_member_array_stride(const SPIRType &type, uint32_
 		SPIRV_CROSS_THROW("Struct member does not have ArrayStride set.");
 }
 
+uint32_t Compiler::type_struct_member_matrix_stride(const SPIRType &type, uint32_t index) const
+{
+	// Decoration must be set in valid SPIR-V, otherwise throw.
+	// MatrixStride is part of OpMemberDecorate.
+	auto &dec = meta[type.self].members[index];
+	if (dec.decoration_flags & (1ull << DecorationMatrixStride))
+		return dec.matrix_stride;
+	else
+		SPIRV_CROSS_THROW("Struct member does not have MatrixStride set.");
+}
+
 size_t Compiler::get_declared_struct_size(const SPIRType &type) const
 {
 	uint32_t last = uint32_t(type.member_types.size() - 1);
@@ -2058,63 +2087,53 @@ size_t Compiler::get_declared_struct_member_size(const SPIRType &struct_type, ui
 	auto flags = get_member_decoration_mask(struct_type.self, index);
 	auto &type = get<SPIRType>(struct_type.member_types[index]);
 
-	if (type.basetype != SPIRType::Struct)
+	switch (type.basetype)
 	{
-		switch (type.basetype)
-		{
-		case SPIRType::Unknown:
-		case SPIRType::Void:
-		case SPIRType::Boolean: // Bools are purely logical, and cannot be used for externally visible types.
-		case SPIRType::AtomicCounter:
-		case SPIRType::Image:
-		case SPIRType::SampledImage:
-		case SPIRType::Sampler:
-			SPIRV_CROSS_THROW("Querying size for object with opaque size.\n");
+	case SPIRType::Unknown:
+	case SPIRType::Void:
+	case SPIRType::Boolean: // Bools are purely logical, and cannot be used for externally visible types.
+	case SPIRType::AtomicCounter:
+	case SPIRType::Image:
+	case SPIRType::SampledImage:
+	case SPIRType::Sampler:
+		SPIRV_CROSS_THROW("Querying size for object with opaque size.");
 
-		default:
-			break;
-		}
+	default:
+		break;
+	}
 
-		size_t component_size = type.width / 8;
-		unsigned vecsize = type.vecsize;
-		unsigned columns = type.columns;
-
-		if (type.array.empty())
-		{
-			// Vectors.
-			if (columns == 1)
-				return vecsize * component_size;
-			else
-			{
-				// Per SPIR-V spec, matrices must be tightly packed and aligned up for vec3 accesses.
-				if ((flags & (1ull << DecorationRowMajor)) && columns == 3)
-					columns = 4;
-				else if ((flags & (1ull << DecorationColMajor)) && vecsize == 3)
-					vecsize = 4;
-
-				return vecsize * columns * component_size;
-			}
-		}
-		else
-		{
-			// For arrays, we can use ArrayStride to get an easy check.
-			return type_struct_member_array_stride(struct_type, index) * type.array.back();
-		}
+	if (!type.array.empty())
+	{
+		// For arrays, we can use ArrayStride to get an easy check.
+		return type_struct_member_array_stride(struct_type, index) * type.array.back();
+	}
+	else if (type.basetype == SPIRType::Struct)
+	{
+		return get_declared_struct_size(type);
 	}
 	else
 	{
-		// Recurse.
-		uint32_t last = uint32_t(struct_type.member_types.size() - 1);
-		uint32_t offset = type_struct_member_offset(struct_type, last);
-		size_t size;
+		unsigned vecsize = type.vecsize;
+		unsigned columns = type.columns;
 
-		// If we have an array of structs inside our struct, handle that with array strides instead.
-		auto &last_type = get<SPIRType>(struct_type.member_types.back());
-		if (last_type.array.empty())
-			size = get_declared_struct_size(last_type);
+		// Vectors.
+		if (columns == 1)
+		{
+			size_t component_size = type.width / 8;
+			return vecsize * component_size;
+		}
 		else
-			size = type_struct_member_array_stride(struct_type, last) * last_type.array.back();
-		return offset + size;
+		{
+			uint32_t matrix_stride = type_struct_member_matrix_stride(struct_type, index);
+
+			// Per SPIR-V spec, matrices must be tightly packed and aligned up for vec3 accesses.
+			if (flags & (1ull << DecorationRowMajor))
+				return matrix_stride * vecsize;
+			else if (flags & (1ull << DecorationColMajor))
+				return matrix_stride * columns;
+			else
+				SPIRV_CROSS_THROW("Either row-major or column-major must be declared for matrices.");
+		}
 	}
 }
 
@@ -3024,3 +3043,48 @@ void Compiler::analyze_variable_scope(SPIRFunction &entry)
 		this->get<SPIRVariable>(loop_variable.first).loop_variable = true;
 	}
 }
+
+uint64_t Compiler::get_buffer_block_flags(const SPIRVariable &var)
+{
+	auto &type = get<SPIRType>(var.basetype);
+	assert(type.basetype == SPIRType::Struct);
+
+	// Some flags like non-writable, non-readable are actually found
+	// as member decorations. If all members have a decoration set, propagate
+	// the decoration up as a regular variable decoration.
+	uint64_t base_flags = meta[var.self].decoration.decoration_flags;
+
+	if (type.member_types.empty())
+		return base_flags;
+
+	uint64_t all_members_flag_mask = ~(0ull);
+	for (uint32_t i = 0; i < uint32_t(type.member_types.size()); i++)
+		all_members_flag_mask &= get_member_decoration_mask(type.self, i);
+
+	return base_flags | all_members_flag_mask;
+}
+
+bool Compiler::get_common_basic_type(const SPIRType &type, SPIRType::BaseType &base_type)
+{
+	if (type.basetype == SPIRType::Struct)
+	{
+		base_type = SPIRType::Unknown;
+		for (auto &member_type : type.member_types)
+		{
+			SPIRType::BaseType member_base;
+			if (!get_common_basic_type(get<SPIRType>(member_type), member_base))
+				return false;
+
+			if (base_type == SPIRType::Unknown)
+				base_type = member_base;
+			else if (base_type != member_base)
+				return false;
+		}
+		return true;
+	}
+	else
+	{
+		base_type = type.basetype;
+		return true;
+	}
+}
diff --git a/spirv_cross.hpp b/spirv_cross.hpp
index c3304290..fab5489f 100644
--- a/spirv_cross.hpp
+++ b/spirv_cross.hpp
@@ -201,11 +201,7 @@ public:
 	// Returns the effective size of a buffer block struct member.
 	virtual size_t get_declared_struct_member_size(const SPIRType &struct_type, uint32_t index) const;
 
-	// Legacy GLSL compatibility method.
-	// Takes a variable with a block interface and flattens it into a T array[N]; array instead.
-	// For this to work, all types in the block must not themselves be composites
-	// (except vectors and matrices), and all types must be the same.
-	// The name of the uniform will be the same as the interface block name.
+	// Legacy GLSL compatibility method. Deprecated in favor of CompilerGLSL::flatten_buffer_block
 	void flatten_interface_block(uint32_t id);
 
 	// Returns a set of all global variables which are statically accessed
@@ -463,6 +459,7 @@ protected:
 
 	uint32_t type_struct_member_offset(const SPIRType &type, uint32_t index) const;
 	uint32_t type_struct_member_array_stride(const SPIRType &type, uint32_t index) const;
+	uint32_t type_struct_member_matrix_stride(const SPIRType &type, uint32_t index) const;
 
 	bool block_is_loop_candidate(const SPIRBlock &block, SPIRBlock::Method method) const;
 
@@ -580,6 +577,9 @@ protected:
 	ShaderResources get_shader_resources(const std::unordered_set<uint32_t> *active_variables) const;
 
 	VariableTypeRemapCallback variable_remap_callback;
+
+	uint64_t get_buffer_block_flags(const SPIRVariable &var);
+	bool get_common_basic_type(const SPIRType &type, SPIRType::BaseType &base_type);
 };
 }
 
diff --git a/spirv_glsl.cpp b/spirv_glsl.cpp
index 95c80896..f35a1969 100644
--- a/spirv_glsl.cpp
+++ b/spirv_glsl.cpp
@@ -18,6 +18,7 @@
 #include "GLSL.std.450.h"
 #include <algorithm>
 #include <assert.h>
+#include <utility>
 
 using namespace spv;
 using namespace spirv_cross;
@@ -129,6 +130,19 @@ static uint32_t pls_format_to_components(PlsFormat format)
 	}
 }
 
+static const char *vector_swizzle(int vecsize, int index)
+{
+	static const char *swizzle[4][4] = {
+		{ ".x", ".y", ".z", ".w" }, { ".xy", ".yz", ".zw" }, { ".xyz", ".yzw" }, { "" }
+	};
+
+	assert(vecsize >= 1 && vecsize <= 4);
+	assert(index >= 0 && index < 4);
+	assert(swizzle[vecsize - 1][index]);
+
+	return swizzle[vecsize - 1][index];
+}
+
 void CompilerGLSL::reset()
 {
 	// We do some speculative optimizations which should pretty much always work out,
@@ -961,7 +975,9 @@ string CompilerGLSL::layout_for_variable(const SPIRVariable &var)
 
 void CompilerGLSL::emit_push_constant_block(const SPIRVariable &var)
 {
-	if (options.vulkan_semantics)
+	if (flattened_buffer_blocks.count(var.self))
+		emit_buffer_block_flattened(var);
+	else if (options.vulkan_semantics)
 		emit_push_constant_block_vulkan(var);
 	else
 		emit_push_constant_block_glsl(var);
@@ -1001,10 +1017,42 @@ void CompilerGLSL::emit_push_constant_block_glsl(const SPIRVariable &var)
 }
 
 void CompilerGLSL::emit_buffer_block(const SPIRVariable &var)
+{
+	if (flattened_buffer_blocks.count(var.self))
+		emit_buffer_block_flattened(var);
+	else if (is_legacy())
+		emit_buffer_block_legacy(var);
+	else
+		emit_buffer_block_native(var);
+}
+
+void CompilerGLSL::emit_buffer_block_legacy(const SPIRVariable &var)
 {
 	auto &type = get<SPIRType>(var.basetype);
 	bool ssbo = (meta[type.self].decoration.decoration_flags & (1ull << DecorationBufferBlock)) != 0;
-	bool is_restrict = (meta[var.self].decoration.decoration_flags & (1ull << DecorationRestrict)) != 0;
+	if (ssbo)
+		SPIRV_CROSS_THROW("SSBOs not supported in legacy targets.");
+
+	// We're emitting the push constant block as a regular struct, so disable the block qualifier temporarily.
+	// Otherwise, we will end up emitting layout() qualifiers on naked structs which is not allowed.
+	auto &block_flags = meta[type.self].decoration.decoration_flags;
+	uint64_t block_flag = block_flags & (1ull << DecorationBlock);
+	block_flags &= ~block_flag;
+	emit_struct(type);
+	block_flags |= block_flag;
+	emit_uniform(var);
+	statement("");
+}
+
+void CompilerGLSL::emit_buffer_block_native(const SPIRVariable &var)
+{
+	auto &type = get<SPIRType>(var.basetype);
+
+	uint64_t flags = get_buffer_block_flags(var);
+	bool ssbo = (meta[type.self].decoration.decoration_flags & (1ull << DecorationBufferBlock)) != 0;
+	bool is_restrict = ssbo && (flags & (1ull << DecorationRestrict)) != 0;
+	bool is_writeonly = ssbo && (flags & (1ull << DecorationNonReadable)) != 0;
+	bool is_readonly = ssbo && (flags & (1ull << DecorationNonWritable)) != 0;
 
 	add_resource_name(var.self);
 
@@ -1018,7 +1066,9 @@ void CompilerGLSL::emit_buffer_block(const SPIRVariable &var)
 	else
 		resource_names.insert(buffer_name);
 
-	statement(layout_for_variable(var), is_restrict ? "restrict " : "", ssbo ? "buffer " : "uniform ", buffer_name);
+	statement(layout_for_variable(var), is_restrict ? "restrict " : "", is_writeonly ? "writeonly " : "",
+	          is_readonly ? "readonly " : "", ssbo ? "buffer " : "uniform ", buffer_name);
+
 	begin_scope();
 
 	type.member_name_cache.clear();
@@ -1037,6 +1087,31 @@ void CompilerGLSL::emit_buffer_block(const SPIRVariable &var)
 	statement("");
 }
 
+void CompilerGLSL::emit_buffer_block_flattened(const SPIRVariable &var)
+{
+	auto &type = get<SPIRType>(var.basetype);
+
+	// Block names should never alias.
+	auto buffer_name = to_name(type.self, false);
+	size_t buffer_size = (get_declared_struct_size(type) + 15) / 16;
+
+	SPIRType::BaseType basic_type;
+	if (get_common_basic_type(type, basic_type))
+	{
+		SPIRType tmp;
+		tmp.basetype = basic_type;
+		tmp.vecsize = 4;
+		if (basic_type != SPIRType::Float && basic_type != SPIRType::Int && basic_type != SPIRType::UInt)
+			SPIRV_CROSS_THROW("Basic types in a flattened UBO must be float, int or uint.");
+
+		auto flags = get_buffer_block_flags(var);
+		statement("uniform ", flags_to_precision_qualifiers_glsl(tmp, flags), type_to_glsl(tmp), " ", buffer_name, "[",
+		          buffer_size, "];");
+	}
+	else
+		SPIRV_CROSS_THROW("All basic types in a flattened block must be the same.");
+}
+
 void CompilerGLSL::emit_interface_block(const SPIRVariable &var)
 {
 	auto &execution = get_entry_point();
@@ -1055,6 +1130,9 @@ void CompilerGLSL::emit_interface_block(const SPIRVariable &var)
 
 	if (block)
 	{
+		if (is_legacy())
+			SPIRV_CROSS_THROW("IO blocks are not supported in legacy targets.");
+
 		add_resource_name(var.self);
 
 		// Block names should never alias.
@@ -1504,7 +1582,7 @@ void CompilerGLSL::strip_enclosed_expression(string &expr)
 				return;
 		}
 	}
-	expr.pop_back();
+	expr.erase(expr.size() - 1, 1);
 	expr.erase(begin(expr));
 }
 
@@ -1574,6 +1652,8 @@ string CompilerGLSL::to_expression(uint32_t id)
 		auto &e = get<SPIRExpression>(id);
 		if (e.base_expression)
 			return to_enclosed_expression(e.base_expression) + e.expression;
+		else if (e.need_transpose)
+			return convert_row_major_matrix(e.expression);
 		else
 			return e.expression;
 	}
@@ -2549,8 +2629,8 @@ void CompilerGLSL::emit_texture_op(const Instruction &i)
 
 	string expr;
 	bool forward = false;
-	expr += to_function_name(img, imgtype, !!fetch, !!gather, !!proj, !!coffsets, (!!coffset || !!offset), (!!grad_x || !!grad_y), !!lod,
-	                         !!dref);
+	expr += to_function_name(img, imgtype, !!fetch, !!gather, !!proj, !!coffsets, (!!coffset || !!offset),
+	                         (!!grad_x || !!grad_y), !!lod, !!dref);
 	expr += "(";
 	expr += to_function_args(img, imgtype, fetch, gather, proj, coord, coord_components, dref, grad_x, grad_y, lod,
 	                         coffset, offset, bias, comp, sample, &forward);
@@ -3097,7 +3177,7 @@ const char *CompilerGLSL::index_to_swizzle(uint32_t index)
 }
 
 string CompilerGLSL::access_chain(uint32_t base, const uint32_t *indices, uint32_t count, bool index_is_literal,
-                                  bool chain_only)
+                                  bool chain_only, bool *need_transpose)
 {
 	string expr;
 	if (!chain_only)
@@ -3218,9 +3298,302 @@ string CompilerGLSL::access_chain(uint32_t base, const uint32_t *indices, uint32
 			SPIRV_CROSS_THROW("Cannot subdivide a scalar value!");
 	}
 
+	if (need_transpose)
+		*need_transpose = row_major_matrix_needs_conversion;
 	return expr;
 }
 
+string CompilerGLSL::access_chain(uint32_t base, const uint32_t *indices, uint32_t count, const SPIRType &target_type,
+                                  bool *out_need_transpose)
+{
+	if (flattened_buffer_blocks.count(base))
+	{
+		uint32_t matrix_stride;
+		bool need_transpose;
+		flattened_access_chain_offset(base, indices, count, 0, &need_transpose, &matrix_stride);
+
+		if (out_need_transpose)
+			*out_need_transpose = target_type.columns > 1 && need_transpose;
+
+		return flattened_access_chain(base, indices, count, target_type, 0, matrix_stride, need_transpose);
+	}
+	else
+	{
+		return access_chain(base, indices, count, false, false, out_need_transpose);
+	}
+}
+
+std::string CompilerGLSL::flattened_access_chain(uint32_t base, const uint32_t *indices, uint32_t count,
+                                                 const SPIRType &target_type, uint32_t offset, uint32_t matrix_stride,
+                                                 bool need_transpose)
+{
+	if (!target_type.array.empty())
+		SPIRV_CROSS_THROW("Access chains that result in an array can not be flattened");
+	else if (target_type.basetype == SPIRType::Struct)
+		return flattened_access_chain_struct(base, indices, count, target_type, offset);
+	else if (target_type.columns > 1)
+		return flattened_access_chain_matrix(base, indices, count, target_type, offset, matrix_stride, need_transpose);
+	else
+		return flattened_access_chain_vector(base, indices, count, target_type, offset, matrix_stride, need_transpose);
+}
+
+std::string CompilerGLSL::flattened_access_chain_struct(uint32_t base, const uint32_t *indices, uint32_t count,
+                                                        const SPIRType &target_type, uint32_t offset)
+{
+	std::string expr;
+
+	expr += type_to_glsl_constructor(target_type);
+	expr += "(";
+
+	for (size_t i = 0; i < target_type.member_types.size(); ++i)
+	{
+		if (i != 0)
+			expr += ", ";
+
+		const SPIRType &member_type = get<SPIRType>(target_type.member_types[i]);
+		uint32_t member_offset = type_struct_member_offset(target_type, i);
+
+		// The access chain terminates at the struct, so we need to find matrix strides and row-major information
+		// ahead of time.
+		bool need_transpose = false;
+		uint32_t matrix_stride = 0;
+		if (member_type.columns > 1)
+		{
+			need_transpose = (combined_decoration_for_member(target_type, i) & (1ull << DecorationRowMajor)) != 0;
+			matrix_stride = type_struct_member_matrix_stride(target_type, i);
+		}
+
+		auto tmp = flattened_access_chain(base, indices, count, member_type, offset + member_offset, matrix_stride,
+		                                  need_transpose);
+
+		// Cannot forward transpositions, so resolve them here.
+		if (need_transpose)
+			expr += convert_row_major_matrix(tmp);
+		else
+			expr += tmp;
+	}
+
+	expr += ")";
+
+	return expr;
+}
+
+std::string CompilerGLSL::flattened_access_chain_matrix(uint32_t base, const uint32_t *indices, uint32_t count,
+                                                        const SPIRType &target_type, uint32_t offset,
+                                                        uint32_t matrix_stride, bool need_transpose)
+{
+	assert(matrix_stride);
+	SPIRType tmp_type = target_type;
+	if (need_transpose)
+		swap(tmp_type.vecsize, tmp_type.columns);
+
+	std::string expr;
+
+	expr += type_to_glsl_constructor(tmp_type);
+	expr += "(";
+
+	for (uint32_t i = 0; i < tmp_type.columns; i++)
+	{
+		if (i != 0)
+			expr += ", ";
+
+		expr += flattened_access_chain_vector(base, indices, count, tmp_type, offset + i * matrix_stride, matrix_stride,
+		                                      /* need_transpose= */ false);
+	}
+
+	expr += ")";
+
+	return expr;
+}
+
+std::string CompilerGLSL::flattened_access_chain_vector(uint32_t base, const uint32_t *indices, uint32_t count,
+                                                        const SPIRType &target_type, uint32_t offset,
+                                                        uint32_t matrix_stride, bool need_transpose)
+{
+	auto result = flattened_access_chain_offset(base, indices, count, offset);
+
+	auto buffer_name = to_name(expression_type(base).self);
+
+	if (need_transpose)
+	{
+		std::string expr;
+
+		if (target_type.vecsize > 1)
+		{
+			expr += type_to_glsl_constructor(target_type);
+			expr += "(";
+		}
+
+		for (uint32_t i = 0; i < target_type.vecsize; ++i)
+		{
+			if (i != 0)
+				expr += ", ";
+
+			uint32_t component_offset = result.second + i * matrix_stride;
+
+			assert(component_offset % (target_type.width / 8) == 0);
+			uint32_t index = component_offset / (target_type.width / 8);
+
+			expr += buffer_name;
+			expr += "[";
+			expr += result.first; // this is a series of N1 * k1 + N2 * k2 + ... that is either empty or ends with a +
+			expr += convert_to_string(index / 4);
+			expr += "]";
+
+			expr += vector_swizzle(1, index % 4);
+		}
+
+		if (target_type.vecsize > 1)
+		{
+			expr += ")";
+		}
+
+		return expr;
+	}
+	else
+	{
+		assert(result.second % (target_type.width / 8) == 0);
+		uint32_t index = result.second / (target_type.width / 8);
+
+		std::string expr;
+
+		expr += buffer_name;
+		expr += "[";
+		expr += result.first; // this is a series of N1 * k1 + N2 * k2 + ... that is either empty or ends with a +
+		expr += convert_to_string(index / 4);
+		expr += "]";
+
+		expr += vector_swizzle(target_type.vecsize, index % 4);
+
+		return expr;
+	}
+}
+
+std::pair<std::string, uint32_t> CompilerGLSL::flattened_access_chain_offset(uint32_t base, const uint32_t *indices,
+                                                                             uint32_t count, uint32_t offset,
+                                                                             bool *need_transpose,
+                                                                             uint32_t *out_matrix_stride)
+{
+	const auto *type = &expression_type(base);
+
+	// This holds the type of the current pointer which we are traversing through.
+	// We always start out from a struct type which is the block.
+	// This is primarily used to reflect the array strides and matrix strides later.
+	// For the first access chain index, type_id won't be needed, so just keep it as 0, it will be set
+	// accordingly as members of structs are accessed.
+	assert(type->basetype == SPIRType::Struct);
+	uint32_t type_id = 0;
+
+	uint32_t matrix_stride = 0;
+
+	std::string expr;
+	bool row_major_matrix_needs_conversion = false;
+
+	for (uint32_t i = 0; i < count; i++)
+	{
+		uint32_t index = indices[i];
+
+		// Arrays
+		if (!type->array.empty())
+		{
+			// Here, the type_id will be a type ID for the array type itself.
+			uint32_t array_stride = get_decoration(type_id, DecorationArrayStride);
+			if (!array_stride)
+				SPIRV_CROSS_THROW("SPIR-V does not define ArrayStride for buffer block.");
+
+			auto *constant = maybe_get<SPIRConstant>(index);
+			if (constant)
+			{
+				// Constant array access.
+				offset += constant->scalar() * array_stride;
+			}
+			else
+			{
+				// Dynamic array access.
+				// FIXME: This will need to change if we support other flattening types than 32-bit.
+				const uint32_t word_stride = 16;
+				if (array_stride % word_stride)
+				{
+					SPIRV_CROSS_THROW(
+					    "Array stride for dynamic indexing must be divisible by the size of a 4-component vector. "
+					    "Likely culprit here is a float or vec2 array inside a push constant block which is std430. "
+					    "This cannot be flattened. Try using std140 layout instead.");
+				}
+
+				expr += to_expression(index);
+				expr += " * ";
+				expr += convert_to_string(array_stride / word_stride);
+				expr += " + ";
+			}
+
+			uint32_t parent_type = type->parent_type;
+			type = &get<SPIRType>(parent_type);
+			type_id = parent_type;
+
+			// Type ID now refers to the array type with one less dimension.
+		}
+		// For structs, the index refers to a constant, which indexes into the members.
+		// We also check if this member is a builtin, since we then replace the entire expression with the builtin one.
+		else if (type->basetype == SPIRType::Struct)
+		{
+			index = get<SPIRConstant>(index).scalar();
+
+			if (index >= type->member_types.size())
+				SPIRV_CROSS_THROW("Member index is out of bounds!");
+
+			offset += type_struct_member_offset(*type, index);
+			type_id = type->member_types[index];
+
+			auto &struct_type = *type;
+			type = &get<SPIRType>(type->member_types[index]);
+
+			if (type->columns > 1)
+			{
+				matrix_stride = type_struct_member_matrix_stride(struct_type, index);
+				row_major_matrix_needs_conversion =
+				    (combined_decoration_for_member(struct_type, index) & (1ull << DecorationRowMajor)) != 0;
+			}
+			else
+				row_major_matrix_needs_conversion = false;
+		}
+		// Matrix -> Vector
+		else if (type->columns > 1)
+		{
+			if (ids[index].get_type() != TypeConstant)
+				SPIRV_CROSS_THROW("Cannot flatten dynamic matrix indexing!");
+
+			index = get<SPIRConstant>(index).scalar();
+			offset += index * (row_major_matrix_needs_conversion ? type->width / 8 : matrix_stride);
+
+			uint32_t parent_type = type->parent_type;
+			type = &get<SPIRType>(type->parent_type);
+			type_id = parent_type;
+		}
+		// Vector -> Scalar
+		else if (type->vecsize > 1)
+		{
+			if (ids[index].get_type() != TypeConstant)
+				SPIRV_CROSS_THROW("Cannot flatten dynamic vector indexing!");
+
+			index = get<SPIRConstant>(index).scalar();
+			offset += index * (row_major_matrix_needs_conversion ? matrix_stride : type->width / 8);
+
+			uint32_t parent_type = type->parent_type;
+			type = &get<SPIRType>(type->parent_type);
+			type_id = parent_type;
+		}
+		else
+			SPIRV_CROSS_THROW("Cannot subdivide a scalar value!");
+	}
+
+	if (need_transpose)
+		*need_transpose = row_major_matrix_needs_conversion;
+	if (out_matrix_stride)
+		*out_matrix_stride = matrix_stride;
+
+	return std::make_pair(expr, offset);
+}
+
 bool CompilerGLSL::should_forward(uint32_t id)
 {
 	// Immutable expression can always be forwarded.
@@ -3554,13 +3927,28 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		// If an expression is mutable and forwardable, we speculate that it is immutable.
 		bool forward = should_forward(ptr) && forced_temporaries.find(id) == end(forced_temporaries);
 
-		// If loading a non-native row-major matrix, convert it to column-major
+		// If loading a non-native row-major matrix, mark the expression as need_transpose.
+		bool need_transpose = false;
+		bool old_need_transpose = false;
+
+		auto *ptr_expression = maybe_get<SPIRExpression>(ptr);
+		if (ptr_expression && ptr_expression->need_transpose)
+		{
+			old_need_transpose = true;
+			ptr_expression->need_transpose = false;
+			need_transpose = true;
+		}
+		else if (is_non_native_row_major_matrix(ptr))
+			need_transpose = true;
+
 		auto expr = to_expression(ptr);
-		if (is_non_native_row_major_matrix(ptr))
-			expr = convert_row_major_matrix(expr);
+
+		if (ptr_expression)
+			ptr_expression->need_transpose = old_need_transpose;
 
 		// Suppress usage tracking since using same expression multiple times does not imply any extra work.
-		emit_op(result_type, id, expr, forward, true);
+		auto &e = emit_op(result_type, id, expr, forward, true);
+		e.need_transpose = need_transpose;
 		register_read(id, ptr, forward);
 		break;
 	}
@@ -3574,9 +3962,11 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 
 		// If the base is immutable, the access chain pointer must also be.
 		// If an expression is mutable and forwardable, we speculate that it is immutable.
-		auto e = access_chain(ops[2], &ops[3], length - 3, false);
+		bool need_transpose;
+		auto e = access_chain(ops[2], &ops[3], length - 3, get<SPIRType>(ops[0]), &need_transpose);
 		auto &expr = set<SPIRExpression>(ops[1], move(e), ops[0], should_forward(ops[2]));
 		expr.loaded_from = ops[2];
+		expr.need_transpose = need_transpose;
 		break;
 	}
 
@@ -4012,11 +4402,25 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		break;
 	}
 
-	case OpFMul:
+	case OpVectorTimesMatrix:
 	case OpMatrixTimesVector:
+	{
+		// If the matrix needs transpose, just flip the multiply order.
+		auto *e = maybe_get<SPIRExpression>(ops[opcode == OpMatrixTimesVector ? 2 : 3]);
+		if (e && e->need_transpose)
+		{
+			e->need_transpose = false;
+			emit_binary_op(ops[0], ops[1], ops[3], ops[2], "*");
+			e->need_transpose = true;
+		}
+		else
+			BOP(*);
+		break;
+	}
+
+	case OpFMul:
 	case OpMatrixTimesScalar:
 	case OpVectorTimesScalar:
-	case OpVectorTimesMatrix:
 	case OpMatrixTimesMatrix:
 		BOP(*);
 		break;
@@ -4855,7 +5259,8 @@ void CompilerGLSL::add_member_name(SPIRType &type, uint32_t index)
 bool CompilerGLSL::is_non_native_row_major_matrix(uint32_t id)
 {
 	// Natively supported row-major matrices do not need to be converted.
-	if (backend.native_row_major_matrix)
+	// Legacy targets do not support row major.
+	if (backend.native_row_major_matrix && !is_legacy())
 		return false;
 
 	// Non-matrix or column-major matrix types do not need to be converted.
@@ -4876,7 +5281,7 @@ bool CompilerGLSL::is_non_native_row_major_matrix(uint32_t id)
 bool CompilerGLSL::member_is_non_native_row_major_matrix(const SPIRType &type, uint32_t index)
 {
 	// Natively supported row-major matrices do not need to be converted.
-	if (backend.native_row_major_matrix)
+	if (backend.native_row_major_matrix && !is_legacy())
 		return false;
 
 	// Non-matrix or column-major matrix types do not need to be converted.
@@ -5331,6 +5736,25 @@ void CompilerGLSL::require_extension(const string &ext)
 	}
 }
 
+void CompilerGLSL::flatten_buffer_block(uint32_t id)
+{
+	auto &var = get<SPIRVariable>(id);
+	auto &type = get<SPIRType>(var.basetype);
+	auto name = to_name(type.self, false);
+	auto flags = meta.at(type.self).decoration.decoration_flags;
+
+	if (!type.array.empty())
+		SPIRV_CROSS_THROW(name + " is an array of UBOs.");
+	if (type.basetype != SPIRType::Struct)
+		SPIRV_CROSS_THROW(name + " is not a struct.");
+	if ((flags & (1ull << DecorationBlock)) == 0)
+		SPIRV_CROSS_THROW(name + " is not a block.");
+	if (type.member_types.empty())
+		SPIRV_CROSS_THROW(name + " is an empty struct.");
+
+	flattened_buffer_blocks.insert(id);
+}
+
 bool CompilerGLSL::check_atomic_image(uint32_t id)
 {
 	auto &type = expression_type(id);
diff --git a/spirv_glsl.hpp b/spirv_glsl.hpp
index b597f75f..6e08c099 100644
--- a/spirv_glsl.hpp
+++ b/spirv_glsl.hpp
@@ -138,6 +138,13 @@ public:
 	// require_extension("GL_KHR_my_extension");
 	void require_extension(const std::string &ext);
 
+	// Legacy GLSL compatibility method.
+	// Takes a uniform or push constant variable and flattens it into a (i|u)vec4 array[N]; array instead.
+	// For this to work, all types in the block must be the same basic type, e.g. mixing vec2 and vec4 is fine, but
+	// mixing int and float is not.
+	// The name of the uniform array will be the same as the interface block name.
+	void flatten_buffer_block(uint32_t id);
+
 protected:
 	void reset();
 	void emit_function(SPIRFunction &func, uint64_t return_flags);
@@ -264,6 +271,9 @@ protected:
 	void emit_struct(SPIRType &type);
 	void emit_resources();
 	void emit_buffer_block(const SPIRVariable &type);
+	void emit_buffer_block_native(const SPIRVariable &var);
+	void emit_buffer_block_legacy(const SPIRVariable &var);
+	void emit_buffer_block_flattened(const SPIRVariable &type);
 	void emit_push_constant_block(const SPIRVariable &var);
 	void emit_push_constant_block_vulkan(const SPIRVariable &var);
 	void emit_push_constant_block_glsl(const SPIRVariable &var);
@@ -305,7 +315,25 @@ protected:
 	SPIRExpression &emit_op(uint32_t result_type, uint32_t result_id, const std::string &rhs, bool forward_rhs,
 	                        bool suppress_usage_tracking = false);
 	std::string access_chain(uint32_t base, const uint32_t *indices, uint32_t count, bool index_is_literal,
-	                         bool chain_only = false);
+	                         bool chain_only = false, bool *need_transpose = nullptr);
+	std::string access_chain(uint32_t base, const uint32_t *indices, uint32_t count, const SPIRType &target_type,
+	                         bool *need_transpose = nullptr);
+
+	std::string flattened_access_chain(uint32_t base, const uint32_t *indices, uint32_t count,
+	                                   const SPIRType &target_type, uint32_t offset, uint32_t matrix_stride,
+	                                   bool need_transpose);
+	std::string flattened_access_chain_struct(uint32_t base, const uint32_t *indices, uint32_t count,
+	                                          const SPIRType &target_type, uint32_t offset);
+	std::string flattened_access_chain_matrix(uint32_t base, const uint32_t *indices, uint32_t count,
+	                                          const SPIRType &target_type, uint32_t offset, uint32_t matrix_stride,
+	                                          bool need_transpose);
+	std::string flattened_access_chain_vector(uint32_t base, const uint32_t *indices, uint32_t count,
+	                                          const SPIRType &target_type, uint32_t offset, uint32_t matrix_stride,
+	                                          bool need_transpose);
+	std::pair<std::string, uint32_t> flattened_access_chain_offset(uint32_t base, const uint32_t *indices,
+	                                                               uint32_t count, uint32_t offset,
+	                                                               bool *need_transpose = nullptr,
+	                                                               uint32_t *matrix_stride = nullptr);
 
 	const char *index_to_swizzle(uint32_t index);
 	std::string remap_swizzle(uint32_t result_type, uint32_t input_components, uint32_t expr);
@@ -354,6 +382,8 @@ protected:
 
 	std::unordered_set<uint32_t> emitted_functions;
 
+	std::unordered_set<uint32_t> flattened_buffer_blocks;
+
 	// Usage tracking. If a temporary is used more than once, use the temporary instead to
 	// avoid AST explosion when SPIRV is generated with pure SSA and doesn't write stuff to variables.
 	std::unordered_map<uint32_t, uint32_t> expression_usage_counts;
diff --git a/spirv_msl.cpp b/spirv_msl.cpp
index 5b6c0d3a..58c95b4d 100644
--- a/spirv_msl.cpp
+++ b/spirv_msl.cpp
@@ -68,9 +68,9 @@ string CompilerMSL::compile(MSLConfiguration &msl_cfg, vector<MSLVertexAttr> *p_
 		for (auto &rb : *p_res_bindings)
 			resource_bindings.push_back(&rb);
 
-	// Establish the need to output any custom functions
+	// Preprocess OpCodes to extract the need to output additional header content
 	set_enabled_interface_variables(get_active_interface_variables());
-	register_custom_functions();
+	preprocess_op_codes();
 
 	// Create structs to hold input, output and uniform variables
 	qual_pos_var_name = "";
@@ -110,7 +110,6 @@ string CompilerMSL::compile(MSLConfiguration &msl_cfg, vector<MSLVertexAttr> *p_
 		emit_header();
 		emit_resources();
 		emit_custom_functions();
-		emit_function_declarations();
 		emit_function(get<SPIRFunction>(entry_point), 0);
 
 		pass_count++;
@@ -126,11 +125,15 @@ string CompilerMSL::compile()
 }
 
 // Register the need to output any custom functions.
-void CompilerMSL::register_custom_functions()
+void CompilerMSL::preprocess_op_codes()
 {
 	custom_function_ops.clear();
-	CustomFunctionHandler handler(*this, custom_function_ops);
-	traverse_all_reachable_opcodes(get<SPIRFunction>(entry_point), handler);
+
+	OpCodePreprocessor preproc(*this);
+	traverse_all_reachable_opcodes(get<SPIRFunction>(entry_point), preproc);
+
+	if (preproc.suppress_missing_prototypes)
+		add_header_line("#pragma clang diagnostic ignored \"-Wmissing-prototypes\"");
 }
 
 // Move the Private global variables to the entry function.
@@ -452,8 +455,13 @@ uint32_t CompilerMSL::add_interface_block(StorageClass storage)
 // Emits the file header info
 void CompilerMSL::emit_header()
 {
-	for (auto &header : header_lines)
-		statement(header);
+	if (!header_lines.empty())
+	{
+		for (auto &header : header_lines)
+			statement(header);
+
+		statement("");
+	}
 
 	statement("#include <metal_stdlib>");
 	statement("#include <simd/simd.h>");
@@ -472,8 +480,6 @@ void CompilerMSL::emit_custom_functions()
 		case OpFMod:
 			statement("// Support GLSL mod(), which is slightly different than Metal fmod()");
 			statement("template<typename Tx, typename Ty>");
-			statement("Tx mod(Tx x, Ty y);");
-			statement("template<typename Tx, typename Ty>");
 			statement("Tx mod(Tx x, Ty y)");
 			begin_scope();
 			statement("return x - y * floor(x / y);");
@@ -680,29 +686,9 @@ void CompilerMSL::emit_interface_block(uint32_t ib_var_id)
 	}
 }
 
-// Output a declaration statement for each function.
-void CompilerMSL::emit_function_declarations()
-{
-	for (auto &id : ids)
-		if (id.get_type() == TypeFunction)
-		{
-			auto &func = id.get<SPIRFunction>();
-			if (func.self != entry_point) {
-				emit_function_prototype(func, true);
-			}
-		}
-
-	statement("");
-}
-
-void CompilerMSL::emit_function_prototype(SPIRFunction &func, uint64_t)
-{
-	emit_function_prototype(func, false);
-}
-
 // Emits the declaration signature of the specified function.
 // If this is the entry point function, Metal-specific return value and function arguments are added.
-void CompilerMSL::emit_function_prototype(SPIRFunction &func, bool is_decl)
+void CompilerMSL::emit_function_prototype(SPIRFunction &func, uint64_t)
 {
 	local_variable_names = resource_names;
 	string decl;
@@ -763,7 +749,7 @@ void CompilerMSL::emit_function_prototype(SPIRFunction &func, bool is_decl)
 	}
 
 	decl += ")";
-	statement(decl, (is_decl ? ";" : ""));
+	statement(decl);
 }
 
 // Returns the texture sampling function string for the specified image and sampling characteristics.
@@ -1304,15 +1290,25 @@ string CompilerMSL::entry_point_args(bool append_comma)
 			{
 				switch (type.basetype)
 				{
-					case SPIRType::Struct: {
-						auto &m = meta.at(type.self);
-						if (m.members.size() == 0) break;
-						if (!ep_args.empty())
-							ep_args += ", ";
-						ep_args += "constant " + type_to_glsl(type) + "& " + to_name(var.self);
-						ep_args += " [[buffer(" + convert_to_string(get_metal_resource_index(var, type.basetype)) + ")]]";
+				case SPIRType::Struct:
+				{
+					auto &m = meta.at(type.self);
+					if (m.members.size() == 0) break;
+					if (!ep_args.empty())
+						ep_args += ", ";
+					if ((meta[type.self].decoration.decoration_flags & (1ull << DecorationBufferBlock)) != 0 &&
+					    (meta[var.self].decoration.decoration_flags & (1ull << DecorationNonWritable)) == 0)
+					{
+						ep_args += "device ";
 					}
+					else
+					{
+						ep_args += "constant ";
+					}
+					ep_args += type_to_glsl(type) + "& " + to_name(var.self);
+					ep_args += " [[buffer(" + convert_to_string(get_metal_resource_index(var, type.basetype)) + ")]]";
 					break;
+				}
 				case SPIRType::Sampler:
 					if (!ep_args.empty())
 						ep_args += ", ";
@@ -1789,13 +1785,20 @@ size_t CompilerMSL::get_declared_type_size(uint32_t type_id, uint64_t dec_mask)
 	}
 }
 
-// If the opcode requires a bespoke custom function be output, remember it.
-bool CompilerMSL::CustomFunctionHandler::handle(Op opcode, const uint32_t * /*args*/, uint32_t /*length*/)
+bool CompilerMSL::OpCodePreprocessor::handle(Op opcode, const uint32_t * /*args*/, uint32_t /*length*/)
 {
 	switch (opcode)
 	{
+	// If an opcode requires a bespoke custom function be output, remember it.
 	case OpFMod:
-		custom_function_ops.insert(opcode);
+		compiler.custom_function_ops.insert(uint32_t(opcode));
+		break;
+
+	// Since MSL exists in a single execution scope, function prototype declarations are not
+	// needed, and clutter the output. If secondary functions are output (as indicated by the
+	// presence of OpFunctionCall, then suppress compiler warnings of missing function prototypes.
+	case OpFunctionCall:
+		suppress_missing_prototypes = true;
 		break;
 
 	default:
diff --git a/spirv_msl.hpp b/spirv_msl.hpp
index a2bf7ace..b900d684 100644
--- a/spirv_msl.hpp
+++ b/spirv_msl.hpp
@@ -117,7 +117,7 @@ protected:
 	                             uint32_t grad_y, uint32_t lod, uint32_t coffset, uint32_t offset, uint32_t bias,
 	                             uint32_t comp, uint32_t sample, bool *p_forward) override;
 
-	void register_custom_functions();
+	void preprocess_op_codes();
 	void emit_custom_functions();
 	void localize_global_variables();
 	void extract_global_variables_from_functions();
@@ -131,8 +131,6 @@ protected:
 
 	void emit_resources();
 	void emit_interface_block(uint32_t ib_var_id);
-	void emit_function_prototype(SPIRFunction &func, bool is_decl);
-	void emit_function_declarations();
 	void populate_func_name_overrides();
 	void populate_var_name_overrides();
 
@@ -169,20 +167,18 @@ protected:
 	std::string stage_uniform_var_name = "uniforms";
 	std::string sampler_name_suffix = "Smplr";
 
-	// Extracts a set of opcodes that should be implemented as a bespoke custom function
-	// whose full source code is output as part of the shader source code.
-	struct CustomFunctionHandler : OpcodeHandler
+	// OpcodeHandler that handles several MSL preprocessing operations.
+	struct OpCodePreprocessor : OpcodeHandler
 	{
-		CustomFunctionHandler(const CompilerMSL &compiler_, std::set<uint32_t> &custom_function_ops_)
+		OpCodePreprocessor(CompilerMSL &compiler_)
 		    : compiler(compiler_)
-		    , custom_function_ops(custom_function_ops_)
 		{
 		}
 
 		bool handle(spv::Op opcode, const uint32_t *args, uint32_t length) override;
 
-		const CompilerMSL &compiler;
-		std::set<uint32_t> &custom_function_ops;
+		CompilerMSL &compiler;
+		bool suppress_missing_prototypes = false;
 	};
 
 	// Sorts the members of a SPIRType and associated Meta info based on a settable sorting
diff --git a/test_shaders.py b/test_shaders.py
index daac82a6..f3349827 100755
--- a/test_shaders.py
+++ b/test_shaders.py
@@ -2,6 +2,7 @@
 
 import sys
 import os
+import os.path
 import subprocess
 import tempfile
 import re
@@ -10,6 +11,8 @@ import hashlib
 import shutil
 import argparse
 
+METALC = '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/usr/bin/metal'
+
 def parse_stats(stats):
     m = re.search('([0-9]+) work registers', stats)
     registers = int(m.group(1)) if m else 0
@@ -60,13 +63,31 @@ def get_shader_stats(shader):
     returned = stdout.decode('utf-8')
     return parse_stats(returned)
 
+def validate_shader_msl(shader):
+    subprocess.check_call([METALC, '-x', 'metal', '-std=ios-metal1.0', '-Werror', shader])
+
+def cross_compile_msl(shader):
+    spirv_f, spirv_path = tempfile.mkstemp()
+    msl_f, msl_path = tempfile.mkstemp(suffix = os.path.basename(shader))
+    os.close(spirv_f)
+    os.close(msl_f)
+    subprocess.check_call(['glslangValidator', '-V', '-o', spirv_path, shader])
+    spirv_cross_path = './spirv-cross'
+    subprocess.check_call([spirv_cross_path, '--entry', 'main', '--output', msl_path, spirv_path, '--metal'])
+    subprocess.check_call(['spirv-val', spirv_path])
+
+    if os.path.exists(METALC):
+        validate_shader_msl(msl_path)
+
+    return (spirv_path, msl_path)
+
 def validate_shader(shader, vulkan):
     if vulkan:
         subprocess.check_call(['glslangValidator', '-V', shader])
     else:
         subprocess.check_call(['glslangValidator', shader])
 
-def cross_compile(shader, vulkan, spirv, eliminate, invalid_spirv):
+def cross_compile(shader, vulkan, spirv, invalid_spirv, eliminate, is_legacy, flatten_ubo):
     spirv_f, spirv_path = tempfile.mkstemp()
     glsl_f, glsl_path = tempfile.mkstemp(suffix = os.path.basename(shader))
     os.close(spirv_f)
@@ -84,21 +105,23 @@ def cross_compile(shader, vulkan, spirv, eliminate, invalid_spirv):
     if not invalid_spirv:
         subprocess.check_call(['spirv-val', spirv_path])
 
-    spirv_cross_path = './spirv-cross'
+    extra_args = []
     if eliminate:
-        subprocess.check_call([spirv_cross_path, '--remove-unused-variables', '--entry', 'main', '--output', glsl_path, spirv_path])
-    else:
-        subprocess.check_call([spirv_cross_path, '--entry', 'main', '--output', glsl_path, spirv_path])
+        extra_args += ['--remove-unused-variables']
+    if is_legacy:
+        extra_args += ['--version', '100', '--es']
+    if flatten_ubo:
+        extra_args += ['--flatten-ubo']
+
+    spirv_cross_path = './spirv-cross'
+    subprocess.check_call([spirv_cross_path, '--entry', 'main', '--output', glsl_path, spirv_path] + extra_args)
 
     # A shader might not be possible to make valid GLSL from, skip validation for this case.
     if (not ('nocompat' in glsl_path)) and (not spirv):
         validate_shader(glsl_path, False)
 
     if vulkan or spirv:
-        if eliminate:
-            subprocess.check_call([spirv_cross_path, '--remove-unused-variables', '--entry', 'main', '--vulkan-semantics', '--output', vulkan_glsl_path, spirv_path])
-        else:
-            subprocess.check_call([spirv_cross_path, '--entry', 'main', '--vulkan-semantics', '--output', vulkan_glsl_path, spirv_path])
+        subprocess.check_call([spirv_cross_path, '--entry', 'main', '--vulkan-semantics', '--output', vulkan_glsl_path, spirv_path] + extra_args)
         validate_shader(vulkan_glsl_path, vulkan)
 
     return (spirv_path, glsl_path, vulkan_glsl_path if vulkan else None)
@@ -171,6 +194,12 @@ def shader_is_spirv(shader):
 def shader_is_invalid_spirv(shader):
     return '.invalid.' in shader
 
+def shader_is_legacy(shader):
+    return '.legacy.' in shader
+
+def shader_is_flatten_ubo(shader):
+    return '.flatten.' in shader
+
 def test_shader(stats, shader, update, keep):
     joined_path = os.path.join(shader[0], shader[1])
     vulkan = shader_is_vulkan(shader[1])
@@ -178,9 +207,11 @@ def test_shader(stats, shader, update, keep):
     eliminate = shader_is_eliminate_dead_variables(shader[1])
     is_spirv = shader_is_spirv(shader[1])
     invalid_spirv = shader_is_invalid_spirv(shader[1])
+    is_legacy = shader_is_legacy(shader[1])
+    flatten_ubo = shader_is_flatten_ubo(shader[1])
 
     print('Testing shader:', joined_path)
-    spirv, glsl, vulkan_glsl = cross_compile(joined_path, vulkan, is_spirv, eliminate, invalid_spirv)
+    spirv, glsl, vulkan_glsl = cross_compile(joined_path, vulkan, is_spirv, invalid_spirv, eliminate, is_legacy, flatten_ubo)
 
     # Only test GLSL stats if we have a shader following GL semantics.
     if stats and (not vulkan) and (not is_spirv) and (not desktop):
@@ -202,20 +233,30 @@ def test_shader(stats, shader, update, keep):
             a.append(str(i))
         print(','.join(a), file = stats)
 
-def test_shaders_helper(stats, shader_dir, update, malisc, keep):
+def test_shader_msl(stats, shader, update, keep):
+    joined_path = os.path.join(shader[0], shader[1])
+    print('Testing MSL shader:', joined_path)
+    spirv, msl = cross_compile_msl(joined_path)
+    regression_check(shader, msl, update, keep)
+    os.remove(spirv)
+
+def test_shaders_helper(stats, shader_dir, update, malisc, keep, backend):
     for root, dirs, files in os.walk(os.path.join(shader_dir)):
         for i in files:
             path = os.path.join(root, i)
             relpath = os.path.relpath(path, shader_dir)
-            test_shader(stats, (shader_dir, relpath), update, keep)
+            if backend == 'metal':
+                test_shader_msl(stats, (shader_dir, relpath), update, keep)
+            else:
+                test_shader(stats, (shader_dir, relpath), update, keep)
 
-def test_shaders(shader_dir, update, malisc, keep):
+def test_shaders(shader_dir, update, malisc, keep, backend):
     if malisc:
         with open('stats.csv', 'w') as stats:
             print('Shader,OrigRegs,OrigUniRegs,OrigALUShort,OrigLSShort,OrigTEXShort,OrigALULong,OrigLSLong,OrigTEXLong,CrossRegs,CrossUniRegs,CrossALUShort,CrossLSShort,CrossTEXShort,CrossALULong,CrossLSLong,CrossTEXLong', file = stats)
-            test_shaders_helper(stats, shader_dir, update, malisc, keep)
+            test_shaders_helper(stats, shader_dir, update, malisc, keep, backend)
     else:
-        test_shaders_helper(None, shader_dir, update, malisc, keep)
+        test_shaders_helper(None, shader_dir, update, malisc, keep, backend)
 
 def main():
     parser = argparse.ArgumentParser(description = 'Script for regression testing.')
@@ -230,13 +271,19 @@ def main():
     parser.add_argument('--malisc',
             action = 'store_true',
             help = 'Use malisc offline compiler to determine static cycle counts before and after spirv-cross.')
+    parser.add_argument('--metal',
+            action = 'store_true',
+            help = 'Test Metal backend.')
     args = parser.parse_args()
 
     if not args.folder:
         sys.stderr.write('Need shader folder.\n')
         sys.exit(1)
 
-    test_shaders(args.folder, args.update, args.malisc, args.keep)
+    if os.path.exists(METALC):
+        subprocess.check_call([METALC, '--version'])
+
+    test_shaders(args.folder, args.update, args.malisc, args.keep, 'metal' if args.metal else 'glsl')
     if args.malisc:
         print('Stats in stats.csv!')
     print('Tests completed!')