Merge pull request #2153 from etang-cw/TexFence

[WIP] MSL: Prevent RAW hazards on read_write textures
2023-08-17 11:05:55 +02:00 · 2023-08-17 11:05:55 +02:00 · 637c211c6f
commit 637c211c6f
parent bccaa94db8 894113f55b
6 changed files with 15 additions and 1 deletions
--- a/reference/opt/shaders-msl/asm/comp/image-load-store-short-vector.invalid.asm.comp
+++ b/reference/opt/shaders-msl/asm/comp/image-load-store-short-vector.invalid.asm.comp
@ -8,6 +8,7 @@ using namespace metal;
 static inline __attribute__((always_inline))
 void _main(thread const uint3& id, texture2d<float, access::read_write> TargetTexture)
 {
+    TargetTexture.fence();
    float2 loaded = TargetTexture.read(uint2(id.xy)).xy;
    float2 storeTemp = loaded + float2(1.0);
    TargetTexture.write(storeTemp.xyyy, uint2((id.xy + uint2(1u))));
--- a/reference/opt/shaders-msl/desktop-only/frag/image-ms.desktop.frag
+++ b/reference/opt/shaders-msl/desktop-only/frag/image-ms.desktop.frag
@ -5,6 +5,7 @@ using namespace metal;

 fragment void main0(texture2d_ms<float> uImageMS [[texture(0)]], texture2d_array<float, access::read_write> uImageArray [[texture(1)]], texture2d<float, access::write> uImage [[texture(2)]])
 {
+    uImageArray.fence();
    uImage.write(uImageMS.read(uint2(int2(1, 2)), 2), uint2(int2(2, 3)));
    uImageArray.write(uImageArray.read(uint2(int3(1, 2, 4).xy), uint(int3(1, 2, 4).z)), uint2(int3(2, 3, 7).xy), uint(int3(2, 3, 7).z));
 }
--- a/reference/shaders-msl/asm/comp/image-load-store-short-vector.invalid.asm.comp
+++ b/reference/shaders-msl/asm/comp/image-load-store-short-vector.invalid.asm.comp
@ -8,6 +8,7 @@ using namespace metal;
 static inline __attribute__((always_inline))
 void _main(thread const uint3& id, texture2d<float, access::read_write> TargetTexture)
 {
+    TargetTexture.fence();
    float2 loaded = TargetTexture.read(uint2(id.xy)).xy;
    float2 storeTemp = loaded + float2(1.0);
    TargetTexture.write(storeTemp.xyyy, uint2((id.xy + uint2(1u))));
--- a/reference/shaders-msl/desktop-only/frag/image-ms.desktop.frag
+++ b/reference/shaders-msl/desktop-only/frag/image-ms.desktop.frag
@ -6,6 +6,7 @@ using namespace metal;
 fragment void main0(texture2d_ms<float> uImageMS [[texture(0)]], texture2d_array<float, access::read_write> uImageArray [[texture(1)]], texture2d<float, access::write> uImage [[texture(2)]])
 {
    float4 a = uImageMS.read(uint2(int2(1, 2)), 2);
+    uImageArray.fence();
    float4 b = uImageArray.read(uint2(int3(1, 2, 4).xy), uint(int3(1, 2, 4).z));
    uImage.write(a, uint2(int2(2, 3)));
    uImageArray.write(b, uint2(int3(2, 3, 7).xy), uint(int3(2, 3, 7).z));
--- a/spirv_msl.cpp
+++ b/spirv_msl.cpp
@ -8675,9 +8675,9 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
 		// Mark that this shader reads from this image
 		uint32_t img_id = ops[2];
 		auto &type = expression_type(img_id);
+		auto *p_var = maybe_get_backing_variable(img_id);
 		if (type.image.dim != DimSubpassData)
 		{
-			auto *p_var = maybe_get_backing_variable(img_id);
 			if (p_var && has_decoration(p_var->self, DecorationNonReadable))
 			{
 				unset_decoration(p_var->self, DecorationNonReadable);
@ -8685,6 +8685,10 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
 			}
 		}

+		// Metal requires explicit fences to break up RAW hazards, even within the same shader invocation
+		if (msl_options.readwrite_texture_fences && p_var && !has_decoration(p_var->self, DecorationNonWritable))
+			statement(to_expression(img_id), ".fence();");
+
 		emit_texture_op(instruction, false);
 		break;
 	}
--- a/spirv_msl.hpp
+++ b/spirv_msl.hpp
@ -496,6 +496,12 @@ public:
 		// so it can be enabled only when the bug is present.
 		bool sample_dref_lod_array_as_grad = false;

+		// MSL doesn't guarantee coherence between writes and subsequent reads of read_write textures.
+		// This inserts fences before each read of a read_write texture to ensure coherency.
+		// If you're sure you never rely on this, you can set this to false for a possible performance improvement.
+		// Note: Only Apple's GPU compiler takes advantage of the lack of coherency, so make sure to test on Apple GPUs if you disable this.
+		bool readwrite_texture_fences = true;
+
 		bool is_ios() const
 		{
 			return platform == iOS;