Merge pull request #2233 from cdavis5e/agx-cube-grad-fixup

MSL: Work around broken cube texture gradients on Apple Silicon.
2023-11-28 12:05:55 +01:00 · 2023-11-28 12:05:55 +01:00 · 50e90dd74e
commit 50e90dd74e
parent 3717660e14 18976c4307
13 changed files with 382 additions and 15 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -243,7 +243,7 @@ set(spirv-cross-util-sources
 		${CMAKE_CURRENT_SOURCE_DIR}/spirv_cross_util.hpp)

 set(spirv-cross-abi-major 0)
-set(spirv-cross-abi-minor 57)
+set(spirv-cross-abi-minor 58)
 set(spirv-cross-abi-patch 0)
 set(SPIRV_CROSS_VERSION ${spirv-cross-abi-major}.${spirv-cross-abi-minor}.${spirv-cross-abi-patch})

--- a/main.cpp
+++ b/main.cpp
@ -678,6 +678,8 @@ struct CLIArguments
 	bool msl_sample_dref_lod_array_as_grad = false;
 	bool msl_runtime_array_rich_descriptor = false;
 	bool msl_replace_recursive_inputs = false;
+	bool msl_readwrite_texture_fences = true;
+	bool msl_agx_manual_cube_grad_fixup = false;
 	const char *msl_combined_sampler_suffix = nullptr;
 	bool glsl_emit_push_constant_as_ubo = false;
 	bool glsl_emit_ubo_as_plain_uniforms = false;
@ -958,6 +960,14 @@ static void print_help_msl()
 	                "\t\tSome Metal devices have a bug where the level() argument to\n"
 	                "\t\tdepth2d_array<T>::sample_compare() in a fragment shader is biased by some\n"
 	                "\t\tunknown amount. This prevents the bias from being added.\n"
+	                "\t[--msl-no-readwrite-texture-fences]:\n\t\tDo not insert fences before each read of a\n"
+	                "\t\tread_write texture. MSL does not guarantee coherence between writes and later reads\n"
+	                "\t\tof read_write textures. If you don't rely on this, you can disable this for a\n"
+	                "\t\tpossible performance improvement.\n"
+	                "\t[--msl-agx-manual-cube-grad-fixup]:\n\t\tManually transform cube texture gradients.\n"
+	                "\t\tAll released Apple Silicon GPUs to date ignore one of the three partial derivatives\n"
+	                "\t\tbased on the selected major axis, and expect the remaining derivatives to be\n"
+	                "\t\tpartially transformed. This fixup gives correct results on Apple Silicon.\n"
 	                "\t[--msl-combined-sampler-suffix <suffix>]:\n\t\tUses a custom suffix for combined samplers.\n");
 	// clang-format on
 }
@ -1236,6 +1246,8 @@ static string compile_iteration(const CLIArguments &args, std::vector<uint32_t>
 		msl_opts.ios_support_base_vertex_instance = true;
 		msl_opts.runtime_array_rich_descriptor = args.msl_runtime_array_rich_descriptor;
 		msl_opts.replace_recursive_inputs = args.msl_replace_recursive_inputs;
+		msl_opts.readwrite_texture_fences = args.msl_readwrite_texture_fences;
+		msl_opts.agx_manual_cube_grad_fixup = args.msl_agx_manual_cube_grad_fixup;
 		msl_comp->set_msl_options(msl_opts);
 		for (auto &v : args.msl_discrete_descriptor_sets)
 			msl_comp->add_discrete_descriptor_set(v);
@ -1790,6 +1802,8 @@ static int main_inner(int argc, char *argv[])
 	cbs.add("--msl-check-discarded-frag-stores", [&args](CLIParser &) { args.msl_check_discarded_frag_stores = true; });
 	cbs.add("--msl-sample-dref-lod-array-as-grad",
 	        [&args](CLIParser &) { args.msl_sample_dref_lod_array_as_grad = true; });
+	cbs.add("--msl-no-readwrite-texture-fences", [&args](CLIParser &) { args.msl_readwrite_texture_fences = false; });
+	cbs.add("--msl-agx-manual-cube-grad-fixup", [&args](CLIParser &) { args.msl_agx_manual_cube_grad_fixup = true; });
 	cbs.add("--msl-combined-sampler-suffix", [&args](CLIParser &parser) {
 		args.msl_combined_sampler_suffix = parser.next_string();
 	});
--- a/reference/opt/shaders-msl/asm/frag/depth-array-texture-lod.lod-as-grad.1d-as-2d.agx-cube-grad.msl23.asm.frag
+++ b/reference/opt/shaders-msl/asm/frag/depth-array-texture-lod.lod-as-grad.1d-as-2d.agx-cube-grad.msl23.asm.frag
@ -0,0 +1,45 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+static inline gradientcube spvGradientCube(float3 P, float3 dPdx, float3 dPdy)
+{
+    // Major axis selection
+    float3 absP = abs(P);
+    bool xMajor = absP.x >= max(absP.y, absP.z);
+    bool yMajor = absP.y >= absP.z;
+    float3 Q = xMajor ? P.yzx : (yMajor ? P.xzy : P);
+    float3 dQdx = xMajor ? dPdx.yzx : (yMajor ? dPdx.xzy : dPdx);
+    float3 dQdy = xMajor ? dPdy.yzx : (yMajor ? dPdy.xzy : dPdy);
+
+    // Skip a couple of operations compared to usual projection
+    float4 d = float4(dQdx.xy, dQdy.xy) - (Q.xy / Q.z).xyxy * float4(dQdx.zz, dQdy.zz);
+
+    // Final swizzle to put the intermediate values into non-ignored components
+    // X major: X and Z
+    // Y major: X and Y
+    // Z major: Y and Z
+    return gradientcube(xMajor ? d.xxy : d.xyx, xMajor ? d.zzw : d.zwz);
+}
+
+struct main0_out
+{
+    float4 o_color [[color(0)]];
+};
+
+struct main0_in
+{
+    float4 v_texCoord [[user(locn0)]];
+    float2 v_drefLodBias [[user(locn1)]];
+};
+
+fragment main0_out main0(main0_in in [[stage_in]], depthcube_array<float> u_sampler [[texture(0)]], sampler u_samplerSmplr [[sampler(0)]])
+{
+    main0_out out = {};
+    out.o_color = float4(u_sampler.sample_compare(u_samplerSmplr, in.v_texCoord.xyz, uint(rint(in.v_texCoord.w)), in.v_drefLodBias.x, spvGradientCube(in.v_texCoord.xyz, exp2(in.v_drefLodBias.y - 0.5) / float3(u_sampler.get_width()), exp2(in.v_drefLodBias.y - 0.5) / float3(u_sampler.get_width()))), 0.0, 0.0, 1.0);
+    return out;
+}
+
--- a/reference/opt/shaders-msl/frag/sampler-cube-grad.agx-cube-grad.frag
+++ b/reference/opt/shaders-msl/frag/sampler-cube-grad.agx-cube-grad.frag
@ -0,0 +1,44 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+static inline gradientcube spvGradientCube(float3 P, float3 dPdx, float3 dPdy)
+{
+    // Major axis selection
+    float3 absP = abs(P);
+    bool xMajor = absP.x >= max(absP.y, absP.z);
+    bool yMajor = absP.y >= absP.z;
+    float3 Q = xMajor ? P.yzx : (yMajor ? P.xzy : P);
+    float3 dQdx = xMajor ? dPdx.yzx : (yMajor ? dPdx.xzy : dPdx);
+    float3 dQdy = xMajor ? dPdy.yzx : (yMajor ? dPdy.xzy : dPdy);
+
+    // Skip a couple of operations compared to usual projection
+    float4 d = float4(dQdx.xy, dQdy.xy) - (Q.xy / Q.z).xyxy * float4(dQdx.zz, dQdy.zz);
+
+    // Final swizzle to put the intermediate values into non-ignored components
+    // X major: X and Z
+    // Y major: X and Y
+    // Z major: Y and Z
+    return gradientcube(xMajor ? d.xxy : d.xyx, xMajor ? d.zzw : d.zwz);
+}
+
+struct main0_out
+{
+    float4 FragColor [[color(0)]];
+};
+
+struct main0_in
+{
+    float3 vTex [[user(locn0), flat]];
+};
+
+fragment main0_out main0(main0_in in [[stage_in]], texturecube<float> uSampler [[texture(0)]], sampler uSamplerSmplr [[sampler(0)]])
+{
+    main0_out out = {};
+    out.FragColor += uSampler.sample(uSamplerSmplr, in.vTex, spvGradientCube(in.vTex, float3(5.0), float3(8.0)));
+    return out;
+}
+
--- a/reference/shaders-msl/asm/frag/depth-array-texture-lod.lod-as-grad.1d-as-2d.agx-cube-grad.msl23.asm.frag
+++ b/reference/shaders-msl/asm/frag/depth-array-texture-lod.lod-as-grad.1d-as-2d.agx-cube-grad.msl23.asm.frag
@ -0,0 +1,55 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+static inline gradientcube spvGradientCube(float3 P, float3 dPdx, float3 dPdy)
+{
+    // Major axis selection
+    float3 absP = abs(P);
+    bool xMajor = absP.x >= max(absP.y, absP.z);
+    bool yMajor = absP.y >= absP.z;
+    float3 Q = xMajor ? P.yzx : (yMajor ? P.xzy : P);
+    float3 dQdx = xMajor ? dPdx.yzx : (yMajor ? dPdx.xzy : dPdx);
+    float3 dQdy = xMajor ? dPdy.yzx : (yMajor ? dPdy.xzy : dPdy);
+
+    // Skip a couple of operations compared to usual projection
+    float4 d = float4(dQdx.xy, dQdy.xy) - (Q.xy / Q.z).xyxy * float4(dQdx.zz, dQdy.zz);
+
+    // Final swizzle to put the intermediate values into non-ignored components
+    // X major: X and Z
+    // Y major: X and Y
+    // Z major: Y and Z
+    return gradientcube(xMajor ? d.xxy : d.xyx, xMajor ? d.zzw : d.zwz);
+}
+
+struct buf0
+{
+    float4 u_scale;
+};
+
+struct buf1
+{
+    float4 u_bias;
+};
+
+struct main0_out
+{
+    float4 o_color [[color(0)]];
+};
+
+struct main0_in
+{
+    float4 v_texCoord [[user(locn0)]];
+    float2 v_drefLodBias [[user(locn1)]];
+};
+
+fragment main0_out main0(main0_in in [[stage_in]], depthcube_array<float> u_sampler [[texture(0)]], sampler u_samplerSmplr [[sampler(0)]])
+{
+    main0_out out = {};
+    out.o_color = float4(u_sampler.sample_compare(u_samplerSmplr, in.v_texCoord.xyz, uint(rint(in.v_texCoord.w)), in.v_drefLodBias.x, spvGradientCube(in.v_texCoord.xyz, exp2(in.v_drefLodBias.y - 0.5) / float3(u_sampler.get_width()), exp2(in.v_drefLodBias.y - 0.5) / float3(u_sampler.get_width()))), 0.0, 0.0, 1.0);
+    return out;
+}
+
--- a/reference/shaders-msl/frag/sampler-cube-grad.agx-cube-grad.frag
+++ b/reference/shaders-msl/frag/sampler-cube-grad.agx-cube-grad.frag
@ -0,0 +1,44 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+static inline gradientcube spvGradientCube(float3 P, float3 dPdx, float3 dPdy)
+{
+    // Major axis selection
+    float3 absP = abs(P);
+    bool xMajor = absP.x >= max(absP.y, absP.z);
+    bool yMajor = absP.y >= absP.z;
+    float3 Q = xMajor ? P.yzx : (yMajor ? P.xzy : P);
+    float3 dQdx = xMajor ? dPdx.yzx : (yMajor ? dPdx.xzy : dPdx);
+    float3 dQdy = xMajor ? dPdy.yzx : (yMajor ? dPdy.xzy : dPdy);
+
+    // Skip a couple of operations compared to usual projection
+    float4 d = float4(dQdx.xy, dQdy.xy) - (Q.xy / Q.z).xyxy * float4(dQdx.zz, dQdy.zz);
+
+    // Final swizzle to put the intermediate values into non-ignored components
+    // X major: X and Z
+    // Y major: X and Y
+    // Z major: Y and Z
+    return gradientcube(xMajor ? d.xxy : d.xyx, xMajor ? d.zzw : d.zwz);
+}
+
+struct main0_out
+{
+    float4 FragColor [[color(0)]];
+};
+
+struct main0_in
+{
+    float3 vTex [[user(locn0), flat]];
+};
+
+fragment main0_out main0(main0_in in [[stage_in]], texturecube<float> uSampler [[texture(0)]], sampler uSamplerSmplr [[sampler(0)]])
+{
+    main0_out out = {};
+    out.FragColor += uSampler.sample(uSamplerSmplr, in.vTex, spvGradientCube(in.vTex, float3(5.0), float3(8.0)));
+    return out;
+}
+
--- a/shaders-msl/asm/frag/depth-array-texture-lod.lod-as-grad.1d-as-2d.agx-cube-grad.msl23.asm.frag
+++ b/shaders-msl/asm/frag/depth-array-texture-lod.lod-as-grad.1d-as-2d.agx-cube-grad.msl23.asm.frag
@ -0,0 +1,86 @@
+; SPIR-V
+; Version: 1.3
+; Generator: Khronos Glslang Reference Front End; 11
+; Bound: 45
+; Schema: 0
+               OpCapability Shader
+               OpCapability SampledCubeArray
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint Fragment %main "main" %o_color %v_texCoord %v_drefLodBias
+               OpExecutionMode %main OriginUpperLeft
+
+               ; Debug Information
+               OpSource GLSL 450
+               OpName %main "main"  ; id %4
+               OpName %o_color "o_color"  ; id %9
+               OpName %u_sampler "u_sampler"  ; id %13
+               OpName %v_texCoord "v_texCoord"  ; id %16
+               OpName %v_drefLodBias "v_drefLodBias"  ; id %21
+               OpName %buf0 "buf0"  ; id %39
+               OpMemberName %buf0 0 "u_scale"
+               OpName %_ ""  ; id %41
+               OpName %buf1 "buf1"  ; id %42
+               OpMemberName %buf1 0 "u_bias"
+               OpName %__0 ""  ; id %44
+
+               ; Annotations
+               OpDecorate %o_color RelaxedPrecision
+               OpDecorate %o_color Location 0
+               OpDecorate %u_sampler DescriptorSet 0
+               OpDecorate %u_sampler Binding 0
+               OpDecorate %v_texCoord Location 0
+               OpDecorate %v_drefLodBias Location 1
+               OpMemberDecorate %buf0 0 Offset 0
+               OpDecorate %buf0 Block
+               OpDecorate %_ DescriptorSet 0
+               OpDecorate %_ Binding 1
+               OpMemberDecorate %buf1 0 Offset 0
+               OpDecorate %buf1 Block
+               OpDecorate %__0 DescriptorSet 0
+               OpDecorate %__0 Binding 2
+
+               ; Types, variables and constants
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+    %v4float = OpTypeVector %float 4
+%_ptr_Output_v4float = OpTypePointer Output %v4float
+    %o_color = OpVariable %_ptr_Output_v4float Output
+         %10 = OpTypeImage %float Cube 1 1 0 1 Unknown
+         %11 = OpTypeSampledImage %10
+%_ptr_UniformConstant_11 = OpTypePointer UniformConstant %11
+  %u_sampler = OpVariable %_ptr_UniformConstant_11 UniformConstant
+%_ptr_Input_v4float = OpTypePointer Input %v4float
+ %v_texCoord = OpVariable %_ptr_Input_v4float Input
+    %v2float = OpTypeVector %float 2
+%_ptr_Input_v2float = OpTypePointer Input %v2float
+%v_drefLodBias = OpVariable %_ptr_Input_v2float Input
+       %uint = OpTypeInt 32 0
+     %uint_0 = OpConstant %uint 0
+%_ptr_Input_float = OpTypePointer Input %float
+    %v3float = OpTypeVector %float 3
+     %uint_1 = OpConstant %uint 1
+    %float_0 = OpConstant %float 0
+    %float_1 = OpConstant %float 1
+       %buf0 = OpTypeStruct %v4float
+%_ptr_Uniform_buf0 = OpTypePointer Uniform %buf0
+          %_ = OpVariable %_ptr_Uniform_buf0 Uniform
+       %buf1 = OpTypeStruct %v4float
+%_ptr_Uniform_buf1 = OpTypePointer Uniform %buf1
+        %__0 = OpVariable %_ptr_Uniform_buf1 Uniform
+
+               ; Function main
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %14 = OpLoad %11 %u_sampler
+         %18 = OpLoad %v4float %v_texCoord
+         %25 = OpAccessChain %_ptr_Input_float %v_drefLodBias %uint_0
+         %26 = OpLoad %float %25
+         %32 = OpAccessChain %_ptr_Input_float %v_drefLodBias %uint_1
+         %33 = OpLoad %float %32
+         %35 = OpImageSampleDrefExplicitLod %float %14 %18 %26 Lod %33
+         %38 = OpCompositeConstruct %v4float %35 %float_0 %float_0 %float_1
+               OpStore %o_color %38
+               OpReturn
+               OpFunctionEnd
--- a/shaders-msl/frag/sampler-cube-grad.agx-cube-grad.frag
+++ b/shaders-msl/frag/sampler-cube-grad.agx-cube-grad.frag
@ -0,0 +1,10 @@
+#version 450
+
+layout(location = 0) out vec4 FragColor;
+layout(location = 0) flat in vec3 vTex;
+layout(binding = 0) uniform samplerCube uSampler;
+
+void main()
+{
+	FragColor += textureGrad(uSampler, vTex, vec3(5.0), vec3(8.0));
+}
--- a/spirv_cross_c.cpp
+++ b/spirv_cross_c.cpp
@ -742,6 +742,18 @@ spvc_result spvc_compiler_options_set_uint(spvc_compiler_options options, spvc_c
 	case SPVC_COMPILER_OPTION_MSL_SAMPLE_DREF_LOD_ARRAY_AS_GRAD:
 		options->msl.sample_dref_lod_array_as_grad = value != 0;
 		break;
+
+	case SPVC_COMPILER_OPTION_MSL_READWRITE_TEXTURE_FENCES:
+		options->msl.readwrite_texture_fences = value != 0;
+		break;
+
+	case SPVC_COMPILER_OPTION_MSL_REPLACE_RECURSIVE_INPUTS:
+		options->msl.replace_recursive_inputs = value != 0;
+		break;
+
+	case SPVC_COMPILER_OPTION_MSL_AGX_MANUAL_CUBE_GRAD_FIXUP:
+		options->msl.agx_manual_cube_grad_fixup = value != 0;
+		break;
 #endif

 	default:
--- a/spirv_cross_c.h
+++ b/spirv_cross_c.h
@ -40,7 +40,7 @@ extern "C" {
 /* Bumped if ABI or API breaks backwards compatibility. */
 #define SPVC_C_API_VERSION_MAJOR 0
 /* Bumped if APIs or enumerations are added in a backwards compatible way. */
-#define SPVC_C_API_VERSION_MINOR 57
+#define SPVC_C_API_VERSION_MINOR 58
 /* Bumped if internal implementation details change. */
 #define SPVC_C_API_VERSION_PATCH 0

@ -725,6 +725,9 @@ typedef enum spvc_compiler_option

 	SPVC_COMPILER_OPTION_MSL_ARGUMENT_BUFFERS_TIER = 84 | SPVC_COMPILER_OPTION_MSL_BIT,
 	SPVC_COMPILER_OPTION_MSL_SAMPLE_DREF_LOD_ARRAY_AS_GRAD = 85 | SPVC_COMPILER_OPTION_MSL_BIT,
+	SPVC_COMPILER_OPTION_MSL_READWRITE_TEXTURE_FENCES = 86 | SPVC_COMPILER_OPTION_MSL_BIT,
+	SPVC_COMPILER_OPTION_MSL_REPLACE_RECURSIVE_INPUTS = 87 | SPVC_COMPILER_OPTION_MSL_BIT,
+	SPVC_COMPILER_OPTION_MSL_AGX_MANUAL_CUBE_GRAD_FIXUP = 88 | SPVC_COMPILER_OPTION_MSL_BIT,

 	SPVC_COMPILER_OPTION_INT_MAX = 0x7fffffff
 } spvc_compiler_option;
--- a/spirv_msl.cpp
+++ b/spirv_msl.cpp
@ -5725,6 +5725,31 @@ void CompilerMSL::emit_custom_functions()
 			break;
 		}

+		// Fix up gradient vectors when sampling a cube texture for Apple Silicon.
+		// h/t Alexey Knyazev (https://github.com/KhronosGroup/MoltenVK/issues/2068#issuecomment-1817799067) for the code.
+		case SPVFuncImplGradientCube:
+			statement("static inline gradientcube spvGradientCube(float3 P, float3 dPdx, float3 dPdy)");
+			begin_scope();
+			statement("// Major axis selection");
+			statement("float3 absP = abs(P);");
+			statement("bool xMajor = absP.x >= max(absP.y, absP.z);");
+			statement("bool yMajor = absP.y >= absP.z;");
+			statement("float3 Q = xMajor ? P.yzx : (yMajor ? P.xzy : P);");
+			statement("float3 dQdx = xMajor ? dPdx.yzx : (yMajor ? dPdx.xzy : dPdx);");
+			statement("float3 dQdy = xMajor ? dPdy.yzx : (yMajor ? dPdy.xzy : dPdy);");
+			statement_no_indent("");
+			statement("// Skip a couple of operations compared to usual projection");
+			statement("float4 d = float4(dQdx.xy, dQdy.xy) - (Q.xy / Q.z).xyxy * float4(dQdx.zz, dQdy.zz);");
+			statement_no_indent("");
+			statement("// Final swizzle to put the intermediate values into non-ignored components");
+			statement("// X major: X and Z");
+			statement("// Y major: X and Y");
+			statement("// Z major: Y and Z");
+			statement("return gradientcube(xMajor ? d.xxy : d.xyx, xMajor ? d.zzw : d.zwz);");
+			end_scope();
+			statement("");
+			break;
+
 		// "fadd" intrinsic support
 		case SPVFuncImplFAdd:
 			statement("template<typename T>");
@ -11123,29 +11148,38 @@ string CompilerMSL::to_function_args(const TextureFunctionArguments &args, bool
 			// rhoX = dP/dx * extent; rhoY = dP/dy * extent
 			// Therefore, dP/dx = dP/dy = exp2(lod)/extent.
 			// (Subtracting 0.5 before exponentiation gives better results.)
-			string grad_opt, extent;
+			string grad_opt, extent, grad_coord;
 			VariableID base_img = img;
 			if (auto *combined = maybe_get<SPIRCombinedImageSampler>(img))
 				base_img = combined->image;
 			switch (imgtype.image.dim)
 			{
 			case Dim1D:
-				grad_opt = "2d";
+				grad_opt = "gradient2d";
 				extent = join("float2(", to_expression(base_img), ".get_width(), 1.0)");
 				break;
 			case Dim2D:
-				grad_opt = "2d";
+				grad_opt = "gradient2d";
 				extent = join("float2(", to_expression(base_img), ".get_width(), ", to_expression(base_img), ".get_height())");
 				break;
 			case DimCube:
 				if (imgtype.image.arrayed && msl_options.emulate_cube_array)
 				{
-					grad_opt = "2d";
+					grad_opt = "gradient2d";
 					extent = join("float2(", to_expression(base_img), ".get_width())");
 				}
 				else
 				{
-					grad_opt = "cube";
+					if (msl_options.agx_manual_cube_grad_fixup)
+					{
+						add_spv_func_and_recompile(SPVFuncImplGradientCube);
+						grad_opt = "spvGradientCube";
+						grad_coord = tex_coords + ", ";
+					}
+					else
+					{
+						grad_opt = "gradientcube";
+					}
 					extent = join("float3(", to_expression(base_img), ".get_width())");
 				}
 				break;
@ -11154,8 +11188,8 @@ string CompilerMSL::to_function_args(const TextureFunctionArguments &args, bool
 				extent = "float3(1.0)";
 				break;
 			}
-			farg_str += join(", gradient", grad_opt, "(exp2(", to_expression(lod), " - 0.5) / ", extent, ", exp2(",
-			                 to_expression(lod), " - 0.5) / ", extent, ")");
+			farg_str += join(", ", grad_opt, "(", grad_coord, "exp2(", to_expression(lod), " - 0.5) / ", extent,
+			                 ", exp2(", to_expression(lod), " - 0.5) / ", extent, ")");
 		}
 		else
 		{
@ -11175,27 +11209,37 @@ string CompilerMSL::to_function_args(const TextureFunctionArguments &args, bool
 	{
 		forward = forward && should_forward(grad_x);
 		forward = forward && should_forward(grad_y);
-		string grad_opt;
+		string grad_opt, grad_coord;
 		switch (imgtype.image.dim)
 		{
 		case Dim1D:
 		case Dim2D:
-			grad_opt = "2d";
+			grad_opt = "gradient2d";
 			break;
 		case Dim3D:
-			grad_opt = "3d";
+			grad_opt = "gradient3d";
 			break;
 		case DimCube:
 			if (imgtype.image.arrayed && msl_options.emulate_cube_array)
-				grad_opt = "2d";
+			{
+				grad_opt = "gradient2d";
+			}
+			else if (msl_options.agx_manual_cube_grad_fixup)
+			{
+				add_spv_func_and_recompile(SPVFuncImplGradientCube);
+				grad_opt = "spvGradientCube";
+				grad_coord = tex_coords + ", ";
+			}
 			else
-				grad_opt = "cube";
+			{
+				grad_opt = "gradientcube";
+			}
 			break;
 		default:
 			grad_opt = "unsupported_gradient_dimension";
 			break;
 		}
-		farg_str += ", gradient" + grad_opt + "(" + to_expression(grad_x) + ", " + to_expression(grad_y) + ")";
+		farg_str += join(", ", grad_opt, "(", grad_coord, to_expression(grad_x), ", ", to_expression(grad_y), ")");
 	}

 	if (args.min_lod)
--- a/spirv_msl.hpp
+++ b/spirv_msl.hpp
@ -512,6 +512,13 @@ public:
 		// The bug has been reported to Apple, and will hopefully be fixed in future releases.
 		bool replace_recursive_inputs = false;

+		// If set, manual fixups of gradient vectors for cube texture lookups will be performed.
+		// All released Apple Silicon GPUs to date behave incorrectly when sampling a cube texture
+		// with explicit gradients. They will ignore one of the three partial derivatives based
+		// on the selected major axis, and expect the remaining derivatives to be partially
+		// transformed.
+		bool agx_manual_cube_grad_fixup = false;
+
 		bool is_ios() const
 		{
 			return platform == iOS;
@ -756,6 +763,7 @@ protected:
 		SPVFuncImplArrayOfArrayCopy6Dim = SPVFuncImplArrayCopyMultidimBase + 6,
 		SPVFuncImplTexelBufferCoords,
 		SPVFuncImplImage2DAtomicCoords, // Emulate texture2D atomic operations
+		SPVFuncImplGradientCube,
 		SPVFuncImplFMul,
 		SPVFuncImplFAdd,
 		SPVFuncImplFSub,
--- a/test_shaders.py
+++ b/test_shaders.py
@ -355,6 +355,8 @@ def cross_compile_msl(shader, spirv, opt, iterations, paths):
        msl_args.append('--msl-check-discarded-frag-stores')
    if '.lod-as-grad.' in shader:
        msl_args.append('--msl-sample-dref-lod-array-as-grad')
+    if '.agx-cube-grad.' in shader:
+        msl_args.append('--msl-agx-manual-cube-grad-fixup')
    if '.decoration-binding.' in shader:
        msl_args.append('--msl-decoration-binding')
    if '.rich-descriptor.' in shader: