Honor NoContraction qualifier.

We'll need to force a temporary and mark it as precise. MSL is a little weird here, but we can piggyback on top of the invariant float math option here to force fma() operations everywhere.
2021-05-07 12:28:08 +02:00 · 2021-05-07 12:28:08 +02:00 · e47a30e807
commit e47a30e807
parent 0eeaffe048
14 changed files with 408 additions and 23 deletions
--- a/reference/opt/shaders-hlsl/vert/no-contraction.vert
+++ b/reference/opt/shaders-hlsl/vert/no-contraction.vert
@ -0,0 +1,39 @@
+static float4 gl_Position;
+static float4 vA;
+static float4 vB;
+static float4 vC;
+
+struct SPIRV_Cross_Input
+{
+    float4 vA : TEXCOORD0;
+    float4 vB : TEXCOORD1;
+    float4 vC : TEXCOORD2;
+};
+
+struct SPIRV_Cross_Output
+{
+    float4 gl_Position : SV_Position;
+};
+
+void vert_main()
+{
+    precise float4 _15 = vA * vB;
+    precise float4 _19 = vA + vB;
+    precise float4 _23 = vA - vB;
+    precise float4 _30 = _15 + vC;
+    precise float4 _34 = _15 + _19;
+    precise float4 _36 = _34 + _23;
+    precise float4 _38 = _36 + _30;
+    gl_Position = _38;
+}
+
+SPIRV_Cross_Output main(SPIRV_Cross_Input stage_input)
+{
+    vA = stage_input.vA;
+    vB = stage_input.vB;
+    vC = stage_input.vC;
+    vert_main();
+    SPIRV_Cross_Output stage_output;
+    stage_output.gl_Position = gl_Position;
+    return stage_output;
+}
--- a/reference/opt/shaders-msl/vert/no-contraction.vert
+++ b/reference/opt/shaders-msl/vert/no-contraction.vert
@ -0,0 +1,88 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct main0_out
+{
+    float4 gl_Position [[position]];
+};
+
+struct main0_in
+{
+    float4 vA [[attribute(0)]];
+    float4 vB [[attribute(1)]];
+    float4 vC [[attribute(2)]];
+};
+
+template<typename T>
+T spvFMul(T l, T r)
+{
+    return fma(l, r, T(0));
+}
+
+template<typename T, int Cols, int Rows>
+vec<T, Cols> spvFMulVectorMatrix(vec<T, Rows> v, matrix<T, Cols, Rows> m)
+{
+    vec<T, Cols> res = vec<T, Cols>(0);
+    for (uint i = Rows; i > 0; --i)
+    {
+        vec<T, Cols> tmp(0);
+        for (uint j = 0; j < Cols; ++j)
+        {
+            tmp[j] = m[j][i - 1];
+        }
+        res = fma(tmp, vec<T, Cols>(v[i - 1]), res);
+    }
+    return res;
+}
+
+template<typename T, int Cols, int Rows>
+vec<T, Rows> spvFMulMatrixVector(matrix<T, Cols, Rows> m, vec<T, Cols> v)
+{
+    vec<T, Rows> res = vec<T, Rows>(0);
+    for (uint i = Cols; i > 0; --i)
+    {
+        res = fma(m[i - 1], vec<T, Rows>(v[i - 1]), res);
+    }
+    return res;
+}
+
+template<typename T, int LCols, int LRows, int RCols, int RRows>
+matrix<T, RCols, LRows> spvFMulMatrixMatrix(matrix<T, LCols, LRows> l, matrix<T, RCols, RRows> r)
+{
+    matrix<T, RCols, LRows> res;
+    for (uint i = 0; i < RCols; i++)
+    {
+        vec<T, RCols> tmp(0);
+        for (uint j = 0; j < LCols; j++)
+        {
+            tmp = fma(vec<T, RCols>(r[i][j]), l[j], tmp);
+        }
+        res[i] = tmp;
+    }
+    return res;
+}
+
+template<typename T>
+T spvFAdd(T l, T r)
+{
+    return fma(T(1), l, r);
+}
+
+template<typename T>
+T spvFSub(T l, T r)
+{
+    return fma(T(-1), r, l);
+}
+
+vertex main0_out main0(main0_in in [[stage_in]])
+{
+    main0_out out = {};
+    float4 _15 = spvFMul(in.vA, in.vB);
+    out.gl_Position = spvFAdd(spvFAdd(spvFAdd(_15, spvFAdd(in.vA, in.vB)), spvFSub(in.vA, in.vB)), spvFAdd(_15, in.vC));
+    return out;
+}
+
--- a/reference/opt/shaders/vert/no-contraction.vert
+++ b/reference/opt/shaders/vert/no-contraction.vert
@ -0,0 +1,18 @@
+#version 450
+
+layout(location = 0) in vec4 vA;
+layout(location = 1) in vec4 vB;
+layout(location = 2) in vec4 vC;
+
+void main()
+{
+    precise vec4 _15 = vA * vB;
+    precise vec4 _19 = vA + vB;
+    precise vec4 _23 = vA - vB;
+    precise vec4 _30 = _15 + vC;
+    precise vec4 _34 = _15 + _19;
+    precise vec4 _36 = _34 + _23;
+    precise vec4 _38 = _36 + _30;
+    gl_Position = _38;
+}
+
--- a/reference/shaders-hlsl/vert/no-contraction.vert
+++ b/reference/shaders-hlsl/vert/no-contraction.vert
@ -0,0 +1,45 @@
+static float4 gl_Position;
+static float4 vA;
+static float4 vB;
+static float4 vC;
+
+struct SPIRV_Cross_Input
+{
+    float4 vA : TEXCOORD0;
+    float4 vB : TEXCOORD1;
+    float4 vC : TEXCOORD2;
+};
+
+struct SPIRV_Cross_Output
+{
+    float4 gl_Position : SV_Position;
+};
+
+void vert_main()
+{
+    precise float4 _15 = vA * vB;
+    float4 mul = _15;
+    precise float4 _19 = vA + vB;
+    float4 add = _19;
+    precise float4 _23 = vA - vB;
+    float4 sub = _23;
+    precise float4 _27 = vA * vB;
+    precise float4 _30 = _27 + vC;
+    float4 mad = _30;
+    precise float4 _34 = mul + add;
+    precise float4 _36 = _34 + sub;
+    precise float4 _38 = _36 + mad;
+    float4 summed = _38;
+    gl_Position = summed;
+}
+
+SPIRV_Cross_Output main(SPIRV_Cross_Input stage_input)
+{
+    vA = stage_input.vA;
+    vB = stage_input.vB;
+    vC = stage_input.vC;
+    vert_main();
+    SPIRV_Cross_Output stage_output;
+    stage_output.gl_Position = gl_Position;
+    return stage_output;
+}
--- a/reference/shaders-msl/vert/no-contraction.vert
+++ b/reference/shaders-msl/vert/no-contraction.vert
@ -0,0 +1,92 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct main0_out
+{
+    float4 gl_Position [[position]];
+};
+
+struct main0_in
+{
+    float4 vA [[attribute(0)]];
+    float4 vB [[attribute(1)]];
+    float4 vC [[attribute(2)]];
+};
+
+template<typename T>
+T spvFMul(T l, T r)
+{
+    return fma(l, r, T(0));
+}
+
+template<typename T, int Cols, int Rows>
+vec<T, Cols> spvFMulVectorMatrix(vec<T, Rows> v, matrix<T, Cols, Rows> m)
+{
+    vec<T, Cols> res = vec<T, Cols>(0);
+    for (uint i = Rows; i > 0; --i)
+    {
+        vec<T, Cols> tmp(0);
+        for (uint j = 0; j < Cols; ++j)
+        {
+            tmp[j] = m[j][i - 1];
+        }
+        res = fma(tmp, vec<T, Cols>(v[i - 1]), res);
+    }
+    return res;
+}
+
+template<typename T, int Cols, int Rows>
+vec<T, Rows> spvFMulMatrixVector(matrix<T, Cols, Rows> m, vec<T, Cols> v)
+{
+    vec<T, Rows> res = vec<T, Rows>(0);
+    for (uint i = Cols; i > 0; --i)
+    {
+        res = fma(m[i - 1], vec<T, Rows>(v[i - 1]), res);
+    }
+    return res;
+}
+
+template<typename T, int LCols, int LRows, int RCols, int RRows>
+matrix<T, RCols, LRows> spvFMulMatrixMatrix(matrix<T, LCols, LRows> l, matrix<T, RCols, RRows> r)
+{
+    matrix<T, RCols, LRows> res;
+    for (uint i = 0; i < RCols; i++)
+    {
+        vec<T, RCols> tmp(0);
+        for (uint j = 0; j < LCols; j++)
+        {
+            tmp = fma(vec<T, RCols>(r[i][j]), l[j], tmp);
+        }
+        res[i] = tmp;
+    }
+    return res;
+}
+
+template<typename T>
+T spvFAdd(T l, T r)
+{
+    return fma(T(1), l, r);
+}
+
+template<typename T>
+T spvFSub(T l, T r)
+{
+    return fma(T(-1), r, l);
+}
+
+vertex main0_out main0(main0_in in [[stage_in]])
+{
+    main0_out out = {};
+    float4 mul = spvFMul(in.vA, in.vB);
+    float4 add = spvFAdd(in.vA, in.vB);
+    float4 sub = spvFSub(in.vA, in.vB);
+    float4 mad = spvFAdd(spvFMul(in.vA, in.vB), in.vC);
+    float4 summed = spvFAdd(spvFAdd(spvFAdd(mul, add), sub), mad);
+    out.gl_Position = summed;
+    return out;
+}
+
--- a/reference/shaders/vert/no-contraction.vert
+++ b/reference/shaders/vert/no-contraction.vert
@ -0,0 +1,24 @@
+#version 450
+
+layout(location = 0) in vec4 vA;
+layout(location = 1) in vec4 vB;
+layout(location = 2) in vec4 vC;
+
+void main()
+{
+    precise vec4 _15 = vA * vB;
+    vec4 mul = _15;
+    precise vec4 _19 = vA + vB;
+    vec4 add = _19;
+    precise vec4 _23 = vA - vB;
+    vec4 sub = _23;
+    precise vec4 _27 = vA * vB;
+    precise vec4 _30 = _27 + vC;
+    vec4 mad = _30;
+    precise vec4 _34 = mul + add;
+    precise vec4 _36 = _34 + sub;
+    precise vec4 _38 = _36 + mad;
+    vec4 summed = _38;
+    gl_Position = summed;
+}
+
--- a/shaders-hlsl/vert/no-contraction.vert
+++ b/shaders-hlsl/vert/no-contraction.vert
@ -0,0 +1,15 @@
+#version 450
+
+layout(location = 0) in vec4 vA;
+layout(location = 1) in vec4 vB;
+layout(location = 2) in vec4 vC;
+
+void main()
+{
+	precise vec4 mul = vA * vB;
+	precise vec4 add = vA + vB;
+	precise vec4 sub = vA - vB;
+	precise vec4 mad = vA * vB + vC;
+	precise vec4 summed = mul + add + sub + mad;
+	gl_Position = summed;
+}
--- a/shaders-msl/vert/no-contraction.vert
+++ b/shaders-msl/vert/no-contraction.vert
@ -0,0 +1,15 @@
+#version 450
+
+layout(location = 0) in vec4 vA;
+layout(location = 1) in vec4 vB;
+layout(location = 2) in vec4 vC;
+
+void main()
+{
+	precise vec4 mul = vA * vB;
+	precise vec4 add = vA + vB;
+	precise vec4 sub = vA - vB;
+	precise vec4 mad = vA * vB + vC;
+	precise vec4 summed = mul + add + sub + mad;
+	gl_Position = summed;
+}
--- a/shaders/vert/no-contraction.vert
+++ b/shaders/vert/no-contraction.vert
@ -0,0 +1,15 @@
+#version 450
+
+layout(location = 0) in vec4 vA;
+layout(location = 1) in vec4 vB;
+layout(location = 2) in vec4 vC;
+
+void main()
+{
+	precise vec4 mul = vA * vB;
+	precise vec4 add = vA + vB;
+	precise vec4 sub = vA - vB;
+	precise vec4 mad = vA * vB + vC;
+	precise vec4 summed = mul + add + sub + mad;
+	gl_Position = summed;
+}
--- a/spirv_glsl.cpp
+++ b/spirv_glsl.cpp
@ -559,18 +559,19 @@ string CompilerGLSL::compile()
 {
 	ir.fixup_reserved_names();

-	if (options.vulkan_semantics)
-		backend.allow_precision_qualifiers = true;
-	else
+	if (!options.vulkan_semantics)
 	{
 		// only NV_gpu_shader5 supports divergent indexing on OpenGL, and it does so without extra qualifiers
 		backend.nonuniform_qualifier = "";
 		backend.needs_row_major_load_workaround = true;
 	}
+	backend.allow_precision_qualifiers = options.vulkan_semantics || options.es;
 	backend.force_gl_in_out_block = true;
 	backend.supports_extensions = true;
 	backend.use_array_constructor = true;

+	backend.support_precise_qualifier = (!options.es && options.version >= 400) || (options.es && options.version >= 320);
+
 	if (is_legacy_es())
 		backend.support_case_fallthrough = false;

@ -5545,7 +5546,12 @@ void CompilerGLSL::emit_unary_op(uint32_t result_type, uint32_t result_id, uint3

 void CompilerGLSL::emit_binary_op(uint32_t result_type, uint32_t result_id, uint32_t op0, uint32_t op1, const char *op)
 {
-	bool forward = should_forward(op0) && should_forward(op1);
+	// Various FP arithmetic opcodes such as add, sub, mul will hit this.
+	bool force_temporary_precise = backend.support_precise_qualifier &&
+	                               has_decoration(result_id, DecorationNoContraction) &&
+	                               type_is_floating_point(get<SPIRType>(result_type));
+	bool forward = should_forward(op0) && should_forward(op1) && !force_temporary_precise;
+
 	emit_op(result_type, result_id,
 	        join(to_enclosed_unpacked_expression(op0), " ", op, " ", to_enclosed_unpacked_expression(op1)), forward);

@ -12728,7 +12734,7 @@ void CompilerGLSL::emit_struct_padding_target(const SPIRType &)
 {
 }

-const char *CompilerGLSL::flags_to_qualifiers_glsl(const SPIRType &type, const Bitset &flags)
+string CompilerGLSL::flags_to_qualifiers_glsl(const SPIRType &type, const Bitset &flags)
 {
 	// GL_EXT_buffer_reference variables can be marked as restrict.
 	if (flags.get(DecorationRestrictPointerEXT))
@ -12740,6 +12746,11 @@ const char *CompilerGLSL::flags_to_qualifiers_glsl(const SPIRType &type, const B
 	    type.basetype != SPIRType::Sampler)
 		return "";

+	string qual;
+
+	if (flags.get(DecorationNoContraction) && backend.support_precise_qualifier)
+		qual = "precise ";
+
 	if (options.es)
 	{
 		auto &execution = get_entry_point();
@ -12754,7 +12765,7 @@ const char *CompilerGLSL::flags_to_qualifiers_glsl(const SPIRType &type, const B
 			                        options.fragment.default_int_precision == Options::Mediump &&
 			                        execution.model == ExecutionModelFragment;

-			return implied_fmediump || implied_imediump ? "" : "mediump ";
+			qual += (implied_fmediump || implied_imediump) ? "" : "mediump ";
 		}
 		else
 		{
@ -12768,7 +12779,7 @@ const char *CompilerGLSL::flags_to_qualifiers_glsl(const SPIRType &type, const B
 			                        execution.model == ExecutionModelFragment) ||
 			                       (execution.model != ExecutionModelFragment));

-			return implied_fhighp || implied_ihighp ? "" : "highp ";
+			qual += (implied_fhighp || implied_ihighp) ? "" : "highp ";
 		}
 	}
 	else if (backend.allow_precision_qualifiers)
@ -12776,18 +12787,16 @@ const char *CompilerGLSL::flags_to_qualifiers_glsl(const SPIRType &type, const B
 		// Vulkan GLSL supports precision qualifiers, even in desktop profiles, which is convenient.
 		// The default is highp however, so only emit mediump in the rare case that a shader has these.
 		if (flags.get(DecorationRelaxedPrecision))
-			return "mediump ";
-		else
-			return "";
+			qual += "mediump ";
 	}
-	else
-		return "";
+
+	return qual;
 }

-const char *CompilerGLSL::to_precision_qualifiers_glsl(uint32_t id)
+string CompilerGLSL::to_precision_qualifiers_glsl(uint32_t id)
 {
 	auto &type = expression_type(id);
-	bool use_precision_qualifiers = backend.allow_precision_qualifiers || options.es;
+	bool use_precision_qualifiers = backend.allow_precision_qualifiers;
 	if (use_precision_qualifiers && (type.basetype == SPIRType::Image || type.basetype == SPIRType::SampledImage))
 	{
 		// Force mediump for the sampler type. We cannot declare 16-bit or smaller image types.
--- a/spirv_glsl.hpp
+++ b/spirv_glsl.hpp
@ -581,6 +581,7 @@ protected:
 		bool use_array_constructor = false;
 		bool needs_row_major_load_workaround = false;
 		bool support_pointer_to_pointer = false;
+		bool support_precise_qualifier = false;
 	} backend;

 	void emit_struct(SPIRType &type);
@ -734,9 +735,9 @@ protected:
 	virtual std::string to_qualifiers_glsl(uint32_t id);
 	void fixup_io_block_patch_qualifiers(const SPIRVariable &var);
 	void emit_output_variable_initializer(const SPIRVariable &var);
-	const char *to_precision_qualifiers_glsl(uint32_t id);
+	std::string to_precision_qualifiers_glsl(uint32_t id);
 	virtual const char *to_storage_qualifiers_glsl(const SPIRVariable &var);
-	const char *flags_to_qualifiers_glsl(const SPIRType &type, const Bitset &flags);
+	std::string flags_to_qualifiers_glsl(const SPIRType &type, const Bitset &flags);
 	const char *format_to_glsl(spv::ImageFormat format);
 	virtual std::string layout_for_member(const SPIRType &type, uint32_t index);
 	virtual std::string to_interpolation_qualifiers(const Bitset &flags);
--- a/spirv_hlsl.cpp
+++ b/spirv_hlsl.cpp
@ -5731,6 +5731,9 @@ string CompilerHLSL::compile()
 	backend.nonuniform_qualifier = "NonUniformResourceIndex";
 	backend.support_case_fallthrough = false;

+	// SM 4.1 does not support precise for some reason.
+	backend.support_precise_qualifier = hlsl_options.shader_model >= 50 || hlsl_options.shader_model == 40;
+
 	fixup_type_alias();
 	reorder_type_alias();
 	build_function_control_flow_graphs_and_analyze();
--- a/spirv_msl.cpp
+++ b/spirv_msl.cpp
@ -4843,6 +4843,16 @@ void CompilerMSL::emit_custom_functions()
 			statement("");
 			break;

+		// "fsub" intrinsic support
+		case SPVFuncImplFSub:
+			statement("template<typename T>");
+			statement("T spvFSub(T l, T r)");
+			begin_scope();
+			statement("return fma(T(-1), r, l);");
+			end_scope();
+			statement("");
+			break;
+
 		// "fmul' intrinsic support
 		case SPVFuncImplFMul:
 			statement("template<typename T>");
@ -7579,19 +7589,26 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
 		break;

 	case OpFMul:
-		if (msl_options.invariant_float_math)
+		if (msl_options.invariant_float_math || has_decoration(ops[1], DecorationNoContraction))
 			MSL_BFOP(spvFMul);
 		else
 			MSL_BOP(*);
 		break;

 	case OpFAdd:
-		if (msl_options.invariant_float_math)
+		if (msl_options.invariant_float_math || has_decoration(ops[1], DecorationNoContraction))
 			MSL_BFOP(spvFAdd);
 		else
 			MSL_BOP(+);
 		break;

+	case OpFSub:
+		if (msl_options.invariant_float_math || has_decoration(ops[1], DecorationNoContraction))
+			MSL_BFOP(spvFSub);
+		else
+			MSL_BOP(-);
+		break;
+
 	// Atomics
 	case OpAtomicExchange:
 	{
@ -8033,7 +8050,7 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
 	case OpVectorTimesMatrix:
 	case OpMatrixTimesVector:
 	{
-		if (!msl_options.invariant_float_math)
+		if (!msl_options.invariant_float_math && !has_decoration(ops[1], DecorationNoContraction))
 		{
 			CompilerGLSL::emit_instruction(instruction);
 			break;
@ -8075,7 +8092,7 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)

 	case OpMatrixTimesMatrix:
 	{
-		if (!msl_options.invariant_float_math)
+		if (!msl_options.invariant_float_math && !has_decoration(ops[1], DecorationNoContraction))
 		{
 			CompilerGLSL::emit_instruction(instruction);
 			break;
@ -14856,9 +14873,11 @@ CompilerMSL::SPVFuncImpl CompilerMSL::OpCodePreprocessor::get_spv_func_impl(Op o
 		return SPVFuncImplMod;

 	case OpFAdd:
-		if (compiler.msl_options.invariant_float_math)
+	case OpFSub:
+		if (compiler.msl_options.invariant_float_math ||
+		    compiler.has_decoration(args[1], DecorationNoContraction))
 		{
-			return SPVFuncImplFAdd;
+			return opcode == OpFAdd ? SPVFuncImplFAdd : SPVFuncImplFSub;
 		}
 		break;

@ -14867,7 +14886,8 @@ CompilerMSL::SPVFuncImpl CompilerMSL::OpCodePreprocessor::get_spv_func_impl(Op o
 	case OpMatrixTimesVector:
 	case OpVectorTimesMatrix:
 	case OpMatrixTimesMatrix:
-		if (compiler.msl_options.invariant_float_math)
+		if (compiler.msl_options.invariant_float_math ||
+		    compiler.has_decoration(args[1], DecorationNoContraction))
 		{
 			return SPVFuncImplFMul;
 		}
--- a/spirv_msl.hpp
+++ b/spirv_msl.hpp
@ -655,6 +655,7 @@ protected:
 		SPVFuncImplImage2DAtomicCoords, // Emulate texture2D atomic operations
 		SPVFuncImplFMul,
 		SPVFuncImplFAdd,
+		SPVFuncImplFSub,
 		SPVFuncImplCubemapTo2DArrayFace,
 		SPVFuncImplUnsafeArray, // Allow Metal to use the array<T> template to make arrays a value type
 		SPVFuncImplInverse4x4,