Add GroupSync() in HLSL.

2017-10-20 16:18:02 +02:00 · 2017-10-20 16:18:02 +02:00 · ae236e7056
commit ae236e7056
parent 85eb972259
6 changed files with 142 additions and 12 deletions
--- a/reference/shaders-hlsl/comp/shared.comp
+++ b/reference/shaders-hlsl/comp/shared.comp
@ -0,0 +1,31 @@
+const uint3 gl_WorkGroupSize = uint3(4u, 1u, 1u);
+
+ByteAddressBuffer _22 : register(u0);
+RWByteAddressBuffer _44 : register(u1);
+
+static uint3 gl_GlobalInvocationID;
+static uint gl_LocalInvocationIndex;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+    uint gl_LocalInvocationIndex : SV_GroupIndex;
+};
+
+groupshared float sShared[4];
+
+void comp_main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    float idata = asfloat(_22.Load(ident * 4 + 0));
+    sShared[gl_LocalInvocationIndex] = idata;
+    GroupMemoryBarrierWithGroupSync();
+    _44.Store(ident * 4 + 0, asuint(sShared[(4u - gl_LocalInvocationIndex) - 1u]));
+}
+
+[numthreads(4, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    gl_LocalInvocationIndex = stage_input.gl_LocalInvocationIndex;
+    comp_main();
+}
--- a/shaders-hlsl/comp/shared.comp
+++ b/shaders-hlsl/comp/shared.comp
@ -0,0 +1,27 @@
+#version 310 es
+layout(local_size_x = 4) in;
+
+shared float sShared[gl_WorkGroupSize.x];
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    float in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    float out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    float idata = in_data[ident];
+
+    sShared[gl_LocalInvocationIndex] = idata;
+    memoryBarrierShared();
+    barrier();
+
+    out_data[ident] = sShared[gl_WorkGroupSize.x - gl_LocalInvocationIndex - 1u];
+}
+
--- a/spirv_glsl.cpp
+++ b/spirv_glsl.cpp
@ -4888,6 +4888,14 @@ bool CompilerGLSL::optimize_read_modify_write(const string &lhs, const string &r
 	return true;
 }

+void CompilerGLSL::emit_block_instructions(const SPIRBlock &block)
+{
+	current_emitting_block = &block;
+	for (auto &op : block.ops)
+		emit_instruction(op);
+	current_emitting_block = nullptr;
+}
+
 void CompilerGLSL::emit_instruction(const Instruction &instruction)
 {
 	auto ops = stream(instruction);
@ -6262,10 +6270,16 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		if (get_entry_point().model == ExecutionModelGLCompute)
 		{
 			uint32_t mem = get<SPIRConstant>(ops[2]).scalar();
+
+			// We cannot forward any loads beyond the memory barrier.
+			if (mem)
+				flush_all_active_variables();
+
 			if (mem == MemorySemanticsWorkgroupMemoryMask)
 				statement("memoryBarrierShared();");
 			else if (mem)
 				statement("memoryBarrier();");
+
 		}
 		statement("barrier();");
 		break;
@ -7338,8 +7352,7 @@ string CompilerGLSL::emit_continue_block(uint32_t continue_block)
 	{
 		propagate_loop_dominators(*block);
 		// Write out all instructions we have in this block.
-		for (auto &op : block->ops)
-			emit_instruction(op);
+		emit_block_instructions(*block);

 		// For plain branchless for/while continue blocks.
 		if (block->next_block)
@ -7410,8 +7423,7 @@ bool CompilerGLSL::attempt_emit_loop_header(SPIRBlock &block, SPIRBlock::Method
 		// If we're trying to create a true for loop,
 		// we need to make sure that all opcodes before branch statement do not actually emit any code.
 		// We can then take the condition expression and create a for (; cond ; ) { body; } structure instead.
-		for (auto &op : block.ops)
-			emit_instruction(op);
+		emit_block_instructions(block);

 		bool condition_is_temporary = forced_temporaries.find(block.condition) == end(forced_temporaries);

@ -7462,8 +7474,7 @@ bool CompilerGLSL::attempt_emit_loop_header(SPIRBlock &block, SPIRBlock::Method
 		// If we're trying to create a true for loop,
 		// we need to make sure that all opcodes before branch statement do not actually emit any code.
 		// We can then take the condition expression and create a for (; cond ; ) { body; } structure instead.
-		for (auto &op : child.ops)
-			emit_instruction(op);
+		emit_block_instructions(child);

 		bool condition_is_temporary = forced_temporaries.find(child.condition) == end(forced_temporaries);

@ -7569,8 +7580,8 @@ void CompilerGLSL::emit_block_chain(SPIRBlock &block)
 	{
 		statement("do");
 		begin_scope();
-		for (auto &op : block.ops)
-			emit_instruction(op);
+
+		emit_block_instructions(block);
 	}
 	else if (block.merge == SPIRBlock::MergeLoop)
 	{
@ -7582,13 +7593,12 @@ void CompilerGLSL::emit_block_chain(SPIRBlock &block)

 		statement("for (;;)");
 		begin_scope();
-		for (auto &op : block.ops)
-			emit_instruction(op);
+
+		emit_block_instructions(block);
 	}
 	else
 	{
-		for (auto &op : block.ops)
-			emit_instruction(op);
+		emit_block_instructions(block);
 	}

 	// If we didn't successfully emit a loop header and we had loop variable candidates, we have a problem
--- a/spirv_glsl.hpp
+++ b/spirv_glsl.hpp
@ -181,7 +181,12 @@ protected:

 	// Virtualize methods which need to be overridden by subclass targets like C++ and such.
 	virtual void emit_function_prototype(SPIRFunction &func, uint64_t return_flags);
+
+	// Kinda ugly way to let opcodes peek at their neighbor instructions for trivial peephole scenarios.
+	const SPIRBlock *current_emitting_block = nullptr;
+
 	virtual void emit_instruction(const Instruction &instr);
+	void emit_block_instructions(const SPIRBlock &block);
 	virtual void emit_glsl_op(uint32_t result_type, uint32_t result_id, uint32_t op, const uint32_t *args,
 	                          uint32_t count);
 	virtual void emit_header();
--- a/spirv_hlsl.cpp
+++ b/spirv_hlsl.cpp
@ -2404,6 +2404,16 @@ void CompilerHLSL::emit_atomic(const uint32_t *ops, uint32_t length, spv::Op op)
 	register_read(ops[1], ops[2], should_forward(ops[2]));
 }

+const Instruction *CompilerHLSL::get_next_instruction_in_block(const Instruction &instr)
+{
+	// FIXME: This is kind of hacky. There should be a cleaner way.
+	uint32_t offset = uint32_t(&instr - current_emitting_block->ops.data());
+	if ((offset + 1) < current_emitting_block->ops.size())
+		return &current_emitting_block->ops[offset + 1];
+	else
+		return nullptr;
+}
+
 void CompilerHLSL::emit_instruction(const Instruction &instruction)
 {
 	auto ops = stream(instruction);
@ -2783,6 +2793,52 @@ void CompilerHLSL::emit_instruction(const Instruction &instruction)
 		break;
 	}

+	case OpMemoryBarrier:
+	{
+		uint32_t mem = get<SPIRConstant>(ops[1]).scalar();
+
+		// If the next instruction is OpControlBarrier and it does what we need, this opcode can be a noop.
+		const Instruction *next = get_next_instruction_in_block(instruction);
+		if (next && next->op == OpControlBarrier)
+		{
+			auto *next_ops = stream(*next);
+			uint32_t next_mem = get<SPIRConstant>(next_ops[2]).scalar();
+			next_mem |= MemorySemanticsWorkgroupMemoryMask; // Barrier in HLSL always implies GroupSync.
+			if ((next_mem & mem) == mem)
+				break;
+		}
+
+		// We cannot forward any loads beyond the memory barrier.
+		if (mem)
+			flush_all_active_variables();
+
+		if (mem == MemorySemanticsWorkgroupMemoryMask)
+			statement("GroupMemoryBarrier();");
+		else if (mem)
+			statement("DeviceMemoryBarrier();");
+		break;
+	}
+
+	case OpControlBarrier:
+	{
+		uint32_t mem = get<SPIRConstant>(ops[2]).scalar();
+
+		// We cannot forward any loads beyond the memory barrier.
+		if (mem)
+			flush_all_active_variables();
+
+		if (mem == MemorySemanticsWorkgroupMemoryMask)
+			statement("GroupMemoryBarrierWithGroupSync();");
+		else if (mem)
+			statement("DeviceMemoryBarrierWithGroupSync();");
+		else
+		{
+			// There is no "GroupSync" standalone function.
+			statement("GroupMemoryBarrierWithGroupSync();");
+		}
+		break;
+	}
+
 	default:
 		CompilerGLSL::emit_instruction(instruction);
 		break;
--- a/spirv_hlsl.hpp
+++ b/spirv_hlsl.hpp
@ -93,6 +93,7 @@ private:
 	std::string read_access_chain(const SPIRAccessChain &chain);
 	void emit_store(const Instruction &instruction);
 	void emit_atomic(const uint32_t *ops, uint32_t length, spv::Op op);
+	const Instruction *get_next_instruction_in_block(const Instruction &instr);

 	void emit_struct_member(const SPIRType &type, uint32_t member_type_id, uint32_t index,
 	                        const std::string &qualifier) override;