Add GroupSync() in HLSL.

This commit is contained in:
Hans-Kristian Arntzen 2017-10-20 16:18:02 +02:00
parent 85eb972259
commit ae236e7056
6 changed files with 142 additions and 12 deletions

View File

@ -0,0 +1,31 @@
const uint3 gl_WorkGroupSize = uint3(4u, 1u, 1u);
ByteAddressBuffer _22 : register(u0);
RWByteAddressBuffer _44 : register(u1);
static uint3 gl_GlobalInvocationID;
static uint gl_LocalInvocationIndex;
struct SPIRV_Cross_Input
{
uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
uint gl_LocalInvocationIndex : SV_GroupIndex;
};
groupshared float sShared[4];
void comp_main()
{
uint ident = gl_GlobalInvocationID.x;
float idata = asfloat(_22.Load(ident * 4 + 0));
sShared[gl_LocalInvocationIndex] = idata;
GroupMemoryBarrierWithGroupSync();
_44.Store(ident * 4 + 0, asuint(sShared[(4u - gl_LocalInvocationIndex) - 1u]));
}
[numthreads(4, 1, 1)]
void main(SPIRV_Cross_Input stage_input)
{
gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
gl_LocalInvocationIndex = stage_input.gl_LocalInvocationIndex;
comp_main();
}

View File

@ -0,0 +1,27 @@
#version 310 es
layout(local_size_x = 4) in;
shared float sShared[gl_WorkGroupSize.x];
layout(std430, binding = 0) readonly buffer SSBO
{
float in_data[];
};
layout(std430, binding = 1) writeonly buffer SSBO2
{
float out_data[];
};
void main()
{
uint ident = gl_GlobalInvocationID.x;
float idata = in_data[ident];
sShared[gl_LocalInvocationIndex] = idata;
memoryBarrierShared();
barrier();
out_data[ident] = sShared[gl_WorkGroupSize.x - gl_LocalInvocationIndex - 1u];
}

View File

@ -4888,6 +4888,14 @@ bool CompilerGLSL::optimize_read_modify_write(const string &lhs, const string &r
return true;
}
void CompilerGLSL::emit_block_instructions(const SPIRBlock &block)
{
current_emitting_block = █
for (auto &op : block.ops)
emit_instruction(op);
current_emitting_block = nullptr;
}
void CompilerGLSL::emit_instruction(const Instruction &instruction)
{
auto ops = stream(instruction);
@ -6262,10 +6270,16 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
if (get_entry_point().model == ExecutionModelGLCompute)
{
uint32_t mem = get<SPIRConstant>(ops[2]).scalar();
// We cannot forward any loads beyond the memory barrier.
if (mem)
flush_all_active_variables();
if (mem == MemorySemanticsWorkgroupMemoryMask)
statement("memoryBarrierShared();");
else if (mem)
statement("memoryBarrier();");
}
statement("barrier();");
break;
@ -7338,8 +7352,7 @@ string CompilerGLSL::emit_continue_block(uint32_t continue_block)
{
propagate_loop_dominators(*block);
// Write out all instructions we have in this block.
for (auto &op : block->ops)
emit_instruction(op);
emit_block_instructions(*block);
// For plain branchless for/while continue blocks.
if (block->next_block)
@ -7410,8 +7423,7 @@ bool CompilerGLSL::attempt_emit_loop_header(SPIRBlock &block, SPIRBlock::Method
// If we're trying to create a true for loop,
// we need to make sure that all opcodes before branch statement do not actually emit any code.
// We can then take the condition expression and create a for (; cond ; ) { body; } structure instead.
for (auto &op : block.ops)
emit_instruction(op);
emit_block_instructions(block);
bool condition_is_temporary = forced_temporaries.find(block.condition) == end(forced_temporaries);
@ -7462,8 +7474,7 @@ bool CompilerGLSL::attempt_emit_loop_header(SPIRBlock &block, SPIRBlock::Method
// If we're trying to create a true for loop,
// we need to make sure that all opcodes before branch statement do not actually emit any code.
// We can then take the condition expression and create a for (; cond ; ) { body; } structure instead.
for (auto &op : child.ops)
emit_instruction(op);
emit_block_instructions(child);
bool condition_is_temporary = forced_temporaries.find(child.condition) == end(forced_temporaries);
@ -7569,8 +7580,8 @@ void CompilerGLSL::emit_block_chain(SPIRBlock &block)
{
statement("do");
begin_scope();
for (auto &op : block.ops)
emit_instruction(op);
emit_block_instructions(block);
}
else if (block.merge == SPIRBlock::MergeLoop)
{
@ -7582,13 +7593,12 @@ void CompilerGLSL::emit_block_chain(SPIRBlock &block)
statement("for (;;)");
begin_scope();
for (auto &op : block.ops)
emit_instruction(op);
emit_block_instructions(block);
}
else
{
for (auto &op : block.ops)
emit_instruction(op);
emit_block_instructions(block);
}
// If we didn't successfully emit a loop header and we had loop variable candidates, we have a problem

View File

@ -181,7 +181,12 @@ protected:
// Virtualize methods which need to be overridden by subclass targets like C++ and such.
virtual void emit_function_prototype(SPIRFunction &func, uint64_t return_flags);
// Kinda ugly way to let opcodes peek at their neighbor instructions for trivial peephole scenarios.
const SPIRBlock *current_emitting_block = nullptr;
virtual void emit_instruction(const Instruction &instr);
void emit_block_instructions(const SPIRBlock &block);
virtual void emit_glsl_op(uint32_t result_type, uint32_t result_id, uint32_t op, const uint32_t *args,
uint32_t count);
virtual void emit_header();

View File

@ -2404,6 +2404,16 @@ void CompilerHLSL::emit_atomic(const uint32_t *ops, uint32_t length, spv::Op op)
register_read(ops[1], ops[2], should_forward(ops[2]));
}
const Instruction *CompilerHLSL::get_next_instruction_in_block(const Instruction &instr)
{
// FIXME: This is kind of hacky. There should be a cleaner way.
uint32_t offset = uint32_t(&instr - current_emitting_block->ops.data());
if ((offset + 1) < current_emitting_block->ops.size())
return &current_emitting_block->ops[offset + 1];
else
return nullptr;
}
void CompilerHLSL::emit_instruction(const Instruction &instruction)
{
auto ops = stream(instruction);
@ -2783,6 +2793,52 @@ void CompilerHLSL::emit_instruction(const Instruction &instruction)
break;
}
case OpMemoryBarrier:
{
uint32_t mem = get<SPIRConstant>(ops[1]).scalar();
// If the next instruction is OpControlBarrier and it does what we need, this opcode can be a noop.
const Instruction *next = get_next_instruction_in_block(instruction);
if (next && next->op == OpControlBarrier)
{
auto *next_ops = stream(*next);
uint32_t next_mem = get<SPIRConstant>(next_ops[2]).scalar();
next_mem |= MemorySemanticsWorkgroupMemoryMask; // Barrier in HLSL always implies GroupSync.
if ((next_mem & mem) == mem)
break;
}
// We cannot forward any loads beyond the memory barrier.
if (mem)
flush_all_active_variables();
if (mem == MemorySemanticsWorkgroupMemoryMask)
statement("GroupMemoryBarrier();");
else if (mem)
statement("DeviceMemoryBarrier();");
break;
}
case OpControlBarrier:
{
uint32_t mem = get<SPIRConstant>(ops[2]).scalar();
// We cannot forward any loads beyond the memory barrier.
if (mem)
flush_all_active_variables();
if (mem == MemorySemanticsWorkgroupMemoryMask)
statement("GroupMemoryBarrierWithGroupSync();");
else if (mem)
statement("DeviceMemoryBarrierWithGroupSync();");
else
{
// There is no "GroupSync" standalone function.
statement("GroupMemoryBarrierWithGroupSync();");
}
break;
}
default:
CompilerGLSL::emit_instruction(instruction);
break;

View File

@ -93,6 +93,7 @@ private:
std::string read_access_chain(const SPIRAccessChain &chain);
void emit_store(const Instruction &instruction);
void emit_atomic(const uint32_t *ops, uint32_t length, spv::Op op);
const Instruction *get_next_instruction_in_block(const Instruction &instr);
void emit_struct_member(const SPIRType &type, uint32_t member_type_id, uint32_t index,
const std::string &qualifier) override;