Rewrite barrier handling in HLSL.
This commit is contained in:
parent
27ad8c0922
commit
9c3d4e7c60
@ -2,21 +2,21 @@ static const uint3 gl_WorkGroupSize = uint3(4u, 1u, 1u);
|
||||
|
||||
void comp_main()
|
||||
{
|
||||
GroupMemoryBarrier();
|
||||
AllMemoryBarrier();
|
||||
DeviceMemoryBarrier();
|
||||
DeviceMemoryBarrier();
|
||||
AllMemoryBarrier();
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
AllMemoryBarrier();
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
DeviceMemoryBarrier();
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
DeviceMemoryBarrier();
|
||||
DeviceMemoryBarrier();
|
||||
DeviceMemoryBarrierWithGroupSync();
|
||||
DeviceMemoryBarrier();
|
||||
DeviceMemoryBarrierWithGroupSync();
|
||||
DeviceMemoryBarrier();
|
||||
DeviceMemoryBarrierWithGroupSync();
|
||||
DeviceMemoryBarrier();
|
||||
DeviceMemoryBarrierWithGroupSync();
|
||||
DeviceMemoryBarrier();
|
||||
DeviceMemoryBarrierWithGroupSync();
|
||||
DeviceMemoryBarrierWithGroupSync();
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
AllMemoryBarrier();
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
}
|
||||
|
||||
[numthreads(4, 1, 1)]
|
||||
|
@ -16,7 +16,7 @@ groupshared float sShared[4];
|
||||
void comp_main()
|
||||
{
|
||||
sShared[gl_LocalInvocationIndex] = asfloat(_22.Load(gl_GlobalInvocationID.x * 4 + 0));
|
||||
DeviceMemoryBarrierWithGroupSync();
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
_44.Store(gl_GlobalInvocationID.x * 4 + 0, asuint(sShared[(4u - gl_LocalInvocationIndex) - 1u]));
|
||||
}
|
||||
|
||||
|
@ -2,12 +2,12 @@ static const uint3 gl_WorkGroupSize = uint3(4u, 1u, 1u);
|
||||
|
||||
void barrier_shared()
|
||||
{
|
||||
DeviceMemoryBarrier();
|
||||
GroupMemoryBarrier();
|
||||
}
|
||||
|
||||
void full_barrier()
|
||||
{
|
||||
DeviceMemoryBarrier();
|
||||
AllMemoryBarrier();
|
||||
}
|
||||
|
||||
void image_barrier()
|
||||
@ -22,41 +22,41 @@ void buffer_barrier()
|
||||
|
||||
void group_barrier()
|
||||
{
|
||||
DeviceMemoryBarrier();
|
||||
AllMemoryBarrier();
|
||||
}
|
||||
|
||||
void barrier_shared_exec()
|
||||
{
|
||||
DeviceMemoryBarrierWithGroupSync();
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
}
|
||||
|
||||
void full_barrier_exec()
|
||||
{
|
||||
DeviceMemoryBarrier();
|
||||
DeviceMemoryBarrierWithGroupSync();
|
||||
AllMemoryBarrier();
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
}
|
||||
|
||||
void image_barrier_exec()
|
||||
{
|
||||
DeviceMemoryBarrier();
|
||||
DeviceMemoryBarrierWithGroupSync();
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
}
|
||||
|
||||
void buffer_barrier_exec()
|
||||
{
|
||||
DeviceMemoryBarrier();
|
||||
DeviceMemoryBarrierWithGroupSync();
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
}
|
||||
|
||||
void group_barrier_exec()
|
||||
{
|
||||
DeviceMemoryBarrier();
|
||||
DeviceMemoryBarrierWithGroupSync();
|
||||
AllMemoryBarrier();
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
}
|
||||
|
||||
void exec_barrier()
|
||||
{
|
||||
DeviceMemoryBarrierWithGroupSync();
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
}
|
||||
|
||||
void comp_main()
|
||||
|
@ -18,7 +18,7 @@ void comp_main()
|
||||
uint ident = gl_GlobalInvocationID.x;
|
||||
float idata = asfloat(_22.Load(ident * 4 + 0));
|
||||
sShared[gl_LocalInvocationIndex] = idata;
|
||||
DeviceMemoryBarrierWithGroupSync();
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
_44.Store(ident * 4 + 0, asuint(sShared[(4u - gl_LocalInvocationIndex) - 1u]));
|
||||
}
|
||||
|
||||
|
@ -25,16 +25,6 @@ using namespace spv;
|
||||
using namespace spirv_cross;
|
||||
using namespace std;
|
||||
|
||||
static uint32_t mask_relevant_memory_semantics(uint32_t semantics)
|
||||
{
|
||||
return semantics & (MemorySemanticsAtomicCounterMemoryMask |
|
||||
MemorySemanticsImageMemoryMask |
|
||||
MemorySemanticsWorkgroupMemoryMask |
|
||||
MemorySemanticsUniformMemoryMask |
|
||||
MemorySemanticsCrossWorkgroupMemoryMask |
|
||||
MemorySemanticsSubgroupMemoryMask);
|
||||
}
|
||||
|
||||
static bool packing_is_vec4_padded(BufferPackingStandard packing)
|
||||
{
|
||||
switch (packing)
|
||||
@ -8449,3 +8439,14 @@ const Instruction *CompilerGLSL::get_next_instruction_in_block(const Instruction
|
||||
else
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
uint32_t CompilerGLSL::mask_relevant_memory_semantics(uint32_t semantics)
|
||||
{
|
||||
return semantics & (MemorySemanticsAtomicCounterMemoryMask |
|
||||
MemorySemanticsImageMemoryMask |
|
||||
MemorySemanticsWorkgroupMemoryMask |
|
||||
MemorySemanticsUniformMemoryMask |
|
||||
MemorySemanticsCrossWorkgroupMemoryMask |
|
||||
MemorySemanticsSubgroupMemoryMask);
|
||||
}
|
||||
|
||||
|
@ -510,6 +510,7 @@ protected:
|
||||
|
||||
bool can_use_io_location(spv::StorageClass storage);
|
||||
const Instruction *get_next_instruction_in_block(const Instruction &instr);
|
||||
static uint32_t mask_relevant_memory_semantics(uint32_t semantics);
|
||||
|
||||
private:
|
||||
void init()
|
||||
|
113
spirv_hlsl.cpp
113
spirv_hlsl.cpp
@ -3421,48 +3421,91 @@ void CompilerHLSL::emit_instruction(const Instruction &instruction)
|
||||
break;
|
||||
}
|
||||
|
||||
case OpControlBarrier:
|
||||
case OpMemoryBarrier:
|
||||
{
|
||||
uint32_t mem = get<SPIRConstant>(ops[1]).scalar();
|
||||
uint32_t memory;
|
||||
uint32_t semantics;
|
||||
|
||||
// If the next instruction is OpControlBarrier and it does what we need, this opcode can be a noop.
|
||||
const Instruction *next = get_next_instruction_in_block(instruction);
|
||||
if (next && next->op == OpControlBarrier)
|
||||
if (opcode == OpMemoryBarrier)
|
||||
{
|
||||
auto *next_ops = stream(*next);
|
||||
uint32_t next_mem = get<SPIRConstant>(next_ops[2]).scalar();
|
||||
next_mem |= MemorySemanticsWorkgroupMemoryMask; // Barrier in HLSL always implies GroupSync.
|
||||
if ((next_mem & mem) == mem)
|
||||
break;
|
||||
memory = get<SPIRConstant>(ops[0]).scalar();
|
||||
semantics = get<SPIRConstant>(ops[1]).scalar();
|
||||
}
|
||||
|
||||
// We cannot forward any loads beyond the memory barrier.
|
||||
if (mem)
|
||||
flush_all_active_variables();
|
||||
|
||||
if (mem == MemorySemanticsWorkgroupMemoryMask)
|
||||
statement("GroupMemoryBarrier();");
|
||||
else if (mem)
|
||||
statement("DeviceMemoryBarrier();");
|
||||
break;
|
||||
}
|
||||
|
||||
case OpControlBarrier:
|
||||
{
|
||||
uint32_t mem = get<SPIRConstant>(ops[2]).scalar();
|
||||
|
||||
// We cannot forward any loads beyond the memory barrier.
|
||||
if (mem)
|
||||
flush_all_active_variables();
|
||||
|
||||
if (mem == MemorySemanticsWorkgroupMemoryMask)
|
||||
statement("GroupMemoryBarrierWithGroupSync();");
|
||||
else if (mem)
|
||||
statement("DeviceMemoryBarrierWithGroupSync();");
|
||||
else
|
||||
{
|
||||
// There is no "GroupSync" standalone function.
|
||||
statement("GroupMemoryBarrierWithGroupSync();");
|
||||
memory = get<SPIRConstant>(ops[1]).scalar();
|
||||
semantics = get<SPIRConstant>(ops[2]).scalar();
|
||||
}
|
||||
|
||||
// We only care about these flags, acquire/release and friends are not relevant to GLSL.
|
||||
semantics = mask_relevant_memory_semantics(semantics);
|
||||
|
||||
if (opcode == OpMemoryBarrier)
|
||||
{
|
||||
// If we are a memory barrier, and the next instruction is a control barrier, check if that memory barrier
|
||||
// does what we need, so we avoid redundant barriers.
|
||||
const Instruction *next = get_next_instruction_in_block(instruction);
|
||||
if (next && next->op == OpControlBarrier)
|
||||
{
|
||||
auto *next_ops = stream(*next);
|
||||
uint32_t next_memory = get<SPIRConstant>(next_ops[1]).scalar();
|
||||
uint32_t next_semantics = get<SPIRConstant>(next_ops[2]).scalar();
|
||||
next_semantics = mask_relevant_memory_semantics(next_semantics);
|
||||
|
||||
// There is no "just execution barrier" in HLSL.
|
||||
// If there are no memory semantics for next instruction, we will imply group shared memory is synced.
|
||||
if (next_semantics == 0)
|
||||
next_semantics = MemorySemanticsWorkgroupMemoryMask;
|
||||
|
||||
bool memory_scope_covered = false;
|
||||
if (next_memory == memory)
|
||||
memory_scope_covered = true;
|
||||
else if (next_semantics == MemorySemanticsWorkgroupMemoryMask)
|
||||
{
|
||||
// If we only care about workgroup memory, either Device or Workgroup scope is fine,
|
||||
// scope does not have to match.
|
||||
if ((next_memory == ScopeDevice || next_memory == ScopeWorkgroup) &&
|
||||
(memory == ScopeDevice || memory == ScopeWorkgroup))
|
||||
{
|
||||
memory_scope_covered = true;
|
||||
}
|
||||
}
|
||||
else if (memory == ScopeWorkgroup && next_memory == ScopeDevice)
|
||||
{
|
||||
// The control barrier has device scope, but the memory barrier just has workgroup scope.
|
||||
memory_scope_covered = true;
|
||||
}
|
||||
|
||||
// If we have the same memory scope, and all memory types are covered, we're good.
|
||||
if (memory_scope_covered && (semantics & next_semantics) == semantics)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// We are synchronizing some memory or syncing execution,
|
||||
// so we cannot forward any loads beyond the memory barrier.
|
||||
if (semantics || opcode == OpControlBarrier)
|
||||
flush_all_active_variables();
|
||||
|
||||
if (opcode == OpControlBarrier)
|
||||
{
|
||||
// We cannot emit just execution barrier, for no memory semantics pick the cheapest option.
|
||||
if (semantics == MemorySemanticsWorkgroupMemoryMask || semantics == 0)
|
||||
statement("GroupMemoryBarrierWithGroupSync();");
|
||||
else if (semantics != 0 && (semantics & MemorySemanticsWorkgroupMemoryMask) == 0)
|
||||
statement("DeviceMemoryBarrierWithGroupSync();");
|
||||
else
|
||||
statement("AllMemoryBarrierWithGroupSync();");
|
||||
}
|
||||
else
|
||||
{
|
||||
if (semantics == MemorySemanticsWorkgroupMemoryMask)
|
||||
statement("GroupMemoryBarrier();");
|
||||
else if (semantics != 0 && (semantics & MemorySemanticsWorkgroupMemoryMask) == 0)
|
||||
statement("DeviceMemoryBarrier();");
|
||||
else
|
||||
statement("AllMemoryBarrier();");
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user