0e963c62b6
WaveReadLaneAt is no longer restricted to dynamically uniform index, so can implement the other shuffle ops.
103 lines
5.6 KiB
Plaintext
103 lines
5.6 KiB
Plaintext
static const uint3 gl_WorkGroupSize = uint3(1u, 1u, 1u);
|
|
|
|
RWByteAddressBuffer _9 : register(u0, space0);
|
|
|
|
static uint4 gl_SubgroupEqMask;
|
|
static uint4 gl_SubgroupGeMask;
|
|
static uint4 gl_SubgroupGtMask;
|
|
static uint4 gl_SubgroupLeMask;
|
|
static uint4 gl_SubgroupLtMask;
|
|
void comp_main()
|
|
{
|
|
_9.Store(0, asuint(float(WaveGetLaneCount())));
|
|
_9.Store(0, asuint(float(WaveGetLaneIndex())));
|
|
bool elected = WaveIsFirstLane();
|
|
_9.Store(0, asuint(float4(gl_SubgroupEqMask).x));
|
|
_9.Store(0, asuint(float4(gl_SubgroupGeMask).x));
|
|
_9.Store(0, asuint(float4(gl_SubgroupGtMask).x));
|
|
_9.Store(0, asuint(float4(gl_SubgroupLeMask).x));
|
|
_9.Store(0, asuint(float4(gl_SubgroupLtMask).x));
|
|
float4 broadcasted = WaveReadLaneAt(10.0f.xxxx, 8u);
|
|
float3 first = WaveReadLaneFirst(20.0f.xxx);
|
|
uint4 ballot_value = WaveActiveBallot(true);
|
|
uint bit_count = countbits(ballot_value.x) + countbits(ballot_value.y) + countbits(ballot_value.z) + countbits(ballot_value.w);
|
|
uint shuffled = WaveReadLaneAt(10u, 8u);
|
|
uint shuffled_xor = WaveReadLaneAt(30u, WaveGetLaneIndex() ^ 8u);
|
|
uint shuffled_up = WaveReadLaneAt(20u, WaveGetLaneIndex() - 4u);
|
|
uint shuffled_down = WaveReadLaneAt(20u, WaveGetLaneIndex() + 4u);
|
|
bool has_all = WaveActiveAllTrue(true);
|
|
bool has_any = WaveActiveAnyTrue(true);
|
|
bool has_equal = WaveActiveAllEqual(true);
|
|
float4 added = WaveActiveSum(20.0f.xxxx);
|
|
int4 iadded = WaveActiveSum(int4(20, 20, 20, 20));
|
|
float4 multiplied = WaveActiveProduct(20.0f.xxxx);
|
|
int4 imultiplied = WaveActiveProduct(int4(20, 20, 20, 20));
|
|
float4 lo = WaveActiveMin(20.0f.xxxx);
|
|
float4 hi = WaveActiveMax(20.0f.xxxx);
|
|
int4 slo = WaveActiveMin(int4(20, 20, 20, 20));
|
|
int4 shi = WaveActiveMax(int4(20, 20, 20, 20));
|
|
uint4 ulo = WaveActiveMin(uint4(20u, 20u, 20u, 20u));
|
|
uint4 uhi = WaveActiveMax(uint4(20u, 20u, 20u, 20u));
|
|
uint4 anded = WaveActiveBitAnd(ballot_value);
|
|
uint4 ored = WaveActiveBitOr(ballot_value);
|
|
uint4 xored = WaveActiveBitXor(ballot_value);
|
|
bool4 anded_b = bool4(WaveActiveBitAnd(uint4(bool4(ballot_value.x == uint4(42u, 42u, 42u, 42u).x, ballot_value.y == uint4(42u, 42u, 42u, 42u).y, ballot_value.z == uint4(42u, 42u, 42u, 42u).z, ballot_value.w == uint4(42u, 42u, 42u, 42u).w))));
|
|
bool4 ored_b = bool4(WaveActiveBitOr(uint4(bool4(ballot_value.x == uint4(42u, 42u, 42u, 42u).x, ballot_value.y == uint4(42u, 42u, 42u, 42u).y, ballot_value.z == uint4(42u, 42u, 42u, 42u).z, ballot_value.w == uint4(42u, 42u, 42u, 42u).w))));
|
|
bool4 xored_b = bool4(WaveActiveBitXor(uint4(bool4(ballot_value.x == uint4(42u, 42u, 42u, 42u).x, ballot_value.y == uint4(42u, 42u, 42u, 42u).y, ballot_value.z == uint4(42u, 42u, 42u, 42u).z, ballot_value.w == uint4(42u, 42u, 42u, 42u).w))));
|
|
added = WavePrefixSum(added) + added;
|
|
iadded = WavePrefixSum(iadded) + iadded;
|
|
multiplied = WavePrefixProduct(multiplied) * multiplied;
|
|
imultiplied = WavePrefixProduct(imultiplied) * imultiplied;
|
|
added = WavePrefixSum(multiplied);
|
|
multiplied = WavePrefixProduct(multiplied);
|
|
iadded = WavePrefixSum(imultiplied);
|
|
imultiplied = WavePrefixProduct(imultiplied);
|
|
float4 swap_horiz = QuadReadAcrossX(20.0f.xxxx);
|
|
float4 swap_vertical = QuadReadAcrossY(20.0f.xxxx);
|
|
float4 swap_diagonal = QuadReadAcrossDiagonal(20.0f.xxxx);
|
|
float4 quad_broadcast = QuadReadLaneAt(20.0f.xxxx, 3u);
|
|
}
|
|
|
|
[numthreads(1, 1, 1)]
|
|
void main()
|
|
{
|
|
gl_SubgroupEqMask = 1u << (WaveGetLaneIndex() - uint4(0, 32, 64, 96));
|
|
if (WaveGetLaneIndex() >= 32) gl_SubgroupEqMask.x = 0;
|
|
if (WaveGetLaneIndex() >= 64 || WaveGetLaneIndex() < 32) gl_SubgroupEqMask.y = 0;
|
|
if (WaveGetLaneIndex() >= 96 || WaveGetLaneIndex() < 64) gl_SubgroupEqMask.z = 0;
|
|
if (WaveGetLaneIndex() < 96) gl_SubgroupEqMask.w = 0;
|
|
gl_SubgroupGeMask = ~((1u << (WaveGetLaneIndex() - uint4(0, 32, 64, 96))) - 1u);
|
|
if (WaveGetLaneIndex() >= 32) gl_SubgroupGeMask.x = 0u;
|
|
if (WaveGetLaneIndex() >= 64) gl_SubgroupGeMask.y = 0u;
|
|
if (WaveGetLaneIndex() >= 96) gl_SubgroupGeMask.z = 0u;
|
|
if (WaveGetLaneIndex() < 32) gl_SubgroupGeMask.y = ~0u;
|
|
if (WaveGetLaneIndex() < 64) gl_SubgroupGeMask.z = ~0u;
|
|
if (WaveGetLaneIndex() < 96) gl_SubgroupGeMask.w = ~0u;
|
|
uint gt_lane_index = WaveGetLaneIndex() + 1;
|
|
gl_SubgroupGtMask = ~((1u << (gt_lane_index - uint4(0, 32, 64, 96))) - 1u);
|
|
if (gt_lane_index >= 32) gl_SubgroupGtMask.x = 0u;
|
|
if (gt_lane_index >= 64) gl_SubgroupGtMask.y = 0u;
|
|
if (gt_lane_index >= 96) gl_SubgroupGtMask.z = 0u;
|
|
if (gt_lane_index >= 128) gl_SubgroupGtMask.w = 0u;
|
|
if (gt_lane_index < 32) gl_SubgroupGtMask.y = ~0u;
|
|
if (gt_lane_index < 64) gl_SubgroupGtMask.z = ~0u;
|
|
if (gt_lane_index < 96) gl_SubgroupGtMask.w = ~0u;
|
|
uint le_lane_index = WaveGetLaneIndex() + 1;
|
|
gl_SubgroupLeMask = (1u << (le_lane_index - uint4(0, 32, 64, 96))) - 1u;
|
|
if (le_lane_index >= 32) gl_SubgroupLeMask.x = ~0u;
|
|
if (le_lane_index >= 64) gl_SubgroupLeMask.y = ~0u;
|
|
if (le_lane_index >= 96) gl_SubgroupLeMask.z = ~0u;
|
|
if (le_lane_index >= 128) gl_SubgroupLeMask.w = ~0u;
|
|
if (le_lane_index < 32) gl_SubgroupLeMask.y = 0u;
|
|
if (le_lane_index < 64) gl_SubgroupLeMask.z = 0u;
|
|
if (le_lane_index < 96) gl_SubgroupLeMask.w = 0u;
|
|
gl_SubgroupLtMask = (1u << (WaveGetLaneIndex() - uint4(0, 32, 64, 96))) - 1u;
|
|
if (WaveGetLaneIndex() >= 32) gl_SubgroupLtMask.x = ~0u;
|
|
if (WaveGetLaneIndex() >= 64) gl_SubgroupLtMask.y = ~0u;
|
|
if (WaveGetLaneIndex() >= 96) gl_SubgroupLtMask.z = ~0u;
|
|
if (WaveGetLaneIndex() < 32) gl_SubgroupLtMask.y = 0u;
|
|
if (WaveGetLaneIndex() < 64) gl_SubgroupLtMask.z = 0u;
|
|
if (WaveGetLaneIndex() < 96) gl_SubgroupLtMask.w = 0u;
|
|
comp_main();
|
|
}
|