Fix PLBVH reduction kernels, simplify nodes per level calculation.

Also calculate index ranges for each internal node.
This commit is contained in:
Jackson Lee 2014-02-19 21:49:30 -08:00
parent 7f0e361fa0
commit e955192971
4 changed files with 193 additions and 175 deletions

View File

@ -44,10 +44,8 @@ subject to the following restrictions:
///The BVH implementation here is almost the same as [Karras 2012], but a different method is used for constructing the tree. ///The BVH implementation here is almost the same as [Karras 2012], but a different method is used for constructing the tree.
/// - Instead of building a binary radix tree, we simply pair each node with its nearest sibling. /// - Instead of building a binary radix tree, we simply pair each node with its nearest sibling.
/// This has the effect of further worsening the quality of the BVH, but the main spatial partitioning is done by the /// This has the effect of further worsening the quality of the BVH, but the main spatial partitioning is done by the
/// Z-curve anyways, and this method should be simpler and faster during construction. Additionally, it is still possible /// Z-curve anyways, and this method should be simpler and faster during construction.
/// to improve the quality of the BVH by rearranging the connections between nodes. /// - Rather than traveling upwards towards the root from the leaf nodes, as in the paper,
/// - Due to the way the tree is constructed, it becomes unnecessary to use atomic_inc to get
/// the AABB for each internal node. Rather than traveling upwards from the leaf nodes, as in the paper,
/// each internal node checks its child nodes to get its AABB. /// each internal node checks its child nodes to get its AABB.
class b3GpuParallelLinearBvh class b3GpuParallelLinearBvh
{ {
@ -73,6 +71,7 @@ class b3GpuParallelLinearBvh
//1 element per internal node (number_of_internal_nodes = number_of_leaves - 1); index 0 is the root node //1 element per internal node (number_of_internal_nodes = number_of_leaves - 1); index 0 is the root node
b3OpenCLArray<b3SapAabb> m_internalNodeAabbs; b3OpenCLArray<b3SapAabb> m_internalNodeAabbs;
b3OpenCLArray<b3Int2> m_internalNodeLeafIndexRanges; //x == min leaf index, y == max leaf index
b3OpenCLArray<b3Int2> m_internalNodeChildNodes; //x == left child, y == right child b3OpenCLArray<b3Int2> m_internalNodeChildNodes; //x == left child, y == right child
b3OpenCLArray<int> m_internalNodeParentNodes; b3OpenCLArray<int> m_internalNodeParentNodes;
@ -90,6 +89,7 @@ public:
m_numNodesPerLevelGpu(context, queue), m_numNodesPerLevelGpu(context, queue),
m_firstIndexOffsetPerLevelGpu(context, queue), m_firstIndexOffsetPerLevelGpu(context, queue),
m_internalNodeAabbs(context, queue), m_internalNodeAabbs(context, queue),
m_internalNodeLeafIndexRanges(context, queue),
m_internalNodeChildNodes(context, queue), m_internalNodeChildNodes(context, queue),
m_internalNodeParentNodes(context, queue), m_internalNodeParentNodes(context, queue),
m_leafNodeParentNodes(context, queue), m_leafNodeParentNodes(context, queue),
@ -128,8 +128,6 @@ public:
clReleaseProgram(m_parallelLinearBvhProgram); clReleaseProgram(m_parallelLinearBvhProgram);
} }
// fix: need to handle/test case with 2 nodes
void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs) void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs)
{ {
@ -143,6 +141,7 @@ public:
// //
{ {
m_internalNodeAabbs.resize(numInternalNodes); m_internalNodeAabbs.resize(numInternalNodes);
m_internalNodeLeafIndexRanges.resize(numInternalNodes);
m_internalNodeChildNodes.resize(numInternalNodes); m_internalNodeChildNodes.resize(numInternalNodes);
m_internalNodeParentNodes.resize(numInternalNodes); m_internalNodeParentNodes.resize(numInternalNodes);
@ -180,17 +179,20 @@ public:
//Calculate number of nodes in each level; //Calculate number of nodes in each level;
//start from the second to last level(level right next to leaf nodes) and move towards the root(level 0) //start from the second to last level(level right next to leaf nodes) and move towards the root(level 0)
int hasRemainder = 0; int remainder = 0;
for(int levelIndex = numLevels - 2; levelIndex >= 0; --levelIndex) for(int levelIndex = numLevels - 2; levelIndex >= 0; --levelIndex)
{ {
int numNodesPreviousLevel = m_numNodesPerLevelCpu[levelIndex + 1]; //For first iteration this == numLeaves int numNodesPreviousLevel = m_numNodesPerLevelCpu[levelIndex + 1]; //For first iteration this == numLeaves
int numNodesCurrentLevel = numNodesPreviousLevel / 2;
bool allNodesAllocated = ( (numNodesPreviousLevel + hasRemainder) % 2 == 0 );
int numNodesCurrentLevel = (allNodesAllocated) ? (numNodesPreviousLevel + hasRemainder) / 2 : numNodesPreviousLevel / 2;
m_numNodesPerLevelCpu[levelIndex] = numNodesCurrentLevel;
hasRemainder = static_cast<int>(!allNodesAllocated); remainder += numNodesPreviousLevel % 2;
if(remainder == 2)
{
numNodesCurrentLevel++;
remainder = 0;
}
m_numNodesPerLevelCpu[levelIndex] = numNodesCurrentLevel;
} }
//Prefix sum to calculate the first index offset of each level //Prefix sum to calculate the first index offset of each level
@ -232,17 +234,22 @@ public:
B3_PROFILE("Find AABB of merged nodes"); B3_PROFILE("Find AABB of merged nodes");
m_mergedAabb.copyFromOpenCLArray(worldSpaceAabbs); //Need to make a copy since the kernel modifies the array m_mergedAabb.copyFromOpenCLArray(worldSpaceAabbs); //Need to make a copy since the kernel modifies the array
b3BufferInfoCL bufferInfo[] = for(int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2;
numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2)
{ {
b3BufferInfoCL( m_mergedAabb.getBufferCL() ) //Resulting AABB is stored in m_mergedAabb[0] b3BufferInfoCL bufferInfo[] =
}; {
b3BufferInfoCL( m_mergedAabb.getBufferCL() ) //Resulting AABB is stored in m_mergedAabb[0]
};
b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(numAabbsNeedingMerge);
launcher.launch1D(numAabbsNeedingMerge);
}
b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(numLeaves);
launcher.launch1D(numLeaves);
clFinish(m_queue); clFinish(m_queue);
} }
@ -315,7 +322,8 @@ public:
m_internalNodeChildNodes.copyToHost(internalNodeChildNodes, false); m_internalNodeChildNodes.copyToHost(internalNodeChildNodes, false);
clFinish(m_queue); clFinish(m_queue);
for(int i = 0; i < 256; ++i) printf("ch[%d]: %d, %d\n", i, internalNodeChildNodes[i].x, internalNodeChildNodes[i].y); for(int i = 0; i < numInternalNodes; ++i)
printf("ch[%d]: %d, %d\n", i, internalNodeChildNodes[i].x, internalNodeChildNodes[i].y);
printf("\n"); printf("\n");
} }
} }
@ -325,30 +333,58 @@ public:
{ {
B3_PROFILE("Set AABBs"); B3_PROFILE("Set AABBs");
b3BufferInfoCL bufferInfo[] = //Due to the arrangement of internal nodes, each internal node corresponds
//to a contiguous range of leaf node indices. This characteristic can be used
//to optimize calculateOverlappingPairs(); checking if
//(m_internalNodeLeafIndexRanges[].y < leafNodeIndex) can be used to ensure that
//each pair is processed only once.
{ {
b3BufferInfoCL( m_firstIndexOffsetPerLevelGpu.getBufferCL() ), B3_PROFILE("Reset internal node index ranges");
b3BufferInfoCL( m_numNodesPerLevelGpu.getBufferCL() ),
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), b3Int2 invalidIndexRange;
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), invalidIndexRange.x = -1; //x == min
b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ), invalidIndexRange.y = -2; //y == max
b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() )
}; m_fill.execute( m_internalNodeLeafIndexRanges, invalidIndexRange, m_internalNodeLeafIndexRanges.size() );
clFinish(m_queue);
}
b3LauncherCL launcher(m_queue, m_determineInternalNodeAabbsKernel, "m_determineInternalNodeAabbsKernel"); int lastInternalLevelIndex = numLevels - 2; //Last level is leaf node level
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); for(int level = lastInternalLevelIndex; level >= 0; --level)
launcher.setConst(numLevels); {
launcher.setConst(numInternalNodes); b3BufferInfoCL bufferInfo[] =
{
launcher.launch1D(numLeaves); b3BufferInfoCL( m_firstIndexOffsetPerLevelGpu.getBufferCL() ),
b3BufferInfoCL( m_numNodesPerLevelGpu.getBufferCL() ),
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ),
b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() )
};
b3LauncherCL launcher(m_queue, m_determineInternalNodeAabbsKernel, "m_determineInternalNodeAabbsKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(numLevels);
launcher.setConst(numInternalNodes);
launcher.setConst(level);
launcher.launch1D(numLeaves);
}
clFinish(m_queue); clFinish(m_queue);
if(0) if(0)
{ {
b3SapAabb mergedAABB = m_mergedAabb.at(0); static b3AlignedObjectArray<b3Int2> leafIndexRanges;
printf("mergedAABBMin: %f, %f, %f \n", mergedAABB.m_minVec.x, mergedAABB.m_minVec.y, mergedAABB.m_minVec.z); m_internalNodeLeafIndexRanges.copyToHost(leafIndexRanges, false);
printf("mergedAABBMax: %f, %f, %f \n", mergedAABB.m_maxVec.x, mergedAABB.m_maxVec.y, mergedAABB.m_maxVec.z); clFinish(m_queue);
for(int i = 0; i < numInternalNodes; ++i)
//if(leafIndexRanges[i].x == -1 || leafIndexRanges[i].y == -1)
printf("leafIndexRanges[%d]: %d, %d\n", i, leafIndexRanges[i].x, leafIndexRanges[i].y);
printf("\n");
} }
if(0) if(0)
{ {
static b3AlignedObjectArray<b3SapAabb> rigidAabbs; static b3AlignedObjectArray<b3SapAabb> rigidAabbs;
@ -363,12 +399,18 @@ public:
actualRootAabb.m_minVec.setMin(rigidAabbs[i].m_minVec); actualRootAabb.m_minVec.setMin(rigidAabbs[i].m_minVec);
actualRootAabb.m_maxVec.setMax(rigidAabbs[i].m_maxVec); actualRootAabb.m_maxVec.setMax(rigidAabbs[i].m_maxVec);
} }
printf("actualRootMin: %f, %f, %f \n", actualRootAabb.m_minVec.x, actualRootAabb.m_minVec.y, actualRootAabb.m_minVec.z);
printf("actualRootMax: %f, %f, %f \n", actualRootAabb.m_maxVec.x, actualRootAabb.m_maxVec.y, actualRootAabb.m_maxVec.z);
b3SapAabb rootAabb = m_internalNodeAabbs.at(0); b3SapAabb rootAabb = m_internalNodeAabbs.at(0);
printf("rootMin: %f, %f, %f \n", rootAabb.m_minVec.x, rootAabb.m_minVec.y, rootAabb.m_minVec.z); b3SapAabb mergedAABB = m_mergedAabb.at(0);
printf("rootMax: %f, %f, %f \n", rootAabb.m_maxVec.x, rootAabb.m_maxVec.y, rootAabb.m_maxVec.z);
printf("mergedAABBMin: %f, %f, %f \n", mergedAABB.m_minVec.x, mergedAABB.m_minVec.y, mergedAABB.m_minVec.z);
printf("actualRootMin: %f, %f, %f \n", actualRootAabb.m_minVec.x, actualRootAabb.m_minVec.y, actualRootAabb.m_minVec.z);
printf("kernelRootMin: %f, %f, %f \n", rootAabb.m_minVec.x, rootAabb.m_minVec.y, rootAabb.m_minVec.z);
printf("mergedAABBMax: %f, %f, %f \n", mergedAABB.m_maxVec.x, mergedAABB.m_maxVec.y, mergedAABB.m_maxVec.z);
printf("actualRootMax: %f, %f, %f \n", actualRootAabb.m_maxVec.x, actualRootAabb.m_maxVec.y, actualRootAabb.m_maxVec.z);
printf("kernelRootMax: %f, %f, %f \n", rootAabb.m_maxVec.x, rootAabb.m_maxVec.y, rootAabb.m_maxVec.z);
printf("\n"); printf("\n");
} }
} }
@ -397,6 +439,7 @@ public:
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ), b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ),
b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ),
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
b3BufferInfoCL( out_numPairs.getBufferCL() ), b3BufferInfoCL( out_numPairs.getBufferCL() ),

View File

@ -77,11 +77,7 @@ public:
virtual cl_mem getAabbBufferWS() { return m_aabbsGpu.getBufferCL(); } virtual cl_mem getAabbBufferWS() { return m_aabbsGpu.getBufferCL(); }
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() { return m_aabbsGpu; } virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() { return m_aabbsGpu; }
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() { return m_aabbsCpu; }
{
b3Assert(0); //CPU version not implemented
return m_aabbsCpu;
}
static b3GpuBroadphaseInterface* CreateFunc(cl_context context, cl_device_id device, cl_command_queue queue) static b3GpuBroadphaseInterface* CreateFunc(cl_context context, cl_device_id device, cl_command_queue queue)
{ {

View File

@ -80,60 +80,32 @@ unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)
return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2; return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;
} }
//Should replace with an optimized parallel reduction
__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbs) __kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)
{ {
int aabbIndex = get_global_id(0); //Each time this kernel is added to the command queue,
if(aabbIndex >= numAabbs) return; //the number of AABBs needing to be merged is halved
//Find the most significant bit(msb)
int mostSignificantBit = 0;
{
int temp = numAabbs;
while(temp >>= 1) mostSignificantBit++; //Start counting from 0 (0 and 1 have msb 0, 2 has msb 1)
}
int numberOfAabbsAboveMsbSplit = numAabbs & ~( ~(0) << mostSignificantBit );
int numRemainingAabbs = (1 << mostSignificantBit);
//Merge AABBs above most significant bit so that the number of remaining AABBs is a power of 2
//For example, if there are 159 AABBs = 128 + 31, then merge indices [0, 30] and 128 + [0, 30]
if(aabbIndex < numberOfAabbsAboveMsbSplit)
{
int otherAabbIndex = numRemainingAabbs + aabbIndex;
b3AabbCL aabb = out_mergedAabb[aabbIndex];
b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];
b3AabbCL mergedAabb;
mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);
mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);
out_mergedAabb[aabbIndex] = mergedAabb;
}
barrier(CLK_GLOBAL_MEM_FENCE);
// //
int offset = numRemainingAabbs / 2; //Example with 159 AABBs:
while(offset >= 1) // numRemainingAabbs == 159 / 2 + 159 % 2 == 80
{ // numMergedAabbs == 159 - 80 == 79
if(aabbIndex < offset) //So, indices [0, 78] are merged with [0 + 80, 78 + 80]
{
int otherAabbIndex = aabbIndex + offset; int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;
int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;
int aabbIndex = get_global_id(0);
if(aabbIndex >= numMergedAabbs) return;
int otherAabbIndex = aabbIndex + numRemainingAabbs;
b3AabbCL aabb = out_mergedAabb[aabbIndex];
b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];
b3AabbCL aabb = out_mergedAabb[aabbIndex]; b3AabbCL mergedAabb;
b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex]; mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);
mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);
b3AabbCL mergedAabb; out_mergedAabb[aabbIndex] = mergedAabb;
mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);
mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);
out_mergedAabb[aabbIndex] = mergedAabb;
}
offset /= 2;
barrier(CLK_GLOBAL_MEM_FENCE);
}
} }
__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, __kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes,
@ -254,7 +226,7 @@ __kernel void constructBinaryTree(__global int* firstIndexOffsetPerLevel,
{ {
int leafNodeLevel = numLevels - 1; int leafNodeLevel = numLevels - 1;
leftChildIndex = (isLeftChildLeaf) ? leftChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : leftChildIndex; leftChildIndex = (isLeftChildLeaf) ? leftChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : leftChildIndex;
rightChildIndex = (isLeftChildLeaf) ? rightChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : rightChildIndex; rightChildIndex = (isRightChildLeaf) ? rightChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : rightChildIndex;
} }
//Set the negative sign bit if the node is internal //Set the negative sign bit if the node is internal
@ -276,20 +248,19 @@ __kernel void determineInternalNodeAabbs(__global int* firstIndexOffsetPerLevel,
__global int2* internalNodeChildIndices, __global int2* internalNodeChildIndices,
__global SortDataCL* mortonCodesAndAabbIndices, __global SortDataCL* mortonCodesAndAabbIndices,
__global b3AabbCL* leafNodeAabbs, __global b3AabbCL* leafNodeAabbs,
__global b3AabbCL* out_internalNodeAabbs, int numLevels, int numInternalNodes) __global int2* out_internalNodeLeafIndexRanges,
__global b3AabbCL* out_internalNodeAabbs,
int numLevels, int numInternalNodes, int level)
{ {
int i = get_global_id(0); int i = get_global_id(0);
if(i >= numInternalNodes) return; if(i >= numInternalNodes) return;
int numInternalLevels = numLevels - 1; //For each node in a level, check its child nodes to determine its AABB
//Starting from the level next to the leaf nodes, move towards the root(level 0)
for(int level = numInternalLevels - 1; level >= 0; --level)
{ {
int indexInLevel = i; //Index relative to firstIndexOffsetPerLevel[level] int indexInLevel = i; //Index relative to firstIndexOffsetPerLevel[level]
int numNodesInLevel = numNodesPerLevel[level]; int numNodesInLevel = numNodesPerLevel[level];
if(i < numNodesInLevel) if(indexInLevel < numNodesInLevel)
{ {
int internalNodeIndexGlobal = indexInLevel + firstIndexOffsetPerLevel[level]; int internalNodeIndexGlobal = indexInLevel + firstIndexOffsetPerLevel[level];
int2 childIndicies = internalNodeChildIndices[internalNodeIndexGlobal]; int2 childIndicies = internalNodeChildIndices[internalNodeIndexGlobal];
@ -300,19 +271,26 @@ __kernel void determineInternalNodeAabbs(__global int* firstIndexOffsetPerLevel,
int isLeftChildLeaf = isLeafNode(childIndicies.x); int isLeftChildLeaf = isLeafNode(childIndicies.x);
int isRightChildLeaf = isLeafNode(childIndicies.y); int isRightChildLeaf = isLeafNode(childIndicies.y);
//left/RightChildLeafIndex == Rigid body indicies
int leftChildLeafIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1; int leftChildLeafIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;
int rightChildLeafIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1; int rightChildLeafIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;
b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftChildLeafIndex] : out_internalNodeAabbs[leftChildIndex]; b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftChildLeafIndex] : out_internalNodeAabbs[leftChildIndex];
b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightChildLeafIndex] : out_internalNodeAabbs[rightChildIndex]; b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightChildLeafIndex] : out_internalNodeAabbs[rightChildIndex];
//
b3AabbCL internalNodeAabb; b3AabbCL internalNodeAabb;
internalNodeAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min); internalNodeAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);
internalNodeAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max); internalNodeAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);
out_internalNodeAabbs[internalNodeIndexGlobal] = internalNodeAabb; out_internalNodeAabbs[internalNodeIndexGlobal] = internalNodeAabb;
//For index range, x == min and y == max; left child always has lower index
int2 leafIndexRange;
leafIndexRange.x = (isLeftChildLeaf) ? leftChildIndex : out_internalNodeLeafIndexRanges[leftChildIndex].x;
leafIndexRange.y = (isRightChildLeaf) ? rightChildIndex : out_internalNodeLeafIndexRanges[rightChildIndex].y;
out_internalNodeLeafIndexRanges[internalNodeIndexGlobal] = leafIndexRange;
} }
barrier(CLK_GLOBAL_MEM_FENCE);
} }
} }
@ -331,7 +309,9 @@ bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)
//From sap.cl //From sap.cl
__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, __kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs,
__global int2* internalNodeChildIndices, __global b3AabbCL* internalNodeAabbs, __global int2* internalNodeChildIndices,
__global b3AabbCL* internalNodeAabbs,
__global int2* internalNodeLeafIndexRanges,
__global SortDataCL* mortonCodesAndAabbIndices, __global SortDataCL* mortonCodesAndAabbIndices,
__global int* out_numPairs, __global int4* out_overlappingPairs, __global int* out_numPairs, __global int4* out_overlappingPairs,
int maxPairs, int numQueryAabbs) int maxPairs, int numQueryAabbs)
@ -341,7 +321,8 @@ __kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs,
int queryRigidIndex = get_group_id(0) * get_local_size(0) + get_local_id(0); int queryRigidIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);
if(queryRigidIndex >= numQueryAabbs) return; if(queryRigidIndex >= numQueryAabbs) return;
queryRigidIndex = mortonCodesAndAabbIndices[queryRigidIndex].m_value; int queryBvhNodeIndex = queryRigidIndex;
queryRigidIndex = mortonCodesAndAabbIndices[queryRigidIndex].m_value; // fix queryRigidIndex naming for this branch
#else #else
int queryRigidIndex = get_global_id(0); int queryRigidIndex = get_global_id(0);
if(queryRigidIndex >= numQueryAabbs) return; if(queryRigidIndex >= numQueryAabbs) return;
@ -363,7 +344,15 @@ __kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs,
int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false
int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex); int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);
//Optimization - if the node is not a leaf, check whether the highest leaf index of that node
//is less than the queried node's index to avoid testing each pair twice.
{
// fix: produces duplicate pairs
// int highestLeafIndex = (isLeaf) ? numQueryAabbs : internalNodeLeafIndexRanges[bvhNodeIndex].y;
// if(highestLeafIndex < queryBvhNodeIndex) continue;
}
//bvhRigidIndex is not used if internal node //bvhRigidIndex is not used if internal node
int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1; int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;

View File

@ -75,59 +75,32 @@ static const char* parallelLinearBvhCL= \
"{\n" "{\n"
" return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n" " return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n"
"}\n" "}\n"
"__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbs)\n" "//Should replace with an optimized parallel reduction\n"
"__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)\n"
"{\n" "{\n"
" int aabbIndex = get_global_id(0);\n" " //Each time this kernel is added to the command queue, \n"
" if(aabbIndex >= numAabbs) return;\n" " //the number of AABBs needing to be merged is halved\n"
" \n"
" //Find the most significant bit(msb)\n"
" int mostSignificantBit = 0;\n"
" {\n"
" int temp = numAabbs;\n"
" while(temp >>= 1) mostSignificantBit++; //Start counting from 0 (0 and 1 have msb 0, 2 has msb 1)\n"
" }\n"
" \n"
" int numberOfAabbsAboveMsbSplit = numAabbs & ~( ~(0) << mostSignificantBit );\n"
" int numRemainingAabbs = (1 << mostSignificantBit);\n"
" \n"
" //Merge AABBs above most significant bit so that the number of remaining AABBs is a power of 2\n"
" //For example, if there are 159 AABBs = 128 + 31, then merge indices [0, 30] and 128 + [0, 30]\n"
" if(aabbIndex < numberOfAabbsAboveMsbSplit)\n"
" {\n"
" int otherAabbIndex = numRemainingAabbs + aabbIndex;\n"
" \n"
" b3AabbCL aabb = out_mergedAabb[aabbIndex];\n"
" b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n"
" \n"
" b3AabbCL mergedAabb;\n"
" mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n"
" mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n"
" out_mergedAabb[aabbIndex] = mergedAabb;\n"
" }\n"
" \n"
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
" \n"
" //\n" " //\n"
" int offset = numRemainingAabbs / 2;\n" " //Example with 159 AABBs:\n"
" while(offset >= 1)\n" " // numRemainingAabbs == 159 / 2 + 159 % 2 == 80\n"
" {\n" " // numMergedAabbs == 159 - 80 == 79\n"
" if(aabbIndex < offset)\n" " //So, indices [0, 78] are merged with [0 + 80, 78 + 80]\n"
" {\n" " \n"
" int otherAabbIndex = aabbIndex + offset;\n" " int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;\n"
" int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;\n"
" \n"
" int aabbIndex = get_global_id(0);\n"
" if(aabbIndex >= numMergedAabbs) return;\n"
" \n"
" int otherAabbIndex = aabbIndex + numRemainingAabbs;\n"
" \n"
" b3AabbCL aabb = out_mergedAabb[aabbIndex];\n"
" b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n"
" \n" " \n"
" b3AabbCL aabb = out_mergedAabb[aabbIndex];\n" " b3AabbCL mergedAabb;\n"
" b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n" " mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n"
" \n" " mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n"
" b3AabbCL mergedAabb;\n" " out_mergedAabb[aabbIndex] = mergedAabb;\n"
" mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n"
" mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n"
" out_mergedAabb[aabbIndex] = mergedAabb;\n"
" }\n"
" \n"
" offset /= 2;\n"
" \n"
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
" }\n"
"}\n" "}\n"
"__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, \n" "__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, \n"
" __global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)\n" " __global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)\n"
@ -244,7 +217,7 @@ static const char* parallelLinearBvhCL= \
" {\n" " {\n"
" int leafNodeLevel = numLevels - 1;\n" " int leafNodeLevel = numLevels - 1;\n"
" leftChildIndex = (isLeftChildLeaf) ? leftChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : leftChildIndex;\n" " leftChildIndex = (isLeftChildLeaf) ? leftChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : leftChildIndex;\n"
" rightChildIndex = (isLeftChildLeaf) ? rightChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : rightChildIndex;\n" " rightChildIndex = (isRightChildLeaf) ? rightChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : rightChildIndex;\n"
" }\n" " }\n"
" \n" " \n"
" //Set the negative sign bit if the node is internal\n" " //Set the negative sign bit if the node is internal\n"
@ -265,20 +238,19 @@ static const char* parallelLinearBvhCL= \
" __global int2* internalNodeChildIndices,\n" " __global int2* internalNodeChildIndices,\n"
" __global SortDataCL* mortonCodesAndAabbIndices,\n" " __global SortDataCL* mortonCodesAndAabbIndices,\n"
" __global b3AabbCL* leafNodeAabbs, \n" " __global b3AabbCL* leafNodeAabbs, \n"
" __global b3AabbCL* out_internalNodeAabbs, int numLevels, int numInternalNodes)\n" " __global int2* out_internalNodeLeafIndexRanges,\n"
" __global b3AabbCL* out_internalNodeAabbs, \n"
" int numLevels, int numInternalNodes, int level)\n"
"{\n" "{\n"
" int i = get_global_id(0);\n" " int i = get_global_id(0);\n"
" if(i >= numInternalNodes) return;\n" " if(i >= numInternalNodes) return;\n"
" \n" " \n"
" int numInternalLevels = numLevels - 1;\n" " //For each node in a level, check its child nodes to determine its AABB\n"
" \n"
" //Starting from the level next to the leaf nodes, move towards the root(level 0)\n"
" for(int level = numInternalLevels - 1; level >= 0; --level)\n"
" {\n" " {\n"
" int indexInLevel = i; //Index relative to firstIndexOffsetPerLevel[level]\n" " int indexInLevel = i; //Index relative to firstIndexOffsetPerLevel[level]\n"
" \n" " \n"
" int numNodesInLevel = numNodesPerLevel[level];\n" " int numNodesInLevel = numNodesPerLevel[level];\n"
" if(i < numNodesInLevel)\n" " if(indexInLevel < numNodesInLevel)\n"
" {\n" " {\n"
" int internalNodeIndexGlobal = indexInLevel + firstIndexOffsetPerLevel[level];\n" " int internalNodeIndexGlobal = indexInLevel + firstIndexOffsetPerLevel[level];\n"
" int2 childIndicies = internalNodeChildIndices[internalNodeIndexGlobal];\n" " int2 childIndicies = internalNodeChildIndices[internalNodeIndexGlobal];\n"
@ -289,19 +261,26 @@ static const char* parallelLinearBvhCL= \
" int isLeftChildLeaf = isLeafNode(childIndicies.x);\n" " int isLeftChildLeaf = isLeafNode(childIndicies.x);\n"
" int isRightChildLeaf = isLeafNode(childIndicies.y);\n" " int isRightChildLeaf = isLeafNode(childIndicies.y);\n"
" \n" " \n"
" //left/RightChildLeafIndex == Rigid body indicies\n"
" int leftChildLeafIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n" " int leftChildLeafIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n"
" int rightChildLeafIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n" " int rightChildLeafIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n"
" \n" " \n"
" b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftChildLeafIndex] : out_internalNodeAabbs[leftChildIndex];\n" " b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftChildLeafIndex] : out_internalNodeAabbs[leftChildIndex];\n"
" b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightChildLeafIndex] : out_internalNodeAabbs[rightChildIndex];\n" " b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightChildLeafIndex] : out_internalNodeAabbs[rightChildIndex];\n"
" \n" " \n"
" //\n"
" b3AabbCL internalNodeAabb;\n" " b3AabbCL internalNodeAabb;\n"
" internalNodeAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n" " internalNodeAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n"
" internalNodeAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n" " internalNodeAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n"
" out_internalNodeAabbs[internalNodeIndexGlobal] = internalNodeAabb;\n" " out_internalNodeAabbs[internalNodeIndexGlobal] = internalNodeAabb;\n"
" \n"
" //For index range, x == min and y == max; left child always has lower index\n"
" int2 leafIndexRange;\n"
" leafIndexRange.x = (isLeftChildLeaf) ? leftChildIndex : out_internalNodeLeafIndexRanges[leftChildIndex].x;\n"
" leafIndexRange.y = (isRightChildLeaf) ? rightChildIndex : out_internalNodeLeafIndexRanges[rightChildIndex].y;\n"
" \n"
" out_internalNodeLeafIndexRanges[internalNodeIndexGlobal] = leafIndexRange;\n"
" }\n" " }\n"
" \n"
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
" }\n" " }\n"
"}\n" "}\n"
"//From sap.cl\n" "//From sap.cl\n"
@ -316,7 +295,9 @@ static const char* parallelLinearBvhCL= \
"}\n" "}\n"
"//From sap.cl\n" "//From sap.cl\n"
"__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, \n" "__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, \n"
" __global int2* internalNodeChildIndices, __global b3AabbCL* internalNodeAabbs,\n" " __global int2* internalNodeChildIndices, \n"
" __global b3AabbCL* internalNodeAabbs,\n"
" __global int2* internalNodeLeafIndexRanges,\n"
" __global SortDataCL* mortonCodesAndAabbIndices,\n" " __global SortDataCL* mortonCodesAndAabbIndices,\n"
" __global int* out_numPairs, __global int4* out_overlappingPairs, \n" " __global int* out_numPairs, __global int4* out_overlappingPairs, \n"
" int maxPairs, int numQueryAabbs)\n" " int maxPairs, int numQueryAabbs)\n"
@ -326,7 +307,8 @@ static const char* parallelLinearBvhCL= \
" int queryRigidIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n" " int queryRigidIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
" if(queryRigidIndex >= numQueryAabbs) return;\n" " if(queryRigidIndex >= numQueryAabbs) return;\n"
" \n" " \n"
" queryRigidIndex = mortonCodesAndAabbIndices[queryRigidIndex].m_value;\n" " int queryBvhNodeIndex = queryRigidIndex;\n"
" queryRigidIndex = mortonCodesAndAabbIndices[queryRigidIndex].m_value; // fix queryRigidIndex naming for this branch\n"
"#else\n" "#else\n"
" int queryRigidIndex = get_global_id(0);\n" " int queryRigidIndex = get_global_id(0);\n"
" if(queryRigidIndex >= numQueryAabbs) return;\n" " if(queryRigidIndex >= numQueryAabbs) return;\n"
@ -347,7 +329,15 @@ static const char* parallelLinearBvhCL= \
" \n" " \n"
" int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n" " int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n"
" int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n" " int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n"
" \n" " \n"
" //Optimization - if the node is not a leaf, check whether the highest leaf index of that node\n"
" //is less than the queried node's index to avoid testing each pair twice.\n"
" {\n"
" // fix: produces duplicate pairs\n"
" // int highestLeafIndex = (isLeaf) ? numQueryAabbs : internalNodeLeafIndexRanges[bvhNodeIndex].y;\n"
" // if(highestLeafIndex < queryBvhNodeIndex) continue;\n"
" }\n"
" \n"
" //bvhRigidIndex is not used if internal node\n" " //bvhRigidIndex is not used if internal node\n"
" int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n" " int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n"
" \n" " \n"