Draft Parallel Linear BVH Broadphase.

This commit is contained in:
Jackson Lee 2014-02-18 19:23:25 -08:00
parent fabdf8b4a9
commit b7b7356af8
7 changed files with 1242 additions and 0 deletions

View File

@ -9,6 +9,7 @@
#include "OpenGLWindow/b3gWindowInterface.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h"
#include "../GpuDemoInternalData.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
@ -108,6 +109,7 @@ static int curSelectedBroadphase = 0;
static BroadphaseEntry allBroadphases[]=
{
{"Gpu Grid",b3GpuGridBroadphase::CreateFunc},
{"Parallel Linear BVH",b3GpuParallelLinearBvhBroadphase::CreateFunc},
{"CPU Brute Force",b3GpuSapBroadphase::CreateFuncBruteForceCpu},
{"GPU Brute Force",b3GpuSapBroadphase::CreateFuncBruteForceGpu},
{"GPU 1-SAP Original",b3GpuSapBroadphase::CreateFuncOriginal},

View File

@ -13,6 +13,7 @@ premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/Broadphas
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h" --stringname="sapFastCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h" --stringname="gridBroadphaseCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h" --stringname="parallelLinearBvhCL" stringify

View File

@ -14,6 +14,7 @@ rem @echo off
./premake4_linux64 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h" --stringname="sapFastCL" stringify
./premake4_linux64 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h" --stringname="gridBroadphaseCL" stringify
./premake4_linux64 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h" --stringname="parallelLinearBvhCL" stringify

View File

@ -0,0 +1,416 @@
/*
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Initial Author Jackson Lee, 2014
#ifndef B3_GPU_PARALLEL_LINEAR_BVH_H
#define B3_GPU_PARALLEL_LINEAR_BVH_H
//#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h"
#include "Bullet3Common/shared/b3Int2.h"
#include "Bullet3Common/shared/b3Int4.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
#include "Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h"
#define B3_PLBVH_ROOT_NODE_MARKER -1 //Syncronize with parallelLinearBvh.cl
///@brief GPU Parallel Linearized Bounding Volume Heirarchy(LBVH) that is reconstructed every frame
///@remarks
///Main references: \n
///"Fast BVH Construction on GPUs" [Lauterbach et al. 2009] \n
///"Maximizing Parallelism in the Construction of BVHs, Octrees, and k-d trees" [Karras 2012] \n
///@par
///The basic algorithm for building the BVH as presented in [Lauterbach et al. 2009] consists of 4 stages:
/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid)
/// - [fully parallel] Sort morton codes
/// - [somewhat parallel] Build radix binary tree (assign parent/child pointers for internal nodes of the BVH)
/// - [somewhat parallel] Set internal node AABBs
///@par
///[Karras 2012] improves on the algorithm by introducing fully parallel methods for the last 2 stages.
///The BVH implementation here is almost the same as [Karras 2012], but a different method is used for constructing the tree.
/// - Instead of building a binary radix tree, we simply pair each node with its nearest sibling.
/// This has the effect of further worsening the quality of the BVH, but the main spatial partitioning is done by the
/// Z-curve anyways, and this method should be simpler and faster during construction. Additionally, it is still possible
/// to improve the quality of the BVH by rearranging the connections between nodes.
/// - Due to the way the tree is constructed, it becomes unnecessary to use atomic_inc to get
/// the AABB for each internal node. Rather than traveling upwards from the leaf nodes, as in the paper,
/// each internal node checks its child nodes to get its AABB.
class b3GpuParallelLinearBvh
{
cl_command_queue m_queue;
cl_program m_parallelLinearBvhProgram;
cl_kernel m_assignMortonCodesAndAabbIndiciesKernel;
cl_kernel m_constructBinaryTreeKernel;
cl_kernel m_determineInternalNodeAabbsKernel;
cl_kernel m_plbvhCalculateOverlappingPairsKernel;
b3FillCL m_fill;
b3RadixSort32CL m_radixSorter;
//1 element per level in the tree
b3AlignedObjectArray<int> m_numNodesPerLevelCpu; //Level 0(m_numNodesPerLevelCpu[0]) is the root, last level contains the leaf nodes
b3AlignedObjectArray<int> m_firstIndexOffsetPerLevelCpu; //Contains the index/offset of the first node in that level
b3OpenCLArray<int> m_numNodesPerLevelGpu;
b3OpenCLArray<int> m_firstIndexOffsetPerLevelGpu;
//1 element per internal node (number_of_internal_nodes = number_of_leaves - 1); index 0 is the root node
b3OpenCLArray<b3SapAabb> m_internalNodeAabbs;
b3OpenCLArray<b3Int2> m_internalNodeChildNodes; //x == left child, y == right child
b3OpenCLArray<int> m_internalNodeParentNodes;
//1 element per leaf node
b3OpenCLArray<int> m_leafNodeParentNodes;
b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies; //m_key = morton code, m_value == aabb index
public:
b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) :
m_queue(queue),
m_fill(context, device, queue),
m_radixSorter(context, device, queue),
m_numNodesPerLevelGpu(context, queue),
m_firstIndexOffsetPerLevelGpu(context, queue),
m_internalNodeAabbs(context, queue),
m_internalNodeChildNodes(context, queue),
m_internalNodeParentNodes(context, queue),
m_leafNodeParentNodes(context, queue),
m_mortonCodesAndAabbIndicies(context, queue)
{
const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl";
const char* kernelSource = parallelLinearBvhCL; //parallelLinearBvhCL.h
cl_int error;
char* additionalMacros = 0;
m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH);
b3Assert(m_parallelLinearBvhProgram);
m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros );
b3Assert(m_assignMortonCodesAndAabbIndiciesKernel);
m_constructBinaryTreeKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "constructBinaryTree", &error, m_parallelLinearBvhProgram, additionalMacros );
b3Assert(m_constructBinaryTreeKernel);
m_determineInternalNodeAabbsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "determineInternalNodeAabbs", &error, m_parallelLinearBvhProgram, additionalMacros );
b3Assert(m_determineInternalNodeAabbsKernel);
m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros );
b3Assert(m_plbvhCalculateOverlappingPairsKernel);
}
virtual ~b3GpuParallelLinearBvh()
{
clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel);
clReleaseKernel(m_constructBinaryTreeKernel);
clReleaseKernel(m_determineInternalNodeAabbsKernel);
clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel);
clReleaseProgram(m_parallelLinearBvhProgram);
}
// fix: need to handle/test case with 2 nodes
///@param cellsize A virtual grid of size 2^10^3 is used in the process of creating the BVH
void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, b3Scalar cellSize)
{
B3_PROFILE("b3ParallelLinearBvh::build()");
int numLeaves = worldSpaceAabbs.size(); //Number of leaves in the BVH == Number of rigid body AABBs
int numInternalNodes = numLeaves - 1;
if(numLeaves < 2) return;
//
{
m_internalNodeAabbs.resize(numInternalNodes);
m_internalNodeChildNodes.resize(numInternalNodes);
m_internalNodeParentNodes.resize(numInternalNodes);
m_leafNodeParentNodes.resize(numLeaves);
m_mortonCodesAndAabbIndicies.resize(numLeaves);
}
//Determine number of levels in the binary tree( numLevels = ceil( log2(numLeaves) ) )
//The number of levels is equivalent to the number of bits needed to uniquely identify each node(including both internal and leaf nodes)
int numLevels = 0;
{
//Find the most significant bit(msb)
int mostSignificantBit = 0;
{
int temp = numLeaves;
while(temp >>= 1) mostSignificantBit++; //Start counting from 0 (0 and 1 have msb 0, 2 has msb 1)
}
numLevels = mostSignificantBit + 1;
//If the number of nodes is not a power of 2(as in, can be expressed as 2^N where N is an integer), then there is 1 additional level
if( ~(1 << mostSignificantBit) & numLeaves ) numLevels++;
if(1) printf("numLeaves, numLevels, mostSignificantBit: %d, %d, %d \n", numLeaves, numLevels, mostSignificantBit);
}
//Determine number of nodes per level, use prefix sum to get offsets of each level, and send to GPU
{
B3_PROFILE("Determine number of nodes per level");
m_numNodesPerLevelCpu.resize(numLevels);
//The last level contains the leaf nodes; number of leaves is already known
if(numLevels - 1 >= 0) m_numNodesPerLevelCpu[numLevels - 1] = numLeaves;
//Calculate number of nodes in each level;
//start from the second to last level(level right next to leaf nodes) and move towards the root(level 0)
int hasRemainder = 0;
for(int levelIndex = numLevels - 2; levelIndex >= 0; --levelIndex)
{
int numNodesPreviousLevel = m_numNodesPerLevelCpu[levelIndex + 1]; //For first iteration this == numLeaves
bool allNodesAllocated = ( (numNodesPreviousLevel + hasRemainder) % 2 == 0 );
int numNodesCurrentLevel = (allNodesAllocated) ? (numNodesPreviousLevel + hasRemainder) / 2 : numNodesPreviousLevel / 2;
m_numNodesPerLevelCpu[levelIndex] = numNodesCurrentLevel;
hasRemainder = static_cast<int>(!allNodesAllocated);
}
//Prefix sum to calculate the first index offset of each level
{
m_firstIndexOffsetPerLevelCpu = m_numNodesPerLevelCpu;
//Perform inclusive scan
for(int i = 1; i < m_firstIndexOffsetPerLevelCpu.size(); ++i)
m_firstIndexOffsetPerLevelCpu[i] += m_firstIndexOffsetPerLevelCpu[i - 1];
//Convert inclusive scan to exclusive scan to get the offsets
//This is equivalent to shifting each element in m_firstIndexOffsetPerLevelCpu[] by 1 to the right,
//and setting the first element to 0
for(int i = 0; i < m_firstIndexOffsetPerLevelCpu.size(); ++i)
m_firstIndexOffsetPerLevelCpu[i] -= m_numNodesPerLevelCpu[i];
}
if(1)
{
int numInternalNodes = 0;
for(int i = 0; i < numLevels; ++i)
if(i < numLevels - 1) numInternalNodes += m_numNodesPerLevelCpu[i];
printf("numInternalNodes: %d\n", numInternalNodes);
for(int i = 0; i < numLevels; ++i)
printf("numNodes, offset[%d]: %d, %d \n", i, m_numNodesPerLevelCpu[i], m_firstIndexOffsetPerLevelCpu[i]);
printf("\n");
}
//Copy to GPU
m_numNodesPerLevelGpu.copyFromHost(m_numNodesPerLevelCpu, false);
m_firstIndexOffsetPerLevelGpu.copyFromHost(m_firstIndexOffsetPerLevelCpu, false);
clFinish(m_queue);
}
//Find the AABB of all input AABBs; this is used to define the size of
//each cell in the virtual grid(2^10 cells in each dimension).
{
B3_PROFILE("Find AABB of merged nodes");
/*b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
b3BufferInfoCL( m_allNodesMergedAabb.getBufferCL() ),
};
b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabb, "m_findAllNodesMergedAabb");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(numLeaves);
launcher.launch1D(numLeaves);
clFinish(m_queue);*/
}
//Insert the center of the AABBs into a virtual grid,
//then convert the discrete grid coordinates into a morton code
//For each element in m_mortonCodesAndAabbIndicies, set
// m_key == morton code (value to sort by)
// m_value = AABB index
{
B3_PROFILE("Assign morton codes");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() )
};
b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(cellSize);
launcher.setConst(numLeaves);
launcher.launch1D(numLeaves);
clFinish(m_queue);
}
//
{
B3_PROFILE("Sort leaves by morton codes");
m_radixSorter.execute(m_mortonCodesAndAabbIndicies);
clFinish(m_queue);
}
//Optional; only element at m_internalNodeParentNodes[0], the root node, needs to be set here
//as the parent indices of other nodes are overwritten during m_constructBinaryTreeKernel
{
B3_PROFILE("Reset parent node indices");
m_fill.execute( m_internalNodeParentNodes, B3_PLBVH_ROOT_NODE_MARKER, m_internalNodeParentNodes.size() );
m_fill.execute( m_leafNodeParentNodes, B3_PLBVH_ROOT_NODE_MARKER, m_leafNodeParentNodes.size() );
clFinish(m_queue);
}
//Construct binary tree; find the children of each internal node, and assign parent nodes
{
B3_PROFILE("Construct binary tree");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_firstIndexOffsetPerLevelGpu.getBufferCL() ),
b3BufferInfoCL( m_numNodesPerLevelGpu.getBufferCL() ),
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ),
b3BufferInfoCL( m_leafNodeParentNodes.getBufferCL() )
};
b3LauncherCL launcher(m_queue, m_constructBinaryTreeKernel, "m_constructBinaryTreeKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(numLevels);
launcher.setConst(numInternalNodes);
launcher.launch1D(numInternalNodes);
clFinish(m_queue);
if(1)
{
static b3AlignedObjectArray<b3Int2> internalNodeChildNodes;
m_internalNodeChildNodes.copyToHost(internalNodeChildNodes, false);
clFinish(m_queue);
for(int i = 0; i < 256; ++i) printf("ch[%d]: %d, %d\n", i, internalNodeChildNodes[i].x, internalNodeChildNodes[i].y);
printf("\n");
}
}
//For each internal node, check children to get its AABB; start from the
//last level and move towards the root
{
B3_PROFILE("Set AABBs");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_firstIndexOffsetPerLevelGpu.getBufferCL() ),
b3BufferInfoCL( m_numNodesPerLevelGpu.getBufferCL() ),
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() )
};
b3LauncherCL launcher(m_queue, m_determineInternalNodeAabbsKernel, "m_determineInternalNodeAabbsKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(numLevels);
launcher.setConst(numInternalNodes);
launcher.launch1D(numLeaves);
clFinish(m_queue);
if(0)
{
static b3AlignedObjectArray<b3SapAabb> rigidAabbs;
worldSpaceAabbs.copyToHost(rigidAabbs, false);
clFinish(m_queue);
b3SapAabb actualRootAabb;
actualRootAabb.m_minVec = b3MakeVector3(B3_LARGE_FLOAT, B3_LARGE_FLOAT, B3_LARGE_FLOAT);
actualRootAabb.m_maxVec = b3MakeVector3(-B3_LARGE_FLOAT, -B3_LARGE_FLOAT, -B3_LARGE_FLOAT);
for(int i = 0; i < rigidAabbs.size(); ++i)
{
actualRootAabb.m_minVec.setMin(rigidAabbs[i].m_minVec);
actualRootAabb.m_maxVec.setMax(rigidAabbs[i].m_maxVec);
}
printf("actualRootMin: %f, %f, %f \n", actualRootAabb.m_minVec.x, actualRootAabb.m_minVec.y, actualRootAabb.m_minVec.z);
printf("actualRootMax: %f, %f, %f \n", actualRootAabb.m_maxVec.x, actualRootAabb.m_maxVec.y, actualRootAabb.m_maxVec.z);
b3SapAabb rootAabb = m_internalNodeAabbs.at(0);
printf("rootMin: %f, %f, %f \n", rootAabb.m_minVec.x, rootAabb.m_minVec.y, rootAabb.m_minVec.z);
printf("rootMax: %f, %f, %f \n", rootAabb.m_maxVec.x, rootAabb.m_maxVec.y, rootAabb.m_maxVec.z);
printf("\n");
}
}
}
//Max number of pairs is out_overlappingPairs.size()
//If the number of overlapping pairs is < out_overlappingPairs.size(), the array is resized
void calculateOverlappingPairs(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs,
b3OpenCLArray<int>& out_numPairs, b3OpenCLArray<b3Int4>& out_overlappingPairs)
{
b3Assert( out_numPairs.size() == 1 );
int maxPairs = out_overlappingPairs.size();
int reset = 0;
out_numPairs.copyFromHostPointer(&reset, 1);
{
B3_PROFILE("PLBVH calculateOverlappingPairs");
int numQueryAabbs = worldSpaceAabbs.size();
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ),
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
b3BufferInfoCL( out_numPairs.getBufferCL() ),
b3BufferInfoCL( out_overlappingPairs.getBufferCL() )
};
b3LauncherCL launcher(m_queue, m_plbvhCalculateOverlappingPairsKernel, "m_plbvhCalculateOverlappingPairsKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(maxPairs);
launcher.setConst(numQueryAabbs);
launcher.launch1D(numQueryAabbs);
clFinish(m_queue);
}
//
int numPairs = -1;
out_numPairs.copyToHostPointer(&numPairs, 1);
if(numPairs > maxPairs)
{
b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
numPairs = maxPairs;
out_numPairs.copyFromHostPointer(&maxPairs, 1);
}
out_overlappingPairs.resize(numPairs);
}
};
#endif

View File

@ -0,0 +1,98 @@
/*
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Initial Author Jackson Lee, 2014
#ifndef B3_GPU_PARALLEL_LINEAR_BVH_BROADPHASE_H
#define B3_GPU_PARALLEL_LINEAR_BVH_BROADPHASE_H
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h"
#include "b3GpuParallelLinearBvh.h"
class b3GpuParallelLinearBvhBroadphase : public b3GpuBroadphaseInterface
{
b3GpuParallelLinearBvh m_plbvh;
b3OpenCLArray<b3Int4> m_overlappingPairsGpu;
b3OpenCLArray<b3SapAabb> m_aabbsGpu;
b3OpenCLArray<int> m_tempNumPairs;
b3AlignedObjectArray<b3SapAabb> m_aabbsCpu;
public:
b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) :
m_plbvh(context, device, queue),
m_overlappingPairsGpu(context, queue),
m_aabbsGpu(context, queue),
m_tempNumPairs(context, queue)
{
m_tempNumPairs.resize(1);
}
virtual ~b3GpuParallelLinearBvhBroadphase() {}
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask)
{
b3SapAabb aabb;
aabb.m_minVec = aabbMin;
aabb.m_maxVec = aabbMax;
aabb.m_minIndices[3] = userPtr;
m_aabbsCpu.push_back(aabb);
}
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask)
{
b3Assert(0); //Not implemented
}
virtual void calculateOverlappingPairs(int maxPairs)
{
//Detect overall min/max
{
//Not implemented
}
//Reconstruct BVH
const b3Scalar CELL_SIZE(0.1);
m_plbvh.build(m_aabbsGpu, CELL_SIZE);
//
m_overlappingPairsGpu.resize(maxPairs);
m_plbvh.calculateOverlappingPairs(m_aabbsGpu, m_tempNumPairs, m_overlappingPairsGpu);
}
virtual void calculateOverlappingPairsHost(int maxPairs)
{
b3Assert(0); //CPU version not implemented
}
//call writeAabbsToGpu after done making all changes (createProxy etc)
virtual void writeAabbsToGpu() { m_aabbsGpu.copyFromHost(m_aabbsCpu); }
virtual int getNumOverlap() { return m_overlappingPairsGpu.size(); }
virtual cl_mem getOverlappingPairBuffer() { return m_overlappingPairsGpu.getBufferCL(); }
virtual cl_mem getAabbBufferWS() { return m_aabbsGpu.getBufferCL(); }
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() { return m_aabbsGpu; }
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()
{
b3Assert(0); //CPU version not implemented
return m_aabbsCpu;
}
static b3GpuBroadphaseInterface* CreateFunc(cl_context context, cl_device_id device, cl_command_queue queue)
{
return new b3GpuParallelLinearBvhBroadphase(context, device, queue);
}
};
#endif

View File

@ -0,0 +1,399 @@
/*
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Initial Author Jackson Lee, 2014
typedef float b3Scalar;
typedef float4 b3Vector3;
#define b3Max max
#define b3Min min
typedef struct
{
unsigned int m_key;
unsigned int m_value;
} SortDataCL;
typedef struct
{
union
{
float4 m_min;
float m_minElems[4];
int m_minIndices[4];
};
union
{
float4 m_max;
float m_maxElems[4];
int m_maxIndices[4];
};
} b3AabbCL;
unsigned int interleaveBits(unsigned int x)
{
//........ ........ ......12 3456789A //x
//....1..2 ..3..4.. 5..6..7. .8..9..A //x after interleaving bits
//........ ....1234 56789A12 3456789A //x |= (x << 10)
//........ ....1111 1....... ...11111 //0x 00 0F 80 1F
//........ ....1234 5....... ...6789A //x = ( x | (x << 10) ) & 0x000F801F;
//.......1 23451234 5.....67 89A6789A //x |= (x << 5)
//.......1 1.....11 1.....11 .....111 //0x 01 83 83 07
//.......1 2.....34 5.....67 .....89A //x = ( x | (x << 5) ) & 0x01838307;
//....12.1 2..34534 5..67.67 ..89A89A //x |= (x << 3)
//....1... 1..1...1 1..1...1 ..1...11 //0x 08 91 91 23
//....1... 2..3...4 5..6...7 ..8...9A //x = ( x | (x << 3) ) & 0x08919123;
//...11..2 2.33..4N 5.66..77 .88..9NA //x |= (x << 1) ( N indicates overlapping bits, first overlap is bit {4,5} second is {9,A} )
//....1..1 ..1...1. 1..1..1. .1...1.1 //0x 09 22 92 45
//....1..2 ..3...4. 5..6..7. .8...9.A //x = ( x | (x << 1) ) & 0x09229245;
//...11.22 .33..445 5.66.77. 88..99AA //x |= (x << 1)
//....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 34 92 29
//....1..2 ..3..4.. 5..6..7. .8..9..A //x = ( x | (x << 1) ) & 0x09349229;
//........ ........ ......11 11111111 //0x000003FF
x &= 0x000003FF; //Clear all bits above bit 10
x = ( x | (x << 10) ) & 0x000F801F;
x = ( x | (x << 5) ) & 0x01838307;
x = ( x | (x << 3) ) & 0x08919123;
x = ( x | (x << 1) ) & 0x09229245;
x = ( x | (x << 1) ) & 0x09349229;
return x;
}
unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)
{
return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;
}
/*
__kernel void findAllNodesMergedAabb(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* out_mergedAabb, int numAabbs)
{
int aabbIndex = get_global_id(0);
if(aabbIndex >= numAabbs) return;
//Find the most significant bit(msb)
int mostSignificantBit = 0;
{
int temp = numLeaves;
while(temp >>= 1) mostSignificantBit++; //Start counting from 0 (0 and 1 have msb 0, 2 has msb 1)
}
int numberOfAabbsAboveMsbSplit = numAabbs & ~( ~(0) << mostSignificantBit ); // verify
int numRemainingAabbs = (1 << mostSignificantBit);
//Merge AABBs above most significant bit so that the number of remaining AABBs is a power of 2
//For example, if there are 159 AABBs = 128 + 31, then merge indices [0, 30] and 128 + [0, 30]
if(aabbIndex < numberOfAabbsAboveMsbSplit)
{
int otherAabbIndex = numRemainingAabbs + aabbIndex;
b3AabbCL aabb = worldSpaceAabbs[aabbIndex];
b3AabbCL otherAabb = worldSpaceAabbs[otherAabbIndex];
b3AabbCL mergedAabb;
mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);
mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);
out_mergedAabb[aabbIndex] = mergedAabb;
}
barrier(CLK_GLOBAL_MEM_FENCE);
//
int offset = numRemainingAabbs / 2;
while(offset >= 1)
{
if(aabbIndex < offset)
{
int otherAabbIndex = aabbIndex + offset;
b3AabbCL aabb = worldSpaceAabbs[aabbIndex];
b3AabbCL otherAabb = worldSpaceAabbs[otherAabbIndex];
b3AabbCL mergedAabb;
mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);
mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);
out_mergedAabb[aabbIndex] = mergedAabb;
}
offset = offset / 2;
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
*/
__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs,
__global SortDataCL* out_mortonCodesAndAabbIndices,
b3Scalar cellSize, int numAabbs)
{
int leafNodeIndex = get_global_id(0); //Leaf node index == AABB index
if(leafNodeIndex >= numAabbs) return;
b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];
b3Vector3 center = (aabb.m_min + aabb.m_max) * 0.5f;
//Quantize into integer coordinates
//floor() is needed to prevent the center cell, at (0,0,0) from being twice the size
b3Vector3 gridPosition = center / cellSize;
int4 discretePosition;
discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );
discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );
discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );
//Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]
discretePosition = b3Max( -512, b3Min(discretePosition, 511) );
discretePosition += 512;
//Interleave bits(assign a morton code, also known as a z-curve)
unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);
//
SortDataCL mortonCodeIndexPair;
mortonCodeIndexPair.m_key = mortonCode;
mortonCodeIndexPair.m_value = leafNodeIndex;
out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;
}
#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128
#define B3_PLBVH_ROOT_NODE_MARKER -1 //Used to indicate that the (root) node has no parent
#define B3_PLBVH_ROOT_NODE_INDEX 0
//For elements of internalNodeChildIndices(int2), the negative bit determines whether it is a leaf or internal node.
//Positive index == leaf node, while negative index == internal node (remove negative sign to get index).
//
//Since the root internal node is at index 0, no internal nodes should reference it as a child,
//and so index 0 is always used to indicate a leaf node.
int isLeafNode(int index) { return (index >= 0); }
int getIndexWithInternalNodeMarkerRemoved(int index) { return (index >= 0) ? index : -index; }
int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : -index; }
__kernel void constructBinaryTree(__global int* firstIndexOffsetPerLevel,
__global int* numNodesPerLevel,
__global int2* out_internalNodeChildIndices,
__global int* out_internalNodeParentNodes,
__global int* out_leafNodeParentNodes,
int numLevels, int numInternalNodes)
{
int internalNodeIndex = get_global_id(0);
if(internalNodeIndex >= numInternalNodes) return;
//Find the level that this node is in, using linear search(could replace with binary search)
int level = 0;
int numInternalLevels = numLevels - 1; //All levels except the last are internal nodes
for(; level < numInternalLevels; ++level)
{
if( firstIndexOffsetPerLevel[level] <= internalNodeIndex && internalNodeIndex < firstIndexOffsetPerLevel[level + 1]) break;
}
//Check lower levels to find child nodes
//Left child is always in the next level, but the same does not apply to the right child
int indexInLevel = internalNodeIndex - firstIndexOffsetPerLevel[level];
int firstIndexInNextLevel = firstIndexOffsetPerLevel[level + 1]; //Should never be out of bounds(see for loop above)
int leftChildLevel = level + 1;
int leftChildIndex = firstIndexInNextLevel + indexInLevel * 2;
int rightChildLevel = level + 1;
int rightChildIndex = leftChildIndex + 1;
//Under certain conditions, the right child index as calculated above is invalid; need to find the correct index
//
//First condition: must be at least 2 levels apart from the leaf node level;
//if the current level is right next to the leaf node level, then the right child
//will never be invalid due to the way the nodes are allocated (also avoid a out-of-bounds memory access)
//
//Second condition: not enough nodes in the next level for each parent to have 2 children, so the right child is invalid
//
//Third condition: must be the last node in its level
if( level < numLevels - 2
&& numNodesPerLevel[level] * 2 > numNodesPerLevel[level + 1]
&& indexInLevel == numNodesPerLevel[level] - 1 )
{
//Check lower levels until we find a node without a parent
for(; rightChildLevel < numLevels - 1; ++rightChildLevel)
{
int rightChildNextLevel = rightChildLevel + 1;
//If this branch is taken, it means that the last node in rightChildNextLevel has no parent
if( numNodesPerLevel[rightChildLevel] * 2 < numNodesPerLevel[rightChildNextLevel] )
{
//Set the node to the last node in rightChildNextLevel
rightChildLevel = rightChildNextLevel;
rightChildIndex = firstIndexOffsetPerLevel[rightChildNextLevel] + numNodesPerLevel[rightChildNextLevel] - 1;
break;
}
}
}
int isLeftChildLeaf = (leftChildLevel >= numLevels - 1);
int isRightChildLeaf = (rightChildLevel >= numLevels - 1);
//If left/right child is a leaf node, the index needs to be corrected
//the way the index is calculated assumes that the leaf and internal nodes are in a contiguous array,
//with leaf nodes at the end of the array; in actuality, the leaf and internal nodes are in separate arrays
{
int leafNodeLevel = numLevels - 1;
leftChildIndex = (isLeftChildLeaf) ? leftChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : leftChildIndex;
rightChildIndex = (isLeftChildLeaf) ? rightChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : rightChildIndex;
}
//Set the negative sign bit if the node is internal
int2 childIndices;
childIndices.x = getIndexWithInternalNodeMarkerSet(isLeftChildLeaf, leftChildIndex);
childIndices.y = getIndexWithInternalNodeMarkerSet(isRightChildLeaf, rightChildIndex);
out_internalNodeChildIndices[internalNodeIndex] = childIndices;
//Assign parent node index to children
__global int* out_leftChildParentNodeIndices = (isLeftChildLeaf) ? out_leafNodeParentNodes : out_internalNodeParentNodes;
out_leftChildParentNodeIndices[leftChildIndex] = internalNodeIndex;
__global int* out_rightChildParentNodeIndices = (isRightChildLeaf) ? out_leafNodeParentNodes : out_internalNodeParentNodes;
out_rightChildParentNodeIndices[rightChildIndex] = internalNodeIndex;
}
__kernel void determineInternalNodeAabbs(__global int* firstIndexOffsetPerLevel,
__global int* numNodesPerLevel,
__global int2* internalNodeChildIndices,
__global SortDataCL* mortonCodesAndAabbIndices,
__global b3AabbCL* leafNodeAabbs,
__global b3AabbCL* out_internalNodeAabbs, int numLevels, int numInternalNodes)
{
int i = get_global_id(0);
if(i >= numInternalNodes) return;
int numInternalLevels = numLevels - 1;
//Starting from the level next to the leaf nodes, move towards the root(level 0)
for(int level = numInternalLevels - 1; level >= 0; --level)
{
int indexInLevel = i; //Index relative to firstIndexOffsetPerLevel[level]
int numNodesInLevel = numNodesPerLevel[level];
if(i < numNodesInLevel)
{
int internalNodeIndexGlobal = indexInLevel + firstIndexOffsetPerLevel[level];
int2 childIndicies = internalNodeChildIndices[internalNodeIndexGlobal];
int leftChildIndex = getIndexWithInternalNodeMarkerRemoved(childIndicies.x);
int rightChildIndex = getIndexWithInternalNodeMarkerRemoved(childIndicies.y);
int isLeftChildLeaf = isLeafNode(childIndicies.x);
int isRightChildLeaf = isLeafNode(childIndicies.y);
int leftChildLeafIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;
int rightChildLeafIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;
b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftChildLeafIndex] : out_internalNodeAabbs[leftChildIndex];
b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightChildLeafIndex] : out_internalNodeAabbs[rightChildIndex];
b3AabbCL internalNodeAabb;
internalNodeAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);
internalNodeAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);
out_internalNodeAabbs[internalNodeIndexGlobal] = internalNodeAabb;
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
//From sap.cl
#define NEW_PAIR_MARKER -1
bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)
{
bool overlap = true;
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
return overlap;
}
//From sap.cl
__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs,
__global int2* internalNodeChildIndices, __global b3AabbCL* internalNodeAabbs,
__global SortDataCL* mortonCodesAndAabbIndices,
__global int* out_numPairs, __global int4* out_overlappingPairs,
int maxPairs, int numQueryAabbs)
{
#define USE_SPATIALLY_COHERENT_INDICIES //mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve
#ifdef USE_SPATIALLY_COHERENT_INDICIES
int queryRigidIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);
if(queryRigidIndex >= numQueryAabbs) return;
queryRigidIndex = mortonCodesAndAabbIndices[queryRigidIndex].m_value;
#else
int queryRigidIndex = get_global_id(0);
if(queryRigidIndex >= numQueryAabbs) return;
#endif
b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];
int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];
//Starting by placing only the root node index, 0, in the stack causes it to be detected as a leaf node(see isLeafNode() in loop)
int stackSize = 2;
stack[0] = internalNodeChildIndices[B3_PLBVH_ROOT_NODE_INDEX].x;
stack[1] = internalNodeChildIndices[B3_PLBVH_ROOT_NODE_INDEX].y;
while(stackSize)
{
int internalOrLeafNodeIndex = stack[ stackSize - 1 ];
--stackSize;
int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false
int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);
//bvhRigidIndex is not used if internal node
int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;
b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];
if( queryRigidIndex != bvhRigidIndex && TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )
{
if(isLeaf && rigidAabbs[queryRigidIndex].m_minIndices[3] < rigidAabbs[bvhRigidIndex].m_minIndices[3])
{
int4 pair;
pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];
pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];
pair.z = NEW_PAIR_MARKER;
pair.w = NEW_PAIR_MARKER;
int pairIndex = atomic_inc(out_numPairs);
if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;
}
if(!isLeaf) //Internal node
{
if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)
{
//Error
}
else
{
stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;
stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;
}
}
}
}
}

View File

@ -0,0 +1,325 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* parallelLinearBvhCL= \
"/*\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose,\n"
"including commercial applications, and to alter it and redistribute it freely,\n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Initial Author Jackson Lee, 2014\n"
"typedef float b3Scalar;\n"
"typedef float4 b3Vector3;\n"
"#define b3Max max\n"
"#define b3Min min\n"
"typedef struct\n"
"{\n"
" unsigned int m_key;\n"
" unsigned int m_value;\n"
"} SortDataCL;\n"
"typedef struct \n"
"{\n"
" union\n"
" {\n"
" float4 m_min;\n"
" float m_minElems[4];\n"
" int m_minIndices[4];\n"
" };\n"
" union\n"
" {\n"
" float4 m_max;\n"
" float m_maxElems[4];\n"
" int m_maxIndices[4];\n"
" };\n"
"} b3AabbCL;\n"
"unsigned int interleaveBits(unsigned int x)\n"
"{\n"
" //........ ........ ......12 3456789A //x\n"
" //....1..2 ..3..4.. 5..6..7. .8..9..A //x after interleaving bits\n"
" \n"
" //........ ....1234 56789A12 3456789A //x |= (x << 10)\n"
" //........ ....1111 1....... ...11111 //0x 00 0F 80 1F\n"
" //........ ....1234 5....... ...6789A //x = ( x | (x << 10) ) & 0x000F801F; \n"
" \n"
" //.......1 23451234 5.....67 89A6789A //x |= (x << 5)\n"
" //.......1 1.....11 1.....11 .....111 //0x 01 83 83 07\n"
" //.......1 2.....34 5.....67 .....89A //x = ( x | (x << 5) ) & 0x01838307;\n"
" \n"
" //....12.1 2..34534 5..67.67 ..89A89A //x |= (x << 3)\n"
" //....1... 1..1...1 1..1...1 ..1...11 //0x 08 91 91 23\n"
" //....1... 2..3...4 5..6...7 ..8...9A //x = ( x | (x << 3) ) & 0x08919123;\n"
" \n"
" //...11..2 2.33..4N 5.66..77 .88..9NA //x |= (x << 1) ( N indicates overlapping bits, first overlap is bit {4,5} second is {9,A} )\n"
" //....1..1 ..1...1. 1..1..1. .1...1.1 //0x 09 22 92 45\n"
" //....1..2 ..3...4. 5..6..7. .8...9.A //x = ( x | (x << 1) ) & 0x09229245;\n"
" \n"
" //...11.22 .33..445 5.66.77. 88..99AA //x |= (x << 1)\n"
" //....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 34 92 29\n"
" //....1..2 ..3..4.. 5..6..7. .8..9..A //x = ( x | (x << 1) ) & 0x09349229;\n"
" \n"
" //........ ........ ......11 11111111 //0x000003FF\n"
" x &= 0x000003FF; //Clear all bits above bit 10\n"
" \n"
" x = ( x | (x << 10) ) & 0x000F801F;\n"
" x = ( x | (x << 5) ) & 0x01838307;\n"
" x = ( x | (x << 3) ) & 0x08919123;\n"
" x = ( x | (x << 1) ) & 0x09229245;\n"
" x = ( x | (x << 1) ) & 0x09349229;\n"
" \n"
" return x;\n"
"}\n"
"unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)\n"
"{\n"
" return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n"
"}\n"
"__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, \n"
" __global SortDataCL* out_mortonCodesAndAabbIndices, \n"
" b3Scalar cellSize, int numAabbs)\n"
"{\n"
" int leafNodeIndex = get_global_id(0); //Leaf node index == AABB index\n"
" if(leafNodeIndex >= numAabbs) return;\n"
" \n"
" b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];\n"
" \n"
" b3Vector3 center = (aabb.m_min + aabb.m_max) * 0.5f;\n"
" \n"
" //Quantize into integer coordinates\n"
" //floor() is needed to prevent the center cell, at (0,0,0) from being twice the size\n"
" b3Vector3 gridPosition = center / cellSize;\n"
" \n"
" int4 discretePosition;\n"
" discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );\n"
" discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );\n"
" discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );\n"
" \n"
" //Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]\n"
" discretePosition = b3Max( -512, b3Min(discretePosition, 511) );\n"
" discretePosition += 512;\n"
" \n"
" //Interleave bits(assign a morton code, also known as a z-curve)\n"
" unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);\n"
" \n"
" //\n"
" SortDataCL mortonCodeIndexPair;\n"
" mortonCodeIndexPair.m_key = mortonCode;\n"
" mortonCodeIndexPair.m_value = leafNodeIndex;\n"
" \n"
" out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;\n"
"}\n"
"#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128\n"
"#define B3_PLBVH_ROOT_NODE_MARKER -1 //Used to indicate that the node has no parent \n"
"#define B3_PLBVH_ROOT_NODE_INDEX 0\n"
"//For elements of internalNodeChildIndices(int2), the negative bit determines whether it is a leaf or internal node.\n"
"//Positive index == leaf node, while negative index == internal node (remove negative sign to get index).\n"
"//\n"
"//Since the root internal node is at index 0, no internal nodes should reference it as a child,\n"
"//and so index 0 is always used to indicate a leaf node.\n"
"int isLeafNode(int index) { return (index >= 0); }\n"
"int getIndexWithInternalNodeMarkerRemoved(int index) { return (index >= 0) ? index : -index; }\n"
"int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : -index; }\n"
"__kernel void constructBinaryTree(__global int* firstIndexOffsetPerLevel,\n"
" __global int* numNodesPerLevel,\n"
" __global int2* out_internalNodeChildIndices, \n"
" __global int* out_internalNodeParentNodes, \n"
" __global int* out_leafNodeParentNodes, \n"
" int numLevels, int numInternalNodes)\n"
"{\n"
" int internalNodeIndex = get_global_id(0);\n"
" if(internalNodeIndex >= numInternalNodes) return;\n"
" \n"
" //Find the level that this node is in, using linear search(could replace with binary search)\n"
" int level = 0;\n"
" int numInternalLevels = numLevels - 1; //All levels except the last are internal nodes\n"
" for(; level < numInternalLevels; ++level)\n"
" {\n"
" if( firstIndexOffsetPerLevel[level] <= internalNodeIndex && internalNodeIndex < firstIndexOffsetPerLevel[level + 1]) break;\n"
" }\n"
" \n"
" //Check lower levels to find child nodes\n"
" //Left child is always in the next level, but the same does not apply to the right child\n"
" int indexInLevel = internalNodeIndex - firstIndexOffsetPerLevel[level];\n"
" int firstIndexInNextLevel = firstIndexOffsetPerLevel[level + 1]; //Should never be out of bounds(see for loop above)\n"
" \n"
" int leftChildLevel = level + 1;\n"
" int leftChildIndex = firstIndexInNextLevel + indexInLevel * 2;\n"
" \n"
" int rightChildLevel = level + 1;\n"
" int rightChildIndex = leftChildIndex + 1;\n"
" \n"
" //Under certain conditions, the right child index as calculated above is invalid; need to find the correct index\n"
" //\n"
" //First condition: must be at least 2 levels apart from the leaf node level;\n"
" //if the current level is right next to the leaf node level, then the right child\n"
" //will never be invalid due to the way the nodes are allocated (also avoid a out-of-bounds memory access)\n"
" //\n"
" //Second condition: not enough nodes in the next level for each parent to have 2 children, so the right child is invalid\n"
" //\n"
" //Third condition: must be the last node in its level\n"
" if( level < numLevels - 2 \n"
" && numNodesPerLevel[level] * 2 > numNodesPerLevel[level + 1] \n"
" && indexInLevel == numNodesPerLevel[level] - 1 )\n"
" {\n"
" //Check lower levels until we find a node without a parent\n"
" for(; rightChildLevel < numLevels - 1; ++rightChildLevel)\n"
" {\n"
" int rightChildNextLevel = rightChildLevel + 1;\n"
" \n"
" //If this branch is taken, it means that the last node in rightChildNextLevel has no parent\n"
" if( numNodesPerLevel[rightChildLevel] * 2 < numNodesPerLevel[rightChildNextLevel] )\n"
" {\n"
" //Set the node to the last node in rightChildNextLevel\n"
" rightChildLevel = rightChildNextLevel;\n"
" rightChildIndex = firstIndexOffsetPerLevel[rightChildNextLevel] + numNodesPerLevel[rightChildNextLevel] - 1;\n"
" break;\n"
" }\n"
" }\n"
" }\n"
" \n"
" int isLeftChildLeaf = (leftChildLevel >= numLevels - 1);\n"
" int isRightChildLeaf = (rightChildLevel >= numLevels - 1);\n"
" \n"
" //If left/right child is a leaf node, the index needs to be corrected\n"
" //the way the index is calculated assumes that the leaf and internal nodes are in a contiguous array,\n"
" //with leaf nodes at the end of the array; in actuality, the leaf and internal nodes are in separate arrays\n"
" {\n"
" int leafNodeLevel = numLevels - 1;\n"
" leftChildIndex = (isLeftChildLeaf) ? leftChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : leftChildIndex;\n"
" rightChildIndex = (isLeftChildLeaf) ? rightChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : rightChildIndex;\n"
" }\n"
" \n"
" //Set the negative sign bit if the node is internal\n"
" int2 childIndices;\n"
" childIndices.x = getIndexWithInternalNodeMarkerSet(isLeftChildLeaf, leftChildIndex);\n"
" childIndices.y = getIndexWithInternalNodeMarkerSet(isRightChildLeaf, rightChildIndex);\n"
" out_internalNodeChildIndices[internalNodeIndex] = childIndices;\n"
" \n"
" //Assign parent node index to children\n"
" __global int* out_leftChildParentNodeIndices = (isLeftChildLeaf) ? out_leafNodeParentNodes : out_internalNodeParentNodes;\n"
" out_leftChildParentNodeIndices[leftChildIndex] = internalNodeIndex;\n"
" \n"
" __global int* out_rightChildParentNodeIndices = (isRightChildLeaf) ? out_leafNodeParentNodes : out_internalNodeParentNodes;\n"
" out_rightChildParentNodeIndices[rightChildIndex] = internalNodeIndex;\n"
"}\n"
"__kernel void determineInternalNodeAabbs(__global int* firstIndexOffsetPerLevel,\n"
" __global int* numNodesPerLevel, \n"
" __global int2* internalNodeChildIndices,\n"
" __global SortDataCL* mortonCodesAndAabbIndices,\n"
" __global b3AabbCL* leafNodeAabbs, \n"
" __global b3AabbCL* out_internalNodeAabbs, int numLevels, int numInternalNodes)\n"
"{\n"
" int i = get_global_id(0);\n"
" if(i >= numInternalNodes) return;\n"
" \n"
" int numInternalLevels = numLevels - 1;\n"
" \n"
" //Starting from the level next to the leaf nodes, move towards the root(level 0)\n"
" for(int level = numInternalLevels - 1; level >= 0; --level)\n"
" {\n"
" int indexInLevel = i; //Index relative to firstIndexOffsetPerLevel[level]\n"
" \n"
" int numNodesInLevel = numNodesPerLevel[level];\n"
" if(i < numNodesInLevel)\n"
" {\n"
" int internalNodeIndexGlobal = indexInLevel + firstIndexOffsetPerLevel[level];\n"
" int2 childIndicies = internalNodeChildIndices[internalNodeIndexGlobal];\n"
" \n"
" int leftChildIndex = getIndexWithInternalNodeMarkerRemoved(childIndicies.x);\n"
" int rightChildIndex = getIndexWithInternalNodeMarkerRemoved(childIndicies.y);\n"
" \n"
" int isLeftChildLeaf = isLeafNode(childIndicies.x);\n"
" int isRightChildLeaf = isLeafNode(childIndicies.y);\n"
" \n"
" int leftChildLeafIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n"
" int rightChildLeafIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n"
" \n"
" b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftChildLeafIndex] : out_internalNodeAabbs[leftChildIndex];\n"
" b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightChildLeafIndex] : out_internalNodeAabbs[rightChildIndex];\n"
" \n"
" b3AabbCL internalNodeAabb;\n"
" internalNodeAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n"
" internalNodeAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n"
" out_internalNodeAabbs[internalNodeIndexGlobal] = internalNodeAabb;\n"
" }\n"
" \n"
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
" }\n"
"}\n"
"//From sap.cl\n"
"#define NEW_PAIR_MARKER -1\n"
"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"//From sap.cl\n"
"__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, \n"
" __global int2* internalNodeChildIndices, __global b3AabbCL* internalNodeAabbs,\n"
" __global SortDataCL* mortonCodesAndAabbIndices,\n"
" __global int* out_numPairs, __global int4* out_overlappingPairs, \n"
" int maxPairs, int numQueryAabbs)\n"
"{\n"
" int queryRigidIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
" if(queryRigidIndex >= numQueryAabbs) return;\n"
" \n"
" queryRigidIndex = mortonCodesAndAabbIndices[queryRigidIndex].m_value;\n"
" //int queryRigidIndex = get_global_id(0);\n"
" //if(queryRigidIndex >= numQueryAabbs) return;\n"
" \n"
" b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];\n"
" \n"
" int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n"
" \n"
" //Starting by placing only the root node index, 0, in the stack causes it to be detected as a leaf node(see isLeafNode() in loop)\n"
" int stackSize = 2;\n"
" stack[0] = internalNodeChildIndices[B3_PLBVH_ROOT_NODE_INDEX].x;\n"
" stack[1] = internalNodeChildIndices[B3_PLBVH_ROOT_NODE_INDEX].y;\n"
" \n"
" while(stackSize)\n"
" {\n"
" int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n"
" --stackSize;\n"
" \n"
" int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n"
" int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n"
" \n"
" //bvhRigidIndex is not used if internal node\n"
" int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n"
" \n"
" b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n"
" if( queryRigidIndex != bvhRigidIndex && TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )\n"
" {\n"
" if(isLeaf && rigidAabbs[queryRigidIndex].m_minIndices[3] < rigidAabbs[bvhRigidIndex].m_minIndices[3])\n"
" {\n"
" int4 pair;\n"
" pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];\n"
" pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n"
" pair.z = NEW_PAIR_MARKER;\n"
" pair.w = NEW_PAIR_MARKER;\n"
" \n"
" int pairIndex = atomic_inc(out_numPairs);\n"
" if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n"
" }\n"
" \n"
" if(!isLeaf) //Internal node\n"
" {\n"
" if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n"
" {\n"
" //Error\n"
" }\n"
" else\n"
" {\n"
" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n"
" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n"
" }\n"
" }\n"
" }\n"
" \n"
" }\n"
"}\n"
;