Draft Parallel Linear BVH Broadphase.

2024-12-13 21:30:09 +00:00 · 2014-02-18 19:23:25 -08:00 · 2014-02-18 19:23:25 -08:00 · b7b7356af8
commit b7b7356af8
parent fabdf8b4a9
7 changed files with 1242 additions and 0 deletions
--- a/Demos3/GpuDemos/broadphase/PairBench.cpp
+++ b/Demos3/GpuDemos/broadphase/PairBench.cpp
@ -9,6 +9,7 @@
 #include "OpenGLWindow/b3gWindowInterface.h"
 #include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
 #include "Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h"
+#include "Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h"

 #include "../GpuDemoInternalData.h"
 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
@ -108,6 +109,7 @@ static int curSelectedBroadphase = 0;
 static BroadphaseEntry allBroadphases[]=
 {
 	{"Gpu Grid",b3GpuGridBroadphase::CreateFunc},
+	{"Parallel Linear BVH",b3GpuParallelLinearBvhBroadphase::CreateFunc},
 	{"CPU Brute Force",b3GpuSapBroadphase::CreateFuncBruteForceCpu},
 	{"GPU Brute Force",b3GpuSapBroadphase::CreateFuncBruteForceGpu},
 	{"GPU 1-SAP Original",b3GpuSapBroadphase::CreateFuncOriginal},
--- a/build3/stringify.bat
+++ b/build3/stringify.bat
@ -13,6 +13,7 @@ premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/Broadphas
 premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h" --stringname="sapFastCL" stringify

 premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h" --stringname="gridBroadphaseCL" stringify
+premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h" --stringname="parallelLinearBvhCL" stringify



--- a/build3/stringify_linux.sh
+++ b/build3/stringify_linux.sh
@ -14,6 +14,7 @@ rem @echo off
 ./premake4_linux64 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h" --stringname="sapFastCL" stringify

 ./premake4_linux64 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h" --stringname="gridBroadphaseCL" stringify
+./premake4_linux64 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h" --stringname="parallelLinearBvhCL" stringify



--- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h
@ -0,0 +1,416 @@
+/*
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Initial Author Jackson Lee, 2014
+
+#ifndef B3_GPU_PARALLEL_LINEAR_BVH_H
+#define B3_GPU_PARALLEL_LINEAR_BVH_H
+
+//#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
+#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h"
+#include "Bullet3Common/shared/b3Int2.h"
+#include "Bullet3Common/shared/b3Int4.h"
+
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
+
+#include "Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h"
+
+#define B3_PLBVH_ROOT_NODE_MARKER -1	//Syncronize with parallelLinearBvh.cl
+
+///@brief GPU Parallel Linearized Bounding Volume Heirarchy(LBVH) that is reconstructed every frame
+///@remarks
+///Main references: \n
+///"Fast BVH Construction on GPUs" [Lauterbach et al. 2009] \n
+///"Maximizing Parallelism in the Construction of BVHs, Octrees, and k-d trees" [Karras 2012] \n
+///@par
+///The basic algorithm for building the BVH as presented in [Lauterbach et al. 2009] consists of 4 stages:
+/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid) 
+/// - [fully parallel] Sort morton codes
+/// - [somewhat parallel] Build radix binary tree (assign parent/child pointers for internal nodes of the BVH) 
+/// - [somewhat parallel] Set internal node AABBs 
+///@par
+///[Karras 2012] improves on the algorithm by introducing fully parallel methods for the last 2 stages.
+///The BVH implementation here is almost the same as [Karras 2012], but a different method is used for constructing the tree.
+/// - Instead of building a binary radix tree, we simply pair each node with its nearest sibling.
+/// This has the effect of further worsening the quality of the BVH, but the main spatial partitioning is done by the
+/// Z-curve anyways, and this method should be simpler and faster during construction. Additionally, it is still possible
+/// to improve the quality of the BVH by rearranging the connections between nodes.
+/// - Due to the way the tree is constructed, it becomes unnecessary to use atomic_inc to get
+/// the AABB for each internal node. Rather than traveling upwards from the leaf nodes, as in the paper,
+/// each internal node checks its child nodes to get its AABB.
+class b3GpuParallelLinearBvh
+{
+	cl_command_queue m_queue;
+	
+	cl_program m_parallelLinearBvhProgram;
+	
+	cl_kernel m_assignMortonCodesAndAabbIndiciesKernel;
+	cl_kernel m_constructBinaryTreeKernel;
+	cl_kernel m_determineInternalNodeAabbsKernel;
+	
+	cl_kernel m_plbvhCalculateOverlappingPairsKernel;
+
+	b3FillCL m_fill;
+	b3RadixSort32CL m_radixSorter;
+	
+	//1 element per level in the tree
+	b3AlignedObjectArray<int> m_numNodesPerLevelCpu;			//Level 0(m_numNodesPerLevelCpu[0]) is the root, last level contains the leaf nodes
+	b3AlignedObjectArray<int> m_firstIndexOffsetPerLevelCpu;	//Contains the index/offset of the first node in that level
+	b3OpenCLArray<int> m_numNodesPerLevelGpu;
+	b3OpenCLArray<int> m_firstIndexOffsetPerLevelGpu;
+	
+	//1 element per internal node (number_of_internal_nodes = number_of_leaves - 1); index 0 is the root node
+	b3OpenCLArray<b3SapAabb> m_internalNodeAabbs;
+	b3OpenCLArray<b3Int2> m_internalNodeChildNodes;				//x == left child, y == right child
+	b3OpenCLArray<int> m_internalNodeParentNodes;
+	
+	//1 element per leaf node
+	b3OpenCLArray<int> m_leafNodeParentNodes;
+	b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies;		//m_key = morton code, m_value == aabb index
+	
+public:
+	b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) :
+		m_queue(queue),
+		m_fill(context, device, queue),
+		m_radixSorter(context, device, queue),
+		
+		m_numNodesPerLevelGpu(context, queue),
+		m_firstIndexOffsetPerLevelGpu(context, queue),
+		m_internalNodeAabbs(context, queue),
+		m_internalNodeChildNodes(context, queue),
+		m_internalNodeParentNodes(context, queue),
+		m_leafNodeParentNodes(context, queue),
+		m_mortonCodesAndAabbIndicies(context, queue)
+	{
+		const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl";
+		
+		const char* kernelSource = parallelLinearBvhCL;	//parallelLinearBvhCL.h
+		cl_int error;
+		char* additionalMacros = 0;
+		m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH);
+		b3Assert(m_parallelLinearBvhProgram);
+		
+		m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros );
+		b3Assert(m_assignMortonCodesAndAabbIndiciesKernel);
+		m_constructBinaryTreeKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "constructBinaryTree", &error, m_parallelLinearBvhProgram, additionalMacros );
+		b3Assert(m_constructBinaryTreeKernel);
+		m_determineInternalNodeAabbsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "determineInternalNodeAabbs", &error, m_parallelLinearBvhProgram, additionalMacros );
+		b3Assert(m_determineInternalNodeAabbsKernel);
+		
+		m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros );
+		b3Assert(m_plbvhCalculateOverlappingPairsKernel);
+	}
+	
+	virtual ~b3GpuParallelLinearBvh() 
+	{
+		clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel);
+		clReleaseKernel(m_constructBinaryTreeKernel);
+		clReleaseKernel(m_determineInternalNodeAabbsKernel);
+		
+		clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel);
+		
+		clReleaseProgram(m_parallelLinearBvhProgram);
+	}
+
+	//	fix: need to handle/test case with 2 nodes
+	
+	///@param cellsize A virtual grid of size 2^10^3 is used in the process of creating the BVH
+	void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, b3Scalar cellSize)
+	{
+		B3_PROFILE("b3ParallelLinearBvh::build()");
+	
+		int numLeaves = worldSpaceAabbs.size();	//Number of leaves in the BVH == Number of rigid body AABBs
+		int numInternalNodes = numLeaves - 1;
+	
+		if(numLeaves < 2) return;
+	
+		//
+		{
+			m_internalNodeAabbs.resize(numInternalNodes);
+			m_internalNodeChildNodes.resize(numInternalNodes);
+			m_internalNodeParentNodes.resize(numInternalNodes);
+	
+			m_leafNodeParentNodes.resize(numLeaves);
+			m_mortonCodesAndAabbIndicies.resize(numLeaves);
+		}
+		
+		//Determine number of levels in the binary tree( numLevels = ceil( log2(numLeaves) ) )
+		//The number of levels is equivalent to the number of bits needed to uniquely identify each node(including both internal and leaf nodes)
+		int numLevels = 0;
+		{
+			//Find the most significant bit(msb)
+			int mostSignificantBit = 0;
+			{
+				int temp = numLeaves;
+				while(temp >>= 1) mostSignificantBit++;		//Start counting from 0 (0 and 1 have msb 0, 2 has msb 1)
+			}
+			numLevels = mostSignificantBit + 1;
+			
+			//If the number of nodes is not a power of 2(as in, can be expressed as 2^N where N is an integer), then there is 1 additional level
+			if( ~(1 << mostSignificantBit) & numLeaves ) numLevels++;
+			
+			if(1) printf("numLeaves, numLevels, mostSignificantBit: %d, %d, %d \n", numLeaves, numLevels, mostSignificantBit);
+		}
+		
+		//Determine number of nodes per level, use prefix sum to get offsets of each level, and send to GPU
+		{
+			B3_PROFILE("Determine number of nodes per level");
+			
+			m_numNodesPerLevelCpu.resize(numLevels);
+			
+			//The last level contains the leaf nodes; number of leaves is already known
+			if(numLevels - 1 >= 0) m_numNodesPerLevelCpu[numLevels - 1] = numLeaves;
+			
+			//Calculate number of nodes in each level; 
+			//start from the second to last level(level right next to leaf nodes) and move towards the root(level 0)
+			int hasRemainder = 0;
+			for(int levelIndex = numLevels - 2; levelIndex >= 0; --levelIndex)
+			{
+				int numNodesPreviousLevel = m_numNodesPerLevelCpu[levelIndex + 1];		//For first iteration this == numLeaves
+			
+				bool allNodesAllocated = ( (numNodesPreviousLevel + hasRemainder) % 2 == 0 );
+			
+				int numNodesCurrentLevel = (allNodesAllocated) ? (numNodesPreviousLevel + hasRemainder) / 2 : numNodesPreviousLevel / 2;
+				m_numNodesPerLevelCpu[levelIndex] = numNodesCurrentLevel;
+				
+				hasRemainder = static_cast<int>(!allNodesAllocated);
+			}
+			
+			//Prefix sum to calculate the first index offset of each level
+			{
+				m_firstIndexOffsetPerLevelCpu = m_numNodesPerLevelCpu;
+				
+				//Perform inclusive scan
+				for(int i = 1; i < m_firstIndexOffsetPerLevelCpu.size(); ++i) 
+					m_firstIndexOffsetPerLevelCpu[i] += m_firstIndexOffsetPerLevelCpu[i - 1];
+				
+				//Convert inclusive scan to exclusive scan to get the offsets
+				//This is equivalent to shifting each element in m_firstIndexOffsetPerLevelCpu[] by 1 to the right, 
+				//and setting the first element to 0
+				for(int i = 0; i < m_firstIndexOffsetPerLevelCpu.size(); ++i) 
+					m_firstIndexOffsetPerLevelCpu[i] -= m_numNodesPerLevelCpu[i];
+			}
+			
+			if(1)
+			{
+				int numInternalNodes = 0;
+				for(int i = 0; i < numLevels; ++i) 
+					if(i < numLevels - 1) numInternalNodes += m_numNodesPerLevelCpu[i];
+				printf("numInternalNodes: %d\n", numInternalNodes);
+				
+				for(int i = 0; i < numLevels; ++i) 
+					printf("numNodes, offset[%d]: %d, %d \n", i, m_numNodesPerLevelCpu[i], m_firstIndexOffsetPerLevelCpu[i]);
+				printf("\n");
+			}
+			
+			//Copy to GPU
+			m_numNodesPerLevelGpu.copyFromHost(m_numNodesPerLevelCpu, false);
+			m_firstIndexOffsetPerLevelGpu.copyFromHost(m_firstIndexOffsetPerLevelCpu, false);
+			clFinish(m_queue);
+		}
+		
+		//Find the AABB of all input AABBs; this is used to define the size of 
+		//each cell in the virtual grid(2^10 cells in each dimension).
+		{
+			B3_PROFILE("Find AABB of merged nodes");
+		
+			/*b3BufferInfoCL bufferInfo[] = 
+			{
+				b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
+				b3BufferInfoCL( m_allNodesMergedAabb.getBufferCL() ),
+			};
+			
+			b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabb, "m_findAllNodesMergedAabb");
+			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst(numLeaves);
+			
+			launcher.launch1D(numLeaves);
+			clFinish(m_queue);*/
+		}
+		
+		//Insert the center of the AABBs into a virtual grid,
+		//then convert the discrete grid coordinates into a morton code
+		//For each element in m_mortonCodesAndAabbIndicies, set
+		//	m_key == morton code (value to sort by)
+		//	m_value = AABB index
+		{
+			B3_PROFILE("Assign morton codes");
+		
+			b3BufferInfoCL bufferInfo[] = 
+			{
+				b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
+				b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() )
+			};
+			
+			b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel");
+			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst(cellSize);
+			launcher.setConst(numLeaves);
+			
+			launcher.launch1D(numLeaves);
+			clFinish(m_queue);
+		}
+		
+		//
+		{
+			B3_PROFILE("Sort leaves by morton codes");
+		
+			m_radixSorter.execute(m_mortonCodesAndAabbIndicies);
+			clFinish(m_queue);
+		}
+		
+		//Optional; only element at m_internalNodeParentNodes[0], the root node, needs to be set here
+		//as the parent indices of other nodes are overwritten during m_constructBinaryTreeKernel
+		{
+			B3_PROFILE("Reset parent node indices");
+			
+			m_fill.execute( m_internalNodeParentNodes, B3_PLBVH_ROOT_NODE_MARKER, m_internalNodeParentNodes.size() );
+			m_fill.execute( m_leafNodeParentNodes, B3_PLBVH_ROOT_NODE_MARKER, m_leafNodeParentNodes.size() );
+			clFinish(m_queue);
+		}
+		
+		//Construct binary tree; find the children of each internal node, and assign parent nodes
+		{
+			B3_PROFILE("Construct binary tree");
+		
+			b3BufferInfoCL bufferInfo[] = 
+			{
+				b3BufferInfoCL( m_firstIndexOffsetPerLevelGpu.getBufferCL() ),
+				b3BufferInfoCL( m_numNodesPerLevelGpu.getBufferCL() ),
+				b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
+				b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ),
+				b3BufferInfoCL( m_leafNodeParentNodes.getBufferCL() )
+			};
+			
+			b3LauncherCL launcher(m_queue, m_constructBinaryTreeKernel, "m_constructBinaryTreeKernel");
+			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst(numLevels);
+			launcher.setConst(numInternalNodes);
+			
+			launcher.launch1D(numInternalNodes);
+			clFinish(m_queue);
+			
+			if(1)
+			{
+				static b3AlignedObjectArray<b3Int2> internalNodeChildNodes;	
+				m_internalNodeChildNodes.copyToHost(internalNodeChildNodes, false);
+				clFinish(m_queue);
+				
+				for(int i = 0; i < 256; ++i) printf("ch[%d]: %d, %d\n", i, internalNodeChildNodes[i].x, internalNodeChildNodes[i].y);
+				printf("\n");
+			}
+		}
+		
+		//For each internal node, check children to get its AABB; start from the 
+		//last level and move towards the root
+		{
+			B3_PROFILE("Set AABBs");
+		
+			b3BufferInfoCL bufferInfo[] = 
+			{
+				b3BufferInfoCL( m_firstIndexOffsetPerLevelGpu.getBufferCL() ),
+				b3BufferInfoCL( m_numNodesPerLevelGpu.getBufferCL() ),
+				b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
+				b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
+				b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
+				b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() )
+			};
+			
+			b3LauncherCL launcher(m_queue, m_determineInternalNodeAabbsKernel, "m_determineInternalNodeAabbsKernel");
+			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst(numLevels);
+			launcher.setConst(numInternalNodes);
+			
+			launcher.launch1D(numLeaves);
+			clFinish(m_queue);
+			
+			if(0)
+			{
+				static b3AlignedObjectArray<b3SapAabb> rigidAabbs;
+				worldSpaceAabbs.copyToHost(rigidAabbs, false);
+				clFinish(m_queue);
+			
+				b3SapAabb actualRootAabb;
+				actualRootAabb.m_minVec = b3MakeVector3(B3_LARGE_FLOAT, B3_LARGE_FLOAT, B3_LARGE_FLOAT);
+				actualRootAabb.m_maxVec = b3MakeVector3(-B3_LARGE_FLOAT, -B3_LARGE_FLOAT, -B3_LARGE_FLOAT);
+				for(int i = 0; i < rigidAabbs.size(); ++i)
+				{
+					actualRootAabb.m_minVec.setMin(rigidAabbs[i].m_minVec);
+					actualRootAabb.m_maxVec.setMax(rigidAabbs[i].m_maxVec);
+				}
+				printf("actualRootMin: %f, %f, %f \n", actualRootAabb.m_minVec.x, actualRootAabb.m_minVec.y, actualRootAabb.m_minVec.z);
+				printf("actualRootMax: %f, %f, %f \n", actualRootAabb.m_maxVec.x, actualRootAabb.m_maxVec.y, actualRootAabb.m_maxVec.z);
+			
+				b3SapAabb rootAabb = m_internalNodeAabbs.at(0);
+				printf("rootMin: %f, %f, %f \n", rootAabb.m_minVec.x, rootAabb.m_minVec.y, rootAabb.m_minVec.z);
+				printf("rootMax: %f, %f, %f \n", rootAabb.m_maxVec.x, rootAabb.m_maxVec.y, rootAabb.m_maxVec.z);
+				printf("\n");
+			}
+		}
+	}
+	
+	//Max number of pairs is out_overlappingPairs.size()
+	//If the number of overlapping pairs is < out_overlappingPairs.size(), the array is resized
+	void calculateOverlappingPairs(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, 
+									b3OpenCLArray<int>& out_numPairs, b3OpenCLArray<b3Int4>& out_overlappingPairs)
+	{
+		b3Assert( out_numPairs.size() == 1 );
+		
+		int maxPairs = out_overlappingPairs.size();
+			
+		int reset = 0;
+		out_numPairs.copyFromHostPointer(&reset, 1);
+		
+		{
+			B3_PROFILE("PLBVH calculateOverlappingPairs");
+		
+			int numQueryAabbs = worldSpaceAabbs.size();
+			
+			b3BufferInfoCL bufferInfo[] = 
+			{
+				b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
+				
+				b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
+				b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ),
+				b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
+				
+				b3BufferInfoCL( out_numPairs.getBufferCL() ),
+				b3BufferInfoCL( out_overlappingPairs.getBufferCL() )
+			};
+			
+			b3LauncherCL launcher(m_queue, m_plbvhCalculateOverlappingPairsKernel, "m_plbvhCalculateOverlappingPairsKernel");
+			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst(maxPairs);
+			launcher.setConst(numQueryAabbs);
+			
+			launcher.launch1D(numQueryAabbs);
+			clFinish(m_queue);
+		}
+		
+		
+		//
+		int numPairs = -1;
+		out_numPairs.copyToHostPointer(&numPairs, 1);
+		if(numPairs > maxPairs)
+		{
+			b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
+			numPairs = maxPairs;
+			out_numPairs.copyFromHostPointer(&maxPairs, 1);
+		}
+		
+		out_overlappingPairs.resize(numPairs);
+	}
+};
+
+#endif
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h
@ -0,0 +1,98 @@
+/*
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Initial Author Jackson Lee, 2014
+
+#ifndef B3_GPU_PARALLEL_LINEAR_BVH_BROADPHASE_H
+#define B3_GPU_PARALLEL_LINEAR_BVH_BROADPHASE_H
+
+#include "Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h"
+
+#include "b3GpuParallelLinearBvh.h"
+
+class b3GpuParallelLinearBvhBroadphase : public b3GpuBroadphaseInterface
+{
+	b3GpuParallelLinearBvh m_plbvh;
+	
+	b3OpenCLArray<b3Int4> m_overlappingPairsGpu;
+	b3OpenCLArray<b3SapAabb> m_aabbsGpu;
+	b3OpenCLArray<int> m_tempNumPairs;
+	
+	b3AlignedObjectArray<b3SapAabb> m_aabbsCpu;
+	
+public:
+	b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) : 
+		m_plbvh(context, device, queue),
+		
+		m_overlappingPairsGpu(context, queue),
+		m_aabbsGpu(context, queue),
+		m_tempNumPairs(context, queue)
+	{
+		m_tempNumPairs.resize(1);
+	}
+	virtual ~b3GpuParallelLinearBvhBroadphase() {}
+
+	virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask)
+	{
+		b3SapAabb aabb;
+		aabb.m_minVec = aabbMin;
+		aabb.m_maxVec = aabbMax;
+		aabb.m_minIndices[3] = userPtr;
+		
+		m_aabbsCpu.push_back(aabb);
+	}
+	virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask)
+	{
+		b3Assert(0);	//Not implemented
+	}
+	
+	virtual void calculateOverlappingPairs(int maxPairs)
+	{
+		//Detect overall min/max
+		{
+			//Not implemented
+		}
+		
+		//Reconstruct BVH
+		const b3Scalar CELL_SIZE(0.1); 
+		m_plbvh.build(m_aabbsGpu, CELL_SIZE);
+		
+		//
+		m_overlappingPairsGpu.resize(maxPairs);
+		m_plbvh.calculateOverlappingPairs(m_aabbsGpu, m_tempNumPairs, m_overlappingPairsGpu);
+	}
+	virtual void calculateOverlappingPairsHost(int maxPairs)
+	{
+		b3Assert(0);	//CPU version not implemented
+	}
+
+	//call writeAabbsToGpu after done making all changes (createProxy etc)
+	virtual void writeAabbsToGpu() { m_aabbsGpu.copyFromHost(m_aabbsCpu); }
+	
+	virtual int	getNumOverlap() { return m_overlappingPairsGpu.size(); }
+	virtual cl_mem getOverlappingPairBuffer() { return m_overlappingPairsGpu.getBufferCL(); }
+
+	virtual cl_mem getAabbBufferWS() { return m_aabbsGpu.getBufferCL(); }
+	virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() { return m_aabbsGpu; }
+	
+	virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()
+	{
+		b3Assert(0);	//CPU version not implemented
+		return m_aabbsCpu;
+	}
+	
+	static b3GpuBroadphaseInterface* CreateFunc(cl_context context, cl_device_id device, cl_command_queue queue)
+	{
+		return new b3GpuParallelLinearBvhBroadphase(context, device, queue);
+	}
+};
+
+#endif
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl
@ -0,0 +1,399 @@
+/*
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Initial Author Jackson Lee, 2014
+
+typedef float b3Scalar;
+typedef float4 b3Vector3;
+#define b3Max max
+#define b3Min min
+
+typedef struct
+{
+	unsigned int m_key;
+	unsigned int m_value;
+} SortDataCL;
+
+typedef struct 
+{
+	union
+	{
+		float4	m_min;
+		float   m_minElems[4];
+		int			m_minIndices[4];
+	};
+	union
+	{
+		float4	m_max;
+		float   m_maxElems[4];
+		int			m_maxIndices[4];
+	};
+} b3AabbCL;
+
+
+unsigned int interleaveBits(unsigned int x)
+{
+	//........ ........ ......12 3456789A	//x
+	//....1..2 ..3..4.. 5..6..7. .8..9..A	//x after interleaving bits
+	
+	//........ ....1234 56789A12 3456789A	//x |= (x << 10)
+	//........ ....1111 1....... ...11111	//0x 00 0F 80 1F
+	//........ ....1234 5....... ...6789A	//x = ( x | (x << 10) ) & 0x000F801F; 
+	
+	//.......1 23451234 5.....67 89A6789A	//x |= (x <<  5)
+	//.......1 1.....11 1.....11 .....111	//0x 01 83 83 07
+	//.......1 2.....34 5.....67 .....89A	//x = ( x | (x <<  5) ) & 0x01838307;
+	
+	//....12.1 2..34534 5..67.67 ..89A89A	//x |= (x <<  3)
+	//....1... 1..1...1 1..1...1 ..1...11	//0x 08 91 91 23
+	//....1... 2..3...4 5..6...7 ..8...9A	//x = ( x | (x <<  3) ) & 0x08919123;
+	
+	//...11..2 2.33..4N 5.66..77 .88..9NA	//x |= (x <<  1)	( N indicates overlapping bits, first overlap is bit {4,5} second is {9,A} )
+	//....1..1 ..1...1. 1..1..1. .1...1.1	//0x 09 22 92 45
+	//....1..2 ..3...4. 5..6..7. .8...9.A	//x = ( x | (x <<  1) ) & 0x09229245;
+	
+	//...11.22 .33..445 5.66.77. 88..99AA	//x |= (x <<  1)
+	//....1..1 ..1..1.. 1..1..1. .1..1..1	//0x 09 34 92 29
+	//....1..2 ..3..4.. 5..6..7. .8..9..A	//x = ( x | (x <<  1) ) & 0x09349229;
+	
+	//........ ........ ......11 11111111	//0x000003FF
+	x &= 0x000003FF;		//Clear all bits above bit 10
+	
+	x = ( x | (x << 10) ) & 0x000F801F;
+	x = ( x | (x <<  5) ) & 0x01838307;
+	x = ( x | (x <<  3) ) & 0x08919123;
+	x = ( x | (x <<  1) ) & 0x09229245;
+	x = ( x | (x <<  1) ) & 0x09349229;
+	
+	return x;
+}
+unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)
+{
+	return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;
+}
+
+/*
+__kernel void findAllNodesMergedAabb(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* out_mergedAabb, int numAabbs)
+{
+	int aabbIndex = get_global_id(0);
+	if(aabbIndex >= numAabbs) return;
+	
+	//Find the most significant bit(msb)
+	int mostSignificantBit = 0;
+	{
+		int temp = numLeaves;
+		while(temp >>= 1) mostSignificantBit++;		//Start counting from 0 (0 and 1 have msb 0, 2 has msb 1)
+	}
+	
+	int numberOfAabbsAboveMsbSplit = numAabbs & ~( ~(0) << mostSignificantBit );	//	verify
+	int numRemainingAabbs = (1 << mostSignificantBit);
+	
+	//Merge AABBs above most significant bit so that the number of remaining AABBs is a power of 2
+	//For example, if there are 159 AABBs = 128 + 31, then merge indices [0, 30] and 128 + [0, 30]
+	if(aabbIndex < numberOfAabbsAboveMsbSplit)
+	{
+		int otherAabbIndex = numRemainingAabbs + aabbIndex;
+		
+		b3AabbCL aabb = worldSpaceAabbs[aabbIndex];
+		b3AabbCL otherAabb = worldSpaceAabbs[otherAabbIndex];
+		
+		b3AabbCL mergedAabb;
+		mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);
+		mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);
+		out_mergedAabb[aabbIndex] = mergedAabb;
+	}
+	
+	barrier(CLK_GLOBAL_MEM_FENCE);
+	
+	//
+	int offset = numRemainingAabbs / 2;
+	while(offset >= 1)
+	{
+		if(aabbIndex < offset)
+		{
+			int otherAabbIndex = aabbIndex + offset;
+		
+			b3AabbCL aabb = worldSpaceAabbs[aabbIndex];
+			b3AabbCL otherAabb = worldSpaceAabbs[otherAabbIndex];
+		
+			b3AabbCL mergedAabb;
+			mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);
+			mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);
+			out_mergedAabb[aabbIndex] = mergedAabb;
+		}
+		
+		offset = offset / 2;
+		
+		barrier(CLK_GLOBAL_MEM_FENCE);
+	}
+}
+*/
+
+__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, 
+												__global SortDataCL* out_mortonCodesAndAabbIndices, 
+												b3Scalar cellSize, int numAabbs)
+{
+	int leafNodeIndex = get_global_id(0);	//Leaf node index == AABB index
+	if(leafNodeIndex >= numAabbs) return;
+	
+	b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];
+	
+	b3Vector3 center = (aabb.m_min + aabb.m_max) * 0.5f;
+	
+	//Quantize into integer coordinates
+	//floor() is needed to prevent the center cell, at (0,0,0) from being twice the size
+	b3Vector3 gridPosition = center / cellSize;
+	
+	int4 discretePosition;
+	discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );
+	discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );
+	discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );
+	
+	//Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]
+	discretePosition = b3Max( -512, b3Min(discretePosition, 511) );
+	discretePosition += 512;
+	
+	//Interleave bits(assign a morton code, also known as a z-curve)
+	unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);
+	
+	//
+	SortDataCL mortonCodeIndexPair;
+	mortonCodeIndexPair.m_key = mortonCode;
+	mortonCodeIndexPair.m_value = leafNodeIndex;
+	
+	out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;
+}
+
+#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128
+#define B3_PLBVH_ROOT_NODE_MARKER -1	//Used to indicate that the (root) node has no parent 
+#define B3_PLBVH_ROOT_NODE_INDEX 0
+
+//For elements of internalNodeChildIndices(int2), the negative bit determines whether it is a leaf or internal node.
+//Positive index == leaf node, while negative index == internal node (remove negative sign to get index).
+//
+//Since the root internal node is at index 0, no internal nodes should reference it as a child,
+//and so index 0 is always used to indicate a leaf node.
+int isLeafNode(int index) { return (index >= 0); }
+int getIndexWithInternalNodeMarkerRemoved(int index) { return (index >= 0) ? index : -index; }
+int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : -index; }
+
+__kernel void constructBinaryTree(__global int* firstIndexOffsetPerLevel,
+									__global int* numNodesPerLevel,
+									__global int2* out_internalNodeChildIndices, 
+									__global int* out_internalNodeParentNodes, 
+									__global int* out_leafNodeParentNodes, 
+									int numLevels, int numInternalNodes)
+{
+	int internalNodeIndex = get_global_id(0);
+	if(internalNodeIndex >= numInternalNodes) return;
+	
+	//Find the level that this node is in, using linear search(could replace with binary search)
+	int level = 0;
+	int numInternalLevels = numLevels - 1;		//All levels except the last are internal nodes
+	for(; level < numInternalLevels; ++level)
+	{
+		if( firstIndexOffsetPerLevel[level] <= internalNodeIndex && internalNodeIndex < firstIndexOffsetPerLevel[level + 1]) break;
+	}
+	
+	//Check lower levels to find child nodes
+	//Left child is always in the next level, but the same does not apply to the right child
+	int indexInLevel = internalNodeIndex - firstIndexOffsetPerLevel[level];
+	int firstIndexInNextLevel = firstIndexOffsetPerLevel[level + 1];	//Should never be out of bounds(see for loop above)
+	
+	int leftChildLevel = level + 1;
+	int leftChildIndex = firstIndexInNextLevel + indexInLevel * 2;
+	
+	int rightChildLevel = level + 1;
+	int rightChildIndex = leftChildIndex + 1;
+	
+	//Under certain conditions, the right child index as calculated above is invalid; need to find the correct index
+	//
+	//First condition: must be at least 2 levels apart from the leaf node level;
+	//if the current level is right next to the leaf node level, then the right child
+	//will never be invalid due to the way the nodes are allocated (also avoid a out-of-bounds memory access)
+	//
+	//Second condition: not enough nodes in the next level for each parent to have 2 children, so the right child is invalid
+	//
+	//Third condition: must be the last node in its level
+	if( level < numLevels - 2 
+		&& numNodesPerLevel[level] * 2 > numNodesPerLevel[level + 1] 
+		&& indexInLevel == numNodesPerLevel[level] - 1 )
+	{
+		//Check lower levels until we find a node without a parent
+		for(; rightChildLevel < numLevels - 1; ++rightChildLevel)
+		{
+			int rightChildNextLevel = rightChildLevel + 1;
+		
+			//If this branch is taken, it means that the last node in rightChildNextLevel has no parent
+			if( numNodesPerLevel[rightChildLevel] * 2 < numNodesPerLevel[rightChildNextLevel] )
+			{
+				//Set the node to the last node in rightChildNextLevel
+				rightChildLevel = rightChildNextLevel;
+				rightChildIndex = firstIndexOffsetPerLevel[rightChildNextLevel] + numNodesPerLevel[rightChildNextLevel] - 1;
+				break;
+			}
+		}
+	}
+	
+	int isLeftChildLeaf = (leftChildLevel >= numLevels - 1);
+	int isRightChildLeaf = (rightChildLevel >= numLevels - 1);
+	
+	//If left/right child is a leaf node, the index needs to be corrected
+	//the way the index is calculated assumes that the leaf and internal nodes are in a contiguous array,
+	//with leaf nodes at the end of the array; in actuality, the leaf and internal nodes are in separate arrays
+	{
+		int leafNodeLevel = numLevels - 1;
+		leftChildIndex = (isLeftChildLeaf) ? leftChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : leftChildIndex;
+		rightChildIndex = (isLeftChildLeaf) ? rightChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : rightChildIndex;
+	}
+	
+	//Set the negative sign bit if the node is internal
+	int2 childIndices;
+	childIndices.x = getIndexWithInternalNodeMarkerSet(isLeftChildLeaf, leftChildIndex);
+	childIndices.y = getIndexWithInternalNodeMarkerSet(isRightChildLeaf, rightChildIndex);
+	out_internalNodeChildIndices[internalNodeIndex] = childIndices;
+	
+	//Assign parent node index to children
+	__global int* out_leftChildParentNodeIndices = (isLeftChildLeaf) ? out_leafNodeParentNodes : out_internalNodeParentNodes;
+	out_leftChildParentNodeIndices[leftChildIndex] = internalNodeIndex;
+	
+	__global int* out_rightChildParentNodeIndices = (isRightChildLeaf) ? out_leafNodeParentNodes : out_internalNodeParentNodes;
+	out_rightChildParentNodeIndices[rightChildIndex] = internalNodeIndex;
+}
+
+__kernel void determineInternalNodeAabbs(__global int* firstIndexOffsetPerLevel,
+										__global int* numNodesPerLevel, 
+										__global int2* internalNodeChildIndices,
+										__global SortDataCL* mortonCodesAndAabbIndices,
+										__global b3AabbCL* leafNodeAabbs, 
+										__global b3AabbCL* out_internalNodeAabbs, int numLevels, int numInternalNodes)
+{
+	int i = get_global_id(0);
+	if(i >= numInternalNodes) return;
+	
+	int numInternalLevels = numLevels - 1;
+	
+	//Starting from the level next to the leaf nodes, move towards the root(level 0)
+	for(int level = numInternalLevels - 1; level >= 0; --level)
+	{
+		int indexInLevel = i;	//Index relative to firstIndexOffsetPerLevel[level]
+		
+		int numNodesInLevel = numNodesPerLevel[level];
+		if(i < numNodesInLevel)
+		{
+			int internalNodeIndexGlobal = indexInLevel + firstIndexOffsetPerLevel[level];
+			int2 childIndicies = internalNodeChildIndices[internalNodeIndexGlobal];
+			
+			int leftChildIndex = getIndexWithInternalNodeMarkerRemoved(childIndicies.x);
+			int rightChildIndex = getIndexWithInternalNodeMarkerRemoved(childIndicies.y);
+		
+			int isLeftChildLeaf = isLeafNode(childIndicies.x);
+			int isRightChildLeaf = isLeafNode(childIndicies.y);
+			
+			int leftChildLeafIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;
+			int rightChildLeafIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;
+			
+			b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftChildLeafIndex] : out_internalNodeAabbs[leftChildIndex];
+			b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightChildLeafIndex] : out_internalNodeAabbs[rightChildIndex];
+			
+			b3AabbCL internalNodeAabb;
+			internalNodeAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);
+			internalNodeAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);
+			out_internalNodeAabbs[internalNodeIndexGlobal] = internalNodeAabb;
+		}
+	
+		barrier(CLK_GLOBAL_MEM_FENCE);
+	}
+}
+
+
+//From sap.cl
+#define NEW_PAIR_MARKER -1
+
+bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)
+{
+	bool overlap = true;
+	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
+	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
+	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
+	return overlap;
+}
+//From sap.cl
+
+__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, 
+											__global int2* internalNodeChildIndices, __global b3AabbCL* internalNodeAabbs,
+											__global SortDataCL* mortonCodesAndAabbIndices,
+											__global int* out_numPairs, __global int4* out_overlappingPairs, 
+											int maxPairs, int numQueryAabbs)
+{
+#define USE_SPATIALLY_COHERENT_INDICIES		//mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve
+#ifdef USE_SPATIALLY_COHERENT_INDICIES
+	int queryRigidIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);
+	if(queryRigidIndex >= numQueryAabbs) return;
+	
+	queryRigidIndex = mortonCodesAndAabbIndices[queryRigidIndex].m_value;
+#else
+	int queryRigidIndex = get_global_id(0);
+	if(queryRigidIndex >= numQueryAabbs) return;
+#endif
+
+	b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];
+	
+	int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];
+	
+	//Starting by placing only the root node index, 0, in the stack causes it to be detected as a leaf node(see isLeafNode() in loop)
+	int stackSize = 2;
+	stack[0] = internalNodeChildIndices[B3_PLBVH_ROOT_NODE_INDEX].x;
+	stack[1] = internalNodeChildIndices[B3_PLBVH_ROOT_NODE_INDEX].y;
+	
+	while(stackSize)
+	{
+		int internalOrLeafNodeIndex = stack[ stackSize - 1 ];
+		--stackSize;
+		
+		int isLeaf = isLeafNode(internalOrLeafNodeIndex);	//Internal node if false
+		int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);
+	
+		//bvhRigidIndex is not used if internal node
+		int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;
+	
+		b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];
+		if( queryRigidIndex != bvhRigidIndex && TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )
+		{
+			if(isLeaf && rigidAabbs[queryRigidIndex].m_minIndices[3] < rigidAabbs[bvhRigidIndex].m_minIndices[3])
+			{
+				int4 pair;
+				pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];
+				pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];
+				pair.z = NEW_PAIR_MARKER;
+				pair.w = NEW_PAIR_MARKER;
+				
+				int pairIndex = atomic_inc(out_numPairs);
+				if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;
+			}
+			
+			if(!isLeaf)	//Internal node
+			{
+				if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)
+				{
+					//Error
+				}
+				else
+				{
+					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;
+					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;
+				}
+			}
+		}
+		
+	}
+}
+
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h
@ -0,0 +1,325 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* parallelLinearBvhCL= \
+"/*\n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose,\n"
+"including commercial applications, and to alter it and redistribute it freely,\n"
+"subject to the following restrictions:\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"*/\n"
+"//Initial Author Jackson Lee, 2014\n"
+"typedef float b3Scalar;\n"
+"typedef float4 b3Vector3;\n"
+"#define b3Max max\n"
+"#define b3Min min\n"
+"typedef struct\n"
+"{\n"
+"	unsigned int m_key;\n"
+"	unsigned int m_value;\n"
+"} SortDataCL;\n"
+"typedef struct \n"
+"{\n"
+"	union\n"
+"	{\n"
+"		float4	m_min;\n"
+"		float   m_minElems[4];\n"
+"		int			m_minIndices[4];\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float4	m_max;\n"
+"		float   m_maxElems[4];\n"
+"		int			m_maxIndices[4];\n"
+"	};\n"
+"} b3AabbCL;\n"
+"unsigned int interleaveBits(unsigned int x)\n"
+"{\n"
+"	//........ ........ ......12 3456789A	//x\n"
+"	//....1..2 ..3..4.. 5..6..7. .8..9..A	//x after interleaving bits\n"
+"	\n"
+"	//........ ....1234 56789A12 3456789A	//x |= (x << 10)\n"
+"	//........ ....1111 1....... ...11111	//0x 00 0F 80 1F\n"
+"	//........ ....1234 5....... ...6789A	//x = ( x | (x << 10) ) & 0x000F801F; \n"
+"	\n"
+"	//.......1 23451234 5.....67 89A6789A	//x |= (x <<  5)\n"
+"	//.......1 1.....11 1.....11 .....111	//0x 01 83 83 07\n"
+"	//.......1 2.....34 5.....67 .....89A	//x = ( x | (x <<  5) ) & 0x01838307;\n"
+"	\n"
+"	//....12.1 2..34534 5..67.67 ..89A89A	//x |= (x <<  3)\n"
+"	//....1... 1..1...1 1..1...1 ..1...11	//0x 08 91 91 23\n"
+"	//....1... 2..3...4 5..6...7 ..8...9A	//x = ( x | (x <<  3) ) & 0x08919123;\n"
+"	\n"
+"	//...11..2 2.33..4N 5.66..77 .88..9NA	//x |= (x <<  1)	( N indicates overlapping bits, first overlap is bit {4,5} second is {9,A} )\n"
+"	//....1..1 ..1...1. 1..1..1. .1...1.1	//0x 09 22 92 45\n"
+"	//....1..2 ..3...4. 5..6..7. .8...9.A	//x = ( x | (x <<  1) ) & 0x09229245;\n"
+"	\n"
+"	//...11.22 .33..445 5.66.77. 88..99AA	//x |= (x <<  1)\n"
+"	//....1..1 ..1..1.. 1..1..1. .1..1..1	//0x 09 34 92 29\n"
+"	//....1..2 ..3..4.. 5..6..7. .8..9..A	//x = ( x | (x <<  1) ) & 0x09349229;\n"
+"	\n"
+"	//........ ........ ......11 11111111	//0x000003FF\n"
+"	x &= 0x000003FF;		//Clear all bits above bit 10\n"
+"	\n"
+"	x = ( x | (x << 10) ) & 0x000F801F;\n"
+"	x = ( x | (x <<  5) ) & 0x01838307;\n"
+"	x = ( x | (x <<  3) ) & 0x08919123;\n"
+"	x = ( x | (x <<  1) ) & 0x09229245;\n"
+"	x = ( x | (x <<  1) ) & 0x09349229;\n"
+"	\n"
+"	return x;\n"
+"}\n"
+"unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)\n"
+"{\n"
+"	return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n"
+"}\n"
+"__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, \n"
+"												__global SortDataCL* out_mortonCodesAndAabbIndices, \n"
+"												b3Scalar cellSize, int numAabbs)\n"
+"{\n"
+"	int leafNodeIndex = get_global_id(0);	//Leaf node index == AABB index\n"
+"	if(leafNodeIndex >= numAabbs) return;\n"
+"	\n"
+"	b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];\n"
+"	\n"
+"	b3Vector3 center = (aabb.m_min + aabb.m_max) * 0.5f;\n"
+"	\n"
+"	//Quantize into integer coordinates\n"
+"	//floor() is needed to prevent the center cell, at (0,0,0) from being twice the size\n"
+"	b3Vector3 gridPosition = center / cellSize;\n"
+"	\n"
+"	int4 discretePosition;\n"
+"	discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );\n"
+"	discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );\n"
+"	discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );\n"
+"	\n"
+"	//Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]\n"
+"	discretePosition = b3Max( -512, b3Min(discretePosition, 511) );\n"
+"	discretePosition += 512;\n"
+"	\n"
+"	//Interleave bits(assign a morton code, also known as a z-curve)\n"
+"	unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);\n"
+"	\n"
+"	//\n"
+"	SortDataCL mortonCodeIndexPair;\n"
+"	mortonCodeIndexPair.m_key = mortonCode;\n"
+"	mortonCodeIndexPair.m_value = leafNodeIndex;\n"
+"	\n"
+"	out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;\n"
+"}\n"
+"#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128\n"
+"#define B3_PLBVH_ROOT_NODE_MARKER -1	//Used to indicate that the node has no parent \n"
+"#define B3_PLBVH_ROOT_NODE_INDEX 0\n"
+"//For elements of internalNodeChildIndices(int2), the negative bit determines whether it is a leaf or internal node.\n"
+"//Positive index == leaf node, while negative index == internal node (remove negative sign to get index).\n"
+"//\n"
+"//Since the root internal node is at index 0, no internal nodes should reference it as a child,\n"
+"//and so index 0 is always used to indicate a leaf node.\n"
+"int isLeafNode(int index) { return (index >= 0); }\n"
+"int getIndexWithInternalNodeMarkerRemoved(int index) { return (index >= 0) ? index : -index; }\n"
+"int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : -index; }\n"
+"__kernel void constructBinaryTree(__global int* firstIndexOffsetPerLevel,\n"
+"									__global int* numNodesPerLevel,\n"
+"									__global int2* out_internalNodeChildIndices, \n"
+"									__global int* out_internalNodeParentNodes, \n"
+"									__global int* out_leafNodeParentNodes, \n"
+"									int numLevels, int numInternalNodes)\n"
+"{\n"
+"	int internalNodeIndex = get_global_id(0);\n"
+"	if(internalNodeIndex >= numInternalNodes) return;\n"
+"	\n"
+"	//Find the level that this node is in, using linear search(could replace with binary search)\n"
+"	int level = 0;\n"
+"	int numInternalLevels = numLevels - 1;		//All levels except the last are internal nodes\n"
+"	for(; level < numInternalLevels; ++level)\n"
+"	{\n"
+"		if( firstIndexOffsetPerLevel[level] <= internalNodeIndex && internalNodeIndex < firstIndexOffsetPerLevel[level + 1]) break;\n"
+"	}\n"
+"	\n"
+"	//Check lower levels to find child nodes\n"
+"	//Left child is always in the next level, but the same does not apply to the right child\n"
+"	int indexInLevel = internalNodeIndex - firstIndexOffsetPerLevel[level];\n"
+"	int firstIndexInNextLevel = firstIndexOffsetPerLevel[level + 1];	//Should never be out of bounds(see for loop above)\n"
+"	\n"
+"	int leftChildLevel = level + 1;\n"
+"	int leftChildIndex = firstIndexInNextLevel + indexInLevel * 2;\n"
+"	\n"
+"	int rightChildLevel = level + 1;\n"
+"	int rightChildIndex = leftChildIndex + 1;\n"
+"	\n"
+"	//Under certain conditions, the right child index as calculated above is invalid; need to find the correct index\n"
+"	//\n"
+"	//First condition: must be at least 2 levels apart from the leaf node level;\n"
+"	//if the current level is right next to the leaf node level, then the right child\n"
+"	//will never be invalid due to the way the nodes are allocated (also avoid a out-of-bounds memory access)\n"
+"	//\n"
+"	//Second condition: not enough nodes in the next level for each parent to have 2 children, so the right child is invalid\n"
+"	//\n"
+"	//Third condition: must be the last node in its level\n"
+"	if( level < numLevels - 2 \n"
+"		&& numNodesPerLevel[level] * 2 > numNodesPerLevel[level + 1] \n"
+"		&& indexInLevel == numNodesPerLevel[level] - 1 )\n"
+"	{\n"
+"		//Check lower levels until we find a node without a parent\n"
+"		for(; rightChildLevel < numLevels - 1; ++rightChildLevel)\n"
+"		{\n"
+"			int rightChildNextLevel = rightChildLevel + 1;\n"
+"		\n"
+"			//If this branch is taken, it means that the last node in rightChildNextLevel has no parent\n"
+"			if( numNodesPerLevel[rightChildLevel] * 2 < numNodesPerLevel[rightChildNextLevel] )\n"
+"			{\n"
+"				//Set the node to the last node in rightChildNextLevel\n"
+"				rightChildLevel = rightChildNextLevel;\n"
+"				rightChildIndex = firstIndexOffsetPerLevel[rightChildNextLevel] + numNodesPerLevel[rightChildNextLevel] - 1;\n"
+"				break;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	\n"
+"	int isLeftChildLeaf = (leftChildLevel >= numLevels - 1);\n"
+"	int isRightChildLeaf = (rightChildLevel >= numLevels - 1);\n"
+"	\n"
+"	//If left/right child is a leaf node, the index needs to be corrected\n"
+"	//the way the index is calculated assumes that the leaf and internal nodes are in a contiguous array,\n"
+"	//with leaf nodes at the end of the array; in actuality, the leaf and internal nodes are in separate arrays\n"
+"	{\n"
+"		int leafNodeLevel = numLevels - 1;\n"
+"		leftChildIndex = (isLeftChildLeaf) ? leftChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : leftChildIndex;\n"
+"		rightChildIndex = (isLeftChildLeaf) ? rightChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : rightChildIndex;\n"
+"	}\n"
+"	\n"
+"	//Set the negative sign bit if the node is internal\n"
+"	int2 childIndices;\n"
+"	childIndices.x = getIndexWithInternalNodeMarkerSet(isLeftChildLeaf, leftChildIndex);\n"
+"	childIndices.y = getIndexWithInternalNodeMarkerSet(isRightChildLeaf, rightChildIndex);\n"
+"	out_internalNodeChildIndices[internalNodeIndex] = childIndices;\n"
+"	\n"
+"	//Assign parent node index to children\n"
+"	__global int* out_leftChildParentNodeIndices = (isLeftChildLeaf) ? out_leafNodeParentNodes : out_internalNodeParentNodes;\n"
+"	out_leftChildParentNodeIndices[leftChildIndex] = internalNodeIndex;\n"
+"	\n"
+"	__global int* out_rightChildParentNodeIndices = (isRightChildLeaf) ? out_leafNodeParentNodes : out_internalNodeParentNodes;\n"
+"	out_rightChildParentNodeIndices[rightChildIndex] = internalNodeIndex;\n"
+"}\n"
+"__kernel void determineInternalNodeAabbs(__global int* firstIndexOffsetPerLevel,\n"
+"										__global int* numNodesPerLevel, \n"
+"										__global int2* internalNodeChildIndices,\n"
+"										__global SortDataCL* mortonCodesAndAabbIndices,\n"
+"										__global b3AabbCL* leafNodeAabbs, \n"
+"										__global b3AabbCL* out_internalNodeAabbs, int numLevels, int numInternalNodes)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	if(i >= numInternalNodes) return;\n"
+"	\n"
+"	int numInternalLevels = numLevels - 1;\n"
+"	\n"
+"	//Starting from the level next to the leaf nodes, move towards the root(level 0)\n"
+"	for(int level = numInternalLevels - 1; level >= 0; --level)\n"
+"	{\n"
+"		int indexInLevel = i;	//Index relative to firstIndexOffsetPerLevel[level]\n"
+"		\n"
+"		int numNodesInLevel = numNodesPerLevel[level];\n"
+"		if(i < numNodesInLevel)\n"
+"		{\n"
+"			int internalNodeIndexGlobal = indexInLevel + firstIndexOffsetPerLevel[level];\n"
+"			int2 childIndicies = internalNodeChildIndices[internalNodeIndexGlobal];\n"
+"			\n"
+"			int leftChildIndex = getIndexWithInternalNodeMarkerRemoved(childIndicies.x);\n"
+"			int rightChildIndex = getIndexWithInternalNodeMarkerRemoved(childIndicies.y);\n"
+"		\n"
+"			int isLeftChildLeaf = isLeafNode(childIndicies.x);\n"
+"			int isRightChildLeaf = isLeafNode(childIndicies.y);\n"
+"			\n"
+"			int leftChildLeafIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n"
+"			int rightChildLeafIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n"
+"			\n"
+"			b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftChildLeafIndex] : out_internalNodeAabbs[leftChildIndex];\n"
+"			b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightChildLeafIndex] : out_internalNodeAabbs[rightChildIndex];\n"
+"			\n"
+"			b3AabbCL internalNodeAabb;\n"
+"			internalNodeAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n"
+"			internalNodeAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n"
+"			out_internalNodeAabbs[internalNodeIndexGlobal] = internalNodeAabb;\n"
+"		}\n"
+"	\n"
+"		barrier(CLK_GLOBAL_MEM_FENCE);\n"
+"	}\n"
+"}\n"
+"//From sap.cl\n"
+"#define NEW_PAIR_MARKER -1\n"
+"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)\n"
+"{\n"
+"	bool overlap = true;\n"
+"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+"	return overlap;\n"
+"}\n"
+"//From sap.cl\n"
+"__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, \n"
+"											__global int2* internalNodeChildIndices, __global b3AabbCL* internalNodeAabbs,\n"
+"											__global SortDataCL* mortonCodesAndAabbIndices,\n"
+"											__global int* out_numPairs, __global int4* out_overlappingPairs, \n"
+"											int maxPairs, int numQueryAabbs)\n"
+"{\n"
+"	int queryRigidIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
+"	if(queryRigidIndex >= numQueryAabbs) return;\n"
+"	\n"
+"	queryRigidIndex = mortonCodesAndAabbIndices[queryRigidIndex].m_value;\n"
+"	//int queryRigidIndex = get_global_id(0);\n"
+"	//if(queryRigidIndex >= numQueryAabbs) return;\n"
+"	\n"
+"	b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];\n"
+"	\n"
+"	int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n"
+"	\n"
+"	//Starting by placing only the root node index, 0, in the stack causes it to be detected as a leaf node(see isLeafNode() in loop)\n"
+"	int stackSize = 2;\n"
+"	stack[0] = internalNodeChildIndices[B3_PLBVH_ROOT_NODE_INDEX].x;\n"
+"	stack[1] = internalNodeChildIndices[B3_PLBVH_ROOT_NODE_INDEX].y;\n"
+"	\n"
+"	while(stackSize)\n"
+"	{\n"
+"		int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n"
+"		--stackSize;\n"
+"		\n"
+"		int isLeaf = isLeafNode(internalOrLeafNodeIndex);	//Internal node if false\n"
+"		int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n"
+"	\n"
+"		//bvhRigidIndex is not used if internal node\n"
+"		int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n"
+"	\n"
+"		b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n"
+"		if( queryRigidIndex != bvhRigidIndex && TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )\n"
+"		{\n"
+"			if(isLeaf && rigidAabbs[queryRigidIndex].m_minIndices[3] < rigidAabbs[bvhRigidIndex].m_minIndices[3])\n"
+"			{\n"
+"				int4 pair;\n"
+"				pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];\n"
+"				pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n"
+"				pair.z = NEW_PAIR_MARKER;\n"
+"				pair.w = NEW_PAIR_MARKER;\n"
+"				\n"
+"				int pairIndex = atomic_inc(out_numPairs);\n"
+"				if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n"
+"			}\n"
+"			\n"
+"			if(!isLeaf)	//Internal node\n"
+"			{\n"
+"				if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n"
+"				{\n"
+"					//Error\n"
+"				}\n"
+"				else\n"
+"				{\n"
+"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n"
+"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		\n"
+"	}\n"
+"}\n"
+;