Fix PLBVH reduction kernels, simplify nodes per level calculation.

Also calculate index ranges for each internal node.
2024-12-14 13:50:04 +00:00 · 2014-02-19 21:49:30 -08:00 · 2014-02-19 21:49:30 -08:00 · e955192971
commit e955192971
parent 7f0e361fa0
4 changed files with 193 additions and 175 deletions
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h
@ -44,10 +44,8 @@ subject to the following restrictions:
 ///The BVH implementation here is almost the same as [Karras 2012], but a different method is used for constructing the tree.
 /// - Instead of building a binary radix tree, we simply pair each node with its nearest sibling.
 /// This has the effect of further worsening the quality of the BVH, but the main spatial partitioning is done by the
-/// Z-curve anyways, and this method should be simpler and faster during construction. Additionally, it is still possible
-/// to improve the quality of the BVH by rearranging the connections between nodes.
-/// - Due to the way the tree is constructed, it becomes unnecessary to use atomic_inc to get
-/// the AABB for each internal node. Rather than traveling upwards from the leaf nodes, as in the paper,
+/// Z-curve anyways, and this method should be simpler and faster during construction.
+/// - Rather than traveling upwards towards the root from the leaf nodes, as in the paper,
 /// each internal node checks its child nodes to get its AABB.
 class b3GpuParallelLinearBvh
 {
@ -73,6 +71,7 @@ class b3GpuParallelLinearBvh
 	
 	//1 element per internal node (number_of_internal_nodes = number_of_leaves - 1); index 0 is the root node
 	b3OpenCLArray<b3SapAabb> m_internalNodeAabbs;
+	b3OpenCLArray<b3Int2> m_internalNodeLeafIndexRanges;		//x == min leaf index, y == max leaf index
 	b3OpenCLArray<b3Int2> m_internalNodeChildNodes;				//x == left child, y == right child
 	b3OpenCLArray<int> m_internalNodeParentNodes;
 	
@ -90,6 +89,7 @@ public:
 		m_numNodesPerLevelGpu(context, queue),
 		m_firstIndexOffsetPerLevelGpu(context, queue),
 		m_internalNodeAabbs(context, queue),
+		m_internalNodeLeafIndexRanges(context, queue),
 		m_internalNodeChildNodes(context, queue),
 		m_internalNodeParentNodes(context, queue),
 		m_leafNodeParentNodes(context, queue),
@ -128,8 +128,6 @@ public:
 		
 		clReleaseProgram(m_parallelLinearBvhProgram);
 	}
-
-	//	fix: need to handle/test case with 2 nodes
 	
 	void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs)
 	{
@ -143,6 +141,7 @@ public:
 		//
 		{
 			m_internalNodeAabbs.resize(numInternalNodes);
+			m_internalNodeLeafIndexRanges.resize(numInternalNodes);
 			m_internalNodeChildNodes.resize(numInternalNodes);
 			m_internalNodeParentNodes.resize(numInternalNodes);
 	
@ -180,17 +179,20 @@ public:
 			
 			//Calculate number of nodes in each level; 
 			//start from the second to last level(level right next to leaf nodes) and move towards the root(level 0)
-			int hasRemainder = 0;
+			int remainder = 0;
 			for(int levelIndex = numLevels - 2; levelIndex >= 0; --levelIndex)
 			{
 				int numNodesPreviousLevel = m_numNodesPerLevelCpu[levelIndex + 1];		//For first iteration this == numLeaves
-			
-				bool allNodesAllocated = ( (numNodesPreviousLevel + hasRemainder) % 2 == 0 );
-			
-				int numNodesCurrentLevel = (allNodesAllocated) ? (numNodesPreviousLevel + hasRemainder) / 2 : numNodesPreviousLevel / 2;
-				m_numNodesPerLevelCpu[levelIndex] = numNodesCurrentLevel;
+				int numNodesCurrentLevel = numNodesPreviousLevel / 2;
 				
-				hasRemainder = static_cast<int>(!allNodesAllocated);
+				remainder += numNodesPreviousLevel % 2;
+				if(remainder == 2) 
+				{
+					numNodesCurrentLevel++;
+					remainder = 0;
+				}
+				
+				m_numNodesPerLevelCpu[levelIndex] = numNodesCurrentLevel;
 			}
 			
 			//Prefix sum to calculate the first index offset of each level
@ -232,17 +234,22 @@ public:
 			B3_PROFILE("Find AABB of merged nodes");
 		
 			m_mergedAabb.copyFromOpenCLArray(worldSpaceAabbs);	//Need to make a copy since the kernel modifies the array
-		
-			b3BufferInfoCL bufferInfo[] = 
+				
+			for(int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2; 
+					numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2)
 			{
-				b3BufferInfoCL( m_mergedAabb.getBufferCL() )		//Resulting AABB is stored in m_mergedAabb[0]
-			};
+				b3BufferInfoCL bufferInfo[] = 
+				{
+					b3BufferInfoCL( m_mergedAabb.getBufferCL() )		//Resulting AABB is stored in m_mergedAabb[0]
+				};
+				
+				b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel");
+				launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+				launcher.setConst(numAabbsNeedingMerge);
+				
+				launcher.launch1D(numAabbsNeedingMerge);
+			}
 			
-			b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel");
-			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
-			launcher.setConst(numLeaves);
-			
-			launcher.launch1D(numLeaves);
 			clFinish(m_queue);
 		}
 		
@ -315,7 +322,8 @@ public:
 				m_internalNodeChildNodes.copyToHost(internalNodeChildNodes, false);
 				clFinish(m_queue);
 				
-				for(int i = 0; i < 256; ++i) printf("ch[%d]: %d, %d\n", i, internalNodeChildNodes[i].x, internalNodeChildNodes[i].y);
+				for(int i = 0; i < numInternalNodes; ++i) 
+					printf("ch[%d]: %d, %d\n", i, internalNodeChildNodes[i].x, internalNodeChildNodes[i].y);
 				printf("\n");
 			}
 		}
@ -325,30 +333,58 @@ public:
 		{
 			B3_PROFILE("Set AABBs");
 		
-			b3BufferInfoCL bufferInfo[] = 
+			//Due to the arrangement of internal nodes, each internal node corresponds 
+			//to a contiguous range of leaf node indices. This characteristic can be used
+			//to optimize calculateOverlappingPairs(); checking if
+			//(m_internalNodeLeafIndexRanges[].y < leafNodeIndex) can be used to ensure that
+			//each pair is processed only once.
 			{
-				b3BufferInfoCL( m_firstIndexOffsetPerLevelGpu.getBufferCL() ),
-				b3BufferInfoCL( m_numNodesPerLevelGpu.getBufferCL() ),
-				b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
-				b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
-				b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
-				b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() )
-			};
+				B3_PROFILE("Reset internal node index ranges");
+				
+				b3Int2 invalidIndexRange;
+				invalidIndexRange.x = -1; 	//x == min
+				invalidIndexRange.y = -2;	//y == max
+				
+				m_fill.execute( m_internalNodeLeafIndexRanges, invalidIndexRange, m_internalNodeLeafIndexRanges.size() );
+				clFinish(m_queue);
+			}
 			
-			b3LauncherCL launcher(m_queue, m_determineInternalNodeAabbsKernel, "m_determineInternalNodeAabbsKernel");
-			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
-			launcher.setConst(numLevels);
-			launcher.setConst(numInternalNodes);
-			
-			launcher.launch1D(numLeaves);
+			int lastInternalLevelIndex = numLevels - 2;		//Last level is leaf node level
+			for(int level = lastInternalLevelIndex; level >= 0; --level)
+			{
+				b3BufferInfoCL bufferInfo[] = 
+				{
+					b3BufferInfoCL( m_firstIndexOffsetPerLevelGpu.getBufferCL() ),
+					b3BufferInfoCL( m_numNodesPerLevelGpu.getBufferCL() ),
+					b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
+					b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
+					b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
+					b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ),
+					b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() )
+				};
+				
+				b3LauncherCL launcher(m_queue, m_determineInternalNodeAabbsKernel, "m_determineInternalNodeAabbsKernel");
+				launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+				launcher.setConst(numLevels);
+				launcher.setConst(numInternalNodes);
+				launcher.setConst(level);
+				
+				launcher.launch1D(numLeaves);
+			}
 			clFinish(m_queue);
 			
 			if(0)
 			{
-				b3SapAabb mergedAABB = m_mergedAabb.at(0);
-				printf("mergedAABBMin: %f, %f, %f \n", mergedAABB.m_minVec.x, mergedAABB.m_minVec.y, mergedAABB.m_minVec.z);
-				printf("mergedAABBMax: %f, %f, %f \n", mergedAABB.m_maxVec.x, mergedAABB.m_maxVec.y, mergedAABB.m_maxVec.z);
+				static b3AlignedObjectArray<b3Int2> leafIndexRanges;	
+				m_internalNodeLeafIndexRanges.copyToHost(leafIndexRanges, false);
+				clFinish(m_queue);
+				
+				for(int i = 0; i < numInternalNodes; ++i) 
+					//if(leafIndexRanges[i].x == -1 || leafIndexRanges[i].y == -1)
+						printf("leafIndexRanges[%d]: %d, %d\n", i, leafIndexRanges[i].x, leafIndexRanges[i].y);
+				printf("\n");
 			}
+			
 			if(0)
 			{
 				static b3AlignedObjectArray<b3SapAabb> rigidAabbs;
@ -363,12 +399,18 @@ public:
 					actualRootAabb.m_minVec.setMin(rigidAabbs[i].m_minVec);
 					actualRootAabb.m_maxVec.setMax(rigidAabbs[i].m_maxVec);
 				}
-				printf("actualRootMin: %f, %f, %f \n", actualRootAabb.m_minVec.x, actualRootAabb.m_minVec.y, actualRootAabb.m_minVec.z);
-				printf("actualRootMax: %f, %f, %f \n", actualRootAabb.m_maxVec.x, actualRootAabb.m_maxVec.y, actualRootAabb.m_maxVec.z);
-			
+				
 				b3SapAabb rootAabb = m_internalNodeAabbs.at(0);
-				printf("rootMin: %f, %f, %f \n", rootAabb.m_minVec.x, rootAabb.m_minVec.y, rootAabb.m_minVec.z);
-				printf("rootMax: %f, %f, %f \n", rootAabb.m_maxVec.x, rootAabb.m_maxVec.y, rootAabb.m_maxVec.z);
+				b3SapAabb mergedAABB = m_mergedAabb.at(0);
+				
+				printf("mergedAABBMin: %f, %f, %f \n", mergedAABB.m_minVec.x, mergedAABB.m_minVec.y, mergedAABB.m_minVec.z);
+				printf("actualRootMin: %f, %f, %f \n", actualRootAabb.m_minVec.x, actualRootAabb.m_minVec.y, actualRootAabb.m_minVec.z);
+				printf("kernelRootMin: %f, %f, %f \n", rootAabb.m_minVec.x, rootAabb.m_minVec.y, rootAabb.m_minVec.z);
+				
+				printf("mergedAABBMax: %f, %f, %f \n", mergedAABB.m_maxVec.x, mergedAABB.m_maxVec.y, mergedAABB.m_maxVec.z);
+				printf("actualRootMax: %f, %f, %f \n", actualRootAabb.m_maxVec.x, actualRootAabb.m_maxVec.y, actualRootAabb.m_maxVec.z);
+				printf("kernelRootMax: %f, %f, %f \n", rootAabb.m_maxVec.x, rootAabb.m_maxVec.y, rootAabb.m_maxVec.z);
+			
 				printf("\n");
 			}
 		}
@ -397,6 +439,7 @@ public:
 				
 				b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
 				b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ),
+				b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ),
 				b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
 				
 				b3BufferInfoCL( out_numPairs.getBufferCL() ),
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h
@ -77,11 +77,7 @@ public:
 	virtual cl_mem getAabbBufferWS() { return m_aabbsGpu.getBufferCL(); }
 	virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() { return m_aabbsGpu; }
 	
-	virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()
-	{
-		b3Assert(0);	//CPU version not implemented
-		return m_aabbsCpu;
-	}
+	virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() { return m_aabbsCpu; }
 	
 	static b3GpuBroadphaseInterface* CreateFunc(cl_context context, cl_device_id device, cl_command_queue queue)
 	{
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl
@ -80,60 +80,32 @@ unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)
 	return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;
 }

-
-__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbs)
+//Should replace with an optimized parallel reduction
+__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)
 {
-	int aabbIndex = get_global_id(0);
-	if(aabbIndex >= numAabbs) return;
-	
-	//Find the most significant bit(msb)
-	int mostSignificantBit = 0;
-	{
-		int temp = numAabbs;
-		while(temp >>= 1) mostSignificantBit++;		//Start counting from 0 (0 and 1 have msb 0, 2 has msb 1)
-	}
-	
-	int numberOfAabbsAboveMsbSplit = numAabbs & ~( ~(0) << mostSignificantBit );
-	int numRemainingAabbs = (1 << mostSignificantBit);
-	
-	//Merge AABBs above most significant bit so that the number of remaining AABBs is a power of 2
-	//For example, if there are 159 AABBs = 128 + 31, then merge indices [0, 30] and 128 + [0, 30]
-	if(aabbIndex < numberOfAabbsAboveMsbSplit)
-	{
-		int otherAabbIndex = numRemainingAabbs + aabbIndex;
-		
-		b3AabbCL aabb = out_mergedAabb[aabbIndex];
-		b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];
-		
-		b3AabbCL mergedAabb;
-		mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);
-		mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);
-		out_mergedAabb[aabbIndex] = mergedAabb;
-	}
-	
-	barrier(CLK_GLOBAL_MEM_FENCE);
-	
+	//Each time this kernel is added to the command queue, 
+	//the number of AABBs needing to be merged is halved
 	//
-	int offset = numRemainingAabbs / 2;
-	while(offset >= 1)
-	{
-		if(aabbIndex < offset)
-		{
-			int otherAabbIndex = aabbIndex + offset;
+	//Example with 159 AABBs:
+	//	numRemainingAabbs == 159 / 2 + 159 % 2 == 80
+	//	numMergedAabbs == 159 - 80 == 79
+	//So, indices [0, 78] are merged with [0 + 80, 78 + 80]
+	
+	int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;
+	int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;
+	
+	int aabbIndex = get_global_id(0);
+	if(aabbIndex >= numMergedAabbs) return;
+	
+	int otherAabbIndex = aabbIndex + numRemainingAabbs;
+	
+	b3AabbCL aabb = out_mergedAabb[aabbIndex];
+	b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];
 		
-			b3AabbCL aabb = out_mergedAabb[aabbIndex];
-			b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];
-		
-			b3AabbCL mergedAabb;
-			mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);
-			mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);
-			out_mergedAabb[aabbIndex] = mergedAabb;
-		}
-		
-		offset /= 2;
-		
-		barrier(CLK_GLOBAL_MEM_FENCE);
-	}
+	b3AabbCL mergedAabb;
+	mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);
+	mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);
+	out_mergedAabb[aabbIndex] = mergedAabb;
 }

 __kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, 
@ -254,7 +226,7 @@ __kernel void constructBinaryTree(__global int* firstIndexOffsetPerLevel,
 	{
 		int leafNodeLevel = numLevels - 1;
 		leftChildIndex = (isLeftChildLeaf) ? leftChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : leftChildIndex;
-		rightChildIndex = (isLeftChildLeaf) ? rightChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : rightChildIndex;
+		rightChildIndex = (isRightChildLeaf) ? rightChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : rightChildIndex;
 	}
 	
 	//Set the negative sign bit if the node is internal
@ -276,20 +248,19 @@ __kernel void determineInternalNodeAabbs(__global int* firstIndexOffsetPerLevel,
 										__global int2* internalNodeChildIndices,
 										__global SortDataCL* mortonCodesAndAabbIndices,
 										__global b3AabbCL* leafNodeAabbs, 
-										__global b3AabbCL* out_internalNodeAabbs, int numLevels, int numInternalNodes)
+										__global int2* out_internalNodeLeafIndexRanges,
+										__global b3AabbCL* out_internalNodeAabbs, 
+										int numLevels, int numInternalNodes, int level)
 {
 	int i = get_global_id(0);
 	if(i >= numInternalNodes) return;
 	
-	int numInternalLevels = numLevels - 1;
-	
-	//Starting from the level next to the leaf nodes, move towards the root(level 0)
-	for(int level = numInternalLevels - 1; level >= 0; --level)
+	//For each node in a level, check its child nodes to determine its AABB
 	{
 		int indexInLevel = i;	//Index relative to firstIndexOffsetPerLevel[level]
 		
 		int numNodesInLevel = numNodesPerLevel[level];
-		if(i < numNodesInLevel)
+		if(indexInLevel < numNodesInLevel)
 		{
 			int internalNodeIndexGlobal = indexInLevel + firstIndexOffsetPerLevel[level];
 			int2 childIndicies = internalNodeChildIndices[internalNodeIndexGlobal];
@ -300,19 +271,26 @@ __kernel void determineInternalNodeAabbs(__global int* firstIndexOffsetPerLevel,
 			int isLeftChildLeaf = isLeafNode(childIndicies.x);
 			int isRightChildLeaf = isLeafNode(childIndicies.y);
 			
+			//left/RightChildLeafIndex == Rigid body indicies
 			int leftChildLeafIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;
 			int rightChildLeafIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;
 			
 			b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftChildLeafIndex] : out_internalNodeAabbs[leftChildIndex];
 			b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightChildLeafIndex] : out_internalNodeAabbs[rightChildIndex];
 			
+			//
 			b3AabbCL internalNodeAabb;
 			internalNodeAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);
 			internalNodeAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);
 			out_internalNodeAabbs[internalNodeIndexGlobal] = internalNodeAabb;
+			
+			//For index range, x == min and y == max; left child always has lower index
+			int2 leafIndexRange;
+			leafIndexRange.x = (isLeftChildLeaf) ? leftChildIndex : out_internalNodeLeafIndexRanges[leftChildIndex].x;
+			leafIndexRange.y = (isRightChildLeaf) ? rightChildIndex : out_internalNodeLeafIndexRanges[rightChildIndex].y;
+			
+			out_internalNodeLeafIndexRanges[internalNodeIndexGlobal] = leafIndexRange;
 		}
-	
-		barrier(CLK_GLOBAL_MEM_FENCE);
 	}
 }

@ -331,7 +309,9 @@ bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)
 //From sap.cl

 __kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, 
-											__global int2* internalNodeChildIndices, __global b3AabbCL* internalNodeAabbs,
+											__global int2* internalNodeChildIndices, 
+											__global b3AabbCL* internalNodeAabbs,
+											__global int2* internalNodeLeafIndexRanges,
 											__global SortDataCL* mortonCodesAndAabbIndices,
 											__global int* out_numPairs, __global int4* out_overlappingPairs, 
 											int maxPairs, int numQueryAabbs)
@ -341,7 +321,8 @@ __kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs,
 	int queryRigidIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);
 	if(queryRigidIndex >= numQueryAabbs) return;
 	
-	queryRigidIndex = mortonCodesAndAabbIndices[queryRigidIndex].m_value;
+	int queryBvhNodeIndex = queryRigidIndex;
+	queryRigidIndex = mortonCodesAndAabbIndices[queryRigidIndex].m_value;		//	fix queryRigidIndex naming for this branch
 #else
 	int queryRigidIndex = get_global_id(0);
 	if(queryRigidIndex >= numQueryAabbs) return;
@ -363,7 +344,15 @@ __kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs,
 		
 		int isLeaf = isLeafNode(internalOrLeafNodeIndex);	//Internal node if false
 		int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);
-	
+		
+		//Optimization - if the node is not a leaf, check whether the highest leaf index of that node
+		//is less than the queried node's index to avoid testing each pair twice.
+		{
+			//	fix: produces duplicate pairs
+		//	int highestLeafIndex = (isLeaf) ? numQueryAabbs : internalNodeLeafIndexRanges[bvhNodeIndex].y;
+		//	if(highestLeafIndex < queryBvhNodeIndex) continue;
+		}
+		
 		//bvhRigidIndex is not used if internal node
 		int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;
 	
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h
@ -75,59 +75,32 @@ static const char* parallelLinearBvhCL= \
 "{\n"
 "	return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n"
 "}\n"
-"__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbs)\n"
+"//Should replace with an optimized parallel reduction\n"
+"__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)\n"
 "{\n"
-"	int aabbIndex = get_global_id(0);\n"
-"	if(aabbIndex >= numAabbs) return;\n"
-"	\n"
-"	//Find the most significant bit(msb)\n"
-"	int mostSignificantBit = 0;\n"
-"	{\n"
-"		int temp = numAabbs;\n"
-"		while(temp >>= 1) mostSignificantBit++;		//Start counting from 0 (0 and 1 have msb 0, 2 has msb 1)\n"
-"	}\n"
-"	\n"
-"	int numberOfAabbsAboveMsbSplit = numAabbs & ~( ~(0) << mostSignificantBit );\n"
-"	int numRemainingAabbs = (1 << mostSignificantBit);\n"
-"	\n"
-"	//Merge AABBs above most significant bit so that the number of remaining AABBs is a power of 2\n"
-"	//For example, if there are 159 AABBs = 128 + 31, then merge indices [0, 30] and 128 + [0, 30]\n"
-"	if(aabbIndex < numberOfAabbsAboveMsbSplit)\n"
-"	{\n"
-"		int otherAabbIndex = numRemainingAabbs + aabbIndex;\n"
-"		\n"
-"		b3AabbCL aabb = out_mergedAabb[aabbIndex];\n"
-"		b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n"
-"		\n"
-"		b3AabbCL mergedAabb;\n"
-"		mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n"
-"		mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n"
-"		out_mergedAabb[aabbIndex] = mergedAabb;\n"
-"	}\n"
-"	\n"
-"	barrier(CLK_GLOBAL_MEM_FENCE);\n"
-"	\n"
+"	//Each time this kernel is added to the command queue, \n"
+"	//the number of AABBs needing to be merged is halved\n"
 "	//\n"
-"	int offset = numRemainingAabbs / 2;\n"
-"	while(offset >= 1)\n"
-"	{\n"
-"		if(aabbIndex < offset)\n"
-"		{\n"
-"			int otherAabbIndex = aabbIndex + offset;\n"
+"	//Example with 159 AABBs:\n"
+"	//	numRemainingAabbs == 159 / 2 + 159 % 2 == 80\n"
+"	//	numMergedAabbs == 159 - 80 == 79\n"
+"	//So, indices [0, 78] are merged with [0 + 80, 78 + 80]\n"
+"	\n"
+"	int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;\n"
+"	int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;\n"
+"	\n"
+"	int aabbIndex = get_global_id(0);\n"
+"	if(aabbIndex >= numMergedAabbs) return;\n"
+"	\n"
+"	int otherAabbIndex = aabbIndex + numRemainingAabbs;\n"
+"	\n"
+"	b3AabbCL aabb = out_mergedAabb[aabbIndex];\n"
+"	b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n"
 "		\n"
-"			b3AabbCL aabb = out_mergedAabb[aabbIndex];\n"
-"			b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n"
-"		\n"
-"			b3AabbCL mergedAabb;\n"
-"			mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n"
-"			mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n"
-"			out_mergedAabb[aabbIndex] = mergedAabb;\n"
-"		}\n"
-"		\n"
-"		offset /= 2;\n"
-"		\n"
-"		barrier(CLK_GLOBAL_MEM_FENCE);\n"
-"	}\n"
+"	b3AabbCL mergedAabb;\n"
+"	mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n"
+"	mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n"
+"	out_mergedAabb[aabbIndex] = mergedAabb;\n"
 "}\n"
 "__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, \n"
 "												__global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)\n"
@ -244,7 +217,7 @@ static const char* parallelLinearBvhCL= \
 "	{\n"
 "		int leafNodeLevel = numLevels - 1;\n"
 "		leftChildIndex = (isLeftChildLeaf) ? leftChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : leftChildIndex;\n"
-"		rightChildIndex = (isLeftChildLeaf) ? rightChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : rightChildIndex;\n"
+"		rightChildIndex = (isRightChildLeaf) ? rightChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : rightChildIndex;\n"
 "	}\n"
 "	\n"
 "	//Set the negative sign bit if the node is internal\n"
@ -265,20 +238,19 @@ static const char* parallelLinearBvhCL= \
 "										__global int2* internalNodeChildIndices,\n"
 "										__global SortDataCL* mortonCodesAndAabbIndices,\n"
 "										__global b3AabbCL* leafNodeAabbs, \n"
-"										__global b3AabbCL* out_internalNodeAabbs, int numLevels, int numInternalNodes)\n"
+"										__global int2* out_internalNodeLeafIndexRanges,\n"
+"										__global b3AabbCL* out_internalNodeAabbs, \n"
+"										int numLevels, int numInternalNodes, int level)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
 "	if(i >= numInternalNodes) return;\n"
 "	\n"
-"	int numInternalLevels = numLevels - 1;\n"
-"	\n"
-"	//Starting from the level next to the leaf nodes, move towards the root(level 0)\n"
-"	for(int level = numInternalLevels - 1; level >= 0; --level)\n"
+"	//For each node in a level, check its child nodes to determine its AABB\n"
 "	{\n"
 "		int indexInLevel = i;	//Index relative to firstIndexOffsetPerLevel[level]\n"
 "		\n"
 "		int numNodesInLevel = numNodesPerLevel[level];\n"
-"		if(i < numNodesInLevel)\n"
+"		if(indexInLevel < numNodesInLevel)\n"
 "		{\n"
 "			int internalNodeIndexGlobal = indexInLevel + firstIndexOffsetPerLevel[level];\n"
 "			int2 childIndicies = internalNodeChildIndices[internalNodeIndexGlobal];\n"
@ -289,19 +261,26 @@ static const char* parallelLinearBvhCL= \
 "			int isLeftChildLeaf = isLeafNode(childIndicies.x);\n"
 "			int isRightChildLeaf = isLeafNode(childIndicies.y);\n"
 "			\n"
+"			//left/RightChildLeafIndex == Rigid body indicies\n"
 "			int leftChildLeafIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n"
 "			int rightChildLeafIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n"
 "			\n"
 "			b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftChildLeafIndex] : out_internalNodeAabbs[leftChildIndex];\n"
 "			b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightChildLeafIndex] : out_internalNodeAabbs[rightChildIndex];\n"
 "			\n"
+"			//\n"
 "			b3AabbCL internalNodeAabb;\n"
 "			internalNodeAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n"
 "			internalNodeAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n"
 "			out_internalNodeAabbs[internalNodeIndexGlobal] = internalNodeAabb;\n"
+"			\n"
+"			//For index range, x == min and y == max; left child always has lower index\n"
+"			int2 leafIndexRange;\n"
+"			leafIndexRange.x = (isLeftChildLeaf) ? leftChildIndex : out_internalNodeLeafIndexRanges[leftChildIndex].x;\n"
+"			leafIndexRange.y = (isRightChildLeaf) ? rightChildIndex : out_internalNodeLeafIndexRanges[rightChildIndex].y;\n"
+"			\n"
+"			out_internalNodeLeafIndexRanges[internalNodeIndexGlobal] = leafIndexRange;\n"
 "		}\n"
-"	\n"
-"		barrier(CLK_GLOBAL_MEM_FENCE);\n"
 "	}\n"
 "}\n"
 "//From sap.cl\n"
@ -316,7 +295,9 @@ static const char* parallelLinearBvhCL= \
 "}\n"
 "//From sap.cl\n"
 "__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, \n"
-"											__global int2* internalNodeChildIndices, __global b3AabbCL* internalNodeAabbs,\n"
+"											__global int2* internalNodeChildIndices, \n"
+"											__global b3AabbCL* internalNodeAabbs,\n"
+"											__global int2* internalNodeLeafIndexRanges,\n"
 "											__global SortDataCL* mortonCodesAndAabbIndices,\n"
 "											__global int* out_numPairs, __global int4* out_overlappingPairs, \n"
 "											int maxPairs, int numQueryAabbs)\n"
@ -326,7 +307,8 @@ static const char* parallelLinearBvhCL= \
 "	int queryRigidIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
 "	if(queryRigidIndex >= numQueryAabbs) return;\n"
 "	\n"
-"	queryRigidIndex = mortonCodesAndAabbIndices[queryRigidIndex].m_value;\n"
+"	int queryBvhNodeIndex = queryRigidIndex;\n"
+"	queryRigidIndex = mortonCodesAndAabbIndices[queryRigidIndex].m_value;		//	fix queryRigidIndex naming for this branch\n"
 "#else\n"
 "	int queryRigidIndex = get_global_id(0);\n"
 "	if(queryRigidIndex >= numQueryAabbs) return;\n"
@ -347,7 +329,15 @@ static const char* parallelLinearBvhCL= \
 "		\n"
 "		int isLeaf = isLeafNode(internalOrLeafNodeIndex);	//Internal node if false\n"
 "		int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n"
-"	\n"
+"		\n"
+"		//Optimization - if the node is not a leaf, check whether the highest leaf index of that node\n"
+"		//is less than the queried node's index to avoid testing each pair twice.\n"
+"		{\n"
+"			//	fix: produces duplicate pairs\n"
+"		//	int highestLeafIndex = (isLeaf) ? numQueryAabbs : internalNodeLeafIndexRanges[bvhNodeIndex].y;\n"
+"		//	if(highestLeafIndex < queryBvhNodeIndex) continue;\n"
+"		}\n"
+"		\n"
 "		//bvhRigidIndex is not used if internal node\n"
 "		int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n"
 "	\n"