From 09aa2dbbe7d968ca7d63d8af3be03bdce0da1f5c Mon Sep 17 00:00:00 2001
From: rponom <rponom@08e121b0-ae19-0410-a57b-3be3395fd4fd>
Date: Tue, 25 Nov 2008 03:16:11 +0000
Subject: [PATCH] CPU implementation of btCudaBroadphase added. It is called
 bt3DGridBroadphase and btCudaBroadphase is now derived from it rater than
 from btSimpleBroadphase Test of bt3DGridBroadphase was added to
 CDTestFramework

---
 .../BulletSAPCompleteBoxPruningTest.cpp       |   4 +
 Extras/CDTestFramework/CDTestFramework.cpp    |   3 +
 Extras/CUDA/bt3DGridBroadphase.cpp            | 604 ++++++++++++++++++
 Extras/CUDA/bt3DGridBroadphaseFunc.h          | 444 +++++++++++++
 Extras/CUDA/btCudaBroadphase.cpp              | 453 +++----------
 Extras/CUDA/btCudaBroadphase.cu               | 460 +------------
 Extras/CUDA/btCudaBroadphase.h                |  88 ++-
 Extras/CUDA/btCudaBroadphaseKernel.h          |  21 +
 Extras/CUDA/libbulletcuda.vcproj              |   8 +
 Extras/CUDA/particleSystem.cpp                |   1 +
 10 files changed, 1247 insertions(+), 839 deletions(-)
 create mode 100644 Extras/CUDA/bt3DGridBroadphase.cpp
 create mode 100644 Extras/CUDA/bt3DGridBroadphaseFunc.h

diff --git a/Extras/CDTestFramework/BulletSAPCompleteBoxPruningTest.cpp b/Extras/CDTestFramework/BulletSAPCompleteBoxPruningTest.cpp
index cb4aaaa7d..6cc3ef612 100644
--- a/Extras/CDTestFramework/BulletSAPCompleteBoxPruningTest.cpp
+++ b/Extras/CDTestFramework/BulletSAPCompleteBoxPruningTest.cpp
@@ -433,6 +433,10 @@ BulletSAPCompleteBoxPruningTest::BulletSAPCompleteBoxPruningTest(int numBoxes,in
 //		m_broadphase = new btCudaBroadphase(aabbMin, aabbMax, 32, 32, 32, 8192, 8192, 64, 16);
 		methodname	=	"btCudaBroadphase";
 		break;
+	case 9:
+		m_broadphase = new bt3DGridBroadphase(aabbMin, aabbMax, 24, 24, 24,maxNumBoxes , maxNumBoxes, 64, 16);
+		methodname	=	"bt3DGridBroadphase";
+		break;
 	default:
 		{
 			m_broadphase = new btAxisSweep3(aabbMin,aabbMax,numBoxes,new btNullPairCache());
diff --git a/Extras/CDTestFramework/CDTestFramework.cpp b/Extras/CDTestFramework/CDTestFramework.cpp
index 5397aaff6..db0d42cff 100644
--- a/Extras/CDTestFramework/CDTestFramework.cpp
+++ b/Extras/CDTestFramework/CDTestFramework.cpp
@@ -76,6 +76,7 @@ enum TestIndex
 //	TEST_BIPARTITE_BOX_PRUNING,
 	TEST_DBVT_8192,
 	TEST_BULLET_CUDA_8192,
+	TEST_BULLET_3DGRID_8192,
  	TEST_OPCODE_ARRAY_SAP,
 	MAX_NB_TESTS
 };
@@ -312,6 +313,7 @@ int main(int argc, char** argv)
 //			{TEST_BIPARTITE_BOX_PRUNING, "Bipartite box pruning"},
 			{TEST_DBVT_8192, "Bullet DBVT 8192"},
 			{TEST_BULLET_CUDA_8192, "Bullet CUDA 8192"},
+			{TEST_BULLET_3DGRID_8192, "Bullet 3D Grid 8192"},
 			{TEST_OPCODE_ARRAY_SAP, "OPCODE ARRAY SAP"},
 		};
 		TwType testType = TwDefineEnum("CollisionTest", testEV, MAX_NB_TESTS);
@@ -335,6 +337,7 @@ int main(int argc, char** argv)
 //	gCollisionTests[TEST_BIPARTITE_BOX_PRUNING]	= new BipartiteBoxPruningTest;
 	gCollisionTests[TEST_DBVT_8192]	= new BulletSAPCompleteBoxPruningTest(NUM_SAP_BOXES,7);
 	gCollisionTests[TEST_BULLET_CUDA_8192]	= new BulletSAPCompleteBoxPruningTest(NUM_SAP_BOXES,8);
+	gCollisionTests[TEST_BULLET_3DGRID_8192]	= new BulletSAPCompleteBoxPruningTest(NUM_SAP_BOXES,9);
 	gCollisionTests[TEST_OPCODE_ARRAY_SAP]	= new OpcodeArraySAPTest(NUM_SAP_BOXES);
 
 	for(int i=0;i<MAX_NB_TESTS;i++)
diff --git a/Extras/CUDA/bt3DGridBroadphase.cpp b/Extras/CUDA/bt3DGridBroadphase.cpp
new file mode 100644
index 000000000..b46170e88
--- /dev/null
+++ b/Extras/CUDA/bt3DGridBroadphase.cpp
@@ -0,0 +1,604 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2008 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//--------------------------------------------------------------------------
+
+#include "LinearMath/btAlignedAllocator.h"
+#include "LinearMath/btQuickprof.h"
+#include "BulletCollision/BroadphaseCollision/btOverlappingPairCache.h"
+#include "btCudaBroadphaseKernel.h"
+#include "btCudaBroadphase.h"
+
+//--------------------------------------------------------------------------
+
+#include <stdio.h>
+
+//--------------------------------------------------------------------------
+
+static btCudaBroadphaseParams s3DGridBroadphaseParams;
+
+//--------------------------------------------------------------------------
+
+bt3DGridBroadphase::bt3DGridBroadphase(	const btVector3& worldAabbMin,const btVector3& worldAabbMax, 
+										int gridSizeX, int gridSizeY, int gridSizeZ, 
+										int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+										int maxBodiesPerCell,
+										btScalar cellFactorAABB) :
+	btSimpleBroadphase(maxSmallProxies,
+//				     new (btAlignedAlloc(sizeof(btSortedOverlappingPairCache),16)) btSortedOverlappingPairCache),
+				     new (btAlignedAlloc(sizeof(btHashedOverlappingPairCache),16)) btHashedOverlappingPairCache),
+	m_bInitialized(false),
+    m_numBodies(0)
+{
+	m_ownsPairCache = true;
+	m_params.m_gridSizeX = gridSizeX;
+	m_params.m_gridSizeY = gridSizeY;
+	m_params.m_gridSizeZ = gridSizeZ;
+	m_params.m_numCells = m_params.m_gridSizeX * m_params.m_gridSizeY * m_params.m_gridSizeZ;
+	btVector3 w_org = worldAabbMin;
+	m_params.m_worldOriginX = w_org.getX();
+	m_params.m_worldOriginY = w_org.getY();
+	m_params.m_worldOriginZ = w_org.getZ();
+	btVector3 w_size = worldAabbMax - worldAabbMin;
+	m_params.m_cellSizeX = w_size.getX() / m_params.m_gridSizeX;
+	m_params.m_cellSizeY = w_size.getY() / m_params.m_gridSizeY;
+	m_params.m_cellSizeZ = w_size.getZ() / m_params.m_gridSizeZ;
+	m_maxRadius = btMin(btMin(m_params.m_cellSizeX, m_params.m_cellSizeY), m_params.m_cellSizeZ);
+	m_maxRadius *= btScalar(0.5f);
+	m_params.m_numBodies = m_numBodies;
+	m_params.m_maxBodiesPerCell = maxBodiesPerCell;
+
+	m_numLargeHandles = 0;						
+	m_maxLargeHandles = maxLargeProxies;
+
+	m_maxPairsPerBody = maxPairsPerBody;
+
+	m_cellFactorAABB = cellFactorAABB;
+
+	m_LastLargeHandleIndex = -1;
+
+	_initialize();
+} // bt3DGridBroadphase::bt3DGridBroadphase()
+
+//--------------------------------------------------------------------------
+
+bt3DGridBroadphase::~bt3DGridBroadphase()
+{
+	//btSimpleBroadphase will free memory of btSortedOverlappingPairCache, because m_ownsPairCache
+	assert(m_bInitialized);
+	_finalize();
+} // bt3DGridBroadphase::~bt3DGridBroadphase()
+
+//--------------------------------------------------------------------------
+
+void bt3DGridBroadphase::_initialize()
+{
+    assert(!m_bInitialized);
+    // allocate host storage
+    m_hBodiesHash = new unsigned int[m_maxHandles * 2];
+    memset(m_hBodiesHash, 0x00, m_maxHandles*2*sizeof(unsigned int));
+
+    m_hCellStart = new unsigned int[m_params.m_numCells];
+    memset(m_hCellStart, 0x00, m_params.m_numCells * sizeof(unsigned int));
+
+	m_hPairBuffStartCurr = new unsigned int[m_maxHandles * 2 + 2];
+	// --------------- for now, init with m_maxPairsPerBody for each body
+	m_hPairBuffStartCurr[0] = 0;
+	m_hPairBuffStartCurr[1] = 0;
+	for(int i = 1; i <= m_maxHandles; i++) 
+	{
+		m_hPairBuffStartCurr[i * 2] = m_hPairBuffStartCurr[(i-1) * 2] + m_maxPairsPerBody;
+		m_hPairBuffStartCurr[i * 2 + 1] = 0;
+	}
+	//----------------
+	unsigned int numAABB = m_maxHandles + m_maxLargeHandles;
+	m_hAABB = new btCuda3F1U[numAABB * 2]; // AABB Min & Max
+
+	m_hPairBuff = new unsigned int[m_maxHandles * m_maxPairsPerBody];
+	memset(m_hPairBuff, 0x00, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int)); // needed?
+
+	m_hPairScan = new unsigned int[m_maxHandles + 1];
+
+	m_hPairOut = new unsigned int[m_maxHandles * m_maxPairsPerBody];
+
+// large proxies
+
+	// allocate handles buffer and put all handles on free list
+	m_pLargeHandlesRawPtr = btAlignedAlloc(sizeof(btSimpleBroadphaseProxy) * m_maxLargeHandles, 16);
+	m_pLargeHandles = new(m_pLargeHandlesRawPtr) btSimpleBroadphaseProxy[m_maxLargeHandles];
+	m_firstFreeLargeHandle = 0;
+	{
+		for (int i = m_firstFreeLargeHandle; i < m_maxLargeHandles; i++)
+		{
+			m_pLargeHandles[i].SetNextFree(i + 1);
+			m_pLargeHandles[i].m_uniqueId = m_maxHandles+2+i;
+		}
+		m_pLargeHandles[m_maxLargeHandles - 1].SetNextFree(0);
+	}
+
+// debug data
+	m_numPairsAdded = 0;
+	m_numOverflows = 0;
+
+    m_bInitialized = true;
+} // bt3DGridBroadphase::_initialize()
+
+//--------------------------------------------------------------------------
+
+void bt3DGridBroadphase::_finalize()
+{
+    assert(m_bInitialized);
+    delete [] m_hBodiesHash;
+    delete [] m_hCellStart;
+    delete [] m_hPairBuffStartCurr;
+    delete [] m_hAABB;
+	delete [] m_hPairBuff;
+	delete [] m_hPairScan;
+	delete [] m_hPairOut;
+	btAlignedFree(m_pLargeHandlesRawPtr);
+	m_bInitialized = false;
+} // bt3DGridBroadphase::_finalize()
+
+//--------------------------------------------------------------------------
+
+void bt3DGridBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher)
+{
+	if(m_numHandles <= 0)
+	{
+		BT_PROFILE("addLarge2LargePairsToCache");
+		addLarge2LargePairsToCache(dispatcher);
+		return;
+	}
+	// update constants
+	setParameters(&m_params);
+	// prepare AABB array
+	prepareAABB();
+	// calculate hash
+	calcHashAABB();
+	// sort bodies based on hash
+	sortHash();
+	// find start of each cell
+	findCellStart();
+	// findOverlappingPairs (small/small)
+	findOverlappingPairs();
+	// findOverlappingPairs (small/large)
+	findPairsLarge();
+	// add pairs to CPU cache
+	computePairCacheChanges();
+	scanOverlappingPairBuff();
+	squeezeOverlappingPairBuff();
+	addPairsToCache(dispatcher);
+	// find and add large/large pairs to CPU cache
+	addLarge2LargePairsToCache(dispatcher);
+	return;
+} // bt3DGridBroadphase::calculateOverlappingPairs()
+
+//--------------------------------------------------------------------------
+
+void bt3DGridBroadphase::addPairsToCache(btDispatcher* dispatcher)
+{
+	m_numPairsAdded = 0;
+	m_numPairsRemoved = 0;
+	for(int i = 0; i < m_numHandles; i++) 
+	{
+		unsigned int num = m_hPairScan[i+1] - m_hPairScan[i];
+		if(!num)
+		{
+			continue;
+		}
+		unsigned int* pInp = m_hPairOut + m_hPairScan[i];
+		unsigned int index0 = m_hAABB[i * 2].uw;
+		btSimpleBroadphaseProxy* proxy0 = &m_pHandles[index0];
+		for(unsigned int j = 0; j < num; j++)
+		{
+			unsigned int indx1_s = pInp[j];
+			unsigned int index1 = indx1_s & (~BT_CUDA_PAIR_ANY_FLG);
+			btSimpleBroadphaseProxy* proxy1;
+			if(index1 < (unsigned int)m_maxHandles)
+			{
+				proxy1 = &m_pHandles[index1];
+			}
+			else
+			{
+				index1 -= m_maxHandles;
+				btAssert((index1 >= 0) && (index1 < (unsigned int)m_maxLargeHandles));
+				proxy1 = &m_pLargeHandles[index1];
+			}
+			if(indx1_s & BT_CUDA_PAIR_NEW_FLG)
+			{
+				m_pairCache->addOverlappingPair(proxy0,proxy1);
+				m_numPairsAdded++;
+			}
+			else
+			{
+				m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher);
+				m_numPairsRemoved++;
+			}
+		}
+	}
+} // bt3DGridBroadphase::addPairsToCache()
+
+//--------------------------------------------------------------------------
+
+btBroadphaseProxy* bt3DGridBroadphase::createProxy(  const btVector3& aabbMin,  const btVector3& aabbMax,int shapeType,void* userPtr ,short int collisionFilterGroup,short int collisionFilterMask, btDispatcher* dispatcher,void* multiSapProxy)
+{
+	btBroadphaseProxy*  proxy;
+	bool bIsLarge = isLargeProxy(aabbMin, aabbMax);
+	if(bIsLarge)
+	{
+		if (m_numLargeHandles >= m_maxLargeHandles)
+		{
+			btAssert(0);
+			return 0; //should never happen, but don't let the game crash ;-)
+		}
+		btAssert((aabbMin[0]<= aabbMax[0]) && (aabbMin[1]<= aabbMax[1]) && (aabbMin[2]<= aabbMax[2]));
+		int newHandleIndex = allocLargeHandle();
+		proxy = new (&m_pLargeHandles[newHandleIndex])btSimpleBroadphaseProxy(aabbMin,aabbMax,shapeType,userPtr,collisionFilterGroup,collisionFilterMask,multiSapProxy);
+	}
+	else
+	{
+		proxy = btSimpleBroadphase::createProxy(aabbMin, aabbMax, shapeType, userPtr, collisionFilterGroup, collisionFilterMask, dispatcher, multiSapProxy);
+	}
+	return proxy;
+} // bt3DGridBroadphase::createProxy()
+
+//--------------------------------------------------------------------------
+
+void bt3DGridBroadphase::destroyProxy(btBroadphaseProxy* proxy, btDispatcher* dispatcher)
+{
+	bool bIsLarge = isLargeProxy(proxy);
+	if(bIsLarge)
+	{
+		
+		btSimpleBroadphaseProxy* proxy0 = static_cast<btSimpleBroadphaseProxy*>(proxy);
+		freeLargeHandle(proxy0);
+		m_pairCache->removeOverlappingPairsContainingProxy(proxy,dispatcher);
+	}
+	else
+	{
+		btSimpleBroadphase::destroyProxy(proxy, dispatcher);
+	}
+	return;
+} // bt3DGridBroadphase::destroyProxy()
+
+//--------------------------------------------------------------------------
+
+bool bt3DGridBroadphase::isLargeProxy(const btVector3& aabbMin,  const btVector3& aabbMax)
+{
+	btVector3 diag = aabbMax - aabbMin;
+	btScalar radius = diag.length() * btScalar(0.5f);
+
+	radius *= m_cellFactorAABB; // user-defined factor
+
+	return (radius > m_maxRadius);
+} // bt3DGridBroadphase::isLargeProxy()
+
+//--------------------------------------------------------------------------
+
+bool bt3DGridBroadphase::isLargeProxy(btBroadphaseProxy* proxy)
+{
+	return (proxy->getUid() >= (m_maxHandles+2));
+} // bt3DGridBroadphase::isLargeProxy()
+
+//--------------------------------------------------------------------------
+
+void bt3DGridBroadphase::addLarge2LargePairsToCache(btDispatcher* dispatcher)
+{
+	int i,j;
+	if (m_numLargeHandles <= 0)
+	{
+		return;
+	}
+	int new_largest_index = -1;
+	for(i = 0; i <= m_LastLargeHandleIndex; i++)
+	{
+		btSimpleBroadphaseProxy* proxy0 = &m_pLargeHandles[i];
+		if(!proxy0->m_clientObject)
+		{
+			continue;
+		}
+		new_largest_index = i;
+		for(j = i + 1; j <= m_LastLargeHandleIndex; j++)
+		{
+			btSimpleBroadphaseProxy* proxy1 = &m_pLargeHandles[j];
+			if(!proxy1->m_clientObject)
+			{
+				continue;
+			}
+			btAssert(proxy0 != proxy1);
+			btSimpleBroadphaseProxy* p0 = getSimpleProxyFromProxy(proxy0);
+			btSimpleBroadphaseProxy* p1 = getSimpleProxyFromProxy(proxy1);
+			if(aabbOverlap(p0,p1))
+			{
+				if (!m_pairCache->findPair(proxy0,proxy1))
+				{
+					m_pairCache->addOverlappingPair(proxy0,proxy1);
+				}
+			} 
+			else
+			{
+				if(m_pairCache->findPair(proxy0,proxy1))
+				{
+					m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher);
+				}
+			}
+		}
+	}
+	m_LastLargeHandleIndex = new_largest_index;
+	return;
+} // bt3DGridBroadphase::addLarge2LargePairsToCache()
+
+//--------------------------------------------------------------------------
+
+void bt3DGridBroadphase::rayTest(const btVector3& rayFrom,const btVector3& rayTo, btBroadphaseRayCallback& rayCallback)
+{
+	btSimpleBroadphase::rayTest(rayFrom, rayTo, rayCallback);
+	for (int i=0; i <= m_LastLargeHandleIndex; i++)
+	{
+		btSimpleBroadphaseProxy* proxy = &m_pLargeHandles[i];
+		if(!proxy->m_clientObject)
+		{
+			continue;
+		}
+		rayCallback.process(proxy);
+	}
+} // bt3DGridBroadphase::rayTest()
+
+//--------------------------------------------------------------------------
+//--------------------------------------------------------------------------
+//
+// overrides for CPU version
+//
+//--------------------------------------------------------------------------
+//--------------------------------------------------------------------------
+
+void bt3DGridBroadphase::prepareAABB()
+{
+	BT_PROFILE("prepareAABB");
+	btCuda3F1U* pBB = m_hAABB;
+	int i;
+	int new_largest_index = -1;
+	unsigned int num_small = 0;
+	for(i = 0; i <= m_LastHandleIndex; i++) 
+	{
+		btSimpleBroadphaseProxy* proxy0 = &m_pHandles[i];
+		if(!proxy0->m_clientObject)
+		{
+			continue;
+		}
+		new_largest_index = i;
+		pBB->fx = proxy0->m_aabbMin.getX();
+		pBB->fy = proxy0->m_aabbMin.getY();
+		pBB->fz = proxy0->m_aabbMin.getZ();
+		pBB->uw = i;
+		pBB++;
+		pBB->fx = proxy0->m_aabbMax.getX();
+		pBB->fy = proxy0->m_aabbMax.getY();
+		pBB->fz = proxy0->m_aabbMax.getZ();
+		pBB->uw = num_small;
+		pBB++;
+		num_small++;
+	}
+	m_LastHandleIndex = new_largest_index;
+	new_largest_index = -1;
+	unsigned int num_large = 0;
+	for(i = 0; i <= m_LastLargeHandleIndex; i++) 
+	{
+		btSimpleBroadphaseProxy* proxy0 = &m_pLargeHandles[i];
+		if(!proxy0->m_clientObject)
+		{
+			continue;
+		}
+		new_largest_index = i;
+		pBB->fx = proxy0->m_aabbMin.getX();
+		pBB->fy = proxy0->m_aabbMin.getY();
+		pBB->fz = proxy0->m_aabbMin.getZ();
+		pBB->uw = i + m_maxHandles;
+		pBB++;
+		pBB->fx = proxy0->m_aabbMax.getX();
+		pBB->fy = proxy0->m_aabbMax.getY();
+		pBB->fz = proxy0->m_aabbMax.getZ();
+		pBB->uw = num_large + m_maxHandles;
+		pBB++;
+		num_large++;
+	}
+	m_LastLargeHandleIndex = new_largest_index;
+	// paranoid checks
+	btAssert(num_small == m_numHandles);
+	btAssert(num_large == m_numLargeHandles);
+	return;
+} // bt3DGridBroadphase::prepareAABB()
+
+//--------------------------------------------------------------------------
+
+void bt3DGridBroadphase::setParameters(btCudaBroadphaseParams* hostParams)
+{
+	s3DGridBroadphaseParams = *hostParams;
+	return;
+} // bt3DGridBroadphase::setParameters()
+
+//--------------------------------------------------------------------------
+
+void bt3DGridBroadphase::calcHashAABB()
+{
+	BT_PROFILE("bt3DGrid_calcHashAABB");
+	bt3DGrid_calcHashAABB(m_hAABB, m_hBodiesHash, m_numHandles);
+	return;
+} // bt3DGridBroadphase::calcHashAABB()
+
+//--------------------------------------------------------------------------
+
+void bt3DGridBroadphase::sortHash()
+{
+	class bt3DGridHashKey
+	{
+	public:
+	   unsigned int hash;
+	   unsigned int index;
+	   void quickSort(bt3DGridHashKey* pData, int lo, int hi)
+	   {
+			int i=lo, j=hi;
+			bt3DGridHashKey x = pData[(lo+hi)/2];
+			do
+			{    
+				while(pData[i].hash > x.hash) i++; 
+				while(x.hash > pData[j].hash) j--;
+				if(i <= j)
+				{
+					bt3DGridHashKey t = pData[i];
+					pData[i] = pData[j];
+					pData[j] = t;
+					i++; j--;
+				}
+			} while(i <= j);
+			if(lo < j) pData->quickSort(pData, lo, j);
+			if(i < hi) pData->quickSort(pData, i, hi);
+	   }
+	};
+	BT_PROFILE("bt3DGrid_sortHash");
+	bt3DGridHashKey* pHash = (bt3DGridHashKey*)m_hBodiesHash;
+	pHash->quickSort(pHash, 0, m_numHandles - 1);
+	return;
+} // bt3DGridBroadphase::sortHash()
+
+//--------------------------------------------------------------------------
+
+void bt3DGridBroadphase::findCellStart()
+{
+	BT_PROFILE("bt3DGrid_findCellStart");
+	bt3DGrid_findCellStart(m_hBodiesHash, m_hCellStart, m_numHandles, m_params.m_numCells);
+	return;
+} // bt3DGridBroadphase::findCellStart()
+
+//--------------------------------------------------------------------------
+
+void bt3DGridBroadphase::findOverlappingPairs()
+{
+	BT_PROFILE("bt3DGrid_findOverlappingPairs");
+	bt3DGrid_findOverlappingPairs(m_hAABB, m_hBodiesHash, m_hCellStart, m_hPairBuff, m_hPairBuffStartCurr, m_numHandles);
+	return;
+} // bt3DGridBroadphase::findOverlappingPairs()
+
+//--------------------------------------------------------------------------
+
+void bt3DGridBroadphase::findPairsLarge()
+{
+	BT_PROFILE("bt3DGrid_findPairsLarge");
+	bt3DGrid_findPairsLarge(m_hAABB, m_hBodiesHash, m_hCellStart, m_hPairBuff, m_hPairBuffStartCurr,	m_numHandles, m_numLargeHandles);
+	return;
+} // bt3DGridBroadphase::findPairsLarge()
+
+//--------------------------------------------------------------------------
+
+void bt3DGridBroadphase::computePairCacheChanges()
+{
+	BT_PROFILE("bt3DGrid_computePairCacheChanges");
+	bt3DGrid_computePairCacheChanges(m_hPairBuff, m_hPairBuffStartCurr, m_hPairScan, m_hAABB, m_numHandles);
+	return;
+} // bt3DGridBroadphase::computePairCacheChanges()
+
+//--------------------------------------------------------------------------
+
+void bt3DGridBroadphase::scanOverlappingPairBuff()
+{
+	BT_PROFILE("bt3DGrid_scanOverlappingPairBuff");
+	m_hPairScan[0] = 0;
+	for(int i = 1; i <= m_numHandles; i++) 
+	{
+		unsigned int delta = m_hPairScan[i];
+		m_hPairScan[i] = m_hPairScan[i-1] + delta;
+	}
+	return;
+} // bt3DGridBroadphase::scanOverlappingPairBuff()
+
+//--------------------------------------------------------------------------
+
+void bt3DGridBroadphase::squeezeOverlappingPairBuff()
+{
+	BT_PROFILE("bt3DGrid_squeezeOverlappingPairBuff");
+	bt3DGrid_squeezeOverlappingPairBuff(m_hPairBuff, m_hPairBuffStartCurr, m_hPairScan, m_hPairOut, m_hAABB, m_numHandles);
+	return;
+} // bt3DGridBroadphase::squeezeOverlappingPairBuff()
+
+//--------------------------------------------------------------------------
+
+typedef unsigned int uint;
+
+struct uint2
+{
+	unsigned int x, y;
+};
+
+struct int3
+{
+	int x, y, z;
+};
+
+struct uint3
+{
+	unsigned int x, y, z;
+};
+
+struct float4
+{
+	float x, y, z, w;
+};
+
+
+#define BT3DGRID__device__ inline
+#define BT3DGRIDmax(a, b) ((a) > (b) ? (a) : (b))
+#define BT3DGRIDmin(a, b) ((a) < (b) ? (a) : (b))
+#define BT3DGRIDparams s3DGridBroadphaseParams
+#define BT3DGRID__mul24(a, b) ((a)*(b))
+#define BT3DGRID__global__ inline
+#define BT3DGRID__shared__ static
+#define BT3DGRID__syncthreads()
+
+
+static inline uint2 bt3dGrid_make_uint2(unsigned int x, unsigned int y)
+{
+  uint2 t; t.x = x; t.y = y; return t;
+}
+#define BT3DGRIDmake_uint2(x, y) bt3dGrid_make_uint2(x, y)
+
+static inline int3 bt3dGrid_make_int3(int x, int y, int z)
+{
+  int3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+inline int3 operator+(int3 a, int3 b)
+{
+    return bt3dGrid_make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+#define BT3DGRIDmake_int3(x, y, z) bt3dGrid_make_int3(x, y, z)
+
+#define BT3DGRIDFETCH(a, b) a[b]
+#define BT3DGRIDPREF(func) bt3DGrid_##func
+#define MY_CUDA_SAFE_CALL(func) func
+#define BT3DGPRDMemset memset
+
+static uint2 s_blockIdx, s_blockDim, s_threadIdx;
+#define BT3DGRIDblockIdx s_blockIdx
+#define BT3DGRIDblockDim s_blockDim
+#define BT3DGRIDthreadIdx s_threadIdx
+#define BT3DGRIDEXECKERNEL(numb, numt, kfunc, args) {s_blockDim.x=numt;for(int nb=0;nb<numb;nb++){s_blockIdx.x=nb;for(int nt=0;nt<numt;nt++){s_threadIdx.x=nt;kfunc args;}}}
+
+#define CUT_CHECK_ERROR(s)
+
+//--------------------------------------------------------------------------
+
+
+#include "bt3DGridBroadphaseFunc.h"
+
+//--------------------------------------------------------------------------
diff --git a/Extras/CUDA/bt3DGridBroadphaseFunc.h b/Extras/CUDA/bt3DGridBroadphaseFunc.h
new file mode 100644
index 000000000..8c5f28e39
--- /dev/null
+++ b/Extras/CUDA/bt3DGridBroadphaseFunc.h
@@ -0,0 +1,444 @@
+/*
+ * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:   
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and 
+ * international Copyright laws.  
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
+ * OR PERFORMANCE OF THIS SOURCE CODE.  
+ *
+ * U.S. Government End Users.  This source code is a "commercial item" as 
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
+ * "commercial computer software" and "commercial computer software 
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
+ * and is provided to the U.S. Government only as a commercial end item.  
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+ * source code with only those rights set forth herein.
+ */
+
+#include "btCudaBroadphaseKernel.h"
+
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//               K E R N E L    F U N C T I O N S 
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+
+// calculate position in uniform grid
+BT3DGRID__device__ int3 btCuda_calcGridPos(float4 p)
+{
+    int3 gridPos;
+    gridPos.x = (int)floor((p.x - BT3DGRIDparams.m_worldOriginX) / BT3DGRIDparams.m_cellSizeX);
+    gridPos.y = (int)floor((p.y - BT3DGRIDparams.m_worldOriginY) / BT3DGRIDparams.m_cellSizeY);
+    gridPos.z = (int)floor((p.z - BT3DGRIDparams.m_worldOriginZ) / BT3DGRIDparams.m_cellSizeZ);
+    return gridPos;
+}
+
+//----------------------------------------------------------------------------------------
+
+// calculate address in grid from position (clamping to edges)
+BT3DGRID__device__ uint btCuda_calcGridHash(int3 gridPos)
+{
+    gridPos.x = BT3DGRIDmax(0, BT3DGRIDmin(gridPos.x, (int)BT3DGRIDparams.m_gridSizeX - 1));
+    gridPos.y = BT3DGRIDmax(0, BT3DGRIDmin(gridPos.y, (int)BT3DGRIDparams.m_gridSizeY - 1));
+    gridPos.z = BT3DGRIDmax(0, BT3DGRIDmin(gridPos.z, (int)BT3DGRIDparams.m_gridSizeZ - 1));
+    return BT3DGRID__mul24(BT3DGRID__mul24(gridPos.z, BT3DGRIDparams.m_gridSizeY), BT3DGRIDparams.m_gridSizeX) + BT3DGRID__mul24(gridPos.y, BT3DGRIDparams.m_gridSizeX) + gridPos.x;
+}
+
+//----------------------------------------------------------------------------------------
+
+// calculate grid hash value for each body using its AABB
+BT3DGRID__global__ void calcHashAABBD(btCuda3F1U* pAABB, uint2* pHash, uint numBodies)
+{
+    int index = BT3DGRID__mul24(BT3DGRIDblockIdx.x, BT3DGRIDblockDim.x) + BT3DGRIDthreadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+	btCuda3F1U bbMin = pAABB[index*2];
+	btCuda3F1U bbMax = pAABB[index*2 + 1];
+	float4 pos;
+	pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
+	pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
+	pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
+    // get address in grid
+    int3 gridPos = btCuda_calcGridPos(pos);
+    uint gridHash = btCuda_calcGridHash(gridPos);
+    // store grid hash and body index
+    pHash[index] = BT3DGRIDmake_uint2(gridHash, index);
+}
+
+//----------------------------------------------------------------------------------------
+
+BT3DGRID__global__ void findCellStartD(uint2* pHash, uint* cellStart, uint numBodies)
+{
+    int index = BT3DGRID__mul24(BT3DGRIDblockIdx.x, BT3DGRIDblockDim.x) + BT3DGRIDthreadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+    uint2 sortedData = pHash[index];
+	// Load hash data into shared memory so that we can look 
+	// at neighboring body's hash value without loading
+	// two hash values per thread
+	BT3DGRID__shared__ uint sharedHash[257];
+	sharedHash[BT3DGRIDthreadIdx.x+1] = sortedData.x;
+	if((index > 0) && (BT3DGRIDthreadIdx.x == 0))
+	{
+		// first thread in block must load neighbor body hash
+		volatile uint2 prevData = pHash[index-1];
+		sharedHash[0] = prevData.x;
+	}
+	BT3DGRID__syncthreads();
+	if((index == 0) || (sortedData.x != sharedHash[BT3DGRIDthreadIdx.x]))
+	{
+		cellStart[sortedData.x] = index;
+	}
+}
+
+//----------------------------------------------------------------------------------------
+
+BT3DGRID__device__ uint cudaTestAABBOverlap(btCuda3F1U min0, btCuda3F1U max0, btCuda3F1U min1, btCuda3F1U max1)
+{
+	return	(min0.fx <= max1.fx)&& (min1.fx <= max0.fx) && 
+			(min0.fy <= max1.fy)&& (min1.fy <= max0.fy) && 
+			(min0.fz <= max1.fz)&& (min1.fz <= max0.fz); 
+}
+
+//----------------------------------------------------------------------------------------
+
+BT3DGRID__device__ void findPairsInCell(int3	gridPos,
+								uint    index,
+								uint2*  pHash,
+								uint*   pCellStart,
+								btCuda3F1U* pAABB, 
+								uint*   pPairBuff,
+								uint2*	pPairBuffStartCurr,
+								uint	numBodies)
+{
+    if (	(gridPos.x < 0) || (gridPos.x > (int)BT3DGRIDparams.m_gridSizeX - 1)
+		||	(gridPos.y < 0) || (gridPos.y > (int)BT3DGRIDparams.m_gridSizeY - 1)
+		||  (gridPos.z < 0) || (gridPos.z > (int)BT3DGRIDparams.m_gridSizeZ - 1)) 
+    {
+		return;
+	}
+    uint gridHash = btCuda_calcGridHash(gridPos);
+    // get start of bucket for this cell
+    uint bucketStart = pCellStart[gridHash];
+    if (bucketStart == 0xffffffff)
+	{
+        return;   // cell empty
+	}
+	// iterate over bodies in this cell
+    uint2 sortedData = pHash[index];
+	uint unsorted_indx = sortedData.y;
+    btCuda3F1U min0 = BT3DGRIDFETCH(pAABB, unsorted_indx*2); 
+	btCuda3F1U max0 = BT3DGRIDFETCH(pAABB, unsorted_indx*2 + 1);
+	uint handleIndex =  min0.uw;
+	uint2 start_curr = pPairBuffStartCurr[handleIndex];
+	uint start = start_curr.x;
+	uint curr = start_curr.y;
+	uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
+	uint curr_max = start_curr_next.x - start - 1;
+	uint bucketEnd = bucketStart + BT3DGRIDparams.m_maxBodiesPerCell;
+	bucketEnd = (bucketEnd > numBodies) ? numBodies : bucketEnd;
+	for(uint index2 = bucketStart; index2 < bucketEnd; index2++) 
+	{
+        uint2 cellData = pHash[index2];
+        if (cellData.x != gridHash)
+        {
+			break;   // no longer in same bucket
+		}
+		uint unsorted_indx2 = cellData.y;
+        if (unsorted_indx2 < unsorted_indx) // check not colliding with self
+        {   
+			btCuda3F1U min1 = BT3DGRIDFETCH(pAABB, unsorted_indx2*2);
+			btCuda3F1U max1 = BT3DGRIDFETCH(pAABB, unsorted_indx2*2 + 1);
+			if(cudaTestAABBOverlap(min0, max0, min1, max1))
+			{
+				uint handleIndex2 = min1.uw;
+				uint k;
+				for(k = 0; k < curr; k++)
+				{
+					uint old_pair = pPairBuff[start+k] & (~BT_CUDA_PAIR_ANY_FLG);
+					if(old_pair == handleIndex2)
+					{
+						pPairBuff[start+k] |= BT_CUDA_PAIR_FOUND_FLG;
+						break;
+					}
+				}
+				if(k == curr)
+				{
+					pPairBuff[start+curr] = handleIndex2 | BT_CUDA_PAIR_NEW_FLG;
+					if(curr >= curr_max) 
+					{ // not a good solution, but let's avoid crash
+						break;
+					}
+					curr++;
+				}
+			}
+		}
+	}
+	pPairBuffStartCurr[handleIndex] = BT3DGRIDmake_uint2(start, curr);
+    return;
+}
+
+//----------------------------------------------------------------------------------------
+
+BT3DGRID__global__ void
+findOverlappingPairsD(	btCuda3F1U*	pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff, 
+						uint2* pPairBuffStartCurr, uint numBodies)
+{
+    int index = BT3DGRID__mul24(BT3DGRIDblockIdx.x, BT3DGRIDblockDim.x) + BT3DGRIDthreadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+    uint2 sortedData = pHash[index];
+	uint unsorted_indx = sortedData.y;
+	btCuda3F1U bbMin = BT3DGRIDFETCH(pAABB, unsorted_indx*2);
+	btCuda3F1U bbMax = BT3DGRIDFETCH(pAABB, unsorted_indx*2 + 1);
+	float4 pos;
+	pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
+	pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
+	pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
+    // get address in grid
+    int3 gridPos = btCuda_calcGridPos(pos);
+    // examine only neighbouring cells
+    for(int z=-1; z<=1; z++) {
+        for(int y=-1; y<=1; y++) {
+            for(int x=-1; x<=1; x++) {
+                findPairsInCell(gridPos + BT3DGRIDmake_int3(x, y, z), index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, numBodies);
+            }
+        }
+    }
+}
+
+//----------------------------------------------------------------------------------------
+
+BT3DGRID__global__ void
+findPairsLargeD(	btCuda3F1U* pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff, 
+						uint2* pPairBuffStartCurr, uint numBodies, uint numLarge)
+{
+    int index = BT3DGRID__mul24(BT3DGRIDblockIdx.x, BT3DGRIDblockDim.x) + BT3DGRIDthreadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+    uint2 sortedData = pHash[index];
+	uint unsorted_indx = sortedData.y;
+	btCuda3F1U min0 = BT3DGRIDFETCH(pAABB, unsorted_indx*2);
+	btCuda3F1U max0 = BT3DGRIDFETCH(pAABB, unsorted_indx*2 + 1);
+	uint handleIndex =  min0.uw;
+	uint2 start_curr = pPairBuffStartCurr[handleIndex];
+	uint start = start_curr.x;
+	uint curr = start_curr.y;
+	uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
+	uint curr_max = start_curr_next.x - start - 1;
+    for(uint i = 0; i < numLarge; i++)
+    {
+		uint indx2 = numBodies + i;
+		btCuda3F1U min1 = BT3DGRIDFETCH(pAABB, indx2*2);
+		btCuda3F1U max1 = BT3DGRIDFETCH(pAABB, indx2*2 + 1);
+		if(cudaTestAABBOverlap(min0, max0, min1, max1))
+		{
+			uint k;
+			uint handleIndex2 =  min1.uw;
+			for(k = 0; k < curr; k++)
+			{
+				uint old_pair = pPairBuff[start+k] & (~BT_CUDA_PAIR_ANY_FLG);
+				if(old_pair == handleIndex2)
+				{
+					pPairBuff[start+k] |= BT_CUDA_PAIR_FOUND_FLG;
+					break;
+				}
+			}
+			if(k == curr)
+			{
+				pPairBuff[start+curr] = handleIndex2 | BT_CUDA_PAIR_NEW_FLG;
+				if(curr >= curr_max) 
+				{ // not a good solution, but let's avoid crash
+					break;
+				}
+				curr++;
+			}
+		}
+    }
+	pPairBuffStartCurr[handleIndex] = BT3DGRIDmake_uint2(start, curr);
+    return;
+}
+
+//----------------------------------------------------------------------------------------
+
+BT3DGRID__global__ void computePairCacheChangesD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan, btCuda3F1U* pAABB, uint numBodies)
+{
+    int index = BT3DGRID__mul24(BT3DGRIDblockIdx.x, BT3DGRIDblockDim.x) + BT3DGRIDthreadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+	btCuda3F1U bbMin = pAABB[index * 2];
+	uint handleIndex = bbMin.uw;
+	uint2 start_curr = pPairBuffStartCurr[handleIndex];
+	uint start = start_curr.x;
+	uint curr = start_curr.y;
+	uint *pInp = pPairBuff + start;
+	uint num_changes = 0;
+	for(uint k = 0; k < curr; k++, pInp++)
+	{
+		if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG))
+		{
+			num_changes++;
+		}
+	}
+	pPairScan[index+1] = num_changes;
+}
+
+//----------------------------------------------------------------------------------------
+
+BT3DGRID__global__ void squeezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan, uint* pPairOut, btCuda3F1U* pAABB, uint numBodies)
+{
+    int index = BT3DGRID__mul24(BT3DGRIDblockIdx.x, BT3DGRIDblockDim.x) + BT3DGRIDthreadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+	btCuda3F1U bbMin = pAABB[index * 2];
+	uint handleIndex = bbMin.uw;
+	uint2 start_curr = pPairBuffStartCurr[handleIndex];
+	uint start = start_curr.x;
+	uint curr = start_curr.y;
+	uint* pInp = pPairBuff + start;
+	uint* pOut = pPairOut + pPairScan[index];
+	uint* pOut2 = pInp;
+	uint num = 0; 
+	for(uint k = 0; k < curr; k++, pInp++)
+	{
+		if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG))
+		{
+			*pOut = *pInp;
+			pOut++;
+		}
+		if((*pInp) & BT_CUDA_PAIR_ANY_FLG)
+		{
+			*pOut2 = (*pInp) & (~BT_CUDA_PAIR_ANY_FLG);
+			pOut2++;
+			num++;
+		}
+	}
+	pPairBuffStartCurr[handleIndex] = BT3DGRIDmake_uint2(start, num);
+} // squeezeOverlappingPairBuffD()
+
+
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//               E N D   O F    K E R N E L    F U N C T I O N S 
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+
+extern "C"
+{
+
+//Round a / b to nearest higher integer value
+int BT3DGRIDPREF(iDivUp)(int a, int b)
+{
+    return (a % b != 0) ? (a / b + 1) : (a / b);
+}
+
+// compute grid and thread block size for a given number of elements
+void BT3DGRIDPREF(computeGridSize)(int n, int blockSize, int &numBlocks, int &numThreads)
+{
+    numThreads = BT3DGRIDmin(blockSize, n);
+    numBlocks = BT3DGRIDPREF(iDivUp)(n, numThreads);
+}
+
+void BT3DGRIDPREF(calcHashAABB)(btCuda3F1U* pAABB, unsigned int* hash,	unsigned int numBodies)
+{
+    int numThreads, numBlocks;
+    BT3DGRIDPREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
+    // execute the kernel
+    BT3DGRIDEXECKERNEL(numBlocks, numThreads, calcHashAABBD, (pAABB, (uint2*)hash, numBodies));
+    // check if kernel invocation generated an error
+    CUT_CHECK_ERROR("calcHashAABBD kernel execution failed");
+}
+
+void BT3DGRIDPREF(findCellStart(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells))
+{
+    int numThreads, numBlocks;
+    BT3DGRIDPREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
+	MY_CUDA_SAFE_CALL(BT3DGPRDMemset(cellStart, 0xffffffff, numCells*sizeof(uint)));
+	BT3DGRIDEXECKERNEL(numBlocks, numThreads, findCellStartD, ((uint2*)hash, (uint*)cellStart, numBodies));
+    CUT_CHECK_ERROR("Kernel execution failed: findCellStartD");
+}
+
+void BT3DGRIDPREF(findOverlappingPairs(btCuda3F1U* pAABB, unsigned int* pHash,	unsigned int* pCellStart, unsigned int*	pPairBuff, unsigned int*	pPairBuffStartCurr, unsigned int	numBodies))
+{
+#if B_CUDA_USE_TEX
+    MY_CUDA_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, numBodies * 2 * sizeof(btCuda3F1U)));
+#endif
+    int numThreads, numBlocks;
+    BT3DGRIDPREF(computeGridSize)(numBodies, 64, numBlocks, numThreads);
+    BT3DGRIDEXECKERNEL(numBlocks, numThreads, findOverlappingPairsD, (pAABB,(uint2*)pHash,(uint*)pCellStart,(uint*)pPairBuff,(uint2*)pPairBuffStartCurr,numBodies));
+    CUT_CHECK_ERROR("Kernel execution failed: bt_CudaFindOverlappingPairsD");
+#if B_CUDA_USE_TEX
+    MY_CUDA_SAFE_CALL(cudaUnbindTexture(pAABBTex));
+#endif
+ }
+
+
+
+void BT3DGRIDPREF(findPairsLarge(btCuda3F1U* pAABB, unsigned int* pHash, unsigned int* pCellStart, unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int numBodies, unsigned int numLarge))
+{
+#if B_CUDA_USE_TEX
+    MY_CUDA_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, (numBodies+numLarge) * 2 * sizeof(btCuda3F1U)));
+#endif
+    int numThreads, numBlocks;
+    BT3DGRIDPREF(computeGridSize)(numBodies, 64, numBlocks, numThreads);
+    BT3DGRIDEXECKERNEL(numBlocks, numThreads, findPairsLargeD, (pAABB,(uint2*)pHash,(uint*)pCellStart,(uint*)pPairBuff,(uint2*)pPairBuffStartCurr,numBodies,numLarge));
+    CUT_CHECK_ERROR("Kernel execution failed: btCuda_findPairsLargeD");
+#if B_CUDA_USE_TEX
+    MY_CUDA_SAFE_CALL(cudaUnbindTexture(pAABBTex));
+#endif
+ } 
+
+
+void BT3DGRIDPREF(computePairCacheChanges(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, btCuda3F1U* pAABB, unsigned int numBodies))
+{
+    int numThreads, numBlocks;
+    BT3DGRIDPREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
+    BT3DGRIDEXECKERNEL(numBlocks, numThreads, computePairCacheChangesD, ((uint*)pPairBuff,(uint2*)pPairBuffStartCurr,(uint*)pPairScan,pAABB,numBodies));
+    CUT_CHECK_ERROR("Kernel execution failed: btCudaComputePairCacheChangesD");
+ }
+
+
+void BT3DGRIDPREF(squeezeOverlappingPairBuff(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, unsigned int* pPairOut, btCuda3F1U* pAABB, unsigned int numBodies))
+{
+    int numThreads, numBlocks;
+    BT3DGRIDPREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
+    BT3DGRIDEXECKERNEL(numBlocks, numThreads, squeezeOverlappingPairBuffD, ((uint*)pPairBuff,(uint2*)pPairBuffStartCurr,(uint*)pPairScan,(uint*)pPairOut,pAABB,numBodies));
+    CUT_CHECK_ERROR("Kernel execution failed: btCudaSqueezeOverlappingPairBuffD");
+} // btCuda_squeezeOverlappingPairBuff()
+
+
+}   // extern "C"
diff --git a/Extras/CUDA/btCudaBroadphase.cpp b/Extras/CUDA/btCudaBroadphase.cpp
index ab873439a..44e22fa87 100644
--- a/Extras/CUDA/btCudaBroadphase.cpp
+++ b/Extras/CUDA/btCudaBroadphase.cpp
@@ -34,39 +34,8 @@ btCudaBroadphase::btCudaBroadphase(const btVector3& worldAabbMin,const btVector3
 								   int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
 								   int maxBodiesPerCell,
 								   btScalar cellFactorAABB) :
-	btSimpleBroadphase(maxSmallProxies,
-//				     new (btAlignedAlloc(sizeof(btSortedOverlappingPairCache),16)) btSortedOverlappingPairCache),
-				     new (btAlignedAlloc(sizeof(btHashedOverlappingPairCache),16)) btHashedOverlappingPairCache),
-	m_bInitialized(false),
-    m_numBodies(0)
+	bt3DGridBroadphase(worldAabbMin, worldAabbMax, gridSizeX, gridSizeY, gridSizeZ, maxSmallProxies, maxLargeProxies, maxPairsPerBody, maxBodiesPerCell,cellFactorAABB)
 {
-	m_ownsPairCache = true;
-	m_params.m_gridSizeX = gridSizeX;
-	m_params.m_gridSizeY = gridSizeY;
-	m_params.m_gridSizeZ = gridSizeZ;
-	m_params.m_numCells = m_params.m_gridSizeX * m_params.m_gridSizeY * m_params.m_gridSizeZ;
-	btVector3 w_org = worldAabbMin;
-	m_params.m_worldOriginX = w_org.getX();
-	m_params.m_worldOriginY = w_org.getY();
-	m_params.m_worldOriginZ = w_org.getZ();
-	btVector3 w_size = worldAabbMax - worldAabbMin;
-	m_params.m_cellSizeX = w_size.getX() / m_params.m_gridSizeX;
-	m_params.m_cellSizeY = w_size.getY() / m_params.m_gridSizeY;
-	m_params.m_cellSizeZ = w_size.getZ() / m_params.m_gridSizeZ;
-	m_maxRadius = btMin(btMin(m_params.m_cellSizeX, m_params.m_cellSizeY), m_params.m_cellSizeZ);
-	m_maxRadius *= btScalar(0.5f);
-	m_params.m_numBodies = m_numBodies;
-	m_params.m_maxBodiesPerCell = maxBodiesPerCell;
-
-	m_numLargeHandles = 0;						
-	m_maxLargeHandles = maxLargeProxies;
-
-	m_maxPairsPerBody = maxPairsPerBody;
-
-	m_cellFactorAABB = cellFactorAABB;
-
-	m_LastLargeHandleIndex = -1;
-
 	_initialize();
 } // btCudaBroadphase::btCudaBroadphase()
 
@@ -83,34 +52,6 @@ btCudaBroadphase::~btCudaBroadphase()
 
 void btCudaBroadphase::_initialize()
 {
-    assert(!m_bInitialized);
-    // allocate host storage
-    m_hBodiesHash = new unsigned int[m_maxHandles * 2];
-    memset(m_hBodiesHash, 0x00, m_maxHandles*2*sizeof(unsigned int));
-
-    m_hCellStart = new unsigned int[m_params.m_numCells];
-    memset(m_hCellStart, 0x00, m_params.m_numCells * sizeof(unsigned int));
-
-	m_hPairBuffStartCurr = new unsigned int[m_maxHandles * 2 + 2];
-	// --------------- for now, init with m_maxPairsPerBody for each body
-	m_hPairBuffStartCurr[0] = 0;
-	m_hPairBuffStartCurr[1] = 0;
-	for(int i = 1; i <= m_maxHandles; i++) 
-	{
-		m_hPairBuffStartCurr[i * 2] = m_hPairBuffStartCurr[(i-1) * 2] + m_maxPairsPerBody;
-		m_hPairBuffStartCurr[i * 2 + 1] = 0;
-	}
-	//----------------
-	unsigned int numAABB = m_maxHandles + m_maxLargeHandles;
-	m_hAABB = new btCuda3F1U[numAABB * 2]; // AABB Min & Max
-
-	m_hPairBuff = new unsigned int[m_maxHandles * m_maxPairsPerBody];
-	memset(m_hPairBuff, 0x00, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int)); // needed?
-
-	m_hPairScan = new unsigned int[m_maxHandles + 1];
-
-	m_hPairOut = new unsigned int[m_maxHandles * m_maxPairsPerBody];
-
     // allocate GPU data
     btCuda_allocateArray((void**)&m_dBodiesHash[0], m_maxHandles * 2 * sizeof(unsigned int));
     btCuda_allocateArray((void**)&m_dBodiesHash[1], m_maxHandles * 2 * sizeof(unsigned int));
@@ -123,34 +64,12 @@ void btCudaBroadphase::_initialize()
     btCuda_allocateArray((void**)&m_dPairBuffStartCurr, (m_maxHandles * 2 + 1) * sizeof(unsigned int));
 	btCuda_copyArrayToDevice(m_dPairBuffStartCurr, m_hPairBuffStartCurr, (m_maxHandles * 2 + 1) * sizeof(unsigned int)); 
 
+	unsigned int numAABB = m_maxHandles + m_maxLargeHandles;
 	btCuda_allocateArray((void**)&m_dAABB, numAABB * sizeof(btCuda3F1U) * 2);
 
     btCuda_allocateArray((void**)&m_dPairScan, (m_maxHandles + 1) * sizeof(unsigned int));
 
 	btCuda_allocateArray((void**)&m_dPairOut, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int));
-
-    btCuda_setParameters(&m_params);
-
-// large proxies
-
-	// allocate handles buffer and put all handles on free list
-	m_pLargeHandlesRawPtr = btAlignedAlloc(sizeof(btSimpleBroadphaseProxy) * m_maxLargeHandles, 16);
-	m_pLargeHandles = new(m_pLargeHandlesRawPtr) btSimpleBroadphaseProxy[m_maxLargeHandles];
-	m_firstFreeLargeHandle = 0;
-	{
-		for (int i = m_firstFreeLargeHandle; i < m_maxLargeHandles; i++)
-		{
-			m_pLargeHandles[i].SetNextFree(i + 1);
-			m_pLargeHandles[i].m_uniqueId = m_maxHandles+2+i;
-		}
-		m_pLargeHandles[m_maxLargeHandles - 1].SetNextFree(0);
-	}
-
-// debug data
-	m_numPairsAdded = 0;
-	m_numOverflows = 0;
-
-    m_bInitialized = true;
 } // btCudaBroadphase::_initialize()
 
 //--------------------------------------------------------------------------
@@ -179,324 +98,104 @@ void btCudaBroadphase::_finalize()
 	m_bInitialized = false;
 } // btCudaBroadphase::_finalize()
 
+//--------------------------------------------------------------------------
+//--------------------------------------------------------------------------
+//
+// overrides for CUDA version
+//
+//--------------------------------------------------------------------------
 //--------------------------------------------------------------------------
 
-void btCudaBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher)
+void btCudaBroadphase::prepareAABB()
 {
-	if(m_numHandles <= 0)
-	{
-		BT_PROFILE("addLarge2LargePairsToCache -- CPU");
-		addLarge2LargePairsToCache(dispatcher);
-		return;
-	}
+	bt3DGridBroadphase::prepareAABB();
+	btCuda_copyArrayToDevice(m_dAABB, m_hAABB, sizeof(btCuda3F1U) * 2 * (m_numHandles + m_numLargeHandles)); 
+	return;
+} // btCudaBroadphase::prepareAABB()
 
-		
+//--------------------------------------------------------------------------
 
-	// update constants
-	btCuda_setParameters(&m_params);
+void btCudaBroadphase::setParameters(btCudaBroadphaseParams* hostParams)
+{
+	btCuda_setParameters(hostParams);
+	return;
+} // btCudaBroadphase::setParameters()
 
-	
+//--------------------------------------------------------------------------
 
-	// move AABB array to GPU
-	{
-		BT_PROFILE("copy AABB");
-		// do it faster ? 
-		btCuda3F1U* pBB = m_hAABB;
-		int i;
-		int new_largest_index = -1;
-		unsigned int num_small = 0;
-
-		for(i = 0; i <= m_LastHandleIndex; i++) 
-		{
-			btSimpleBroadphaseProxy* proxy0 = &m_pHandles[i];
-			if(!proxy0->m_clientObject)
-			{
-				continue;
-			}
-			new_largest_index = i;
-			pBB->fx = proxy0->m_aabbMin.getX();
-			pBB->fy = proxy0->m_aabbMin.getY();
-			pBB->fz = proxy0->m_aabbMin.getZ();
-			pBB->uw = i;
-			pBB++;
-			pBB->fx = proxy0->m_aabbMax.getX();
-			pBB->fy = proxy0->m_aabbMax.getY();
-			pBB->fz = proxy0->m_aabbMax.getZ();
-			pBB->uw = num_small;
-			pBB++;
-			num_small++;
-		}
-		m_LastHandleIndex = new_largest_index;
-		new_largest_index = -1;
-		unsigned int num_large = 0;
-		for(i = 0; i <= m_LastLargeHandleIndex; i++) 
-		{
-			btSimpleBroadphaseProxy* proxy0 = &m_pLargeHandles[i];
-			if(!proxy0->m_clientObject)
-			{
-				continue;
-			}
-			new_largest_index = i;
-			pBB->fx = proxy0->m_aabbMin.getX();
-			pBB->fy = proxy0->m_aabbMin.getY();
-			pBB->fz = proxy0->m_aabbMin.getZ();
-			pBB->uw = i + m_maxHandles;
-			pBB++;
-			pBB->fx = proxy0->m_aabbMax.getX();
-			pBB->fy = proxy0->m_aabbMax.getY();
-			pBB->fz = proxy0->m_aabbMax.getZ();
-			pBB->uw = num_large + m_maxHandles;
-			pBB++;
-			num_large++;
-		}
-		m_LastLargeHandleIndex = new_largest_index;
-		// paranoid checks
-		btAssert(num_small == m_numHandles);
-		btAssert(num_large == m_numLargeHandles);
-	}
-	
-	{
-		BT_PROFILE("CopyBB to CUDA");
-		btCuda_copyArrayToDevice(m_dAABB, m_hAABB, sizeof(btCuda3F1U) * 2 * (m_numHandles + m_numLargeHandles)); 
-	}
-	// calculate hash
-	{
-		BT_PROFILE("calcHash -- CUDA");
-		btCuda_calcHashAABB(m_dAABB, m_dBodiesHash[0], m_numHandles);
-	}
+void btCudaBroadphase::calcHashAABB()
+{
+	BT_PROFILE("btCuda_calcHashAABB");
+	btCuda_calcHashAABB(m_dAABB, m_dBodiesHash[0], m_numHandles);
 //	btCuda_copyArrayFromDevice((void*)m_hBodiesHash, (void*)m_dBodiesHash[0], sizeof(unsigned int) * 2 * m_numHandles);
-	// sort bodies based on hash
-	{
-		BT_PROFILE("RadixSort-- CUDA");
-		RadixSort((KeyValuePair*)m_dBodiesHash[0], (KeyValuePair*)m_dBodiesHash[1], m_numHandles, 32);
-	}
-	// find start of each cell
-	{
-		BT_PROFILE("Find cell start -- CUDA");
-		btCuda_findCellStart(m_dBodiesHash[0],	m_dCellStart, m_numHandles, m_params.m_numCells);
-	}
+	return;
+} // btCudaBroadphase::calcHashAABB()
+
+//--------------------------------------------------------------------------
+
+void btCudaBroadphase::sortHash()
+{
+	BT_PROFILE("RadixSort-- CUDA");
+	RadixSort((KeyValuePair*)m_dBodiesHash[0], (KeyValuePair*)m_dBodiesHash[1], m_numHandles, 32);
+	return;
+} // btCudaBroadphase::sortHash()
+
+//--------------------------------------------------------------------------
+
+void btCudaBroadphase::findCellStart()
+{
+	BT_PROFILE("btCuda_findCellStart");
+	btCuda_findCellStart(m_dBodiesHash[0],	m_dCellStart, m_numHandles, m_params.m_numCells);
 //	btCuda_copyArrayFromDevice((void*)m_hBodiesHash, (void*)m_dBodiesHash[0], sizeof(unsigned int) * 2 * m_numHandles);
 //	btCuda_copyArrayFromDevice((void*)m_hCellStart, (void*)m_dCellStart, sizeof(unsigned int) * m_params.m_numCells);
-	{
-		BT_PROFILE("FindOverlappingPairs -- CUDA");
-		btCuda_findOverlappingPairs(m_dAABB, m_dBodiesHash[0], m_dCellStart, m_dPairBuff, m_dPairBuffStartCurr,	m_numHandles);
-	}
-	{
-		BT_PROFILE("FindPairsLarge -- CUDA");
-		btCuda_findPairsLarge(m_dAABB, m_dBodiesHash[0], m_dCellStart, m_dPairBuff, m_dPairBuffStartCurr,	m_numHandles, m_numLargeHandles);
-	}
-	{
-		BT_PROFILE("ComputePairCacheChanges -- CUDA");
-		btCuda_computePairCacheChanges(m_dPairBuff, m_dPairBuffStartCurr, m_dPairScan, m_dAABB, m_numHandles);
-	}
-	{
-		BT_PROFILE("scanOverlappingPairBuff -- CPU");
-		btCuda_copyArrayFromDevice(m_hPairScan, m_dPairScan, sizeof(unsigned int)*(m_numHandles + 1)); 
-		scanOverlappingPairBuffCPU();
-		btCuda_copyArrayToDevice(m_dPairScan, m_hPairScan, sizeof(unsigned int)*(m_numHandles + 1)); 
-	}
-	{
-		BT_PROFILE("SqueezeOverlappingPairBuff -- CUDA");
-		btCuda_squeezeOverlappingPairBuff(m_dPairBuff, m_dPairBuffStartCurr, m_dPairScan, m_dPairOut, m_dAABB, m_numHandles);
-	}
-	{
-		BT_PROFILE("SqueezeOverlappingPairBuff -- CUDA");
-		btCuda_copyArrayFromDevice(m_hPairOut, m_dPairOut, sizeof(unsigned int) * m_hPairScan[m_numHandles]); 
-	}
-	{
-		BT_PROFILE("addPairsToCache -- CPU");
-		addPairsToCacheCPU(dispatcher);
-	}
-	{
-		BT_PROFILE("addLarge2LargePairsToCache -- CPU");
-		addLarge2LargePairsToCache(dispatcher);
-	}
 	return;
-} // btCudaBroadphase::calculateOverlappingPairs()
+} // btCudaBroadphase::findCellStart()
 
 //--------------------------------------------------------------------------
 
-void btCudaBroadphase::scanOverlappingPairBuffCPU()
+void btCudaBroadphase::findOverlappingPairs()
 {
-	m_hPairScan[0] = 0;
-	for(int i = 1; i <= m_numHandles; i++) 
-	{
-		unsigned int delta = m_hPairScan[i];
-		m_hPairScan[i] = m_hPairScan[i-1] + delta;
-	}
-} // btCudaBroadphase::scanOverlappingPairBuffCPU()
-
-//--------------------------------------------------------------------------
-
-void btCudaBroadphase::addPairsToCacheCPU(btDispatcher* dispatcher)
-{
-	m_numPairsAdded = 0;
-	m_numPairsRemoved = 0;
-	for(int i = 0; i < m_numHandles; i++) 
-	{
-		unsigned int num = m_hPairScan[i+1] - m_hPairScan[i];
-		if(!num)
-		{
-			continue;
-		}
-		unsigned int* pInp = m_hPairOut + m_hPairScan[i];
-		unsigned int index0 = m_hAABB[i * 2].uw;
-		btSimpleBroadphaseProxy* proxy0 = &m_pHandles[index0];
-		for(unsigned int j = 0; j < num; j++)
-		{
-			unsigned int indx1_s = pInp[j];
-			unsigned int index1 = indx1_s & (~BT_CUDA_PAIR_ANY_FLG);
-			btSimpleBroadphaseProxy* proxy1;
-			if(index1 < (unsigned int)m_maxHandles)
-			{
-				proxy1 = &m_pHandles[index1];
-			}
-			else
-			{
-				index1 -= m_maxHandles;
-				btAssert((index1 >= 0) && (index1 < (unsigned int)m_maxLargeHandles));
-				proxy1 = &m_pLargeHandles[index1];
-			}
-			if(indx1_s & BT_CUDA_PAIR_NEW_FLG)
-			{
-				m_pairCache->addOverlappingPair(proxy0,proxy1);
-				m_numPairsAdded++;
-			}
-			else
-			{
-				m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher);
-				m_numPairsRemoved++;
-			}
-		}
-	}
-} // btCudaBroadphase::addPairsToCacheCPU()
-
-//--------------------------------------------------------------------------
-
-btBroadphaseProxy* btCudaBroadphase::createProxy(  const btVector3& aabbMin,  const btVector3& aabbMax,int shapeType,void* userPtr ,short int collisionFilterGroup,short int collisionFilterMask, btDispatcher* dispatcher,void* multiSapProxy)
-{
-	btBroadphaseProxy*  proxy;
-	bool bIsLarge = isLargeProxy(aabbMin, aabbMax);
-	if(bIsLarge)
-	{
-		if (m_numLargeHandles >= m_maxLargeHandles)
-		{
-			btAssert(0);
-			return 0; //should never happen, but don't let the game crash ;-)
-		}
-		btAssert((aabbMin[0]<= aabbMax[0]) && (aabbMin[1]<= aabbMax[1]) && (aabbMin[2]<= aabbMax[2]));
-		int newHandleIndex = allocLargeHandle();
-		proxy = new (&m_pLargeHandles[newHandleIndex])btSimpleBroadphaseProxy(aabbMin,aabbMax,shapeType,userPtr,collisionFilterGroup,collisionFilterMask,multiSapProxy);
-	}
-	else
-	{
-		proxy = btSimpleBroadphase::createProxy(aabbMin, aabbMax, shapeType, userPtr, collisionFilterGroup, collisionFilterMask, dispatcher, multiSapProxy);
-	}
-	return proxy;
-} // btCudaBroadphase::createProxy()
-
-//--------------------------------------------------------------------------
-
-void btCudaBroadphase::destroyProxy(btBroadphaseProxy* proxy, btDispatcher* dispatcher)
-{
-	bool bIsLarge = isLargeProxy(proxy);
-	if(bIsLarge)
-	{
-		
-		btSimpleBroadphaseProxy* proxy0 = static_cast<btSimpleBroadphaseProxy*>(proxy);
-		freeLargeHandle(proxy0);
-		// TODO : remove pair from cache on GPU as well !!!
-		// UPD: they will not be used anyway, so don't waste time
-		m_pairCache->removeOverlappingPairsContainingProxy(proxy,dispatcher);
-	}
-	else
-	{
-		btSimpleBroadphase::destroyProxy(proxy, dispatcher);
-	}
+	BT_PROFILE("btCuda_findOverlappingPairs");
+	btCuda_findOverlappingPairs(m_dAABB, m_dBodiesHash[0], m_dCellStart, m_dPairBuff, m_dPairBuffStartCurr,	m_numHandles);
 	return;
-} // btCudaBroadphase::destroyProxy()
+} // btCudaBroadphase::findOverlappingPairs()
 
 //--------------------------------------------------------------------------
 
-bool btCudaBroadphase::isLargeProxy(const btVector3& aabbMin,  const btVector3& aabbMax)
+void btCudaBroadphase::findPairsLarge()
 {
-	btVector3 diag = aabbMax - aabbMin;
-	btScalar radius = diag.length() * btScalar(0.5f);
-
-	radius *= m_cellFactorAABB; // user-defined factor
-
-	return (radius > m_maxRadius);
-} // btCudaBroadphase::isLargeProxy()
-
-//--------------------------------------------------------------------------
-
-bool btCudaBroadphase::isLargeProxy(btBroadphaseProxy* proxy)
-{
-	return (proxy->getUid() >= (m_maxHandles+2));
-} // btCudaBroadphase::isLargeProxy()
-
-//--------------------------------------------------------------------------
-
-void btCudaBroadphase::addLarge2LargePairsToCache(btDispatcher* dispatcher)
-{
-	int i,j;
-	if (m_numLargeHandles <= 0)
-	{
-		return;
-	}
-	int new_largest_index = -1;
-	for(i = 0; i <= m_LastLargeHandleIndex; i++)
-	{
-		btSimpleBroadphaseProxy* proxy0 = &m_pLargeHandles[i];
-		if(!proxy0->m_clientObject)
-		{
-			continue;
-		}
-		new_largest_index = i;
-		for(j = i + 1; j <= m_LastLargeHandleIndex; j++)
-		{
-			btSimpleBroadphaseProxy* proxy1 = &m_pLargeHandles[j];
-			if(!proxy1->m_clientObject)
-			{
-				continue;
-			}
-			btAssert(proxy0 != proxy1);
-			btSimpleBroadphaseProxy* p0 = getSimpleProxyFromProxy(proxy0);
-			btSimpleBroadphaseProxy* p1 = getSimpleProxyFromProxy(proxy1);
-			if(aabbOverlap(p0,p1))
-			{
-				if (!m_pairCache->findPair(proxy0,proxy1))
-				{
-					m_pairCache->addOverlappingPair(proxy0,proxy1);
-				}
-			} 
-			else
-			{
-				if(m_pairCache->findPair(proxy0,proxy1))
-				{
-					m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher);
-				}
-			}
-		}
-	}
-	m_LastLargeHandleIndex = new_largest_index;
+	BT_PROFILE("btCuda_findPairsLarge");
+	btCuda_findPairsLarge(m_dAABB, m_dBodiesHash[0], m_dCellStart, m_dPairBuff, m_dPairBuffStartCurr,	m_numHandles, m_numLargeHandles);
 	return;
-} // btCudaBroadphase::addLarge2LargePairsToCache()
+} // btCudaBroadphase::findPairsLarge()
 
 //--------------------------------------------------------------------------
 
-void btCudaBroadphase::rayTest(const btVector3& rayFrom,const btVector3& rayTo, btBroadphaseRayCallback& rayCallback)
+void btCudaBroadphase::computePairCacheChanges()
 {
-	btSimpleBroadphase::rayTest(rayFrom, rayTo, rayCallback);
-	for (int i=0; i <= m_LastLargeHandleIndex; i++)
-	{
-		btSimpleBroadphaseProxy* proxy = &m_pLargeHandles[i];
-		if(!proxy->m_clientObject)
-		{
-			continue;
-		}
-		rayCallback.process(proxy);
-	}
-} // btCudaBroadphase::rayTest()
+	BT_PROFILE("btCuda_computePairCacheChanges");
+	btCuda_computePairCacheChanges(m_dPairBuff, m_dPairBuffStartCurr, m_dPairScan, m_dAABB, m_numHandles);
+	return;
+} // btCudaBroadphase::computePairCacheChanges()
+
+//--------------------------------------------------------------------------
+
+void btCudaBroadphase::scanOverlappingPairBuff()
+{
+	btCuda_copyArrayFromDevice(m_hPairScan, m_dPairScan, sizeof(unsigned int)*(m_numHandles + 1)); 
+	bt3DGridBroadphase::scanOverlappingPairBuff();
+	btCuda_copyArrayToDevice(m_dPairScan, m_hPairScan, sizeof(unsigned int)*(m_numHandles + 1)); 
+	return;
+} // btCudaBroadphase::scanOverlappingPairBuff()
+
+//--------------------------------------------------------------------------
+
+void btCudaBroadphase::squeezeOverlappingPairBuff()
+{
+	BT_PROFILE("btCuda_squeezeOverlappingPairBuff");
+	btCuda_squeezeOverlappingPairBuff(m_dPairBuff, m_dPairBuffStartCurr, m_dPairScan, m_dPairOut, m_dAABB, m_numHandles);
+	btCuda_copyArrayFromDevice(m_hPairOut, m_dPairOut, sizeof(unsigned int) * m_hPairScan[m_numHandles]); 
+	return;
+} // btCudaBroadphase::squeezeOverlappingPairBuff()
 
 //--------------------------------------------------------------------------
diff --git a/Extras/CUDA/btCudaBroadphase.cu b/Extras/CUDA/btCudaBroadphase.cu
index e21807af8..225a5495c 100644
--- a/Extras/CUDA/btCudaBroadphase.cu
+++ b/Extras/CUDA/btCudaBroadphase.cu
@@ -65,9 +65,9 @@
 __device__ inline btCuda3F1U tex_fetch3F1U(float4 a) { return *((btCuda3F1U*)(&a)); }
 
 #if B_CUDA_USE_TEX
-	#define FETCH(t, i) tex_fetch3F1U(tex1Dfetch(t##Tex, i))
+	#define BT3DGRIDFETCH(t, i) tex_fetch3F1U(tex1Dfetch(t##Tex, i))
 #else
-	#define FETCH(t, i) t[i]
+	#define BT3DGRIDFETCH(t, i) t[i]
 #endif
 
 texture<uint2, 1, cudaReadModeElementType> particleHashTex;
@@ -80,323 +80,25 @@ __constant__ btCudaBroadphaseParams params;
 
 //----------------------------------------------------------------------------------------
 
-// calculate position in uniform grid
-__device__ int3 btCuda_calcGridPos(float4 p)
-{
-    int3 gridPos;
-    gridPos.x = floor((p.x - params.m_worldOriginX) / params.m_cellSizeX);
-    gridPos.y = floor((p.y - params.m_worldOriginY) / params.m_cellSizeY);
-    gridPos.z = floor((p.z - params.m_worldOriginZ) / params.m_cellSizeZ);
-    return gridPos;
-}
+#define BT3DGRID__device__ __device__
+#define BT3DGRIDmax(a, b) max(a, b)
+#define BT3DGRIDmin(a, b) min(a, b)
+#define BT3DGRIDparams params
+#define BT3DGRID__mul24(a, b) __mul24(a, b)
+#define BT3DGRID__global__ __global__
+#define BT3DGRID__shared__ __shared__
+#define BT3DGRID__syncthreads() __syncthreads()
+#define BT3DGRIDmake_uint2(x, y) make_uint2(x, y)
+#define BT3DGRIDmake_int3(x, y, z) make_int3(x, y, z)
+#define BT3DGRIDPREF(func) btCuda_##func
+#define BT3DGPRDMemset cudaMemset
+#define BT3DGRIDblockIdx blockIdx
+#define BT3DGRIDblockDim blockDim
+#define BT3DGRIDthreadIdx threadIdx
+#define BT3DGRIDEXECKERNEL(numb, numt, kfunc, args) kfunc<<<numb, numt>>>args
 
 //----------------------------------------------------------------------------------------
 
-// calculate address in grid from position (clamping to edges)
-__device__ uint btCuda_calcGridHash(int3 gridPos)
-{
-    gridPos.x = max(0, min(gridPos.x, params.m_gridSizeX - 1));
-    gridPos.y = max(0, min(gridPos.y, params.m_gridSizeY - 1));
-    gridPos.z = max(0, min(gridPos.z, params.m_gridSizeZ - 1));
-    return __mul24(__mul24(gridPos.z, params.m_gridSizeY), params.m_gridSizeX) + __mul24(gridPos.y, params.m_gridSizeX) + gridPos.x;
-}
-
-//----------------------------------------------------------------------------------------
-
-// calculate grid hash value for each body using its AABB
-__global__ void calcHashAABBD(btCuda3F1U* pAABB, uint2* pHash, uint numBodies)
-{
-    int index = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
-    if(index >= numBodies)
-	{
-		return;
-	}
-	btCuda3F1U bbMin = pAABB[index*2];
-	btCuda3F1U bbMax = pAABB[index*2 + 1];
-	float4 pos;
-	pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
-	pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
-	pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
-    // get address in grid
-    int3 gridPos = btCuda_calcGridPos(pos);
-    uint gridHash = btCuda_calcGridHash(gridPos);
-    // store grid hash and body index
-    pHash[index] = make_uint2(gridHash, index);
-}
-
-//----------------------------------------------------------------------------------------
-
-__global__ void findCellStartD(uint2* pHash, uint* cellStart, uint numBodies)
-{
-    int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if(index >= numBodies)
-	{
-		return;
-	}
-    uint2 sortedData = pHash[index];
-	// Load hash data into shared memory so that we can look 
-	// at neighboring body's hash value without loading
-	// two hash values per thread
-	__shared__ uint sharedHash[257];
-	sharedHash[threadIdx.x+1] = sortedData.x;
-	if((index > 0) && (threadIdx.x == 0))
-	{
-		// first thread in block must load neighbor body hash
-		volatile uint2 prevData = pHash[index-1];
-		sharedHash[0] = prevData.x;
-	}
-	__syncthreads();
-	if((index == 0) || (sortedData.x != sharedHash[threadIdx.x]))
-	{
-		cellStart[sortedData.x] = index;
-	}
-}
-
-//----------------------------------------------------------------------------------------
-
-__device__ uint cudaTestAABBOverlap(btCuda3F1U min0, btCuda3F1U max0, btCuda3F1U min1, btCuda3F1U max1)
-{
-	return	(min0.fx <= max1.fx)&& (min1.fx <= max0.fx) && 
-			(min0.fy <= max1.fy)&& (min1.fy <= max0.fy) && 
-			(min0.fz <= max1.fz)&& (min1.fz <= max0.fz); 
-}
-
-//----------------------------------------------------------------------------------------
-
-__device__ void findPairsInCell(int3	gridPos,
-								uint    index,
-								uint2*  pHash,
-								uint*   pCellStart,
-								btCuda3F1U* pAABB, 
-								uint*   pPairBuff,
-								uint2*	pPairBuffStartCurr,
-								uint	numBodies)
-{
-    if (	(gridPos.x < 0) || (gridPos.x > params.m_gridSizeX - 1)
-		||	(gridPos.y < 0) || (gridPos.y > params.m_gridSizeY - 1)
-		||  (gridPos.z < 0) || (gridPos.z > params.m_gridSizeZ - 1)) 
-    {
-		return;
-	}
-    uint gridHash = btCuda_calcGridHash(gridPos);
-    // get start of bucket for this cell
-    uint bucketStart = pCellStart[gridHash];
-    if (bucketStart == 0xffffffff)
-	{
-        return;   // cell empty
-	}
-	// iterate over bodies in this cell
-    uint2 sortedData = pHash[index];
-	uint unsorted_indx = sortedData.y;
-    btCuda3F1U min0 = FETCH(pAABB, unsorted_indx*2); 
-	btCuda3F1U max0 = FETCH(pAABB, unsorted_indx*2 + 1);
-	uint handleIndex =  min0.uw;
-	uint2 start_curr = pPairBuffStartCurr[handleIndex];
-	uint start = start_curr.x;
-	uint curr = start_curr.y;
-	uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
-	uint curr_max = start_curr_next.x - start - 1;
-	uint bucketEnd = bucketStart + params.m_maxBodiesPerCell;
-	bucketEnd = (bucketEnd > numBodies) ? numBodies : bucketEnd;
-	for(uint index2 = bucketStart; index2 < bucketEnd; index2++) 
-	{
-        uint2 cellData = pHash[index2];
-        if (cellData.x != gridHash)
-        {
-			break;   // no longer in same bucket
-		}
-		uint unsorted_indx2 = cellData.y;
-        if (unsorted_indx2 < unsorted_indx) // check not colliding with self
-        {   
-			btCuda3F1U min1 = FETCH(pAABB, unsorted_indx2*2);
-			btCuda3F1U max1 = FETCH(pAABB, unsorted_indx2*2 + 1);
-			if(cudaTestAABBOverlap(min0, max0, min1, max1))
-			{
-				uint handleIndex2 = min1.uw;
-				uint k;
-				for(k = 0; k < curr; k++)
-				{
-					uint old_pair = pPairBuff[start+k] & (~BT_CUDA_PAIR_ANY_FLG);
-					if(old_pair == handleIndex2)
-					{
-						pPairBuff[start+k] |= BT_CUDA_PAIR_FOUND_FLG;
-						break;
-					}
-				}
-				if(k == curr)
-				{
-					pPairBuff[start+curr] = handleIndex2 | BT_CUDA_PAIR_NEW_FLG;
-					if(curr >= curr_max) 
-					{ // not a good solution, but let's avoid crash
-						break;
-					}
-					curr++;
-				}
-			}
-		}
-	}
-	pPairBuffStartCurr[handleIndex] = make_uint2(start, curr);
-    return;
-}
-
-//----------------------------------------------------------------------------------------
-
-__global__ void
-findOverlappingPairsD(	btCuda3F1U*	pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff, 
-						uint2* pPairBuffStartCurr, uint numBodies)
-{
-    int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if(index >= numBodies)
-	{
-		return;
-	}
-    uint2 sortedData = pHash[index];
-	uint unsorted_indx = sortedData.y;
-	btCuda3F1U bbMin = FETCH(pAABB, unsorted_indx*2);
-	btCuda3F1U bbMax = FETCH(pAABB, unsorted_indx*2 + 1);
-	float4 pos;
-	pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
-	pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
-	pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
-    // get address in grid
-    int3 gridPos = btCuda_calcGridPos(pos);
-    // examine only neighbouring cells
-    for(int z=-1; z<=1; z++) {
-        for(int y=-1; y<=1; y++) {
-            for(int x=-1; x<=1; x++) {
-                findPairsInCell(gridPos + make_int3(x, y, z), index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, numBodies);
-            }
-        }
-    }
-}
-
-//----------------------------------------------------------------------------------------
-
-__global__ void
-findPairsLargeD(	btCuda3F1U* pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff, 
-						uint2* pPairBuffStartCurr, uint numBodies, uint numLarge)
-{
-    int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if(index >= numBodies)
-	{
-		return;
-	}
-    uint2 sortedData = pHash[index];
-	uint unsorted_indx = sortedData.y;
-	btCuda3F1U min0 = FETCH(pAABB, unsorted_indx*2);
-	btCuda3F1U max0 = FETCH(pAABB, unsorted_indx*2 + 1);
-	uint handleIndex =  min0.uw;
-	uint2 start_curr = pPairBuffStartCurr[handleIndex];
-	uint start = start_curr.x;
-	uint curr = start_curr.y;
-	uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
-	uint curr_max = start_curr_next.x - start - 1;
-    for(uint i = 0; i < numLarge; i++)
-    {
-		uint indx2 = numBodies + i;
-		btCuda3F1U min1 = FETCH(pAABB, indx2*2);
-		btCuda3F1U max1 = FETCH(pAABB, indx2*2 + 1);
-		if(cudaTestAABBOverlap(min0, max0, min1, max1))
-		{
-			uint k;
-			uint handleIndex2 =  min1.uw;
-			for(k = 0; k < curr; k++)
-			{
-				uint old_pair = pPairBuff[start+k] & (~BT_CUDA_PAIR_ANY_FLG);
-				if(old_pair == handleIndex2)
-				{
-					pPairBuff[start+k] |= BT_CUDA_PAIR_FOUND_FLG;
-					break;
-				}
-			}
-			if(k == curr)
-			{
-				pPairBuff[start+curr] = handleIndex2 | BT_CUDA_PAIR_NEW_FLG;
-				if(curr >= curr_max) 
-				{ // not a good solution, but let's avoid crash
-					break;
-				}
-				curr++;
-			}
-		}
-    }
-	pPairBuffStartCurr[handleIndex] = make_uint2(start, curr);
-    return;
-}
-
-//----------------------------------------------------------------------------------------
-
-__global__ void computePairCacheChangesD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan, btCuda3F1U* pAABB, uint numBodies)
-{
-    int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if(index >= numBodies)
-	{
-		return;
-	}
-	btCuda3F1U bbMin = pAABB[index * 2];
-	uint handleIndex = bbMin.uw;
-	uint2 start_curr = pPairBuffStartCurr[handleIndex];
-	uint start = start_curr.x;
-	uint curr = start_curr.y;
-	uint *pInp = pPairBuff + start;
-	uint num_changes = 0;
-	for(uint k = 0; k < curr; k++, pInp++)
-	{
-		if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG))
-		{
-			num_changes++;
-		}
-	}
-	pPairScan[index+1] = num_changes;
-}
-
-//----------------------------------------------------------------------------------------
-
-__global__ void squeezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan, uint* pPairOut, btCuda3F1U* pAABB, uint numBodies)
-{
-    int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if(index >= numBodies)
-	{
-		return;
-	}
-	btCuda3F1U bbMin = pAABB[index * 2];
-	uint handleIndex = bbMin.uw;
-	uint2 start_curr = pPairBuffStartCurr[handleIndex];
-	uint start = start_curr.x;
-	uint curr = start_curr.y;
-	uint* pInp = pPairBuff + start;
-	uint* pOut = pPairOut + pPairScan[index];
-	uint* pOut2 = pInp;
-	uint num = 0; 
-	for(uint k = 0; k < curr; k++, pInp++)
-	{
-		if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG))
-		{
-			*pOut = *pInp;
-			pOut++;
-		}
-		if((*pInp) & BT_CUDA_PAIR_ANY_FLG)
-		{
-			*pOut2 = (*pInp) & (~BT_CUDA_PAIR_ANY_FLG);
-			pOut2++;
-			num++;
-		}
-	}
-	pPairBuffStartCurr[handleIndex] = make_uint2(start, num);
-} // squeezeOverlappingPairBuffD()
-
-
-//----------------------------------------------------------------------------------------
-//----------------------------------------------------------------------------------------
-//----------------------------------------------------------------------------------------
-//----------------------------------------------------------------------------------------
-//               E N D   O F    K E R N E L    F U N C T I O N S 
-//----------------------------------------------------------------------------------------
-//----------------------------------------------------------------------------------------
-//----------------------------------------------------------------------------------------
-//----------------------------------------------------------------------------------------
-
-
 //! Check for CUDA error
 #  define CUT_CHECK_ERROR(errorMessage) do {                                 \
     cudaError_t err = cudaGetLastError();                                    \
@@ -430,9 +132,7 @@ __global__ void squeezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffSta
         btCuda_exit(EXIT_FAILURE);                                           \
     } } while (0)
 
-
-extern "C"
-{
+//----------------------------------------------------------------------------------------
 
 void btCuda_exit(int val)
 {
@@ -465,125 +165,9 @@ void btCuda_setParameters(btCudaBroadphaseParams* hostParams)
     MY_CUDA_SAFE_CALL(cudaMemcpyToSymbol(params, hostParams, sizeof(btCudaBroadphaseParams)));
 }
 
-//Round a / b to nearest higher integer value
-int btCuda_iDivUp(int a, int b)
-{
-    return (a % b != 0) ? (a / b + 1) : (a / b);
-}
+//----------------------------------------------------------------------------------------
 
-// compute grid and thread block size for a given number of elements
-void btCuda_computeGridSize(int n, int blockSize, int &numBlocks, int &numThreads)
-{
-    numThreads = min(blockSize, n);
-    numBlocks = btCuda_iDivUp(n, numThreads);
-}
+#include "bt3DGridBroadphaseFunc.h"
 
-void btCuda_calcHashAABB(btCuda3F1U* pAABB, unsigned int* hash,	unsigned int numBodies)
-{
-    int numThreads, numBlocks;
-    btCuda_computeGridSize(numBodies, 256, numBlocks, numThreads);
-    // execute the kernel
-    calcHashAABBD<<< numBlocks, numThreads >>>(pAABB, (uint2*)hash, numBodies);
-    // check if kernel invocation generated an error
-    CUT_CHECK_ERROR("calcHashAABBD kernel execution failed");
-}
+//----------------------------------------------------------------------------------------
 
-void btCuda_findCellStart(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells)
-{
-    int numThreads, numBlocks;
-    btCuda_computeGridSize(numBodies, 256, numBlocks, numThreads);
-	MY_CUDA_SAFE_CALL(cudaMemset(cellStart, 0xffffffff, numCells*sizeof(uint)));
-    findCellStartD<<< numBlocks, numThreads >>>((uint2*)hash, (uint*)cellStart, numBodies);
-    CUT_CHECK_ERROR("Kernel execution failed: findCellStartD");
-}
-
-void btCuda_findOverlappingPairs(	btCuda3F1U*		pAABB, unsigned int* pHash,
-									unsigned int*	pCellStart,
-									unsigned int*	pPairBuff,
-									unsigned int*	pPairBuffStartCurr,
-									unsigned int	numBodies)
-{
-#if B_CUDA_USE_TEX
-    MY_CUDA_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, numBodies * 2 * sizeof(btCuda3F1U)));
-#endif
-    int numThreads, numBlocks;
-    btCuda_computeGridSize(numBodies, 64, numBlocks, numThreads);
-    findOverlappingPairsD<<< numBlocks, numThreads >>>(
-		pAABB,
-		(uint2*)pHash,
-        (uint*)pCellStart,
-		(uint*)pPairBuff,
-		(uint2*)pPairBuffStartCurr,
-		numBodies
-	);
-    CUT_CHECK_ERROR("Kernel execution failed: bt_CudaFindOverlappingPairsD");
-#if B_CUDA_USE_TEX
-    MY_CUDA_SAFE_CALL(cudaUnbindTexture(pAABBTex));
-#endif
- } // btCuda_findOverlappingPairs()
-
-
-
-void btCuda_findPairsLarge(	btCuda3F1U*	pAABB, unsigned int* pHash,
-							unsigned int*	pCellStart,
-							unsigned int*	pPairBuff,
-							unsigned int*	pPairBuffStartCurr,
-							unsigned int	numBodies,
-							unsigned int	numLarge)
-{
-#if B_CUDA_USE_TEX
-    MY_CUDA_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, (numBodies+numLarge) * 2 * sizeof(btCuda3F1U)));
-#endif
-    int numThreads, numBlocks;
-    btCuda_computeGridSize(numBodies, 64, numBlocks, numThreads);
-    findPairsLargeD<<< numBlocks, numThreads >>>(
-		pAABB,
-		(uint2*)pHash,
-        (uint*)pCellStart,
-		(uint*)pPairBuff,
-		(uint2*)pPairBuffStartCurr,
-		numBodies,
-		numLarge
-	);
-    CUT_CHECK_ERROR("Kernel execution failed: btCuda_findPairsLargeD");
-#if B_CUDA_USE_TEX
-    MY_CUDA_SAFE_CALL(cudaUnbindTexture(pAABBTex));
-#endif
- } // btCuda_findPairsLarge()
-
-
-
-void btCuda_computePairCacheChanges(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, 
-									unsigned int* pPairScan, btCuda3F1U* pAABB, unsigned int numBodies)
-{
-    int numThreads, numBlocks;
-    btCuda_computeGridSize(numBodies, 256, numBlocks, numThreads);
-    computePairCacheChangesD<<< numBlocks, numThreads >>>(
-		(uint*)pPairBuff,
-		(uint2*)pPairBuffStartCurr,
-        (uint*)pPairScan,
-        pAABB, 
-        numBodies
-	);
-    CUT_CHECK_ERROR("Kernel execution failed: btCudaComputePairCacheChangesD");
- } // btCuda_computePairCacheChanges()
-
-
-void btCuda_squeezeOverlappingPairBuff(	unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, 
-										unsigned int* pPairOut, btCuda3F1U* pAABB, unsigned int numBodies)
-{
-    int numThreads, numBlocks;
-    btCuda_computeGridSize(numBodies, 256, numBlocks, numThreads);
-    squeezeOverlappingPairBuffD<<< numBlocks, numThreads >>>(
-		(uint*)pPairBuff,
-		(uint2*)pPairBuffStartCurr,
-        (uint*)pPairScan,
-        (uint*)pPairOut,
-        pAABB, 
-        numBodies
-	);
-    CUT_CHECK_ERROR("Kernel execution failed: btCudaSqueezeOverlappingPairBuffD");
-} // btCuda_squeezeOverlappingPairBuff()
-
-
-}   // extern "C"
diff --git a/Extras/CUDA/btCudaBroadphase.h b/Extras/CUDA/btCudaBroadphase.h
index 1cdc3b790..89ca382b8 100644
--- a/Extras/CUDA/btCudaBroadphase.h
+++ b/Extras/CUDA/btCudaBroadphase.h
@@ -21,14 +21,18 @@ subject to the following restrictions:
 #include "btCudaBroadphaseKernel.h"
 
 
-///The btCudaBroadphase uses CUDA to compute overlapping pairs using a GPU.
-class btCudaBroadphase : public btSimpleBroadphase
+///The bt3DGridBroadphase uses CUDA to compute overlapping pairs using a GPU.
+class bt3DGridBroadphase : public btSimpleBroadphase
 {
+protected:
 	bool			m_bInitialized;
     unsigned int	m_numBodies;
     unsigned int	m_numCells;
 	unsigned int	m_maxPairsPerBody;
 	btScalar		m_cellFactorAABB;
+    unsigned int	m_maxBodiesPerCell;
+	btCudaBroadphaseParams m_params;
+	btScalar		m_maxRadius;
 	// CPU data
     unsigned int*	m_hBodiesHash;
     unsigned int*	m_hCellStart;
@@ -37,17 +41,6 @@ class btCudaBroadphase : public btSimpleBroadphase
 	unsigned int*	m_hPairBuff;
 	unsigned int*	m_hPairScan;
 	unsigned int*	m_hPairOut;
-    // GPU data
-    unsigned int*	m_dBodiesHash[2];
-    unsigned int*	m_dCellStart;
-	unsigned int*	m_dPairBuff; 
-	unsigned int*	m_dPairBuffStartCurr;
-	btCuda3F1U*		m_dAABB;
-	unsigned int*	m_dPairScan;
-	unsigned int*	m_dPairOut;
-    unsigned int	m_maxBodiesPerCell;
-	btCudaBroadphaseParams m_params;
-	btScalar		m_maxRadius;
 // large proxies
 	int		m_numLargeHandles;						
 	int		m_maxLargeHandles;						
@@ -82,12 +75,56 @@ class btCudaBroadphase : public btSimpleBroadphase
 	}
 	bool isLargeProxy(const btVector3& aabbMin,  const btVector3& aabbMax);
 	bool isLargeProxy(btBroadphaseProxy* proxy);
-
 // debug
 	unsigned int	m_numPairsAdded;
 	unsigned int	m_numPairsRemoved;
 	unsigned int	m_numOverflows;
 // 
+public:
+	bt3DGridBroadphase(const btVector3& worldAabbMin,const btVector3& worldAabbMax, 
+					   int gridSizeX, int gridSizeY, int gridSizeZ, 
+					   int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+					   int maxBodiesPerCell = 8,
+					   btScalar cellFactorAABB = btScalar(1.0f));
+	virtual ~bt3DGridBroadphase();
+	virtual void	calculateOverlappingPairs(btDispatcher* dispatcher);
+
+	virtual btBroadphaseProxy*	createProxy(const btVector3& aabbMin,  const btVector3& aabbMax,int shapeType,void* userPtr ,short int collisionFilterGroup,short int collisionFilterMask, btDispatcher* dispatcher,void* multiSapProxy);
+	virtual void	destroyProxy(btBroadphaseProxy* proxy,btDispatcher* dispatcher);
+	virtual void	rayTest(const btVector3& rayFrom,const btVector3& rayTo, btBroadphaseRayCallback& rayCallback);
+protected:
+	void _initialize();
+	void _finalize();
+	void addPairsToCache(btDispatcher* dispatcher);
+	void addLarge2LargePairsToCache(btDispatcher* dispatcher);
+
+// overrides for CPU version
+	virtual void setParameters(btCudaBroadphaseParams* hostParams);
+	virtual void prepareAABB();
+	virtual void calcHashAABB();
+	virtual void sortHash();	
+	virtual void findCellStart();
+	virtual void findOverlappingPairs();
+	virtual void findPairsLarge();
+	virtual void computePairCacheChanges();
+	virtual void scanOverlappingPairBuff();
+	virtual void squeezeOverlappingPairBuff();
+};
+
+
+
+///The btCudaBroadphase uses CUDA to compute overlapping pairs using a GPU.
+class btCudaBroadphase : public bt3DGridBroadphase
+{
+protected:
+    // GPU data
+    unsigned int*	m_dBodiesHash[2];
+    unsigned int*	m_dCellStart;
+	unsigned int*	m_dPairBuff; 
+	unsigned int*	m_dPairBuffStartCurr;
+	btCuda3F1U*		m_dAABB;
+	unsigned int*	m_dPairScan;
+	unsigned int*	m_dPairOut;
 public:
 	btCudaBroadphase(const btVector3& worldAabbMin,const btVector3& worldAabbMax, 
 								   int gridSizeX, int gridSizeY, int gridSizeZ, 
@@ -95,18 +132,21 @@ public:
 								   int maxBodiesPerCell = 8,
 								   btScalar cellFactorAABB = btScalar(1.0f));
 	virtual ~btCudaBroadphase();
-	virtual void	calculateOverlappingPairs(btDispatcher* dispatcher);
-
-	virtual btBroadphaseProxy*	createProxy(const btVector3& aabbMin,  const btVector3& aabbMax,int shapeType,void* userPtr ,short int collisionFilterGroup,short int collisionFilterMask, btDispatcher* dispatcher,void* multiSapProxy);
-	virtual void	destroyProxy(btBroadphaseProxy* proxy,btDispatcher* dispatcher);
-	virtual void	rayTest(const btVector3& rayFrom,const btVector3& rayTo, btBroadphaseRayCallback& rayCallback);
-
-
 protected:
 	void _initialize();
 	void _finalize();
-	void scanOverlappingPairBuffCPU();
-	void addPairsToCacheCPU(btDispatcher* dispatcher);
-	void addLarge2LargePairsToCache(btDispatcher* dispatcher);
+	void allocateArray(void** devPtr, unsigned int size);
+	void freeArray(void* devPtr);
+// overrides for CUDA version
+	virtual void setParameters(btCudaBroadphaseParams* hostParams);
+	virtual void prepareAABB();
+	virtual void calcHashAABB();
+	virtual void sortHash();	
+	virtual void findCellStart();
+	virtual void findOverlappingPairs();
+	virtual void findPairsLarge();
+	virtual void computePairCacheChanges();
+	virtual void scanOverlappingPairBuff();
+	virtual void squeezeOverlappingPairBuff();
 };
 #endif //CUDA_BROADPHASE_H
\ No newline at end of file
diff --git a/Extras/CUDA/btCudaBroadphaseKernel.h b/Extras/CUDA/btCudaBroadphaseKernel.h
index 5548c076b..a54370097 100644
--- a/Extras/CUDA/btCudaBroadphaseKernel.h
+++ b/Extras/CUDA/btCudaBroadphaseKernel.h
@@ -54,7 +54,28 @@ struct btCuda3F1U
 
 extern "C"
 {
+// CPU functions
+	void bt3DGrid_setParameters(btCudaBroadphaseParams* hostParams);
+	void bt3DGrid_calcHashAABB(btCuda3F1U* pAABB, unsigned int* hash,	unsigned int numBodies);
+	void bt3DGrid_findCellStart(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells);
+	void bt3DGrid_findOverlappingPairs(	btCuda3F1U*	pAABB, unsigned int* pHash,
+										unsigned int*	pCellStart,
+										unsigned int*	pPairBuff,
+										unsigned int*	pPairBuffStartCurr,
+										unsigned int	numBodies);
+	void bt3DGrid_findPairsLarge(	btCuda3F1U*	pAABB, unsigned int* pHash,
+									unsigned int*	pCellStart,
+									unsigned int*	pPairBuff,
+									unsigned int*	pPairBuffStartCurr,
+									unsigned int	numBodies,
+									unsigned int	numLarge);
 
+	void bt3DGrid_computePairCacheChanges(	unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, 
+											unsigned int* pPairScan, btCuda3F1U* pAABB, unsigned int numBodies);
+	void bt3DGrid_squeezeOverlappingPairBuff(	unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, 
+												unsigned int* pPairOut, btCuda3F1U* pAABB, unsigned int numBodies);
+
+// CUDA functions
 	void btCuda_allocateArray(void** devPtr, unsigned int size);
 	void btCuda_freeArray(void* devPtr);
 	void btCuda_copyArrayFromDevice(void* host, const void* device, unsigned int size);
diff --git a/Extras/CUDA/libbulletcuda.vcproj b/Extras/CUDA/libbulletcuda.vcproj
index 53a6d1f6a..3466f0f3c 100644
--- a/Extras/CUDA/libbulletcuda.vcproj
+++ b/Extras/CUDA/libbulletcuda.vcproj
@@ -506,6 +506,14 @@
 		<Filter
 			Name="Source Files"
 			>
+			<File
+				RelativePath=".\bt3DGridBroadphase.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\bt3DGridBroadphaseFunc.h"
+				>
+			</File>
 			<File
 				RelativePath=".\btCudaBroadphase.cpp"
 				>
diff --git a/Extras/CUDA/particleSystem.cpp b/Extras/CUDA/particleSystem.cpp
index 6ce9da8a0..c75bfd7dd 100644
--- a/Extras/CUDA/particleSystem.cpp
+++ b/Extras/CUDA/particleSystem.cpp
@@ -664,6 +664,7 @@ void	ParticleSystem::initializeBullet()
 //	m_broadphase = new btDbvtBroadphase();
 //	m_broadphase = new btAxisSweep3(btVector3(-3,-3,-3),btVector3(3,3,3));
 	m_broadphase = new btCudaBroadphase(btVector3(-1, -1, -1), btVector3(1, 1, 1), 64, 64, 64, m_params.numBodies, 16, 64, 8, btScalar(1.0f/1.733f));
+//	m_broadphase = new bt3DGridBroadphase(btVector3(-1, -1, -1), btVector3(1, 1, 1), 64, 64, 64, m_params.numBodies, 16, 64, 8, btScalar(1.0f/1.733f));
 
 
 	m_constraintSolver=new btSequentialImpulseConstraintSolver();