From 09aa2dbbe7d968ca7d63d8af3be03bdce0da1f5c Mon Sep 17 00:00:00 2001 From: rponom Date: Tue, 25 Nov 2008 03:16:11 +0000 Subject: [PATCH] CPU implementation of btCudaBroadphase added. It is called bt3DGridBroadphase and btCudaBroadphase is now derived from it rater than from btSimpleBroadphase Test of bt3DGridBroadphase was added to CDTestFramework --- .../BulletSAPCompleteBoxPruningTest.cpp | 4 + Extras/CDTestFramework/CDTestFramework.cpp | 3 + Extras/CUDA/bt3DGridBroadphase.cpp | 604 ++++++++++++++++++ Extras/CUDA/bt3DGridBroadphaseFunc.h | 444 +++++++++++++ Extras/CUDA/btCudaBroadphase.cpp | 453 +++---------- Extras/CUDA/btCudaBroadphase.cu | 460 +------------ Extras/CUDA/btCudaBroadphase.h | 88 ++- Extras/CUDA/btCudaBroadphaseKernel.h | 21 + Extras/CUDA/libbulletcuda.vcproj | 8 + Extras/CUDA/particleSystem.cpp | 1 + 10 files changed, 1247 insertions(+), 839 deletions(-) create mode 100644 Extras/CUDA/bt3DGridBroadphase.cpp create mode 100644 Extras/CUDA/bt3DGridBroadphaseFunc.h diff --git a/Extras/CDTestFramework/BulletSAPCompleteBoxPruningTest.cpp b/Extras/CDTestFramework/BulletSAPCompleteBoxPruningTest.cpp index cb4aaaa7d..6cc3ef612 100644 --- a/Extras/CDTestFramework/BulletSAPCompleteBoxPruningTest.cpp +++ b/Extras/CDTestFramework/BulletSAPCompleteBoxPruningTest.cpp @@ -433,6 +433,10 @@ BulletSAPCompleteBoxPruningTest::BulletSAPCompleteBoxPruningTest(int numBoxes,in // m_broadphase = new btCudaBroadphase(aabbMin, aabbMax, 32, 32, 32, 8192, 8192, 64, 16); methodname = "btCudaBroadphase"; break; + case 9: + m_broadphase = new bt3DGridBroadphase(aabbMin, aabbMax, 24, 24, 24,maxNumBoxes , maxNumBoxes, 64, 16); + methodname = "bt3DGridBroadphase"; + break; default: { m_broadphase = new btAxisSweep3(aabbMin,aabbMax,numBoxes,new btNullPairCache()); diff --git a/Extras/CDTestFramework/CDTestFramework.cpp b/Extras/CDTestFramework/CDTestFramework.cpp index 5397aaff6..db0d42cff 100644 --- a/Extras/CDTestFramework/CDTestFramework.cpp +++ b/Extras/CDTestFramework/CDTestFramework.cpp @@ -76,6 +76,7 @@ enum TestIndex // TEST_BIPARTITE_BOX_PRUNING, TEST_DBVT_8192, TEST_BULLET_CUDA_8192, + TEST_BULLET_3DGRID_8192, TEST_OPCODE_ARRAY_SAP, MAX_NB_TESTS }; @@ -312,6 +313,7 @@ int main(int argc, char** argv) // {TEST_BIPARTITE_BOX_PRUNING, "Bipartite box pruning"}, {TEST_DBVT_8192, "Bullet DBVT 8192"}, {TEST_BULLET_CUDA_8192, "Bullet CUDA 8192"}, + {TEST_BULLET_3DGRID_8192, "Bullet 3D Grid 8192"}, {TEST_OPCODE_ARRAY_SAP, "OPCODE ARRAY SAP"}, }; TwType testType = TwDefineEnum("CollisionTest", testEV, MAX_NB_TESTS); @@ -335,6 +337,7 @@ int main(int argc, char** argv) // gCollisionTests[TEST_BIPARTITE_BOX_PRUNING] = new BipartiteBoxPruningTest; gCollisionTests[TEST_DBVT_8192] = new BulletSAPCompleteBoxPruningTest(NUM_SAP_BOXES,7); gCollisionTests[TEST_BULLET_CUDA_8192] = new BulletSAPCompleteBoxPruningTest(NUM_SAP_BOXES,8); + gCollisionTests[TEST_BULLET_3DGRID_8192] = new BulletSAPCompleteBoxPruningTest(NUM_SAP_BOXES,9); gCollisionTests[TEST_OPCODE_ARRAY_SAP] = new OpcodeArraySAPTest(NUM_SAP_BOXES); for(int i=0;i + +//-------------------------------------------------------------------------- + +static btCudaBroadphaseParams s3DGridBroadphaseParams; + +//-------------------------------------------------------------------------- + +bt3DGridBroadphase::bt3DGridBroadphase( const btVector3& worldAabbMin,const btVector3& worldAabbMax, + int gridSizeX, int gridSizeY, int gridSizeZ, + int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody, + int maxBodiesPerCell, + btScalar cellFactorAABB) : + btSimpleBroadphase(maxSmallProxies, +// new (btAlignedAlloc(sizeof(btSortedOverlappingPairCache),16)) btSortedOverlappingPairCache), + new (btAlignedAlloc(sizeof(btHashedOverlappingPairCache),16)) btHashedOverlappingPairCache), + m_bInitialized(false), + m_numBodies(0) +{ + m_ownsPairCache = true; + m_params.m_gridSizeX = gridSizeX; + m_params.m_gridSizeY = gridSizeY; + m_params.m_gridSizeZ = gridSizeZ; + m_params.m_numCells = m_params.m_gridSizeX * m_params.m_gridSizeY * m_params.m_gridSizeZ; + btVector3 w_org = worldAabbMin; + m_params.m_worldOriginX = w_org.getX(); + m_params.m_worldOriginY = w_org.getY(); + m_params.m_worldOriginZ = w_org.getZ(); + btVector3 w_size = worldAabbMax - worldAabbMin; + m_params.m_cellSizeX = w_size.getX() / m_params.m_gridSizeX; + m_params.m_cellSizeY = w_size.getY() / m_params.m_gridSizeY; + m_params.m_cellSizeZ = w_size.getZ() / m_params.m_gridSizeZ; + m_maxRadius = btMin(btMin(m_params.m_cellSizeX, m_params.m_cellSizeY), m_params.m_cellSizeZ); + m_maxRadius *= btScalar(0.5f); + m_params.m_numBodies = m_numBodies; + m_params.m_maxBodiesPerCell = maxBodiesPerCell; + + m_numLargeHandles = 0; + m_maxLargeHandles = maxLargeProxies; + + m_maxPairsPerBody = maxPairsPerBody; + + m_cellFactorAABB = cellFactorAABB; + + m_LastLargeHandleIndex = -1; + + _initialize(); +} // bt3DGridBroadphase::bt3DGridBroadphase() + +//-------------------------------------------------------------------------- + +bt3DGridBroadphase::~bt3DGridBroadphase() +{ + //btSimpleBroadphase will free memory of btSortedOverlappingPairCache, because m_ownsPairCache + assert(m_bInitialized); + _finalize(); +} // bt3DGridBroadphase::~bt3DGridBroadphase() + +//-------------------------------------------------------------------------- + +void bt3DGridBroadphase::_initialize() +{ + assert(!m_bInitialized); + // allocate host storage + m_hBodiesHash = new unsigned int[m_maxHandles * 2]; + memset(m_hBodiesHash, 0x00, m_maxHandles*2*sizeof(unsigned int)); + + m_hCellStart = new unsigned int[m_params.m_numCells]; + memset(m_hCellStart, 0x00, m_params.m_numCells * sizeof(unsigned int)); + + m_hPairBuffStartCurr = new unsigned int[m_maxHandles * 2 + 2]; + // --------------- for now, init with m_maxPairsPerBody for each body + m_hPairBuffStartCurr[0] = 0; + m_hPairBuffStartCurr[1] = 0; + for(int i = 1; i <= m_maxHandles; i++) + { + m_hPairBuffStartCurr[i * 2] = m_hPairBuffStartCurr[(i-1) * 2] + m_maxPairsPerBody; + m_hPairBuffStartCurr[i * 2 + 1] = 0; + } + //---------------- + unsigned int numAABB = m_maxHandles + m_maxLargeHandles; + m_hAABB = new btCuda3F1U[numAABB * 2]; // AABB Min & Max + + m_hPairBuff = new unsigned int[m_maxHandles * m_maxPairsPerBody]; + memset(m_hPairBuff, 0x00, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int)); // needed? + + m_hPairScan = new unsigned int[m_maxHandles + 1]; + + m_hPairOut = new unsigned int[m_maxHandles * m_maxPairsPerBody]; + +// large proxies + + // allocate handles buffer and put all handles on free list + m_pLargeHandlesRawPtr = btAlignedAlloc(sizeof(btSimpleBroadphaseProxy) * m_maxLargeHandles, 16); + m_pLargeHandles = new(m_pLargeHandlesRawPtr) btSimpleBroadphaseProxy[m_maxLargeHandles]; + m_firstFreeLargeHandle = 0; + { + for (int i = m_firstFreeLargeHandle; i < m_maxLargeHandles; i++) + { + m_pLargeHandles[i].SetNextFree(i + 1); + m_pLargeHandles[i].m_uniqueId = m_maxHandles+2+i; + } + m_pLargeHandles[m_maxLargeHandles - 1].SetNextFree(0); + } + +// debug data + m_numPairsAdded = 0; + m_numOverflows = 0; + + m_bInitialized = true; +} // bt3DGridBroadphase::_initialize() + +//-------------------------------------------------------------------------- + +void bt3DGridBroadphase::_finalize() +{ + assert(m_bInitialized); + delete [] m_hBodiesHash; + delete [] m_hCellStart; + delete [] m_hPairBuffStartCurr; + delete [] m_hAABB; + delete [] m_hPairBuff; + delete [] m_hPairScan; + delete [] m_hPairOut; + btAlignedFree(m_pLargeHandlesRawPtr); + m_bInitialized = false; +} // bt3DGridBroadphase::_finalize() + +//-------------------------------------------------------------------------- + +void bt3DGridBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher) +{ + if(m_numHandles <= 0) + { + BT_PROFILE("addLarge2LargePairsToCache"); + addLarge2LargePairsToCache(dispatcher); + return; + } + // update constants + setParameters(&m_params); + // prepare AABB array + prepareAABB(); + // calculate hash + calcHashAABB(); + // sort bodies based on hash + sortHash(); + // find start of each cell + findCellStart(); + // findOverlappingPairs (small/small) + findOverlappingPairs(); + // findOverlappingPairs (small/large) + findPairsLarge(); + // add pairs to CPU cache + computePairCacheChanges(); + scanOverlappingPairBuff(); + squeezeOverlappingPairBuff(); + addPairsToCache(dispatcher); + // find and add large/large pairs to CPU cache + addLarge2LargePairsToCache(dispatcher); + return; +} // bt3DGridBroadphase::calculateOverlappingPairs() + +//-------------------------------------------------------------------------- + +void bt3DGridBroadphase::addPairsToCache(btDispatcher* dispatcher) +{ + m_numPairsAdded = 0; + m_numPairsRemoved = 0; + for(int i = 0; i < m_numHandles; i++) + { + unsigned int num = m_hPairScan[i+1] - m_hPairScan[i]; + if(!num) + { + continue; + } + unsigned int* pInp = m_hPairOut + m_hPairScan[i]; + unsigned int index0 = m_hAABB[i * 2].uw; + btSimpleBroadphaseProxy* proxy0 = &m_pHandles[index0]; + for(unsigned int j = 0; j < num; j++) + { + unsigned int indx1_s = pInp[j]; + unsigned int index1 = indx1_s & (~BT_CUDA_PAIR_ANY_FLG); + btSimpleBroadphaseProxy* proxy1; + if(index1 < (unsigned int)m_maxHandles) + { + proxy1 = &m_pHandles[index1]; + } + else + { + index1 -= m_maxHandles; + btAssert((index1 >= 0) && (index1 < (unsigned int)m_maxLargeHandles)); + proxy1 = &m_pLargeHandles[index1]; + } + if(indx1_s & BT_CUDA_PAIR_NEW_FLG) + { + m_pairCache->addOverlappingPair(proxy0,proxy1); + m_numPairsAdded++; + } + else + { + m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher); + m_numPairsRemoved++; + } + } + } +} // bt3DGridBroadphase::addPairsToCache() + +//-------------------------------------------------------------------------- + +btBroadphaseProxy* bt3DGridBroadphase::createProxy( const btVector3& aabbMin, const btVector3& aabbMax,int shapeType,void* userPtr ,short int collisionFilterGroup,short int collisionFilterMask, btDispatcher* dispatcher,void* multiSapProxy) +{ + btBroadphaseProxy* proxy; + bool bIsLarge = isLargeProxy(aabbMin, aabbMax); + if(bIsLarge) + { + if (m_numLargeHandles >= m_maxLargeHandles) + { + btAssert(0); + return 0; //should never happen, but don't let the game crash ;-) + } + btAssert((aabbMin[0]<= aabbMax[0]) && (aabbMin[1]<= aabbMax[1]) && (aabbMin[2]<= aabbMax[2])); + int newHandleIndex = allocLargeHandle(); + proxy = new (&m_pLargeHandles[newHandleIndex])btSimpleBroadphaseProxy(aabbMin,aabbMax,shapeType,userPtr,collisionFilterGroup,collisionFilterMask,multiSapProxy); + } + else + { + proxy = btSimpleBroadphase::createProxy(aabbMin, aabbMax, shapeType, userPtr, collisionFilterGroup, collisionFilterMask, dispatcher, multiSapProxy); + } + return proxy; +} // bt3DGridBroadphase::createProxy() + +//-------------------------------------------------------------------------- + +void bt3DGridBroadphase::destroyProxy(btBroadphaseProxy* proxy, btDispatcher* dispatcher) +{ + bool bIsLarge = isLargeProxy(proxy); + if(bIsLarge) + { + + btSimpleBroadphaseProxy* proxy0 = static_cast(proxy); + freeLargeHandle(proxy0); + m_pairCache->removeOverlappingPairsContainingProxy(proxy,dispatcher); + } + else + { + btSimpleBroadphase::destroyProxy(proxy, dispatcher); + } + return; +} // bt3DGridBroadphase::destroyProxy() + +//-------------------------------------------------------------------------- + +bool bt3DGridBroadphase::isLargeProxy(const btVector3& aabbMin, const btVector3& aabbMax) +{ + btVector3 diag = aabbMax - aabbMin; + btScalar radius = diag.length() * btScalar(0.5f); + + radius *= m_cellFactorAABB; // user-defined factor + + return (radius > m_maxRadius); +} // bt3DGridBroadphase::isLargeProxy() + +//-------------------------------------------------------------------------- + +bool bt3DGridBroadphase::isLargeProxy(btBroadphaseProxy* proxy) +{ + return (proxy->getUid() >= (m_maxHandles+2)); +} // bt3DGridBroadphase::isLargeProxy() + +//-------------------------------------------------------------------------- + +void bt3DGridBroadphase::addLarge2LargePairsToCache(btDispatcher* dispatcher) +{ + int i,j; + if (m_numLargeHandles <= 0) + { + return; + } + int new_largest_index = -1; + for(i = 0; i <= m_LastLargeHandleIndex; i++) + { + btSimpleBroadphaseProxy* proxy0 = &m_pLargeHandles[i]; + if(!proxy0->m_clientObject) + { + continue; + } + new_largest_index = i; + for(j = i + 1; j <= m_LastLargeHandleIndex; j++) + { + btSimpleBroadphaseProxy* proxy1 = &m_pLargeHandles[j]; + if(!proxy1->m_clientObject) + { + continue; + } + btAssert(proxy0 != proxy1); + btSimpleBroadphaseProxy* p0 = getSimpleProxyFromProxy(proxy0); + btSimpleBroadphaseProxy* p1 = getSimpleProxyFromProxy(proxy1); + if(aabbOverlap(p0,p1)) + { + if (!m_pairCache->findPair(proxy0,proxy1)) + { + m_pairCache->addOverlappingPair(proxy0,proxy1); + } + } + else + { + if(m_pairCache->findPair(proxy0,proxy1)) + { + m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher); + } + } + } + } + m_LastLargeHandleIndex = new_largest_index; + return; +} // bt3DGridBroadphase::addLarge2LargePairsToCache() + +//-------------------------------------------------------------------------- + +void bt3DGridBroadphase::rayTest(const btVector3& rayFrom,const btVector3& rayTo, btBroadphaseRayCallback& rayCallback) +{ + btSimpleBroadphase::rayTest(rayFrom, rayTo, rayCallback); + for (int i=0; i <= m_LastLargeHandleIndex; i++) + { + btSimpleBroadphaseProxy* proxy = &m_pLargeHandles[i]; + if(!proxy->m_clientObject) + { + continue; + } + rayCallback.process(proxy); + } +} // bt3DGridBroadphase::rayTest() + +//-------------------------------------------------------------------------- +//-------------------------------------------------------------------------- +// +// overrides for CPU version +// +//-------------------------------------------------------------------------- +//-------------------------------------------------------------------------- + +void bt3DGridBroadphase::prepareAABB() +{ + BT_PROFILE("prepareAABB"); + btCuda3F1U* pBB = m_hAABB; + int i; + int new_largest_index = -1; + unsigned int num_small = 0; + for(i = 0; i <= m_LastHandleIndex; i++) + { + btSimpleBroadphaseProxy* proxy0 = &m_pHandles[i]; + if(!proxy0->m_clientObject) + { + continue; + } + new_largest_index = i; + pBB->fx = proxy0->m_aabbMin.getX(); + pBB->fy = proxy0->m_aabbMin.getY(); + pBB->fz = proxy0->m_aabbMin.getZ(); + pBB->uw = i; + pBB++; + pBB->fx = proxy0->m_aabbMax.getX(); + pBB->fy = proxy0->m_aabbMax.getY(); + pBB->fz = proxy0->m_aabbMax.getZ(); + pBB->uw = num_small; + pBB++; + num_small++; + } + m_LastHandleIndex = new_largest_index; + new_largest_index = -1; + unsigned int num_large = 0; + for(i = 0; i <= m_LastLargeHandleIndex; i++) + { + btSimpleBroadphaseProxy* proxy0 = &m_pLargeHandles[i]; + if(!proxy0->m_clientObject) + { + continue; + } + new_largest_index = i; + pBB->fx = proxy0->m_aabbMin.getX(); + pBB->fy = proxy0->m_aabbMin.getY(); + pBB->fz = proxy0->m_aabbMin.getZ(); + pBB->uw = i + m_maxHandles; + pBB++; + pBB->fx = proxy0->m_aabbMax.getX(); + pBB->fy = proxy0->m_aabbMax.getY(); + pBB->fz = proxy0->m_aabbMax.getZ(); + pBB->uw = num_large + m_maxHandles; + pBB++; + num_large++; + } + m_LastLargeHandleIndex = new_largest_index; + // paranoid checks + btAssert(num_small == m_numHandles); + btAssert(num_large == m_numLargeHandles); + return; +} // bt3DGridBroadphase::prepareAABB() + +//-------------------------------------------------------------------------- + +void bt3DGridBroadphase::setParameters(btCudaBroadphaseParams* hostParams) +{ + s3DGridBroadphaseParams = *hostParams; + return; +} // bt3DGridBroadphase::setParameters() + +//-------------------------------------------------------------------------- + +void bt3DGridBroadphase::calcHashAABB() +{ + BT_PROFILE("bt3DGrid_calcHashAABB"); + bt3DGrid_calcHashAABB(m_hAABB, m_hBodiesHash, m_numHandles); + return; +} // bt3DGridBroadphase::calcHashAABB() + +//-------------------------------------------------------------------------- + +void bt3DGridBroadphase::sortHash() +{ + class bt3DGridHashKey + { + public: + unsigned int hash; + unsigned int index; + void quickSort(bt3DGridHashKey* pData, int lo, int hi) + { + int i=lo, j=hi; + bt3DGridHashKey x = pData[(lo+hi)/2]; + do + { + while(pData[i].hash > x.hash) i++; + while(x.hash > pData[j].hash) j--; + if(i <= j) + { + bt3DGridHashKey t = pData[i]; + pData[i] = pData[j]; + pData[j] = t; + i++; j--; + } + } while(i <= j); + if(lo < j) pData->quickSort(pData, lo, j); + if(i < hi) pData->quickSort(pData, i, hi); + } + }; + BT_PROFILE("bt3DGrid_sortHash"); + bt3DGridHashKey* pHash = (bt3DGridHashKey*)m_hBodiesHash; + pHash->quickSort(pHash, 0, m_numHandles - 1); + return; +} // bt3DGridBroadphase::sortHash() + +//-------------------------------------------------------------------------- + +void bt3DGridBroadphase::findCellStart() +{ + BT_PROFILE("bt3DGrid_findCellStart"); + bt3DGrid_findCellStart(m_hBodiesHash, m_hCellStart, m_numHandles, m_params.m_numCells); + return; +} // bt3DGridBroadphase::findCellStart() + +//-------------------------------------------------------------------------- + +void bt3DGridBroadphase::findOverlappingPairs() +{ + BT_PROFILE("bt3DGrid_findOverlappingPairs"); + bt3DGrid_findOverlappingPairs(m_hAABB, m_hBodiesHash, m_hCellStart, m_hPairBuff, m_hPairBuffStartCurr, m_numHandles); + return; +} // bt3DGridBroadphase::findOverlappingPairs() + +//-------------------------------------------------------------------------- + +void bt3DGridBroadphase::findPairsLarge() +{ + BT_PROFILE("bt3DGrid_findPairsLarge"); + bt3DGrid_findPairsLarge(m_hAABB, m_hBodiesHash, m_hCellStart, m_hPairBuff, m_hPairBuffStartCurr, m_numHandles, m_numLargeHandles); + return; +} // bt3DGridBroadphase::findPairsLarge() + +//-------------------------------------------------------------------------- + +void bt3DGridBroadphase::computePairCacheChanges() +{ + BT_PROFILE("bt3DGrid_computePairCacheChanges"); + bt3DGrid_computePairCacheChanges(m_hPairBuff, m_hPairBuffStartCurr, m_hPairScan, m_hAABB, m_numHandles); + return; +} // bt3DGridBroadphase::computePairCacheChanges() + +//-------------------------------------------------------------------------- + +void bt3DGridBroadphase::scanOverlappingPairBuff() +{ + BT_PROFILE("bt3DGrid_scanOverlappingPairBuff"); + m_hPairScan[0] = 0; + for(int i = 1; i <= m_numHandles; i++) + { + unsigned int delta = m_hPairScan[i]; + m_hPairScan[i] = m_hPairScan[i-1] + delta; + } + return; +} // bt3DGridBroadphase::scanOverlappingPairBuff() + +//-------------------------------------------------------------------------- + +void bt3DGridBroadphase::squeezeOverlappingPairBuff() +{ + BT_PROFILE("bt3DGrid_squeezeOverlappingPairBuff"); + bt3DGrid_squeezeOverlappingPairBuff(m_hPairBuff, m_hPairBuffStartCurr, m_hPairScan, m_hPairOut, m_hAABB, m_numHandles); + return; +} // bt3DGridBroadphase::squeezeOverlappingPairBuff() + +//-------------------------------------------------------------------------- + +typedef unsigned int uint; + +struct uint2 +{ + unsigned int x, y; +}; + +struct int3 +{ + int x, y, z; +}; + +struct uint3 +{ + unsigned int x, y, z; +}; + +struct float4 +{ + float x, y, z, w; +}; + + +#define BT3DGRID__device__ inline +#define BT3DGRIDmax(a, b) ((a) > (b) ? (a) : (b)) +#define BT3DGRIDmin(a, b) ((a) < (b) ? (a) : (b)) +#define BT3DGRIDparams s3DGridBroadphaseParams +#define BT3DGRID__mul24(a, b) ((a)*(b)) +#define BT3DGRID__global__ inline +#define BT3DGRID__shared__ static +#define BT3DGRID__syncthreads() + + +static inline uint2 bt3dGrid_make_uint2(unsigned int x, unsigned int y) +{ + uint2 t; t.x = x; t.y = y; return t; +} +#define BT3DGRIDmake_uint2(x, y) bt3dGrid_make_uint2(x, y) + +static inline int3 bt3dGrid_make_int3(int x, int y, int z) +{ + int3 t; t.x = x; t.y = y; t.z = z; return t; +} +inline int3 operator+(int3 a, int3 b) +{ + return bt3dGrid_make_int3(a.x + b.x, a.y + b.y, a.z + b.z); +} +#define BT3DGRIDmake_int3(x, y, z) bt3dGrid_make_int3(x, y, z) + +#define BT3DGRIDFETCH(a, b) a[b] +#define BT3DGRIDPREF(func) bt3DGrid_##func +#define MY_CUDA_SAFE_CALL(func) func +#define BT3DGPRDMemset memset + +static uint2 s_blockIdx, s_blockDim, s_threadIdx; +#define BT3DGRIDblockIdx s_blockIdx +#define BT3DGRIDblockDim s_blockDim +#define BT3DGRIDthreadIdx s_threadIdx +#define BT3DGRIDEXECKERNEL(numb, numt, kfunc, args) {s_blockDim.x=numt;for(int nb=0;nb= (int)numBodies) + { + return; + } + btCuda3F1U bbMin = pAABB[index*2]; + btCuda3F1U bbMax = pAABB[index*2 + 1]; + float4 pos; + pos.x = (bbMin.fx + bbMax.fx) * 0.5f; + pos.y = (bbMin.fy + bbMax.fy) * 0.5f; + pos.z = (bbMin.fz + bbMax.fz) * 0.5f; + // get address in grid + int3 gridPos = btCuda_calcGridPos(pos); + uint gridHash = btCuda_calcGridHash(gridPos); + // store grid hash and body index + pHash[index] = BT3DGRIDmake_uint2(gridHash, index); +} + +//---------------------------------------------------------------------------------------- + +BT3DGRID__global__ void findCellStartD(uint2* pHash, uint* cellStart, uint numBodies) +{ + int index = BT3DGRID__mul24(BT3DGRIDblockIdx.x, BT3DGRIDblockDim.x) + BT3DGRIDthreadIdx.x; + if(index >= (int)numBodies) + { + return; + } + uint2 sortedData = pHash[index]; + // Load hash data into shared memory so that we can look + // at neighboring body's hash value without loading + // two hash values per thread + BT3DGRID__shared__ uint sharedHash[257]; + sharedHash[BT3DGRIDthreadIdx.x+1] = sortedData.x; + if((index > 0) && (BT3DGRIDthreadIdx.x == 0)) + { + // first thread in block must load neighbor body hash + volatile uint2 prevData = pHash[index-1]; + sharedHash[0] = prevData.x; + } + BT3DGRID__syncthreads(); + if((index == 0) || (sortedData.x != sharedHash[BT3DGRIDthreadIdx.x])) + { + cellStart[sortedData.x] = index; + } +} + +//---------------------------------------------------------------------------------------- + +BT3DGRID__device__ uint cudaTestAABBOverlap(btCuda3F1U min0, btCuda3F1U max0, btCuda3F1U min1, btCuda3F1U max1) +{ + return (min0.fx <= max1.fx)&& (min1.fx <= max0.fx) && + (min0.fy <= max1.fy)&& (min1.fy <= max0.fy) && + (min0.fz <= max1.fz)&& (min1.fz <= max0.fz); +} + +//---------------------------------------------------------------------------------------- + +BT3DGRID__device__ void findPairsInCell(int3 gridPos, + uint index, + uint2* pHash, + uint* pCellStart, + btCuda3F1U* pAABB, + uint* pPairBuff, + uint2* pPairBuffStartCurr, + uint numBodies) +{ + if ( (gridPos.x < 0) || (gridPos.x > (int)BT3DGRIDparams.m_gridSizeX - 1) + || (gridPos.y < 0) || (gridPos.y > (int)BT3DGRIDparams.m_gridSizeY - 1) + || (gridPos.z < 0) || (gridPos.z > (int)BT3DGRIDparams.m_gridSizeZ - 1)) + { + return; + } + uint gridHash = btCuda_calcGridHash(gridPos); + // get start of bucket for this cell + uint bucketStart = pCellStart[gridHash]; + if (bucketStart == 0xffffffff) + { + return; // cell empty + } + // iterate over bodies in this cell + uint2 sortedData = pHash[index]; + uint unsorted_indx = sortedData.y; + btCuda3F1U min0 = BT3DGRIDFETCH(pAABB, unsorted_indx*2); + btCuda3F1U max0 = BT3DGRIDFETCH(pAABB, unsorted_indx*2 + 1); + uint handleIndex = min0.uw; + uint2 start_curr = pPairBuffStartCurr[handleIndex]; + uint start = start_curr.x; + uint curr = start_curr.y; + uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1]; + uint curr_max = start_curr_next.x - start - 1; + uint bucketEnd = bucketStart + BT3DGRIDparams.m_maxBodiesPerCell; + bucketEnd = (bucketEnd > numBodies) ? numBodies : bucketEnd; + for(uint index2 = bucketStart; index2 < bucketEnd; index2++) + { + uint2 cellData = pHash[index2]; + if (cellData.x != gridHash) + { + break; // no longer in same bucket + } + uint unsorted_indx2 = cellData.y; + if (unsorted_indx2 < unsorted_indx) // check not colliding with self + { + btCuda3F1U min1 = BT3DGRIDFETCH(pAABB, unsorted_indx2*2); + btCuda3F1U max1 = BT3DGRIDFETCH(pAABB, unsorted_indx2*2 + 1); + if(cudaTestAABBOverlap(min0, max0, min1, max1)) + { + uint handleIndex2 = min1.uw; + uint k; + for(k = 0; k < curr; k++) + { + uint old_pair = pPairBuff[start+k] & (~BT_CUDA_PAIR_ANY_FLG); + if(old_pair == handleIndex2) + { + pPairBuff[start+k] |= BT_CUDA_PAIR_FOUND_FLG; + break; + } + } + if(k == curr) + { + pPairBuff[start+curr] = handleIndex2 | BT_CUDA_PAIR_NEW_FLG; + if(curr >= curr_max) + { // not a good solution, but let's avoid crash + break; + } + curr++; + } + } + } + } + pPairBuffStartCurr[handleIndex] = BT3DGRIDmake_uint2(start, curr); + return; +} + +//---------------------------------------------------------------------------------------- + +BT3DGRID__global__ void +findOverlappingPairsD( btCuda3F1U* pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff, + uint2* pPairBuffStartCurr, uint numBodies) +{ + int index = BT3DGRID__mul24(BT3DGRIDblockIdx.x, BT3DGRIDblockDim.x) + BT3DGRIDthreadIdx.x; + if(index >= (int)numBodies) + { + return; + } + uint2 sortedData = pHash[index]; + uint unsorted_indx = sortedData.y; + btCuda3F1U bbMin = BT3DGRIDFETCH(pAABB, unsorted_indx*2); + btCuda3F1U bbMax = BT3DGRIDFETCH(pAABB, unsorted_indx*2 + 1); + float4 pos; + pos.x = (bbMin.fx + bbMax.fx) * 0.5f; + pos.y = (bbMin.fy + bbMax.fy) * 0.5f; + pos.z = (bbMin.fz + bbMax.fz) * 0.5f; + // get address in grid + int3 gridPos = btCuda_calcGridPos(pos); + // examine only neighbouring cells + for(int z=-1; z<=1; z++) { + for(int y=-1; y<=1; y++) { + for(int x=-1; x<=1; x++) { + findPairsInCell(gridPos + BT3DGRIDmake_int3(x, y, z), index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, numBodies); + } + } + } +} + +//---------------------------------------------------------------------------------------- + +BT3DGRID__global__ void +findPairsLargeD( btCuda3F1U* pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff, + uint2* pPairBuffStartCurr, uint numBodies, uint numLarge) +{ + int index = BT3DGRID__mul24(BT3DGRIDblockIdx.x, BT3DGRIDblockDim.x) + BT3DGRIDthreadIdx.x; + if(index >= (int)numBodies) + { + return; + } + uint2 sortedData = pHash[index]; + uint unsorted_indx = sortedData.y; + btCuda3F1U min0 = BT3DGRIDFETCH(pAABB, unsorted_indx*2); + btCuda3F1U max0 = BT3DGRIDFETCH(pAABB, unsorted_indx*2 + 1); + uint handleIndex = min0.uw; + uint2 start_curr = pPairBuffStartCurr[handleIndex]; + uint start = start_curr.x; + uint curr = start_curr.y; + uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1]; + uint curr_max = start_curr_next.x - start - 1; + for(uint i = 0; i < numLarge; i++) + { + uint indx2 = numBodies + i; + btCuda3F1U min1 = BT3DGRIDFETCH(pAABB, indx2*2); + btCuda3F1U max1 = BT3DGRIDFETCH(pAABB, indx2*2 + 1); + if(cudaTestAABBOverlap(min0, max0, min1, max1)) + { + uint k; + uint handleIndex2 = min1.uw; + for(k = 0; k < curr; k++) + { + uint old_pair = pPairBuff[start+k] & (~BT_CUDA_PAIR_ANY_FLG); + if(old_pair == handleIndex2) + { + pPairBuff[start+k] |= BT_CUDA_PAIR_FOUND_FLG; + break; + } + } + if(k == curr) + { + pPairBuff[start+curr] = handleIndex2 | BT_CUDA_PAIR_NEW_FLG; + if(curr >= curr_max) + { // not a good solution, but let's avoid crash + break; + } + curr++; + } + } + } + pPairBuffStartCurr[handleIndex] = BT3DGRIDmake_uint2(start, curr); + return; +} + +//---------------------------------------------------------------------------------------- + +BT3DGRID__global__ void computePairCacheChangesD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan, btCuda3F1U* pAABB, uint numBodies) +{ + int index = BT3DGRID__mul24(BT3DGRIDblockIdx.x, BT3DGRIDblockDim.x) + BT3DGRIDthreadIdx.x; + if(index >= (int)numBodies) + { + return; + } + btCuda3F1U bbMin = pAABB[index * 2]; + uint handleIndex = bbMin.uw; + uint2 start_curr = pPairBuffStartCurr[handleIndex]; + uint start = start_curr.x; + uint curr = start_curr.y; + uint *pInp = pPairBuff + start; + uint num_changes = 0; + for(uint k = 0; k < curr; k++, pInp++) + { + if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG)) + { + num_changes++; + } + } + pPairScan[index+1] = num_changes; +} + +//---------------------------------------------------------------------------------------- + +BT3DGRID__global__ void squeezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan, uint* pPairOut, btCuda3F1U* pAABB, uint numBodies) +{ + int index = BT3DGRID__mul24(BT3DGRIDblockIdx.x, BT3DGRIDblockDim.x) + BT3DGRIDthreadIdx.x; + if(index >= (int)numBodies) + { + return; + } + btCuda3F1U bbMin = pAABB[index * 2]; + uint handleIndex = bbMin.uw; + uint2 start_curr = pPairBuffStartCurr[handleIndex]; + uint start = start_curr.x; + uint curr = start_curr.y; + uint* pInp = pPairBuff + start; + uint* pOut = pPairOut + pPairScan[index]; + uint* pOut2 = pInp; + uint num = 0; + for(uint k = 0; k < curr; k++, pInp++) + { + if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG)) + { + *pOut = *pInp; + pOut++; + } + if((*pInp) & BT_CUDA_PAIR_ANY_FLG) + { + *pOut2 = (*pInp) & (~BT_CUDA_PAIR_ANY_FLG); + pOut2++; + num++; + } + } + pPairBuffStartCurr[handleIndex] = BT3DGRIDmake_uint2(start, num); +} // squeezeOverlappingPairBuffD() + + +//---------------------------------------------------------------------------------------- +//---------------------------------------------------------------------------------------- +//---------------------------------------------------------------------------------------- +//---------------------------------------------------------------------------------------- +// E N D O F K E R N E L F U N C T I O N S +//---------------------------------------------------------------------------------------- +//---------------------------------------------------------------------------------------- +//---------------------------------------------------------------------------------------- +//---------------------------------------------------------------------------------------- + +extern "C" +{ + +//Round a / b to nearest higher integer value +int BT3DGRIDPREF(iDivUp)(int a, int b) +{ + return (a % b != 0) ? (a / b + 1) : (a / b); +} + +// compute grid and thread block size for a given number of elements +void BT3DGRIDPREF(computeGridSize)(int n, int blockSize, int &numBlocks, int &numThreads) +{ + numThreads = BT3DGRIDmin(blockSize, n); + numBlocks = BT3DGRIDPREF(iDivUp)(n, numThreads); +} + +void BT3DGRIDPREF(calcHashAABB)(btCuda3F1U* pAABB, unsigned int* hash, unsigned int numBodies) +{ + int numThreads, numBlocks; + BT3DGRIDPREF(computeGridSize)(numBodies, 256, numBlocks, numThreads); + // execute the kernel + BT3DGRIDEXECKERNEL(numBlocks, numThreads, calcHashAABBD, (pAABB, (uint2*)hash, numBodies)); + // check if kernel invocation generated an error + CUT_CHECK_ERROR("calcHashAABBD kernel execution failed"); +} + +void BT3DGRIDPREF(findCellStart(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells)) +{ + int numThreads, numBlocks; + BT3DGRIDPREF(computeGridSize)(numBodies, 256, numBlocks, numThreads); + MY_CUDA_SAFE_CALL(BT3DGPRDMemset(cellStart, 0xffffffff, numCells*sizeof(uint))); + BT3DGRIDEXECKERNEL(numBlocks, numThreads, findCellStartD, ((uint2*)hash, (uint*)cellStart, numBodies)); + CUT_CHECK_ERROR("Kernel execution failed: findCellStartD"); +} + +void BT3DGRIDPREF(findOverlappingPairs(btCuda3F1U* pAABB, unsigned int* pHash, unsigned int* pCellStart, unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int numBodies)) +{ +#if B_CUDA_USE_TEX + MY_CUDA_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, numBodies * 2 * sizeof(btCuda3F1U))); +#endif + int numThreads, numBlocks; + BT3DGRIDPREF(computeGridSize)(numBodies, 64, numBlocks, numThreads); + BT3DGRIDEXECKERNEL(numBlocks, numThreads, findOverlappingPairsD, (pAABB,(uint2*)pHash,(uint*)pCellStart,(uint*)pPairBuff,(uint2*)pPairBuffStartCurr,numBodies)); + CUT_CHECK_ERROR("Kernel execution failed: bt_CudaFindOverlappingPairsD"); +#if B_CUDA_USE_TEX + MY_CUDA_SAFE_CALL(cudaUnbindTexture(pAABBTex)); +#endif + } + + + +void BT3DGRIDPREF(findPairsLarge(btCuda3F1U* pAABB, unsigned int* pHash, unsigned int* pCellStart, unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int numBodies, unsigned int numLarge)) +{ +#if B_CUDA_USE_TEX + MY_CUDA_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, (numBodies+numLarge) * 2 * sizeof(btCuda3F1U))); +#endif + int numThreads, numBlocks; + BT3DGRIDPREF(computeGridSize)(numBodies, 64, numBlocks, numThreads); + BT3DGRIDEXECKERNEL(numBlocks, numThreads, findPairsLargeD, (pAABB,(uint2*)pHash,(uint*)pCellStart,(uint*)pPairBuff,(uint2*)pPairBuffStartCurr,numBodies,numLarge)); + CUT_CHECK_ERROR("Kernel execution failed: btCuda_findPairsLargeD"); +#if B_CUDA_USE_TEX + MY_CUDA_SAFE_CALL(cudaUnbindTexture(pAABBTex)); +#endif + } + + +void BT3DGRIDPREF(computePairCacheChanges(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, btCuda3F1U* pAABB, unsigned int numBodies)) +{ + int numThreads, numBlocks; + BT3DGRIDPREF(computeGridSize)(numBodies, 256, numBlocks, numThreads); + BT3DGRIDEXECKERNEL(numBlocks, numThreads, computePairCacheChangesD, ((uint*)pPairBuff,(uint2*)pPairBuffStartCurr,(uint*)pPairScan,pAABB,numBodies)); + CUT_CHECK_ERROR("Kernel execution failed: btCudaComputePairCacheChangesD"); + } + + +void BT3DGRIDPREF(squeezeOverlappingPairBuff(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, unsigned int* pPairOut, btCuda3F1U* pAABB, unsigned int numBodies)) +{ + int numThreads, numBlocks; + BT3DGRIDPREF(computeGridSize)(numBodies, 256, numBlocks, numThreads); + BT3DGRIDEXECKERNEL(numBlocks, numThreads, squeezeOverlappingPairBuffD, ((uint*)pPairBuff,(uint2*)pPairBuffStartCurr,(uint*)pPairScan,(uint*)pPairOut,pAABB,numBodies)); + CUT_CHECK_ERROR("Kernel execution failed: btCudaSqueezeOverlappingPairBuffD"); +} // btCuda_squeezeOverlappingPairBuff() + + +} // extern "C" diff --git a/Extras/CUDA/btCudaBroadphase.cpp b/Extras/CUDA/btCudaBroadphase.cpp index ab873439a..44e22fa87 100644 --- a/Extras/CUDA/btCudaBroadphase.cpp +++ b/Extras/CUDA/btCudaBroadphase.cpp @@ -34,39 +34,8 @@ btCudaBroadphase::btCudaBroadphase(const btVector3& worldAabbMin,const btVector3 int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody, int maxBodiesPerCell, btScalar cellFactorAABB) : - btSimpleBroadphase(maxSmallProxies, -// new (btAlignedAlloc(sizeof(btSortedOverlappingPairCache),16)) btSortedOverlappingPairCache), - new (btAlignedAlloc(sizeof(btHashedOverlappingPairCache),16)) btHashedOverlappingPairCache), - m_bInitialized(false), - m_numBodies(0) + bt3DGridBroadphase(worldAabbMin, worldAabbMax, gridSizeX, gridSizeY, gridSizeZ, maxSmallProxies, maxLargeProxies, maxPairsPerBody, maxBodiesPerCell,cellFactorAABB) { - m_ownsPairCache = true; - m_params.m_gridSizeX = gridSizeX; - m_params.m_gridSizeY = gridSizeY; - m_params.m_gridSizeZ = gridSizeZ; - m_params.m_numCells = m_params.m_gridSizeX * m_params.m_gridSizeY * m_params.m_gridSizeZ; - btVector3 w_org = worldAabbMin; - m_params.m_worldOriginX = w_org.getX(); - m_params.m_worldOriginY = w_org.getY(); - m_params.m_worldOriginZ = w_org.getZ(); - btVector3 w_size = worldAabbMax - worldAabbMin; - m_params.m_cellSizeX = w_size.getX() / m_params.m_gridSizeX; - m_params.m_cellSizeY = w_size.getY() / m_params.m_gridSizeY; - m_params.m_cellSizeZ = w_size.getZ() / m_params.m_gridSizeZ; - m_maxRadius = btMin(btMin(m_params.m_cellSizeX, m_params.m_cellSizeY), m_params.m_cellSizeZ); - m_maxRadius *= btScalar(0.5f); - m_params.m_numBodies = m_numBodies; - m_params.m_maxBodiesPerCell = maxBodiesPerCell; - - m_numLargeHandles = 0; - m_maxLargeHandles = maxLargeProxies; - - m_maxPairsPerBody = maxPairsPerBody; - - m_cellFactorAABB = cellFactorAABB; - - m_LastLargeHandleIndex = -1; - _initialize(); } // btCudaBroadphase::btCudaBroadphase() @@ -83,34 +52,6 @@ btCudaBroadphase::~btCudaBroadphase() void btCudaBroadphase::_initialize() { - assert(!m_bInitialized); - // allocate host storage - m_hBodiesHash = new unsigned int[m_maxHandles * 2]; - memset(m_hBodiesHash, 0x00, m_maxHandles*2*sizeof(unsigned int)); - - m_hCellStart = new unsigned int[m_params.m_numCells]; - memset(m_hCellStart, 0x00, m_params.m_numCells * sizeof(unsigned int)); - - m_hPairBuffStartCurr = new unsigned int[m_maxHandles * 2 + 2]; - // --------------- for now, init with m_maxPairsPerBody for each body - m_hPairBuffStartCurr[0] = 0; - m_hPairBuffStartCurr[1] = 0; - for(int i = 1; i <= m_maxHandles; i++) - { - m_hPairBuffStartCurr[i * 2] = m_hPairBuffStartCurr[(i-1) * 2] + m_maxPairsPerBody; - m_hPairBuffStartCurr[i * 2 + 1] = 0; - } - //---------------- - unsigned int numAABB = m_maxHandles + m_maxLargeHandles; - m_hAABB = new btCuda3F1U[numAABB * 2]; // AABB Min & Max - - m_hPairBuff = new unsigned int[m_maxHandles * m_maxPairsPerBody]; - memset(m_hPairBuff, 0x00, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int)); // needed? - - m_hPairScan = new unsigned int[m_maxHandles + 1]; - - m_hPairOut = new unsigned int[m_maxHandles * m_maxPairsPerBody]; - // allocate GPU data btCuda_allocateArray((void**)&m_dBodiesHash[0], m_maxHandles * 2 * sizeof(unsigned int)); btCuda_allocateArray((void**)&m_dBodiesHash[1], m_maxHandles * 2 * sizeof(unsigned int)); @@ -123,34 +64,12 @@ void btCudaBroadphase::_initialize() btCuda_allocateArray((void**)&m_dPairBuffStartCurr, (m_maxHandles * 2 + 1) * sizeof(unsigned int)); btCuda_copyArrayToDevice(m_dPairBuffStartCurr, m_hPairBuffStartCurr, (m_maxHandles * 2 + 1) * sizeof(unsigned int)); + unsigned int numAABB = m_maxHandles + m_maxLargeHandles; btCuda_allocateArray((void**)&m_dAABB, numAABB * sizeof(btCuda3F1U) * 2); btCuda_allocateArray((void**)&m_dPairScan, (m_maxHandles + 1) * sizeof(unsigned int)); btCuda_allocateArray((void**)&m_dPairOut, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int)); - - btCuda_setParameters(&m_params); - -// large proxies - - // allocate handles buffer and put all handles on free list - m_pLargeHandlesRawPtr = btAlignedAlloc(sizeof(btSimpleBroadphaseProxy) * m_maxLargeHandles, 16); - m_pLargeHandles = new(m_pLargeHandlesRawPtr) btSimpleBroadphaseProxy[m_maxLargeHandles]; - m_firstFreeLargeHandle = 0; - { - for (int i = m_firstFreeLargeHandle; i < m_maxLargeHandles; i++) - { - m_pLargeHandles[i].SetNextFree(i + 1); - m_pLargeHandles[i].m_uniqueId = m_maxHandles+2+i; - } - m_pLargeHandles[m_maxLargeHandles - 1].SetNextFree(0); - } - -// debug data - m_numPairsAdded = 0; - m_numOverflows = 0; - - m_bInitialized = true; } // btCudaBroadphase::_initialize() //-------------------------------------------------------------------------- @@ -179,324 +98,104 @@ void btCudaBroadphase::_finalize() m_bInitialized = false; } // btCudaBroadphase::_finalize() +//-------------------------------------------------------------------------- +//-------------------------------------------------------------------------- +// +// overrides for CUDA version +// +//-------------------------------------------------------------------------- //-------------------------------------------------------------------------- -void btCudaBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher) +void btCudaBroadphase::prepareAABB() { - if(m_numHandles <= 0) - { - BT_PROFILE("addLarge2LargePairsToCache -- CPU"); - addLarge2LargePairsToCache(dispatcher); - return; - } + bt3DGridBroadphase::prepareAABB(); + btCuda_copyArrayToDevice(m_dAABB, m_hAABB, sizeof(btCuda3F1U) * 2 * (m_numHandles + m_numLargeHandles)); + return; +} // btCudaBroadphase::prepareAABB() - +//-------------------------------------------------------------------------- - // update constants - btCuda_setParameters(&m_params); +void btCudaBroadphase::setParameters(btCudaBroadphaseParams* hostParams) +{ + btCuda_setParameters(hostParams); + return; +} // btCudaBroadphase::setParameters() - +//-------------------------------------------------------------------------- - // move AABB array to GPU - { - BT_PROFILE("copy AABB"); - // do it faster ? - btCuda3F1U* pBB = m_hAABB; - int i; - int new_largest_index = -1; - unsigned int num_small = 0; - - for(i = 0; i <= m_LastHandleIndex; i++) - { - btSimpleBroadphaseProxy* proxy0 = &m_pHandles[i]; - if(!proxy0->m_clientObject) - { - continue; - } - new_largest_index = i; - pBB->fx = proxy0->m_aabbMin.getX(); - pBB->fy = proxy0->m_aabbMin.getY(); - pBB->fz = proxy0->m_aabbMin.getZ(); - pBB->uw = i; - pBB++; - pBB->fx = proxy0->m_aabbMax.getX(); - pBB->fy = proxy0->m_aabbMax.getY(); - pBB->fz = proxy0->m_aabbMax.getZ(); - pBB->uw = num_small; - pBB++; - num_small++; - } - m_LastHandleIndex = new_largest_index; - new_largest_index = -1; - unsigned int num_large = 0; - for(i = 0; i <= m_LastLargeHandleIndex; i++) - { - btSimpleBroadphaseProxy* proxy0 = &m_pLargeHandles[i]; - if(!proxy0->m_clientObject) - { - continue; - } - new_largest_index = i; - pBB->fx = proxy0->m_aabbMin.getX(); - pBB->fy = proxy0->m_aabbMin.getY(); - pBB->fz = proxy0->m_aabbMin.getZ(); - pBB->uw = i + m_maxHandles; - pBB++; - pBB->fx = proxy0->m_aabbMax.getX(); - pBB->fy = proxy0->m_aabbMax.getY(); - pBB->fz = proxy0->m_aabbMax.getZ(); - pBB->uw = num_large + m_maxHandles; - pBB++; - num_large++; - } - m_LastLargeHandleIndex = new_largest_index; - // paranoid checks - btAssert(num_small == m_numHandles); - btAssert(num_large == m_numLargeHandles); - } - - { - BT_PROFILE("CopyBB to CUDA"); - btCuda_copyArrayToDevice(m_dAABB, m_hAABB, sizeof(btCuda3F1U) * 2 * (m_numHandles + m_numLargeHandles)); - } - // calculate hash - { - BT_PROFILE("calcHash -- CUDA"); - btCuda_calcHashAABB(m_dAABB, m_dBodiesHash[0], m_numHandles); - } +void btCudaBroadphase::calcHashAABB() +{ + BT_PROFILE("btCuda_calcHashAABB"); + btCuda_calcHashAABB(m_dAABB, m_dBodiesHash[0], m_numHandles); // btCuda_copyArrayFromDevice((void*)m_hBodiesHash, (void*)m_dBodiesHash[0], sizeof(unsigned int) * 2 * m_numHandles); - // sort bodies based on hash - { - BT_PROFILE("RadixSort-- CUDA"); - RadixSort((KeyValuePair*)m_dBodiesHash[0], (KeyValuePair*)m_dBodiesHash[1], m_numHandles, 32); - } - // find start of each cell - { - BT_PROFILE("Find cell start -- CUDA"); - btCuda_findCellStart(m_dBodiesHash[0], m_dCellStart, m_numHandles, m_params.m_numCells); - } + return; +} // btCudaBroadphase::calcHashAABB() + +//-------------------------------------------------------------------------- + +void btCudaBroadphase::sortHash() +{ + BT_PROFILE("RadixSort-- CUDA"); + RadixSort((KeyValuePair*)m_dBodiesHash[0], (KeyValuePair*)m_dBodiesHash[1], m_numHandles, 32); + return; +} // btCudaBroadphase::sortHash() + +//-------------------------------------------------------------------------- + +void btCudaBroadphase::findCellStart() +{ + BT_PROFILE("btCuda_findCellStart"); + btCuda_findCellStart(m_dBodiesHash[0], m_dCellStart, m_numHandles, m_params.m_numCells); // btCuda_copyArrayFromDevice((void*)m_hBodiesHash, (void*)m_dBodiesHash[0], sizeof(unsigned int) * 2 * m_numHandles); // btCuda_copyArrayFromDevice((void*)m_hCellStart, (void*)m_dCellStart, sizeof(unsigned int) * m_params.m_numCells); - { - BT_PROFILE("FindOverlappingPairs -- CUDA"); - btCuda_findOverlappingPairs(m_dAABB, m_dBodiesHash[0], m_dCellStart, m_dPairBuff, m_dPairBuffStartCurr, m_numHandles); - } - { - BT_PROFILE("FindPairsLarge -- CUDA"); - btCuda_findPairsLarge(m_dAABB, m_dBodiesHash[0], m_dCellStart, m_dPairBuff, m_dPairBuffStartCurr, m_numHandles, m_numLargeHandles); - } - { - BT_PROFILE("ComputePairCacheChanges -- CUDA"); - btCuda_computePairCacheChanges(m_dPairBuff, m_dPairBuffStartCurr, m_dPairScan, m_dAABB, m_numHandles); - } - { - BT_PROFILE("scanOverlappingPairBuff -- CPU"); - btCuda_copyArrayFromDevice(m_hPairScan, m_dPairScan, sizeof(unsigned int)*(m_numHandles + 1)); - scanOverlappingPairBuffCPU(); - btCuda_copyArrayToDevice(m_dPairScan, m_hPairScan, sizeof(unsigned int)*(m_numHandles + 1)); - } - { - BT_PROFILE("SqueezeOverlappingPairBuff -- CUDA"); - btCuda_squeezeOverlappingPairBuff(m_dPairBuff, m_dPairBuffStartCurr, m_dPairScan, m_dPairOut, m_dAABB, m_numHandles); - } - { - BT_PROFILE("SqueezeOverlappingPairBuff -- CUDA"); - btCuda_copyArrayFromDevice(m_hPairOut, m_dPairOut, sizeof(unsigned int) * m_hPairScan[m_numHandles]); - } - { - BT_PROFILE("addPairsToCache -- CPU"); - addPairsToCacheCPU(dispatcher); - } - { - BT_PROFILE("addLarge2LargePairsToCache -- CPU"); - addLarge2LargePairsToCache(dispatcher); - } return; -} // btCudaBroadphase::calculateOverlappingPairs() +} // btCudaBroadphase::findCellStart() //-------------------------------------------------------------------------- -void btCudaBroadphase::scanOverlappingPairBuffCPU() +void btCudaBroadphase::findOverlappingPairs() { - m_hPairScan[0] = 0; - for(int i = 1; i <= m_numHandles; i++) - { - unsigned int delta = m_hPairScan[i]; - m_hPairScan[i] = m_hPairScan[i-1] + delta; - } -} // btCudaBroadphase::scanOverlappingPairBuffCPU() - -//-------------------------------------------------------------------------- - -void btCudaBroadphase::addPairsToCacheCPU(btDispatcher* dispatcher) -{ - m_numPairsAdded = 0; - m_numPairsRemoved = 0; - for(int i = 0; i < m_numHandles; i++) - { - unsigned int num = m_hPairScan[i+1] - m_hPairScan[i]; - if(!num) - { - continue; - } - unsigned int* pInp = m_hPairOut + m_hPairScan[i]; - unsigned int index0 = m_hAABB[i * 2].uw; - btSimpleBroadphaseProxy* proxy0 = &m_pHandles[index0]; - for(unsigned int j = 0; j < num; j++) - { - unsigned int indx1_s = pInp[j]; - unsigned int index1 = indx1_s & (~BT_CUDA_PAIR_ANY_FLG); - btSimpleBroadphaseProxy* proxy1; - if(index1 < (unsigned int)m_maxHandles) - { - proxy1 = &m_pHandles[index1]; - } - else - { - index1 -= m_maxHandles; - btAssert((index1 >= 0) && (index1 < (unsigned int)m_maxLargeHandles)); - proxy1 = &m_pLargeHandles[index1]; - } - if(indx1_s & BT_CUDA_PAIR_NEW_FLG) - { - m_pairCache->addOverlappingPair(proxy0,proxy1); - m_numPairsAdded++; - } - else - { - m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher); - m_numPairsRemoved++; - } - } - } -} // btCudaBroadphase::addPairsToCacheCPU() - -//-------------------------------------------------------------------------- - -btBroadphaseProxy* btCudaBroadphase::createProxy( const btVector3& aabbMin, const btVector3& aabbMax,int shapeType,void* userPtr ,short int collisionFilterGroup,short int collisionFilterMask, btDispatcher* dispatcher,void* multiSapProxy) -{ - btBroadphaseProxy* proxy; - bool bIsLarge = isLargeProxy(aabbMin, aabbMax); - if(bIsLarge) - { - if (m_numLargeHandles >= m_maxLargeHandles) - { - btAssert(0); - return 0; //should never happen, but don't let the game crash ;-) - } - btAssert((aabbMin[0]<= aabbMax[0]) && (aabbMin[1]<= aabbMax[1]) && (aabbMin[2]<= aabbMax[2])); - int newHandleIndex = allocLargeHandle(); - proxy = new (&m_pLargeHandles[newHandleIndex])btSimpleBroadphaseProxy(aabbMin,aabbMax,shapeType,userPtr,collisionFilterGroup,collisionFilterMask,multiSapProxy); - } - else - { - proxy = btSimpleBroadphase::createProxy(aabbMin, aabbMax, shapeType, userPtr, collisionFilterGroup, collisionFilterMask, dispatcher, multiSapProxy); - } - return proxy; -} // btCudaBroadphase::createProxy() - -//-------------------------------------------------------------------------- - -void btCudaBroadphase::destroyProxy(btBroadphaseProxy* proxy, btDispatcher* dispatcher) -{ - bool bIsLarge = isLargeProxy(proxy); - if(bIsLarge) - { - - btSimpleBroadphaseProxy* proxy0 = static_cast(proxy); - freeLargeHandle(proxy0); - // TODO : remove pair from cache on GPU as well !!! - // UPD: they will not be used anyway, so don't waste time - m_pairCache->removeOverlappingPairsContainingProxy(proxy,dispatcher); - } - else - { - btSimpleBroadphase::destroyProxy(proxy, dispatcher); - } + BT_PROFILE("btCuda_findOverlappingPairs"); + btCuda_findOverlappingPairs(m_dAABB, m_dBodiesHash[0], m_dCellStart, m_dPairBuff, m_dPairBuffStartCurr, m_numHandles); return; -} // btCudaBroadphase::destroyProxy() +} // btCudaBroadphase::findOverlappingPairs() //-------------------------------------------------------------------------- -bool btCudaBroadphase::isLargeProxy(const btVector3& aabbMin, const btVector3& aabbMax) +void btCudaBroadphase::findPairsLarge() { - btVector3 diag = aabbMax - aabbMin; - btScalar radius = diag.length() * btScalar(0.5f); - - radius *= m_cellFactorAABB; // user-defined factor - - return (radius > m_maxRadius); -} // btCudaBroadphase::isLargeProxy() - -//-------------------------------------------------------------------------- - -bool btCudaBroadphase::isLargeProxy(btBroadphaseProxy* proxy) -{ - return (proxy->getUid() >= (m_maxHandles+2)); -} // btCudaBroadphase::isLargeProxy() - -//-------------------------------------------------------------------------- - -void btCudaBroadphase::addLarge2LargePairsToCache(btDispatcher* dispatcher) -{ - int i,j; - if (m_numLargeHandles <= 0) - { - return; - } - int new_largest_index = -1; - for(i = 0; i <= m_LastLargeHandleIndex; i++) - { - btSimpleBroadphaseProxy* proxy0 = &m_pLargeHandles[i]; - if(!proxy0->m_clientObject) - { - continue; - } - new_largest_index = i; - for(j = i + 1; j <= m_LastLargeHandleIndex; j++) - { - btSimpleBroadphaseProxy* proxy1 = &m_pLargeHandles[j]; - if(!proxy1->m_clientObject) - { - continue; - } - btAssert(proxy0 != proxy1); - btSimpleBroadphaseProxy* p0 = getSimpleProxyFromProxy(proxy0); - btSimpleBroadphaseProxy* p1 = getSimpleProxyFromProxy(proxy1); - if(aabbOverlap(p0,p1)) - { - if (!m_pairCache->findPair(proxy0,proxy1)) - { - m_pairCache->addOverlappingPair(proxy0,proxy1); - } - } - else - { - if(m_pairCache->findPair(proxy0,proxy1)) - { - m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher); - } - } - } - } - m_LastLargeHandleIndex = new_largest_index; + BT_PROFILE("btCuda_findPairsLarge"); + btCuda_findPairsLarge(m_dAABB, m_dBodiesHash[0], m_dCellStart, m_dPairBuff, m_dPairBuffStartCurr, m_numHandles, m_numLargeHandles); return; -} // btCudaBroadphase::addLarge2LargePairsToCache() +} // btCudaBroadphase::findPairsLarge() //-------------------------------------------------------------------------- -void btCudaBroadphase::rayTest(const btVector3& rayFrom,const btVector3& rayTo, btBroadphaseRayCallback& rayCallback) +void btCudaBroadphase::computePairCacheChanges() { - btSimpleBroadphase::rayTest(rayFrom, rayTo, rayCallback); - for (int i=0; i <= m_LastLargeHandleIndex; i++) - { - btSimpleBroadphaseProxy* proxy = &m_pLargeHandles[i]; - if(!proxy->m_clientObject) - { - continue; - } - rayCallback.process(proxy); - } -} // btCudaBroadphase::rayTest() + BT_PROFILE("btCuda_computePairCacheChanges"); + btCuda_computePairCacheChanges(m_dPairBuff, m_dPairBuffStartCurr, m_dPairScan, m_dAABB, m_numHandles); + return; +} // btCudaBroadphase::computePairCacheChanges() + +//-------------------------------------------------------------------------- + +void btCudaBroadphase::scanOverlappingPairBuff() +{ + btCuda_copyArrayFromDevice(m_hPairScan, m_dPairScan, sizeof(unsigned int)*(m_numHandles + 1)); + bt3DGridBroadphase::scanOverlappingPairBuff(); + btCuda_copyArrayToDevice(m_dPairScan, m_hPairScan, sizeof(unsigned int)*(m_numHandles + 1)); + return; +} // btCudaBroadphase::scanOverlappingPairBuff() + +//-------------------------------------------------------------------------- + +void btCudaBroadphase::squeezeOverlappingPairBuff() +{ + BT_PROFILE("btCuda_squeezeOverlappingPairBuff"); + btCuda_squeezeOverlappingPairBuff(m_dPairBuff, m_dPairBuffStartCurr, m_dPairScan, m_dPairOut, m_dAABB, m_numHandles); + btCuda_copyArrayFromDevice(m_hPairOut, m_dPairOut, sizeof(unsigned int) * m_hPairScan[m_numHandles]); + return; +} // btCudaBroadphase::squeezeOverlappingPairBuff() //-------------------------------------------------------------------------- diff --git a/Extras/CUDA/btCudaBroadphase.cu b/Extras/CUDA/btCudaBroadphase.cu index e21807af8..225a5495c 100644 --- a/Extras/CUDA/btCudaBroadphase.cu +++ b/Extras/CUDA/btCudaBroadphase.cu @@ -65,9 +65,9 @@ __device__ inline btCuda3F1U tex_fetch3F1U(float4 a) { return *((btCuda3F1U*)(&a)); } #if B_CUDA_USE_TEX - #define FETCH(t, i) tex_fetch3F1U(tex1Dfetch(t##Tex, i)) + #define BT3DGRIDFETCH(t, i) tex_fetch3F1U(tex1Dfetch(t##Tex, i)) #else - #define FETCH(t, i) t[i] + #define BT3DGRIDFETCH(t, i) t[i] #endif texture particleHashTex; @@ -80,323 +80,25 @@ __constant__ btCudaBroadphaseParams params; //---------------------------------------------------------------------------------------- -// calculate position in uniform grid -__device__ int3 btCuda_calcGridPos(float4 p) -{ - int3 gridPos; - gridPos.x = floor((p.x - params.m_worldOriginX) / params.m_cellSizeX); - gridPos.y = floor((p.y - params.m_worldOriginY) / params.m_cellSizeY); - gridPos.z = floor((p.z - params.m_worldOriginZ) / params.m_cellSizeZ); - return gridPos; -} +#define BT3DGRID__device__ __device__ +#define BT3DGRIDmax(a, b) max(a, b) +#define BT3DGRIDmin(a, b) min(a, b) +#define BT3DGRIDparams params +#define BT3DGRID__mul24(a, b) __mul24(a, b) +#define BT3DGRID__global__ __global__ +#define BT3DGRID__shared__ __shared__ +#define BT3DGRID__syncthreads() __syncthreads() +#define BT3DGRIDmake_uint2(x, y) make_uint2(x, y) +#define BT3DGRIDmake_int3(x, y, z) make_int3(x, y, z) +#define BT3DGRIDPREF(func) btCuda_##func +#define BT3DGPRDMemset cudaMemset +#define BT3DGRIDblockIdx blockIdx +#define BT3DGRIDblockDim blockDim +#define BT3DGRIDthreadIdx threadIdx +#define BT3DGRIDEXECKERNEL(numb, numt, kfunc, args) kfunc<<>>args //---------------------------------------------------------------------------------------- -// calculate address in grid from position (clamping to edges) -__device__ uint btCuda_calcGridHash(int3 gridPos) -{ - gridPos.x = max(0, min(gridPos.x, params.m_gridSizeX - 1)); - gridPos.y = max(0, min(gridPos.y, params.m_gridSizeY - 1)); - gridPos.z = max(0, min(gridPos.z, params.m_gridSizeZ - 1)); - return __mul24(__mul24(gridPos.z, params.m_gridSizeY), params.m_gridSizeX) + __mul24(gridPos.y, params.m_gridSizeX) + gridPos.x; -} - -//---------------------------------------------------------------------------------------- - -// calculate grid hash value for each body using its AABB -__global__ void calcHashAABBD(btCuda3F1U* pAABB, uint2* pHash, uint numBodies) -{ - int index = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; - if(index >= numBodies) - { - return; - } - btCuda3F1U bbMin = pAABB[index*2]; - btCuda3F1U bbMax = pAABB[index*2 + 1]; - float4 pos; - pos.x = (bbMin.fx + bbMax.fx) * 0.5f; - pos.y = (bbMin.fy + bbMax.fy) * 0.5f; - pos.z = (bbMin.fz + bbMax.fz) * 0.5f; - // get address in grid - int3 gridPos = btCuda_calcGridPos(pos); - uint gridHash = btCuda_calcGridHash(gridPos); - // store grid hash and body index - pHash[index] = make_uint2(gridHash, index); -} - -//---------------------------------------------------------------------------------------- - -__global__ void findCellStartD(uint2* pHash, uint* cellStart, uint numBodies) -{ - int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; - if(index >= numBodies) - { - return; - } - uint2 sortedData = pHash[index]; - // Load hash data into shared memory so that we can look - // at neighboring body's hash value without loading - // two hash values per thread - __shared__ uint sharedHash[257]; - sharedHash[threadIdx.x+1] = sortedData.x; - if((index > 0) && (threadIdx.x == 0)) - { - // first thread in block must load neighbor body hash - volatile uint2 prevData = pHash[index-1]; - sharedHash[0] = prevData.x; - } - __syncthreads(); - if((index == 0) || (sortedData.x != sharedHash[threadIdx.x])) - { - cellStart[sortedData.x] = index; - } -} - -//---------------------------------------------------------------------------------------- - -__device__ uint cudaTestAABBOverlap(btCuda3F1U min0, btCuda3F1U max0, btCuda3F1U min1, btCuda3F1U max1) -{ - return (min0.fx <= max1.fx)&& (min1.fx <= max0.fx) && - (min0.fy <= max1.fy)&& (min1.fy <= max0.fy) && - (min0.fz <= max1.fz)&& (min1.fz <= max0.fz); -} - -//---------------------------------------------------------------------------------------- - -__device__ void findPairsInCell(int3 gridPos, - uint index, - uint2* pHash, - uint* pCellStart, - btCuda3F1U* pAABB, - uint* pPairBuff, - uint2* pPairBuffStartCurr, - uint numBodies) -{ - if ( (gridPos.x < 0) || (gridPos.x > params.m_gridSizeX - 1) - || (gridPos.y < 0) || (gridPos.y > params.m_gridSizeY - 1) - || (gridPos.z < 0) || (gridPos.z > params.m_gridSizeZ - 1)) - { - return; - } - uint gridHash = btCuda_calcGridHash(gridPos); - // get start of bucket for this cell - uint bucketStart = pCellStart[gridHash]; - if (bucketStart == 0xffffffff) - { - return; // cell empty - } - // iterate over bodies in this cell - uint2 sortedData = pHash[index]; - uint unsorted_indx = sortedData.y; - btCuda3F1U min0 = FETCH(pAABB, unsorted_indx*2); - btCuda3F1U max0 = FETCH(pAABB, unsorted_indx*2 + 1); - uint handleIndex = min0.uw; - uint2 start_curr = pPairBuffStartCurr[handleIndex]; - uint start = start_curr.x; - uint curr = start_curr.y; - uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1]; - uint curr_max = start_curr_next.x - start - 1; - uint bucketEnd = bucketStart + params.m_maxBodiesPerCell; - bucketEnd = (bucketEnd > numBodies) ? numBodies : bucketEnd; - for(uint index2 = bucketStart; index2 < bucketEnd; index2++) - { - uint2 cellData = pHash[index2]; - if (cellData.x != gridHash) - { - break; // no longer in same bucket - } - uint unsorted_indx2 = cellData.y; - if (unsorted_indx2 < unsorted_indx) // check not colliding with self - { - btCuda3F1U min1 = FETCH(pAABB, unsorted_indx2*2); - btCuda3F1U max1 = FETCH(pAABB, unsorted_indx2*2 + 1); - if(cudaTestAABBOverlap(min0, max0, min1, max1)) - { - uint handleIndex2 = min1.uw; - uint k; - for(k = 0; k < curr; k++) - { - uint old_pair = pPairBuff[start+k] & (~BT_CUDA_PAIR_ANY_FLG); - if(old_pair == handleIndex2) - { - pPairBuff[start+k] |= BT_CUDA_PAIR_FOUND_FLG; - break; - } - } - if(k == curr) - { - pPairBuff[start+curr] = handleIndex2 | BT_CUDA_PAIR_NEW_FLG; - if(curr >= curr_max) - { // not a good solution, but let's avoid crash - break; - } - curr++; - } - } - } - } - pPairBuffStartCurr[handleIndex] = make_uint2(start, curr); - return; -} - -//---------------------------------------------------------------------------------------- - -__global__ void -findOverlappingPairsD( btCuda3F1U* pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff, - uint2* pPairBuffStartCurr, uint numBodies) -{ - int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; - if(index >= numBodies) - { - return; - } - uint2 sortedData = pHash[index]; - uint unsorted_indx = sortedData.y; - btCuda3F1U bbMin = FETCH(pAABB, unsorted_indx*2); - btCuda3F1U bbMax = FETCH(pAABB, unsorted_indx*2 + 1); - float4 pos; - pos.x = (bbMin.fx + bbMax.fx) * 0.5f; - pos.y = (bbMin.fy + bbMax.fy) * 0.5f; - pos.z = (bbMin.fz + bbMax.fz) * 0.5f; - // get address in grid - int3 gridPos = btCuda_calcGridPos(pos); - // examine only neighbouring cells - for(int z=-1; z<=1; z++) { - for(int y=-1; y<=1; y++) { - for(int x=-1; x<=1; x++) { - findPairsInCell(gridPos + make_int3(x, y, z), index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, numBodies); - } - } - } -} - -//---------------------------------------------------------------------------------------- - -__global__ void -findPairsLargeD( btCuda3F1U* pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff, - uint2* pPairBuffStartCurr, uint numBodies, uint numLarge) -{ - int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; - if(index >= numBodies) - { - return; - } - uint2 sortedData = pHash[index]; - uint unsorted_indx = sortedData.y; - btCuda3F1U min0 = FETCH(pAABB, unsorted_indx*2); - btCuda3F1U max0 = FETCH(pAABB, unsorted_indx*2 + 1); - uint handleIndex = min0.uw; - uint2 start_curr = pPairBuffStartCurr[handleIndex]; - uint start = start_curr.x; - uint curr = start_curr.y; - uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1]; - uint curr_max = start_curr_next.x - start - 1; - for(uint i = 0; i < numLarge; i++) - { - uint indx2 = numBodies + i; - btCuda3F1U min1 = FETCH(pAABB, indx2*2); - btCuda3F1U max1 = FETCH(pAABB, indx2*2 + 1); - if(cudaTestAABBOverlap(min0, max0, min1, max1)) - { - uint k; - uint handleIndex2 = min1.uw; - for(k = 0; k < curr; k++) - { - uint old_pair = pPairBuff[start+k] & (~BT_CUDA_PAIR_ANY_FLG); - if(old_pair == handleIndex2) - { - pPairBuff[start+k] |= BT_CUDA_PAIR_FOUND_FLG; - break; - } - } - if(k == curr) - { - pPairBuff[start+curr] = handleIndex2 | BT_CUDA_PAIR_NEW_FLG; - if(curr >= curr_max) - { // not a good solution, but let's avoid crash - break; - } - curr++; - } - } - } - pPairBuffStartCurr[handleIndex] = make_uint2(start, curr); - return; -} - -//---------------------------------------------------------------------------------------- - -__global__ void computePairCacheChangesD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan, btCuda3F1U* pAABB, uint numBodies) -{ - int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; - if(index >= numBodies) - { - return; - } - btCuda3F1U bbMin = pAABB[index * 2]; - uint handleIndex = bbMin.uw; - uint2 start_curr = pPairBuffStartCurr[handleIndex]; - uint start = start_curr.x; - uint curr = start_curr.y; - uint *pInp = pPairBuff + start; - uint num_changes = 0; - for(uint k = 0; k < curr; k++, pInp++) - { - if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG)) - { - num_changes++; - } - } - pPairScan[index+1] = num_changes; -} - -//---------------------------------------------------------------------------------------- - -__global__ void squeezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan, uint* pPairOut, btCuda3F1U* pAABB, uint numBodies) -{ - int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; - if(index >= numBodies) - { - return; - } - btCuda3F1U bbMin = pAABB[index * 2]; - uint handleIndex = bbMin.uw; - uint2 start_curr = pPairBuffStartCurr[handleIndex]; - uint start = start_curr.x; - uint curr = start_curr.y; - uint* pInp = pPairBuff + start; - uint* pOut = pPairOut + pPairScan[index]; - uint* pOut2 = pInp; - uint num = 0; - for(uint k = 0; k < curr; k++, pInp++) - { - if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG)) - { - *pOut = *pInp; - pOut++; - } - if((*pInp) & BT_CUDA_PAIR_ANY_FLG) - { - *pOut2 = (*pInp) & (~BT_CUDA_PAIR_ANY_FLG); - pOut2++; - num++; - } - } - pPairBuffStartCurr[handleIndex] = make_uint2(start, num); -} // squeezeOverlappingPairBuffD() - - -//---------------------------------------------------------------------------------------- -//---------------------------------------------------------------------------------------- -//---------------------------------------------------------------------------------------- -//---------------------------------------------------------------------------------------- -// E N D O F K E R N E L F U N C T I O N S -//---------------------------------------------------------------------------------------- -//---------------------------------------------------------------------------------------- -//---------------------------------------------------------------------------------------- -//---------------------------------------------------------------------------------------- - - //! Check for CUDA error # define CUT_CHECK_ERROR(errorMessage) do { \ cudaError_t err = cudaGetLastError(); \ @@ -430,9 +132,7 @@ __global__ void squeezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffSta btCuda_exit(EXIT_FAILURE); \ } } while (0) - -extern "C" -{ +//---------------------------------------------------------------------------------------- void btCuda_exit(int val) { @@ -465,125 +165,9 @@ void btCuda_setParameters(btCudaBroadphaseParams* hostParams) MY_CUDA_SAFE_CALL(cudaMemcpyToSymbol(params, hostParams, sizeof(btCudaBroadphaseParams))); } -//Round a / b to nearest higher integer value -int btCuda_iDivUp(int a, int b) -{ - return (a % b != 0) ? (a / b + 1) : (a / b); -} +//---------------------------------------------------------------------------------------- -// compute grid and thread block size for a given number of elements -void btCuda_computeGridSize(int n, int blockSize, int &numBlocks, int &numThreads) -{ - numThreads = min(blockSize, n); - numBlocks = btCuda_iDivUp(n, numThreads); -} +#include "bt3DGridBroadphaseFunc.h" -void btCuda_calcHashAABB(btCuda3F1U* pAABB, unsigned int* hash, unsigned int numBodies) -{ - int numThreads, numBlocks; - btCuda_computeGridSize(numBodies, 256, numBlocks, numThreads); - // execute the kernel - calcHashAABBD<<< numBlocks, numThreads >>>(pAABB, (uint2*)hash, numBodies); - // check if kernel invocation generated an error - CUT_CHECK_ERROR("calcHashAABBD kernel execution failed"); -} +//---------------------------------------------------------------------------------------- -void btCuda_findCellStart(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells) -{ - int numThreads, numBlocks; - btCuda_computeGridSize(numBodies, 256, numBlocks, numThreads); - MY_CUDA_SAFE_CALL(cudaMemset(cellStart, 0xffffffff, numCells*sizeof(uint))); - findCellStartD<<< numBlocks, numThreads >>>((uint2*)hash, (uint*)cellStart, numBodies); - CUT_CHECK_ERROR("Kernel execution failed: findCellStartD"); -} - -void btCuda_findOverlappingPairs( btCuda3F1U* pAABB, unsigned int* pHash, - unsigned int* pCellStart, - unsigned int* pPairBuff, - unsigned int* pPairBuffStartCurr, - unsigned int numBodies) -{ -#if B_CUDA_USE_TEX - MY_CUDA_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, numBodies * 2 * sizeof(btCuda3F1U))); -#endif - int numThreads, numBlocks; - btCuda_computeGridSize(numBodies, 64, numBlocks, numThreads); - findOverlappingPairsD<<< numBlocks, numThreads >>>( - pAABB, - (uint2*)pHash, - (uint*)pCellStart, - (uint*)pPairBuff, - (uint2*)pPairBuffStartCurr, - numBodies - ); - CUT_CHECK_ERROR("Kernel execution failed: bt_CudaFindOverlappingPairsD"); -#if B_CUDA_USE_TEX - MY_CUDA_SAFE_CALL(cudaUnbindTexture(pAABBTex)); -#endif - } // btCuda_findOverlappingPairs() - - - -void btCuda_findPairsLarge( btCuda3F1U* pAABB, unsigned int* pHash, - unsigned int* pCellStart, - unsigned int* pPairBuff, - unsigned int* pPairBuffStartCurr, - unsigned int numBodies, - unsigned int numLarge) -{ -#if B_CUDA_USE_TEX - MY_CUDA_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, (numBodies+numLarge) * 2 * sizeof(btCuda3F1U))); -#endif - int numThreads, numBlocks; - btCuda_computeGridSize(numBodies, 64, numBlocks, numThreads); - findPairsLargeD<<< numBlocks, numThreads >>>( - pAABB, - (uint2*)pHash, - (uint*)pCellStart, - (uint*)pPairBuff, - (uint2*)pPairBuffStartCurr, - numBodies, - numLarge - ); - CUT_CHECK_ERROR("Kernel execution failed: btCuda_findPairsLargeD"); -#if B_CUDA_USE_TEX - MY_CUDA_SAFE_CALL(cudaUnbindTexture(pAABBTex)); -#endif - } // btCuda_findPairsLarge() - - - -void btCuda_computePairCacheChanges(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, - unsigned int* pPairScan, btCuda3F1U* pAABB, unsigned int numBodies) -{ - int numThreads, numBlocks; - btCuda_computeGridSize(numBodies, 256, numBlocks, numThreads); - computePairCacheChangesD<<< numBlocks, numThreads >>>( - (uint*)pPairBuff, - (uint2*)pPairBuffStartCurr, - (uint*)pPairScan, - pAABB, - numBodies - ); - CUT_CHECK_ERROR("Kernel execution failed: btCudaComputePairCacheChangesD"); - } // btCuda_computePairCacheChanges() - - -void btCuda_squeezeOverlappingPairBuff( unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, - unsigned int* pPairOut, btCuda3F1U* pAABB, unsigned int numBodies) -{ - int numThreads, numBlocks; - btCuda_computeGridSize(numBodies, 256, numBlocks, numThreads); - squeezeOverlappingPairBuffD<<< numBlocks, numThreads >>>( - (uint*)pPairBuff, - (uint2*)pPairBuffStartCurr, - (uint*)pPairScan, - (uint*)pPairOut, - pAABB, - numBodies - ); - CUT_CHECK_ERROR("Kernel execution failed: btCudaSqueezeOverlappingPairBuffD"); -} // btCuda_squeezeOverlappingPairBuff() - - -} // extern "C" diff --git a/Extras/CUDA/btCudaBroadphase.h b/Extras/CUDA/btCudaBroadphase.h index 1cdc3b790..89ca382b8 100644 --- a/Extras/CUDA/btCudaBroadphase.h +++ b/Extras/CUDA/btCudaBroadphase.h @@ -21,14 +21,18 @@ subject to the following restrictions: #include "btCudaBroadphaseKernel.h" -///The btCudaBroadphase uses CUDA to compute overlapping pairs using a GPU. -class btCudaBroadphase : public btSimpleBroadphase +///The bt3DGridBroadphase uses CUDA to compute overlapping pairs using a GPU. +class bt3DGridBroadphase : public btSimpleBroadphase { +protected: bool m_bInitialized; unsigned int m_numBodies; unsigned int m_numCells; unsigned int m_maxPairsPerBody; btScalar m_cellFactorAABB; + unsigned int m_maxBodiesPerCell; + btCudaBroadphaseParams m_params; + btScalar m_maxRadius; // CPU data unsigned int* m_hBodiesHash; unsigned int* m_hCellStart; @@ -37,17 +41,6 @@ class btCudaBroadphase : public btSimpleBroadphase unsigned int* m_hPairBuff; unsigned int* m_hPairScan; unsigned int* m_hPairOut; - // GPU data - unsigned int* m_dBodiesHash[2]; - unsigned int* m_dCellStart; - unsigned int* m_dPairBuff; - unsigned int* m_dPairBuffStartCurr; - btCuda3F1U* m_dAABB; - unsigned int* m_dPairScan; - unsigned int* m_dPairOut; - unsigned int m_maxBodiesPerCell; - btCudaBroadphaseParams m_params; - btScalar m_maxRadius; // large proxies int m_numLargeHandles; int m_maxLargeHandles; @@ -82,12 +75,56 @@ class btCudaBroadphase : public btSimpleBroadphase } bool isLargeProxy(const btVector3& aabbMin, const btVector3& aabbMax); bool isLargeProxy(btBroadphaseProxy* proxy); - // debug unsigned int m_numPairsAdded; unsigned int m_numPairsRemoved; unsigned int m_numOverflows; // +public: + bt3DGridBroadphase(const btVector3& worldAabbMin,const btVector3& worldAabbMax, + int gridSizeX, int gridSizeY, int gridSizeZ, + int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody, + int maxBodiesPerCell = 8, + btScalar cellFactorAABB = btScalar(1.0f)); + virtual ~bt3DGridBroadphase(); + virtual void calculateOverlappingPairs(btDispatcher* dispatcher); + + virtual btBroadphaseProxy* createProxy(const btVector3& aabbMin, const btVector3& aabbMax,int shapeType,void* userPtr ,short int collisionFilterGroup,short int collisionFilterMask, btDispatcher* dispatcher,void* multiSapProxy); + virtual void destroyProxy(btBroadphaseProxy* proxy,btDispatcher* dispatcher); + virtual void rayTest(const btVector3& rayFrom,const btVector3& rayTo, btBroadphaseRayCallback& rayCallback); +protected: + void _initialize(); + void _finalize(); + void addPairsToCache(btDispatcher* dispatcher); + void addLarge2LargePairsToCache(btDispatcher* dispatcher); + +// overrides for CPU version + virtual void setParameters(btCudaBroadphaseParams* hostParams); + virtual void prepareAABB(); + virtual void calcHashAABB(); + virtual void sortHash(); + virtual void findCellStart(); + virtual void findOverlappingPairs(); + virtual void findPairsLarge(); + virtual void computePairCacheChanges(); + virtual void scanOverlappingPairBuff(); + virtual void squeezeOverlappingPairBuff(); +}; + + + +///The btCudaBroadphase uses CUDA to compute overlapping pairs using a GPU. +class btCudaBroadphase : public bt3DGridBroadphase +{ +protected: + // GPU data + unsigned int* m_dBodiesHash[2]; + unsigned int* m_dCellStart; + unsigned int* m_dPairBuff; + unsigned int* m_dPairBuffStartCurr; + btCuda3F1U* m_dAABB; + unsigned int* m_dPairScan; + unsigned int* m_dPairOut; public: btCudaBroadphase(const btVector3& worldAabbMin,const btVector3& worldAabbMax, int gridSizeX, int gridSizeY, int gridSizeZ, @@ -95,18 +132,21 @@ public: int maxBodiesPerCell = 8, btScalar cellFactorAABB = btScalar(1.0f)); virtual ~btCudaBroadphase(); - virtual void calculateOverlappingPairs(btDispatcher* dispatcher); - - virtual btBroadphaseProxy* createProxy(const btVector3& aabbMin, const btVector3& aabbMax,int shapeType,void* userPtr ,short int collisionFilterGroup,short int collisionFilterMask, btDispatcher* dispatcher,void* multiSapProxy); - virtual void destroyProxy(btBroadphaseProxy* proxy,btDispatcher* dispatcher); - virtual void rayTest(const btVector3& rayFrom,const btVector3& rayTo, btBroadphaseRayCallback& rayCallback); - - protected: void _initialize(); void _finalize(); - void scanOverlappingPairBuffCPU(); - void addPairsToCacheCPU(btDispatcher* dispatcher); - void addLarge2LargePairsToCache(btDispatcher* dispatcher); + void allocateArray(void** devPtr, unsigned int size); + void freeArray(void* devPtr); +// overrides for CUDA version + virtual void setParameters(btCudaBroadphaseParams* hostParams); + virtual void prepareAABB(); + virtual void calcHashAABB(); + virtual void sortHash(); + virtual void findCellStart(); + virtual void findOverlappingPairs(); + virtual void findPairsLarge(); + virtual void computePairCacheChanges(); + virtual void scanOverlappingPairBuff(); + virtual void squeezeOverlappingPairBuff(); }; #endif //CUDA_BROADPHASE_H \ No newline at end of file diff --git a/Extras/CUDA/btCudaBroadphaseKernel.h b/Extras/CUDA/btCudaBroadphaseKernel.h index 5548c076b..a54370097 100644 --- a/Extras/CUDA/btCudaBroadphaseKernel.h +++ b/Extras/CUDA/btCudaBroadphaseKernel.h @@ -54,7 +54,28 @@ struct btCuda3F1U extern "C" { +// CPU functions + void bt3DGrid_setParameters(btCudaBroadphaseParams* hostParams); + void bt3DGrid_calcHashAABB(btCuda3F1U* pAABB, unsigned int* hash, unsigned int numBodies); + void bt3DGrid_findCellStart(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells); + void bt3DGrid_findOverlappingPairs( btCuda3F1U* pAABB, unsigned int* pHash, + unsigned int* pCellStart, + unsigned int* pPairBuff, + unsigned int* pPairBuffStartCurr, + unsigned int numBodies); + void bt3DGrid_findPairsLarge( btCuda3F1U* pAABB, unsigned int* pHash, + unsigned int* pCellStart, + unsigned int* pPairBuff, + unsigned int* pPairBuffStartCurr, + unsigned int numBodies, + unsigned int numLarge); + void bt3DGrid_computePairCacheChanges( unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, + unsigned int* pPairScan, btCuda3F1U* pAABB, unsigned int numBodies); + void bt3DGrid_squeezeOverlappingPairBuff( unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, + unsigned int* pPairOut, btCuda3F1U* pAABB, unsigned int numBodies); + +// CUDA functions void btCuda_allocateArray(void** devPtr, unsigned int size); void btCuda_freeArray(void* devPtr); void btCuda_copyArrayFromDevice(void* host, const void* device, unsigned int size); diff --git a/Extras/CUDA/libbulletcuda.vcproj b/Extras/CUDA/libbulletcuda.vcproj index 53a6d1f6a..3466f0f3c 100644 --- a/Extras/CUDA/libbulletcuda.vcproj +++ b/Extras/CUDA/libbulletcuda.vcproj @@ -506,6 +506,14 @@ + + + + diff --git a/Extras/CUDA/particleSystem.cpp b/Extras/CUDA/particleSystem.cpp index 6ce9da8a0..c75bfd7dd 100644 --- a/Extras/CUDA/particleSystem.cpp +++ b/Extras/CUDA/particleSystem.cpp @@ -664,6 +664,7 @@ void ParticleSystem::initializeBullet() // m_broadphase = new btDbvtBroadphase(); // m_broadphase = new btAxisSweep3(btVector3(-3,-3,-3),btVector3(3,3,3)); m_broadphase = new btCudaBroadphase(btVector3(-1, -1, -1), btVector3(1, 1, 1), 64, 64, 64, m_params.numBodies, 16, 64, 8, btScalar(1.0f/1.733f)); +// m_broadphase = new bt3DGridBroadphase(btVector3(-1, -1, -1), btVector3(1, 1, 1), 64, 64, 64, m_params.numBodies, 16, 64, 8, btScalar(1.0f/1.733f)); m_constraintSolver=new btSequentialImpulseConstraintSolver();