CPU implementation of btCudaBroadphase added.

It is called bt3DGridBroadphase and btCudaBroadphase is now derived from it rater than from btSimpleBroadphase
Test of bt3DGridBroadphase was added to CDTestFramework
This commit is contained in:
rponom 2008-11-25 03:16:11 +00:00
parent 7dda192bfc
commit 09aa2dbbe7
10 changed files with 1247 additions and 839 deletions

View File

@ -433,6 +433,10 @@ BulletSAPCompleteBoxPruningTest::BulletSAPCompleteBoxPruningTest(int numBoxes,in
// m_broadphase = new btCudaBroadphase(aabbMin, aabbMax, 32, 32, 32, 8192, 8192, 64, 16);
methodname = "btCudaBroadphase";
break;
case 9:
m_broadphase = new bt3DGridBroadphase(aabbMin, aabbMax, 24, 24, 24,maxNumBoxes , maxNumBoxes, 64, 16);
methodname = "bt3DGridBroadphase";
break;
default:
{
m_broadphase = new btAxisSweep3(aabbMin,aabbMax,numBoxes,new btNullPairCache());

View File

@ -76,6 +76,7 @@ enum TestIndex
// TEST_BIPARTITE_BOX_PRUNING,
TEST_DBVT_8192,
TEST_BULLET_CUDA_8192,
TEST_BULLET_3DGRID_8192,
TEST_OPCODE_ARRAY_SAP,
MAX_NB_TESTS
};
@ -312,6 +313,7 @@ int main(int argc, char** argv)
// {TEST_BIPARTITE_BOX_PRUNING, "Bipartite box pruning"},
{TEST_DBVT_8192, "Bullet DBVT 8192"},
{TEST_BULLET_CUDA_8192, "Bullet CUDA 8192"},
{TEST_BULLET_3DGRID_8192, "Bullet 3D Grid 8192"},
{TEST_OPCODE_ARRAY_SAP, "OPCODE ARRAY SAP"},
};
TwType testType = TwDefineEnum("CollisionTest", testEV, MAX_NB_TESTS);
@ -335,6 +337,7 @@ int main(int argc, char** argv)
// gCollisionTests[TEST_BIPARTITE_BOX_PRUNING] = new BipartiteBoxPruningTest;
gCollisionTests[TEST_DBVT_8192] = new BulletSAPCompleteBoxPruningTest(NUM_SAP_BOXES,7);
gCollisionTests[TEST_BULLET_CUDA_8192] = new BulletSAPCompleteBoxPruningTest(NUM_SAP_BOXES,8);
gCollisionTests[TEST_BULLET_3DGRID_8192] = new BulletSAPCompleteBoxPruningTest(NUM_SAP_BOXES,9);
gCollisionTests[TEST_OPCODE_ARRAY_SAP] = new OpcodeArraySAPTest(NUM_SAP_BOXES);
for(int i=0;i<MAX_NB_TESTS;i++)

View File

@ -0,0 +1,604 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2008 Erwin Coumans http://continuousphysics.com/Bullet/
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//--------------------------------------------------------------------------
#include "LinearMath/btAlignedAllocator.h"
#include "LinearMath/btQuickprof.h"
#include "BulletCollision/BroadphaseCollision/btOverlappingPairCache.h"
#include "btCudaBroadphaseKernel.h"
#include "btCudaBroadphase.h"
//--------------------------------------------------------------------------
#include <stdio.h>
//--------------------------------------------------------------------------
static btCudaBroadphaseParams s3DGridBroadphaseParams;
//--------------------------------------------------------------------------
bt3DGridBroadphase::bt3DGridBroadphase( const btVector3& worldAabbMin,const btVector3& worldAabbMax,
int gridSizeX, int gridSizeY, int gridSizeZ,
int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
int maxBodiesPerCell,
btScalar cellFactorAABB) :
btSimpleBroadphase(maxSmallProxies,
// new (btAlignedAlloc(sizeof(btSortedOverlappingPairCache),16)) btSortedOverlappingPairCache),
new (btAlignedAlloc(sizeof(btHashedOverlappingPairCache),16)) btHashedOverlappingPairCache),
m_bInitialized(false),
m_numBodies(0)
{
m_ownsPairCache = true;
m_params.m_gridSizeX = gridSizeX;
m_params.m_gridSizeY = gridSizeY;
m_params.m_gridSizeZ = gridSizeZ;
m_params.m_numCells = m_params.m_gridSizeX * m_params.m_gridSizeY * m_params.m_gridSizeZ;
btVector3 w_org = worldAabbMin;
m_params.m_worldOriginX = w_org.getX();
m_params.m_worldOriginY = w_org.getY();
m_params.m_worldOriginZ = w_org.getZ();
btVector3 w_size = worldAabbMax - worldAabbMin;
m_params.m_cellSizeX = w_size.getX() / m_params.m_gridSizeX;
m_params.m_cellSizeY = w_size.getY() / m_params.m_gridSizeY;
m_params.m_cellSizeZ = w_size.getZ() / m_params.m_gridSizeZ;
m_maxRadius = btMin(btMin(m_params.m_cellSizeX, m_params.m_cellSizeY), m_params.m_cellSizeZ);
m_maxRadius *= btScalar(0.5f);
m_params.m_numBodies = m_numBodies;
m_params.m_maxBodiesPerCell = maxBodiesPerCell;
m_numLargeHandles = 0;
m_maxLargeHandles = maxLargeProxies;
m_maxPairsPerBody = maxPairsPerBody;
m_cellFactorAABB = cellFactorAABB;
m_LastLargeHandleIndex = -1;
_initialize();
} // bt3DGridBroadphase::bt3DGridBroadphase()
//--------------------------------------------------------------------------
bt3DGridBroadphase::~bt3DGridBroadphase()
{
//btSimpleBroadphase will free memory of btSortedOverlappingPairCache, because m_ownsPairCache
assert(m_bInitialized);
_finalize();
} // bt3DGridBroadphase::~bt3DGridBroadphase()
//--------------------------------------------------------------------------
void bt3DGridBroadphase::_initialize()
{
assert(!m_bInitialized);
// allocate host storage
m_hBodiesHash = new unsigned int[m_maxHandles * 2];
memset(m_hBodiesHash, 0x00, m_maxHandles*2*sizeof(unsigned int));
m_hCellStart = new unsigned int[m_params.m_numCells];
memset(m_hCellStart, 0x00, m_params.m_numCells * sizeof(unsigned int));
m_hPairBuffStartCurr = new unsigned int[m_maxHandles * 2 + 2];
// --------------- for now, init with m_maxPairsPerBody for each body
m_hPairBuffStartCurr[0] = 0;
m_hPairBuffStartCurr[1] = 0;
for(int i = 1; i <= m_maxHandles; i++)
{
m_hPairBuffStartCurr[i * 2] = m_hPairBuffStartCurr[(i-1) * 2] + m_maxPairsPerBody;
m_hPairBuffStartCurr[i * 2 + 1] = 0;
}
//----------------
unsigned int numAABB = m_maxHandles + m_maxLargeHandles;
m_hAABB = new btCuda3F1U[numAABB * 2]; // AABB Min & Max
m_hPairBuff = new unsigned int[m_maxHandles * m_maxPairsPerBody];
memset(m_hPairBuff, 0x00, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int)); // needed?
m_hPairScan = new unsigned int[m_maxHandles + 1];
m_hPairOut = new unsigned int[m_maxHandles * m_maxPairsPerBody];
// large proxies
// allocate handles buffer and put all handles on free list
m_pLargeHandlesRawPtr = btAlignedAlloc(sizeof(btSimpleBroadphaseProxy) * m_maxLargeHandles, 16);
m_pLargeHandles = new(m_pLargeHandlesRawPtr) btSimpleBroadphaseProxy[m_maxLargeHandles];
m_firstFreeLargeHandle = 0;
{
for (int i = m_firstFreeLargeHandle; i < m_maxLargeHandles; i++)
{
m_pLargeHandles[i].SetNextFree(i + 1);
m_pLargeHandles[i].m_uniqueId = m_maxHandles+2+i;
}
m_pLargeHandles[m_maxLargeHandles - 1].SetNextFree(0);
}
// debug data
m_numPairsAdded = 0;
m_numOverflows = 0;
m_bInitialized = true;
} // bt3DGridBroadphase::_initialize()
//--------------------------------------------------------------------------
void bt3DGridBroadphase::_finalize()
{
assert(m_bInitialized);
delete [] m_hBodiesHash;
delete [] m_hCellStart;
delete [] m_hPairBuffStartCurr;
delete [] m_hAABB;
delete [] m_hPairBuff;
delete [] m_hPairScan;
delete [] m_hPairOut;
btAlignedFree(m_pLargeHandlesRawPtr);
m_bInitialized = false;
} // bt3DGridBroadphase::_finalize()
//--------------------------------------------------------------------------
void bt3DGridBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher)
{
if(m_numHandles <= 0)
{
BT_PROFILE("addLarge2LargePairsToCache");
addLarge2LargePairsToCache(dispatcher);
return;
}
// update constants
setParameters(&m_params);
// prepare AABB array
prepareAABB();
// calculate hash
calcHashAABB();
// sort bodies based on hash
sortHash();
// find start of each cell
findCellStart();
// findOverlappingPairs (small/small)
findOverlappingPairs();
// findOverlappingPairs (small/large)
findPairsLarge();
// add pairs to CPU cache
computePairCacheChanges();
scanOverlappingPairBuff();
squeezeOverlappingPairBuff();
addPairsToCache(dispatcher);
// find and add large/large pairs to CPU cache
addLarge2LargePairsToCache(dispatcher);
return;
} // bt3DGridBroadphase::calculateOverlappingPairs()
//--------------------------------------------------------------------------
void bt3DGridBroadphase::addPairsToCache(btDispatcher* dispatcher)
{
m_numPairsAdded = 0;
m_numPairsRemoved = 0;
for(int i = 0; i < m_numHandles; i++)
{
unsigned int num = m_hPairScan[i+1] - m_hPairScan[i];
if(!num)
{
continue;
}
unsigned int* pInp = m_hPairOut + m_hPairScan[i];
unsigned int index0 = m_hAABB[i * 2].uw;
btSimpleBroadphaseProxy* proxy0 = &m_pHandles[index0];
for(unsigned int j = 0; j < num; j++)
{
unsigned int indx1_s = pInp[j];
unsigned int index1 = indx1_s & (~BT_CUDA_PAIR_ANY_FLG);
btSimpleBroadphaseProxy* proxy1;
if(index1 < (unsigned int)m_maxHandles)
{
proxy1 = &m_pHandles[index1];
}
else
{
index1 -= m_maxHandles;
btAssert((index1 >= 0) && (index1 < (unsigned int)m_maxLargeHandles));
proxy1 = &m_pLargeHandles[index1];
}
if(indx1_s & BT_CUDA_PAIR_NEW_FLG)
{
m_pairCache->addOverlappingPair(proxy0,proxy1);
m_numPairsAdded++;
}
else
{
m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher);
m_numPairsRemoved++;
}
}
}
} // bt3DGridBroadphase::addPairsToCache()
//--------------------------------------------------------------------------
btBroadphaseProxy* bt3DGridBroadphase::createProxy( const btVector3& aabbMin, const btVector3& aabbMax,int shapeType,void* userPtr ,short int collisionFilterGroup,short int collisionFilterMask, btDispatcher* dispatcher,void* multiSapProxy)
{
btBroadphaseProxy* proxy;
bool bIsLarge = isLargeProxy(aabbMin, aabbMax);
if(bIsLarge)
{
if (m_numLargeHandles >= m_maxLargeHandles)
{
btAssert(0);
return 0; //should never happen, but don't let the game crash ;-)
}
btAssert((aabbMin[0]<= aabbMax[0]) && (aabbMin[1]<= aabbMax[1]) && (aabbMin[2]<= aabbMax[2]));
int newHandleIndex = allocLargeHandle();
proxy = new (&m_pLargeHandles[newHandleIndex])btSimpleBroadphaseProxy(aabbMin,aabbMax,shapeType,userPtr,collisionFilterGroup,collisionFilterMask,multiSapProxy);
}
else
{
proxy = btSimpleBroadphase::createProxy(aabbMin, aabbMax, shapeType, userPtr, collisionFilterGroup, collisionFilterMask, dispatcher, multiSapProxy);
}
return proxy;
} // bt3DGridBroadphase::createProxy()
//--------------------------------------------------------------------------
void bt3DGridBroadphase::destroyProxy(btBroadphaseProxy* proxy, btDispatcher* dispatcher)
{
bool bIsLarge = isLargeProxy(proxy);
if(bIsLarge)
{
btSimpleBroadphaseProxy* proxy0 = static_cast<btSimpleBroadphaseProxy*>(proxy);
freeLargeHandle(proxy0);
m_pairCache->removeOverlappingPairsContainingProxy(proxy,dispatcher);
}
else
{
btSimpleBroadphase::destroyProxy(proxy, dispatcher);
}
return;
} // bt3DGridBroadphase::destroyProxy()
//--------------------------------------------------------------------------
bool bt3DGridBroadphase::isLargeProxy(const btVector3& aabbMin, const btVector3& aabbMax)
{
btVector3 diag = aabbMax - aabbMin;
btScalar radius = diag.length() * btScalar(0.5f);
radius *= m_cellFactorAABB; // user-defined factor
return (radius > m_maxRadius);
} // bt3DGridBroadphase::isLargeProxy()
//--------------------------------------------------------------------------
bool bt3DGridBroadphase::isLargeProxy(btBroadphaseProxy* proxy)
{
return (proxy->getUid() >= (m_maxHandles+2));
} // bt3DGridBroadphase::isLargeProxy()
//--------------------------------------------------------------------------
void bt3DGridBroadphase::addLarge2LargePairsToCache(btDispatcher* dispatcher)
{
int i,j;
if (m_numLargeHandles <= 0)
{
return;
}
int new_largest_index = -1;
for(i = 0; i <= m_LastLargeHandleIndex; i++)
{
btSimpleBroadphaseProxy* proxy0 = &m_pLargeHandles[i];
if(!proxy0->m_clientObject)
{
continue;
}
new_largest_index = i;
for(j = i + 1; j <= m_LastLargeHandleIndex; j++)
{
btSimpleBroadphaseProxy* proxy1 = &m_pLargeHandles[j];
if(!proxy1->m_clientObject)
{
continue;
}
btAssert(proxy0 != proxy1);
btSimpleBroadphaseProxy* p0 = getSimpleProxyFromProxy(proxy0);
btSimpleBroadphaseProxy* p1 = getSimpleProxyFromProxy(proxy1);
if(aabbOverlap(p0,p1))
{
if (!m_pairCache->findPair(proxy0,proxy1))
{
m_pairCache->addOverlappingPair(proxy0,proxy1);
}
}
else
{
if(m_pairCache->findPair(proxy0,proxy1))
{
m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher);
}
}
}
}
m_LastLargeHandleIndex = new_largest_index;
return;
} // bt3DGridBroadphase::addLarge2LargePairsToCache()
//--------------------------------------------------------------------------
void bt3DGridBroadphase::rayTest(const btVector3& rayFrom,const btVector3& rayTo, btBroadphaseRayCallback& rayCallback)
{
btSimpleBroadphase::rayTest(rayFrom, rayTo, rayCallback);
for (int i=0; i <= m_LastLargeHandleIndex; i++)
{
btSimpleBroadphaseProxy* proxy = &m_pLargeHandles[i];
if(!proxy->m_clientObject)
{
continue;
}
rayCallback.process(proxy);
}
} // bt3DGridBroadphase::rayTest()
//--------------------------------------------------------------------------
//--------------------------------------------------------------------------
//
// overrides for CPU version
//
//--------------------------------------------------------------------------
//--------------------------------------------------------------------------
void bt3DGridBroadphase::prepareAABB()
{
BT_PROFILE("prepareAABB");
btCuda3F1U* pBB = m_hAABB;
int i;
int new_largest_index = -1;
unsigned int num_small = 0;
for(i = 0; i <= m_LastHandleIndex; i++)
{
btSimpleBroadphaseProxy* proxy0 = &m_pHandles[i];
if(!proxy0->m_clientObject)
{
continue;
}
new_largest_index = i;
pBB->fx = proxy0->m_aabbMin.getX();
pBB->fy = proxy0->m_aabbMin.getY();
pBB->fz = proxy0->m_aabbMin.getZ();
pBB->uw = i;
pBB++;
pBB->fx = proxy0->m_aabbMax.getX();
pBB->fy = proxy0->m_aabbMax.getY();
pBB->fz = proxy0->m_aabbMax.getZ();
pBB->uw = num_small;
pBB++;
num_small++;
}
m_LastHandleIndex = new_largest_index;
new_largest_index = -1;
unsigned int num_large = 0;
for(i = 0; i <= m_LastLargeHandleIndex; i++)
{
btSimpleBroadphaseProxy* proxy0 = &m_pLargeHandles[i];
if(!proxy0->m_clientObject)
{
continue;
}
new_largest_index = i;
pBB->fx = proxy0->m_aabbMin.getX();
pBB->fy = proxy0->m_aabbMin.getY();
pBB->fz = proxy0->m_aabbMin.getZ();
pBB->uw = i + m_maxHandles;
pBB++;
pBB->fx = proxy0->m_aabbMax.getX();
pBB->fy = proxy0->m_aabbMax.getY();
pBB->fz = proxy0->m_aabbMax.getZ();
pBB->uw = num_large + m_maxHandles;
pBB++;
num_large++;
}
m_LastLargeHandleIndex = new_largest_index;
// paranoid checks
btAssert(num_small == m_numHandles);
btAssert(num_large == m_numLargeHandles);
return;
} // bt3DGridBroadphase::prepareAABB()
//--------------------------------------------------------------------------
void bt3DGridBroadphase::setParameters(btCudaBroadphaseParams* hostParams)
{
s3DGridBroadphaseParams = *hostParams;
return;
} // bt3DGridBroadphase::setParameters()
//--------------------------------------------------------------------------
void bt3DGridBroadphase::calcHashAABB()
{
BT_PROFILE("bt3DGrid_calcHashAABB");
bt3DGrid_calcHashAABB(m_hAABB, m_hBodiesHash, m_numHandles);
return;
} // bt3DGridBroadphase::calcHashAABB()
//--------------------------------------------------------------------------
void bt3DGridBroadphase::sortHash()
{
class bt3DGridHashKey
{
public:
unsigned int hash;
unsigned int index;
void quickSort(bt3DGridHashKey* pData, int lo, int hi)
{
int i=lo, j=hi;
bt3DGridHashKey x = pData[(lo+hi)/2];
do
{
while(pData[i].hash > x.hash) i++;
while(x.hash > pData[j].hash) j--;
if(i <= j)
{
bt3DGridHashKey t = pData[i];
pData[i] = pData[j];
pData[j] = t;
i++; j--;
}
} while(i <= j);
if(lo < j) pData->quickSort(pData, lo, j);
if(i < hi) pData->quickSort(pData, i, hi);
}
};
BT_PROFILE("bt3DGrid_sortHash");
bt3DGridHashKey* pHash = (bt3DGridHashKey*)m_hBodiesHash;
pHash->quickSort(pHash, 0, m_numHandles - 1);
return;
} // bt3DGridBroadphase::sortHash()
//--------------------------------------------------------------------------
void bt3DGridBroadphase::findCellStart()
{
BT_PROFILE("bt3DGrid_findCellStart");
bt3DGrid_findCellStart(m_hBodiesHash, m_hCellStart, m_numHandles, m_params.m_numCells);
return;
} // bt3DGridBroadphase::findCellStart()
//--------------------------------------------------------------------------
void bt3DGridBroadphase::findOverlappingPairs()
{
BT_PROFILE("bt3DGrid_findOverlappingPairs");
bt3DGrid_findOverlappingPairs(m_hAABB, m_hBodiesHash, m_hCellStart, m_hPairBuff, m_hPairBuffStartCurr, m_numHandles);
return;
} // bt3DGridBroadphase::findOverlappingPairs()
//--------------------------------------------------------------------------
void bt3DGridBroadphase::findPairsLarge()
{
BT_PROFILE("bt3DGrid_findPairsLarge");
bt3DGrid_findPairsLarge(m_hAABB, m_hBodiesHash, m_hCellStart, m_hPairBuff, m_hPairBuffStartCurr, m_numHandles, m_numLargeHandles);
return;
} // bt3DGridBroadphase::findPairsLarge()
//--------------------------------------------------------------------------
void bt3DGridBroadphase::computePairCacheChanges()
{
BT_PROFILE("bt3DGrid_computePairCacheChanges");
bt3DGrid_computePairCacheChanges(m_hPairBuff, m_hPairBuffStartCurr, m_hPairScan, m_hAABB, m_numHandles);
return;
} // bt3DGridBroadphase::computePairCacheChanges()
//--------------------------------------------------------------------------
void bt3DGridBroadphase::scanOverlappingPairBuff()
{
BT_PROFILE("bt3DGrid_scanOverlappingPairBuff");
m_hPairScan[0] = 0;
for(int i = 1; i <= m_numHandles; i++)
{
unsigned int delta = m_hPairScan[i];
m_hPairScan[i] = m_hPairScan[i-1] + delta;
}
return;
} // bt3DGridBroadphase::scanOverlappingPairBuff()
//--------------------------------------------------------------------------
void bt3DGridBroadphase::squeezeOverlappingPairBuff()
{
BT_PROFILE("bt3DGrid_squeezeOverlappingPairBuff");
bt3DGrid_squeezeOverlappingPairBuff(m_hPairBuff, m_hPairBuffStartCurr, m_hPairScan, m_hPairOut, m_hAABB, m_numHandles);
return;
} // bt3DGridBroadphase::squeezeOverlappingPairBuff()
//--------------------------------------------------------------------------
typedef unsigned int uint;
struct uint2
{
unsigned int x, y;
};
struct int3
{
int x, y, z;
};
struct uint3
{
unsigned int x, y, z;
};
struct float4
{
float x, y, z, w;
};
#define BT3DGRID__device__ inline
#define BT3DGRIDmax(a, b) ((a) > (b) ? (a) : (b))
#define BT3DGRIDmin(a, b) ((a) < (b) ? (a) : (b))
#define BT3DGRIDparams s3DGridBroadphaseParams
#define BT3DGRID__mul24(a, b) ((a)*(b))
#define BT3DGRID__global__ inline
#define BT3DGRID__shared__ static
#define BT3DGRID__syncthreads()
static inline uint2 bt3dGrid_make_uint2(unsigned int x, unsigned int y)
{
uint2 t; t.x = x; t.y = y; return t;
}
#define BT3DGRIDmake_uint2(x, y) bt3dGrid_make_uint2(x, y)
static inline int3 bt3dGrid_make_int3(int x, int y, int z)
{
int3 t; t.x = x; t.y = y; t.z = z; return t;
}
inline int3 operator+(int3 a, int3 b)
{
return bt3dGrid_make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
}
#define BT3DGRIDmake_int3(x, y, z) bt3dGrid_make_int3(x, y, z)
#define BT3DGRIDFETCH(a, b) a[b]
#define BT3DGRIDPREF(func) bt3DGrid_##func
#define MY_CUDA_SAFE_CALL(func) func
#define BT3DGPRDMemset memset
static uint2 s_blockIdx, s_blockDim, s_threadIdx;
#define BT3DGRIDblockIdx s_blockIdx
#define BT3DGRIDblockDim s_blockDim
#define BT3DGRIDthreadIdx s_threadIdx
#define BT3DGRIDEXECKERNEL(numb, numt, kfunc, args) {s_blockDim.x=numt;for(int nb=0;nb<numb;nb++){s_blockIdx.x=nb;for(int nt=0;nt<numt;nt++){s_threadIdx.x=nt;kfunc args;}}}
#define CUT_CHECK_ERROR(s)
//--------------------------------------------------------------------------
#include "bt3DGridBroadphaseFunc.h"
//--------------------------------------------------------------------------

View File

@ -0,0 +1,444 @@
/*
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*/
#include "btCudaBroadphaseKernel.h"
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
// K E R N E L F U N C T I O N S
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
// calculate position in uniform grid
BT3DGRID__device__ int3 btCuda_calcGridPos(float4 p)
{
int3 gridPos;
gridPos.x = (int)floor((p.x - BT3DGRIDparams.m_worldOriginX) / BT3DGRIDparams.m_cellSizeX);
gridPos.y = (int)floor((p.y - BT3DGRIDparams.m_worldOriginY) / BT3DGRIDparams.m_cellSizeY);
gridPos.z = (int)floor((p.z - BT3DGRIDparams.m_worldOriginZ) / BT3DGRIDparams.m_cellSizeZ);
return gridPos;
}
//----------------------------------------------------------------------------------------
// calculate address in grid from position (clamping to edges)
BT3DGRID__device__ uint btCuda_calcGridHash(int3 gridPos)
{
gridPos.x = BT3DGRIDmax(0, BT3DGRIDmin(gridPos.x, (int)BT3DGRIDparams.m_gridSizeX - 1));
gridPos.y = BT3DGRIDmax(0, BT3DGRIDmin(gridPos.y, (int)BT3DGRIDparams.m_gridSizeY - 1));
gridPos.z = BT3DGRIDmax(0, BT3DGRIDmin(gridPos.z, (int)BT3DGRIDparams.m_gridSizeZ - 1));
return BT3DGRID__mul24(BT3DGRID__mul24(gridPos.z, BT3DGRIDparams.m_gridSizeY), BT3DGRIDparams.m_gridSizeX) + BT3DGRID__mul24(gridPos.y, BT3DGRIDparams.m_gridSizeX) + gridPos.x;
}
//----------------------------------------------------------------------------------------
// calculate grid hash value for each body using its AABB
BT3DGRID__global__ void calcHashAABBD(btCuda3F1U* pAABB, uint2* pHash, uint numBodies)
{
int index = BT3DGRID__mul24(BT3DGRIDblockIdx.x, BT3DGRIDblockDim.x) + BT3DGRIDthreadIdx.x;
if(index >= (int)numBodies)
{
return;
}
btCuda3F1U bbMin = pAABB[index*2];
btCuda3F1U bbMax = pAABB[index*2 + 1];
float4 pos;
pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
// get address in grid
int3 gridPos = btCuda_calcGridPos(pos);
uint gridHash = btCuda_calcGridHash(gridPos);
// store grid hash and body index
pHash[index] = BT3DGRIDmake_uint2(gridHash, index);
}
//----------------------------------------------------------------------------------------
BT3DGRID__global__ void findCellStartD(uint2* pHash, uint* cellStart, uint numBodies)
{
int index = BT3DGRID__mul24(BT3DGRIDblockIdx.x, BT3DGRIDblockDim.x) + BT3DGRIDthreadIdx.x;
if(index >= (int)numBodies)
{
return;
}
uint2 sortedData = pHash[index];
// Load hash data into shared memory so that we can look
// at neighboring body's hash value without loading
// two hash values per thread
BT3DGRID__shared__ uint sharedHash[257];
sharedHash[BT3DGRIDthreadIdx.x+1] = sortedData.x;
if((index > 0) && (BT3DGRIDthreadIdx.x == 0))
{
// first thread in block must load neighbor body hash
volatile uint2 prevData = pHash[index-1];
sharedHash[0] = prevData.x;
}
BT3DGRID__syncthreads();
if((index == 0) || (sortedData.x != sharedHash[BT3DGRIDthreadIdx.x]))
{
cellStart[sortedData.x] = index;
}
}
//----------------------------------------------------------------------------------------
BT3DGRID__device__ uint cudaTestAABBOverlap(btCuda3F1U min0, btCuda3F1U max0, btCuda3F1U min1, btCuda3F1U max1)
{
return (min0.fx <= max1.fx)&& (min1.fx <= max0.fx) &&
(min0.fy <= max1.fy)&& (min1.fy <= max0.fy) &&
(min0.fz <= max1.fz)&& (min1.fz <= max0.fz);
}
//----------------------------------------------------------------------------------------
BT3DGRID__device__ void findPairsInCell(int3 gridPos,
uint index,
uint2* pHash,
uint* pCellStart,
btCuda3F1U* pAABB,
uint* pPairBuff,
uint2* pPairBuffStartCurr,
uint numBodies)
{
if ( (gridPos.x < 0) || (gridPos.x > (int)BT3DGRIDparams.m_gridSizeX - 1)
|| (gridPos.y < 0) || (gridPos.y > (int)BT3DGRIDparams.m_gridSizeY - 1)
|| (gridPos.z < 0) || (gridPos.z > (int)BT3DGRIDparams.m_gridSizeZ - 1))
{
return;
}
uint gridHash = btCuda_calcGridHash(gridPos);
// get start of bucket for this cell
uint bucketStart = pCellStart[gridHash];
if (bucketStart == 0xffffffff)
{
return; // cell empty
}
// iterate over bodies in this cell
uint2 sortedData = pHash[index];
uint unsorted_indx = sortedData.y;
btCuda3F1U min0 = BT3DGRIDFETCH(pAABB, unsorted_indx*2);
btCuda3F1U max0 = BT3DGRIDFETCH(pAABB, unsorted_indx*2 + 1);
uint handleIndex = min0.uw;
uint2 start_curr = pPairBuffStartCurr[handleIndex];
uint start = start_curr.x;
uint curr = start_curr.y;
uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
uint curr_max = start_curr_next.x - start - 1;
uint bucketEnd = bucketStart + BT3DGRIDparams.m_maxBodiesPerCell;
bucketEnd = (bucketEnd > numBodies) ? numBodies : bucketEnd;
for(uint index2 = bucketStart; index2 < bucketEnd; index2++)
{
uint2 cellData = pHash[index2];
if (cellData.x != gridHash)
{
break; // no longer in same bucket
}
uint unsorted_indx2 = cellData.y;
if (unsorted_indx2 < unsorted_indx) // check not colliding with self
{
btCuda3F1U min1 = BT3DGRIDFETCH(pAABB, unsorted_indx2*2);
btCuda3F1U max1 = BT3DGRIDFETCH(pAABB, unsorted_indx2*2 + 1);
if(cudaTestAABBOverlap(min0, max0, min1, max1))
{
uint handleIndex2 = min1.uw;
uint k;
for(k = 0; k < curr; k++)
{
uint old_pair = pPairBuff[start+k] & (~BT_CUDA_PAIR_ANY_FLG);
if(old_pair == handleIndex2)
{
pPairBuff[start+k] |= BT_CUDA_PAIR_FOUND_FLG;
break;
}
}
if(k == curr)
{
pPairBuff[start+curr] = handleIndex2 | BT_CUDA_PAIR_NEW_FLG;
if(curr >= curr_max)
{ // not a good solution, but let's avoid crash
break;
}
curr++;
}
}
}
}
pPairBuffStartCurr[handleIndex] = BT3DGRIDmake_uint2(start, curr);
return;
}
//----------------------------------------------------------------------------------------
BT3DGRID__global__ void
findOverlappingPairsD( btCuda3F1U* pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff,
uint2* pPairBuffStartCurr, uint numBodies)
{
int index = BT3DGRID__mul24(BT3DGRIDblockIdx.x, BT3DGRIDblockDim.x) + BT3DGRIDthreadIdx.x;
if(index >= (int)numBodies)
{
return;
}
uint2 sortedData = pHash[index];
uint unsorted_indx = sortedData.y;
btCuda3F1U bbMin = BT3DGRIDFETCH(pAABB, unsorted_indx*2);
btCuda3F1U bbMax = BT3DGRIDFETCH(pAABB, unsorted_indx*2 + 1);
float4 pos;
pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
// get address in grid
int3 gridPos = btCuda_calcGridPos(pos);
// examine only neighbouring cells
for(int z=-1; z<=1; z++) {
for(int y=-1; y<=1; y++) {
for(int x=-1; x<=1; x++) {
findPairsInCell(gridPos + BT3DGRIDmake_int3(x, y, z), index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, numBodies);
}
}
}
}
//----------------------------------------------------------------------------------------
BT3DGRID__global__ void
findPairsLargeD( btCuda3F1U* pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff,
uint2* pPairBuffStartCurr, uint numBodies, uint numLarge)
{
int index = BT3DGRID__mul24(BT3DGRIDblockIdx.x, BT3DGRIDblockDim.x) + BT3DGRIDthreadIdx.x;
if(index >= (int)numBodies)
{
return;
}
uint2 sortedData = pHash[index];
uint unsorted_indx = sortedData.y;
btCuda3F1U min0 = BT3DGRIDFETCH(pAABB, unsorted_indx*2);
btCuda3F1U max0 = BT3DGRIDFETCH(pAABB, unsorted_indx*2 + 1);
uint handleIndex = min0.uw;
uint2 start_curr = pPairBuffStartCurr[handleIndex];
uint start = start_curr.x;
uint curr = start_curr.y;
uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
uint curr_max = start_curr_next.x - start - 1;
for(uint i = 0; i < numLarge; i++)
{
uint indx2 = numBodies + i;
btCuda3F1U min1 = BT3DGRIDFETCH(pAABB, indx2*2);
btCuda3F1U max1 = BT3DGRIDFETCH(pAABB, indx2*2 + 1);
if(cudaTestAABBOverlap(min0, max0, min1, max1))
{
uint k;
uint handleIndex2 = min1.uw;
for(k = 0; k < curr; k++)
{
uint old_pair = pPairBuff[start+k] & (~BT_CUDA_PAIR_ANY_FLG);
if(old_pair == handleIndex2)
{
pPairBuff[start+k] |= BT_CUDA_PAIR_FOUND_FLG;
break;
}
}
if(k == curr)
{
pPairBuff[start+curr] = handleIndex2 | BT_CUDA_PAIR_NEW_FLG;
if(curr >= curr_max)
{ // not a good solution, but let's avoid crash
break;
}
curr++;
}
}
}
pPairBuffStartCurr[handleIndex] = BT3DGRIDmake_uint2(start, curr);
return;
}
//----------------------------------------------------------------------------------------
BT3DGRID__global__ void computePairCacheChangesD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan, btCuda3F1U* pAABB, uint numBodies)
{
int index = BT3DGRID__mul24(BT3DGRIDblockIdx.x, BT3DGRIDblockDim.x) + BT3DGRIDthreadIdx.x;
if(index >= (int)numBodies)
{
return;
}
btCuda3F1U bbMin = pAABB[index * 2];
uint handleIndex = bbMin.uw;
uint2 start_curr = pPairBuffStartCurr[handleIndex];
uint start = start_curr.x;
uint curr = start_curr.y;
uint *pInp = pPairBuff + start;
uint num_changes = 0;
for(uint k = 0; k < curr; k++, pInp++)
{
if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG))
{
num_changes++;
}
}
pPairScan[index+1] = num_changes;
}
//----------------------------------------------------------------------------------------
BT3DGRID__global__ void squeezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan, uint* pPairOut, btCuda3F1U* pAABB, uint numBodies)
{
int index = BT3DGRID__mul24(BT3DGRIDblockIdx.x, BT3DGRIDblockDim.x) + BT3DGRIDthreadIdx.x;
if(index >= (int)numBodies)
{
return;
}
btCuda3F1U bbMin = pAABB[index * 2];
uint handleIndex = bbMin.uw;
uint2 start_curr = pPairBuffStartCurr[handleIndex];
uint start = start_curr.x;
uint curr = start_curr.y;
uint* pInp = pPairBuff + start;
uint* pOut = pPairOut + pPairScan[index];
uint* pOut2 = pInp;
uint num = 0;
for(uint k = 0; k < curr; k++, pInp++)
{
if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG))
{
*pOut = *pInp;
pOut++;
}
if((*pInp) & BT_CUDA_PAIR_ANY_FLG)
{
*pOut2 = (*pInp) & (~BT_CUDA_PAIR_ANY_FLG);
pOut2++;
num++;
}
}
pPairBuffStartCurr[handleIndex] = BT3DGRIDmake_uint2(start, num);
} // squeezeOverlappingPairBuffD()
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
// E N D O F K E R N E L F U N C T I O N S
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
extern "C"
{
//Round a / b to nearest higher integer value
int BT3DGRIDPREF(iDivUp)(int a, int b)
{
return (a % b != 0) ? (a / b + 1) : (a / b);
}
// compute grid and thread block size for a given number of elements
void BT3DGRIDPREF(computeGridSize)(int n, int blockSize, int &numBlocks, int &numThreads)
{
numThreads = BT3DGRIDmin(blockSize, n);
numBlocks = BT3DGRIDPREF(iDivUp)(n, numThreads);
}
void BT3DGRIDPREF(calcHashAABB)(btCuda3F1U* pAABB, unsigned int* hash, unsigned int numBodies)
{
int numThreads, numBlocks;
BT3DGRIDPREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
// execute the kernel
BT3DGRIDEXECKERNEL(numBlocks, numThreads, calcHashAABBD, (pAABB, (uint2*)hash, numBodies));
// check if kernel invocation generated an error
CUT_CHECK_ERROR("calcHashAABBD kernel execution failed");
}
void BT3DGRIDPREF(findCellStart(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells))
{
int numThreads, numBlocks;
BT3DGRIDPREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
MY_CUDA_SAFE_CALL(BT3DGPRDMemset(cellStart, 0xffffffff, numCells*sizeof(uint)));
BT3DGRIDEXECKERNEL(numBlocks, numThreads, findCellStartD, ((uint2*)hash, (uint*)cellStart, numBodies));
CUT_CHECK_ERROR("Kernel execution failed: findCellStartD");
}
void BT3DGRIDPREF(findOverlappingPairs(btCuda3F1U* pAABB, unsigned int* pHash, unsigned int* pCellStart, unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int numBodies))
{
#if B_CUDA_USE_TEX
MY_CUDA_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, numBodies * 2 * sizeof(btCuda3F1U)));
#endif
int numThreads, numBlocks;
BT3DGRIDPREF(computeGridSize)(numBodies, 64, numBlocks, numThreads);
BT3DGRIDEXECKERNEL(numBlocks, numThreads, findOverlappingPairsD, (pAABB,(uint2*)pHash,(uint*)pCellStart,(uint*)pPairBuff,(uint2*)pPairBuffStartCurr,numBodies));
CUT_CHECK_ERROR("Kernel execution failed: bt_CudaFindOverlappingPairsD");
#if B_CUDA_USE_TEX
MY_CUDA_SAFE_CALL(cudaUnbindTexture(pAABBTex));
#endif
}
void BT3DGRIDPREF(findPairsLarge(btCuda3F1U* pAABB, unsigned int* pHash, unsigned int* pCellStart, unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int numBodies, unsigned int numLarge))
{
#if B_CUDA_USE_TEX
MY_CUDA_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, (numBodies+numLarge) * 2 * sizeof(btCuda3F1U)));
#endif
int numThreads, numBlocks;
BT3DGRIDPREF(computeGridSize)(numBodies, 64, numBlocks, numThreads);
BT3DGRIDEXECKERNEL(numBlocks, numThreads, findPairsLargeD, (pAABB,(uint2*)pHash,(uint*)pCellStart,(uint*)pPairBuff,(uint2*)pPairBuffStartCurr,numBodies,numLarge));
CUT_CHECK_ERROR("Kernel execution failed: btCuda_findPairsLargeD");
#if B_CUDA_USE_TEX
MY_CUDA_SAFE_CALL(cudaUnbindTexture(pAABBTex));
#endif
}
void BT3DGRIDPREF(computePairCacheChanges(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, btCuda3F1U* pAABB, unsigned int numBodies))
{
int numThreads, numBlocks;
BT3DGRIDPREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
BT3DGRIDEXECKERNEL(numBlocks, numThreads, computePairCacheChangesD, ((uint*)pPairBuff,(uint2*)pPairBuffStartCurr,(uint*)pPairScan,pAABB,numBodies));
CUT_CHECK_ERROR("Kernel execution failed: btCudaComputePairCacheChangesD");
}
void BT3DGRIDPREF(squeezeOverlappingPairBuff(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, unsigned int* pPairOut, btCuda3F1U* pAABB, unsigned int numBodies))
{
int numThreads, numBlocks;
BT3DGRIDPREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
BT3DGRIDEXECKERNEL(numBlocks, numThreads, squeezeOverlappingPairBuffD, ((uint*)pPairBuff,(uint2*)pPairBuffStartCurr,(uint*)pPairScan,(uint*)pPairOut,pAABB,numBodies));
CUT_CHECK_ERROR("Kernel execution failed: btCudaSqueezeOverlappingPairBuffD");
} // btCuda_squeezeOverlappingPairBuff()
} // extern "C"

View File

@ -34,39 +34,8 @@ btCudaBroadphase::btCudaBroadphase(const btVector3& worldAabbMin,const btVector3
int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
int maxBodiesPerCell,
btScalar cellFactorAABB) :
btSimpleBroadphase(maxSmallProxies,
// new (btAlignedAlloc(sizeof(btSortedOverlappingPairCache),16)) btSortedOverlappingPairCache),
new (btAlignedAlloc(sizeof(btHashedOverlappingPairCache),16)) btHashedOverlappingPairCache),
m_bInitialized(false),
m_numBodies(0)
bt3DGridBroadphase(worldAabbMin, worldAabbMax, gridSizeX, gridSizeY, gridSizeZ, maxSmallProxies, maxLargeProxies, maxPairsPerBody, maxBodiesPerCell,cellFactorAABB)
{
m_ownsPairCache = true;
m_params.m_gridSizeX = gridSizeX;
m_params.m_gridSizeY = gridSizeY;
m_params.m_gridSizeZ = gridSizeZ;
m_params.m_numCells = m_params.m_gridSizeX * m_params.m_gridSizeY * m_params.m_gridSizeZ;
btVector3 w_org = worldAabbMin;
m_params.m_worldOriginX = w_org.getX();
m_params.m_worldOriginY = w_org.getY();
m_params.m_worldOriginZ = w_org.getZ();
btVector3 w_size = worldAabbMax - worldAabbMin;
m_params.m_cellSizeX = w_size.getX() / m_params.m_gridSizeX;
m_params.m_cellSizeY = w_size.getY() / m_params.m_gridSizeY;
m_params.m_cellSizeZ = w_size.getZ() / m_params.m_gridSizeZ;
m_maxRadius = btMin(btMin(m_params.m_cellSizeX, m_params.m_cellSizeY), m_params.m_cellSizeZ);
m_maxRadius *= btScalar(0.5f);
m_params.m_numBodies = m_numBodies;
m_params.m_maxBodiesPerCell = maxBodiesPerCell;
m_numLargeHandles = 0;
m_maxLargeHandles = maxLargeProxies;
m_maxPairsPerBody = maxPairsPerBody;
m_cellFactorAABB = cellFactorAABB;
m_LastLargeHandleIndex = -1;
_initialize();
} // btCudaBroadphase::btCudaBroadphase()
@ -83,34 +52,6 @@ btCudaBroadphase::~btCudaBroadphase()
void btCudaBroadphase::_initialize()
{
assert(!m_bInitialized);
// allocate host storage
m_hBodiesHash = new unsigned int[m_maxHandles * 2];
memset(m_hBodiesHash, 0x00, m_maxHandles*2*sizeof(unsigned int));
m_hCellStart = new unsigned int[m_params.m_numCells];
memset(m_hCellStart, 0x00, m_params.m_numCells * sizeof(unsigned int));
m_hPairBuffStartCurr = new unsigned int[m_maxHandles * 2 + 2];
// --------------- for now, init with m_maxPairsPerBody for each body
m_hPairBuffStartCurr[0] = 0;
m_hPairBuffStartCurr[1] = 0;
for(int i = 1; i <= m_maxHandles; i++)
{
m_hPairBuffStartCurr[i * 2] = m_hPairBuffStartCurr[(i-1) * 2] + m_maxPairsPerBody;
m_hPairBuffStartCurr[i * 2 + 1] = 0;
}
//----------------
unsigned int numAABB = m_maxHandles + m_maxLargeHandles;
m_hAABB = new btCuda3F1U[numAABB * 2]; // AABB Min & Max
m_hPairBuff = new unsigned int[m_maxHandles * m_maxPairsPerBody];
memset(m_hPairBuff, 0x00, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int)); // needed?
m_hPairScan = new unsigned int[m_maxHandles + 1];
m_hPairOut = new unsigned int[m_maxHandles * m_maxPairsPerBody];
// allocate GPU data
btCuda_allocateArray((void**)&m_dBodiesHash[0], m_maxHandles * 2 * sizeof(unsigned int));
btCuda_allocateArray((void**)&m_dBodiesHash[1], m_maxHandles * 2 * sizeof(unsigned int));
@ -123,34 +64,12 @@ void btCudaBroadphase::_initialize()
btCuda_allocateArray((void**)&m_dPairBuffStartCurr, (m_maxHandles * 2 + 1) * sizeof(unsigned int));
btCuda_copyArrayToDevice(m_dPairBuffStartCurr, m_hPairBuffStartCurr, (m_maxHandles * 2 + 1) * sizeof(unsigned int));
unsigned int numAABB = m_maxHandles + m_maxLargeHandles;
btCuda_allocateArray((void**)&m_dAABB, numAABB * sizeof(btCuda3F1U) * 2);
btCuda_allocateArray((void**)&m_dPairScan, (m_maxHandles + 1) * sizeof(unsigned int));
btCuda_allocateArray((void**)&m_dPairOut, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int));
btCuda_setParameters(&m_params);
// large proxies
// allocate handles buffer and put all handles on free list
m_pLargeHandlesRawPtr = btAlignedAlloc(sizeof(btSimpleBroadphaseProxy) * m_maxLargeHandles, 16);
m_pLargeHandles = new(m_pLargeHandlesRawPtr) btSimpleBroadphaseProxy[m_maxLargeHandles];
m_firstFreeLargeHandle = 0;
{
for (int i = m_firstFreeLargeHandle; i < m_maxLargeHandles; i++)
{
m_pLargeHandles[i].SetNextFree(i + 1);
m_pLargeHandles[i].m_uniqueId = m_maxHandles+2+i;
}
m_pLargeHandles[m_maxLargeHandles - 1].SetNextFree(0);
}
// debug data
m_numPairsAdded = 0;
m_numOverflows = 0;
m_bInitialized = true;
} // btCudaBroadphase::_initialize()
//--------------------------------------------------------------------------
@ -179,324 +98,104 @@ void btCudaBroadphase::_finalize()
m_bInitialized = false;
} // btCudaBroadphase::_finalize()
//--------------------------------------------------------------------------
//--------------------------------------------------------------------------
//
// overrides for CUDA version
//
//--------------------------------------------------------------------------
//--------------------------------------------------------------------------
void btCudaBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher)
void btCudaBroadphase::prepareAABB()
{
if(m_numHandles <= 0)
{
BT_PROFILE("addLarge2LargePairsToCache -- CPU");
addLarge2LargePairsToCache(dispatcher);
return;
}
bt3DGridBroadphase::prepareAABB();
btCuda_copyArrayToDevice(m_dAABB, m_hAABB, sizeof(btCuda3F1U) * 2 * (m_numHandles + m_numLargeHandles));
return;
} // btCudaBroadphase::prepareAABB()
//--------------------------------------------------------------------------
// update constants
btCuda_setParameters(&m_params);
void btCudaBroadphase::setParameters(btCudaBroadphaseParams* hostParams)
{
btCuda_setParameters(hostParams);
return;
} // btCudaBroadphase::setParameters()
//--------------------------------------------------------------------------
// move AABB array to GPU
{
BT_PROFILE("copy AABB");
// do it faster ?
btCuda3F1U* pBB = m_hAABB;
int i;
int new_largest_index = -1;
unsigned int num_small = 0;
for(i = 0; i <= m_LastHandleIndex; i++)
{
btSimpleBroadphaseProxy* proxy0 = &m_pHandles[i];
if(!proxy0->m_clientObject)
{
continue;
}
new_largest_index = i;
pBB->fx = proxy0->m_aabbMin.getX();
pBB->fy = proxy0->m_aabbMin.getY();
pBB->fz = proxy0->m_aabbMin.getZ();
pBB->uw = i;
pBB++;
pBB->fx = proxy0->m_aabbMax.getX();
pBB->fy = proxy0->m_aabbMax.getY();
pBB->fz = proxy0->m_aabbMax.getZ();
pBB->uw = num_small;
pBB++;
num_small++;
}
m_LastHandleIndex = new_largest_index;
new_largest_index = -1;
unsigned int num_large = 0;
for(i = 0; i <= m_LastLargeHandleIndex; i++)
{
btSimpleBroadphaseProxy* proxy0 = &m_pLargeHandles[i];
if(!proxy0->m_clientObject)
{
continue;
}
new_largest_index = i;
pBB->fx = proxy0->m_aabbMin.getX();
pBB->fy = proxy0->m_aabbMin.getY();
pBB->fz = proxy0->m_aabbMin.getZ();
pBB->uw = i + m_maxHandles;
pBB++;
pBB->fx = proxy0->m_aabbMax.getX();
pBB->fy = proxy0->m_aabbMax.getY();
pBB->fz = proxy0->m_aabbMax.getZ();
pBB->uw = num_large + m_maxHandles;
pBB++;
num_large++;
}
m_LastLargeHandleIndex = new_largest_index;
// paranoid checks
btAssert(num_small == m_numHandles);
btAssert(num_large == m_numLargeHandles);
}
{
BT_PROFILE("CopyBB to CUDA");
btCuda_copyArrayToDevice(m_dAABB, m_hAABB, sizeof(btCuda3F1U) * 2 * (m_numHandles + m_numLargeHandles));
}
// calculate hash
{
BT_PROFILE("calcHash -- CUDA");
btCuda_calcHashAABB(m_dAABB, m_dBodiesHash[0], m_numHandles);
}
void btCudaBroadphase::calcHashAABB()
{
BT_PROFILE("btCuda_calcHashAABB");
btCuda_calcHashAABB(m_dAABB, m_dBodiesHash[0], m_numHandles);
// btCuda_copyArrayFromDevice((void*)m_hBodiesHash, (void*)m_dBodiesHash[0], sizeof(unsigned int) * 2 * m_numHandles);
// sort bodies based on hash
{
BT_PROFILE("RadixSort-- CUDA");
RadixSort((KeyValuePair*)m_dBodiesHash[0], (KeyValuePair*)m_dBodiesHash[1], m_numHandles, 32);
}
// find start of each cell
{
BT_PROFILE("Find cell start -- CUDA");
btCuda_findCellStart(m_dBodiesHash[0], m_dCellStart, m_numHandles, m_params.m_numCells);
}
return;
} // btCudaBroadphase::calcHashAABB()
//--------------------------------------------------------------------------
void btCudaBroadphase::sortHash()
{
BT_PROFILE("RadixSort-- CUDA");
RadixSort((KeyValuePair*)m_dBodiesHash[0], (KeyValuePair*)m_dBodiesHash[1], m_numHandles, 32);
return;
} // btCudaBroadphase::sortHash()
//--------------------------------------------------------------------------
void btCudaBroadphase::findCellStart()
{
BT_PROFILE("btCuda_findCellStart");
btCuda_findCellStart(m_dBodiesHash[0], m_dCellStart, m_numHandles, m_params.m_numCells);
// btCuda_copyArrayFromDevice((void*)m_hBodiesHash, (void*)m_dBodiesHash[0], sizeof(unsigned int) * 2 * m_numHandles);
// btCuda_copyArrayFromDevice((void*)m_hCellStart, (void*)m_dCellStart, sizeof(unsigned int) * m_params.m_numCells);
{
BT_PROFILE("FindOverlappingPairs -- CUDA");
btCuda_findOverlappingPairs(m_dAABB, m_dBodiesHash[0], m_dCellStart, m_dPairBuff, m_dPairBuffStartCurr, m_numHandles);
}
{
BT_PROFILE("FindPairsLarge -- CUDA");
btCuda_findPairsLarge(m_dAABB, m_dBodiesHash[0], m_dCellStart, m_dPairBuff, m_dPairBuffStartCurr, m_numHandles, m_numLargeHandles);
}
{
BT_PROFILE("ComputePairCacheChanges -- CUDA");
btCuda_computePairCacheChanges(m_dPairBuff, m_dPairBuffStartCurr, m_dPairScan, m_dAABB, m_numHandles);
}
{
BT_PROFILE("scanOverlappingPairBuff -- CPU");
btCuda_copyArrayFromDevice(m_hPairScan, m_dPairScan, sizeof(unsigned int)*(m_numHandles + 1));
scanOverlappingPairBuffCPU();
btCuda_copyArrayToDevice(m_dPairScan, m_hPairScan, sizeof(unsigned int)*(m_numHandles + 1));
}
{
BT_PROFILE("SqueezeOverlappingPairBuff -- CUDA");
btCuda_squeezeOverlappingPairBuff(m_dPairBuff, m_dPairBuffStartCurr, m_dPairScan, m_dPairOut, m_dAABB, m_numHandles);
}
{
BT_PROFILE("SqueezeOverlappingPairBuff -- CUDA");
btCuda_copyArrayFromDevice(m_hPairOut, m_dPairOut, sizeof(unsigned int) * m_hPairScan[m_numHandles]);
}
{
BT_PROFILE("addPairsToCache -- CPU");
addPairsToCacheCPU(dispatcher);
}
{
BT_PROFILE("addLarge2LargePairsToCache -- CPU");
addLarge2LargePairsToCache(dispatcher);
}
return;
} // btCudaBroadphase::calculateOverlappingPairs()
} // btCudaBroadphase::findCellStart()
//--------------------------------------------------------------------------
void btCudaBroadphase::scanOverlappingPairBuffCPU()
void btCudaBroadphase::findOverlappingPairs()
{
m_hPairScan[0] = 0;
for(int i = 1; i <= m_numHandles; i++)
{
unsigned int delta = m_hPairScan[i];
m_hPairScan[i] = m_hPairScan[i-1] + delta;
}
} // btCudaBroadphase::scanOverlappingPairBuffCPU()
//--------------------------------------------------------------------------
void btCudaBroadphase::addPairsToCacheCPU(btDispatcher* dispatcher)
{
m_numPairsAdded = 0;
m_numPairsRemoved = 0;
for(int i = 0; i < m_numHandles; i++)
{
unsigned int num = m_hPairScan[i+1] - m_hPairScan[i];
if(!num)
{
continue;
}
unsigned int* pInp = m_hPairOut + m_hPairScan[i];
unsigned int index0 = m_hAABB[i * 2].uw;
btSimpleBroadphaseProxy* proxy0 = &m_pHandles[index0];
for(unsigned int j = 0; j < num; j++)
{
unsigned int indx1_s = pInp[j];
unsigned int index1 = indx1_s & (~BT_CUDA_PAIR_ANY_FLG);
btSimpleBroadphaseProxy* proxy1;
if(index1 < (unsigned int)m_maxHandles)
{
proxy1 = &m_pHandles[index1];
}
else
{
index1 -= m_maxHandles;
btAssert((index1 >= 0) && (index1 < (unsigned int)m_maxLargeHandles));
proxy1 = &m_pLargeHandles[index1];
}
if(indx1_s & BT_CUDA_PAIR_NEW_FLG)
{
m_pairCache->addOverlappingPair(proxy0,proxy1);
m_numPairsAdded++;
}
else
{
m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher);
m_numPairsRemoved++;
}
}
}
} // btCudaBroadphase::addPairsToCacheCPU()
//--------------------------------------------------------------------------
btBroadphaseProxy* btCudaBroadphase::createProxy( const btVector3& aabbMin, const btVector3& aabbMax,int shapeType,void* userPtr ,short int collisionFilterGroup,short int collisionFilterMask, btDispatcher* dispatcher,void* multiSapProxy)
{
btBroadphaseProxy* proxy;
bool bIsLarge = isLargeProxy(aabbMin, aabbMax);
if(bIsLarge)
{
if (m_numLargeHandles >= m_maxLargeHandles)
{
btAssert(0);
return 0; //should never happen, but don't let the game crash ;-)
}
btAssert((aabbMin[0]<= aabbMax[0]) && (aabbMin[1]<= aabbMax[1]) && (aabbMin[2]<= aabbMax[2]));
int newHandleIndex = allocLargeHandle();
proxy = new (&m_pLargeHandles[newHandleIndex])btSimpleBroadphaseProxy(aabbMin,aabbMax,shapeType,userPtr,collisionFilterGroup,collisionFilterMask,multiSapProxy);
}
else
{
proxy = btSimpleBroadphase::createProxy(aabbMin, aabbMax, shapeType, userPtr, collisionFilterGroup, collisionFilterMask, dispatcher, multiSapProxy);
}
return proxy;
} // btCudaBroadphase::createProxy()
//--------------------------------------------------------------------------
void btCudaBroadphase::destroyProxy(btBroadphaseProxy* proxy, btDispatcher* dispatcher)
{
bool bIsLarge = isLargeProxy(proxy);
if(bIsLarge)
{
btSimpleBroadphaseProxy* proxy0 = static_cast<btSimpleBroadphaseProxy*>(proxy);
freeLargeHandle(proxy0);
// TODO : remove pair from cache on GPU as well !!!
// UPD: they will not be used anyway, so don't waste time
m_pairCache->removeOverlappingPairsContainingProxy(proxy,dispatcher);
}
else
{
btSimpleBroadphase::destroyProxy(proxy, dispatcher);
}
BT_PROFILE("btCuda_findOverlappingPairs");
btCuda_findOverlappingPairs(m_dAABB, m_dBodiesHash[0], m_dCellStart, m_dPairBuff, m_dPairBuffStartCurr, m_numHandles);
return;
} // btCudaBroadphase::destroyProxy()
} // btCudaBroadphase::findOverlappingPairs()
//--------------------------------------------------------------------------
bool btCudaBroadphase::isLargeProxy(const btVector3& aabbMin, const btVector3& aabbMax)
void btCudaBroadphase::findPairsLarge()
{
btVector3 diag = aabbMax - aabbMin;
btScalar radius = diag.length() * btScalar(0.5f);
radius *= m_cellFactorAABB; // user-defined factor
return (radius > m_maxRadius);
} // btCudaBroadphase::isLargeProxy()
//--------------------------------------------------------------------------
bool btCudaBroadphase::isLargeProxy(btBroadphaseProxy* proxy)
{
return (proxy->getUid() >= (m_maxHandles+2));
} // btCudaBroadphase::isLargeProxy()
//--------------------------------------------------------------------------
void btCudaBroadphase::addLarge2LargePairsToCache(btDispatcher* dispatcher)
{
int i,j;
if (m_numLargeHandles <= 0)
{
return;
}
int new_largest_index = -1;
for(i = 0; i <= m_LastLargeHandleIndex; i++)
{
btSimpleBroadphaseProxy* proxy0 = &m_pLargeHandles[i];
if(!proxy0->m_clientObject)
{
continue;
}
new_largest_index = i;
for(j = i + 1; j <= m_LastLargeHandleIndex; j++)
{
btSimpleBroadphaseProxy* proxy1 = &m_pLargeHandles[j];
if(!proxy1->m_clientObject)
{
continue;
}
btAssert(proxy0 != proxy1);
btSimpleBroadphaseProxy* p0 = getSimpleProxyFromProxy(proxy0);
btSimpleBroadphaseProxy* p1 = getSimpleProxyFromProxy(proxy1);
if(aabbOverlap(p0,p1))
{
if (!m_pairCache->findPair(proxy0,proxy1))
{
m_pairCache->addOverlappingPair(proxy0,proxy1);
}
}
else
{
if(m_pairCache->findPair(proxy0,proxy1))
{
m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher);
}
}
}
}
m_LastLargeHandleIndex = new_largest_index;
BT_PROFILE("btCuda_findPairsLarge");
btCuda_findPairsLarge(m_dAABB, m_dBodiesHash[0], m_dCellStart, m_dPairBuff, m_dPairBuffStartCurr, m_numHandles, m_numLargeHandles);
return;
} // btCudaBroadphase::addLarge2LargePairsToCache()
} // btCudaBroadphase::findPairsLarge()
//--------------------------------------------------------------------------
void btCudaBroadphase::rayTest(const btVector3& rayFrom,const btVector3& rayTo, btBroadphaseRayCallback& rayCallback)
void btCudaBroadphase::computePairCacheChanges()
{
btSimpleBroadphase::rayTest(rayFrom, rayTo, rayCallback);
for (int i=0; i <= m_LastLargeHandleIndex; i++)
{
btSimpleBroadphaseProxy* proxy = &m_pLargeHandles[i];
if(!proxy->m_clientObject)
{
continue;
}
rayCallback.process(proxy);
}
} // btCudaBroadphase::rayTest()
BT_PROFILE("btCuda_computePairCacheChanges");
btCuda_computePairCacheChanges(m_dPairBuff, m_dPairBuffStartCurr, m_dPairScan, m_dAABB, m_numHandles);
return;
} // btCudaBroadphase::computePairCacheChanges()
//--------------------------------------------------------------------------
void btCudaBroadphase::scanOverlappingPairBuff()
{
btCuda_copyArrayFromDevice(m_hPairScan, m_dPairScan, sizeof(unsigned int)*(m_numHandles + 1));
bt3DGridBroadphase::scanOverlappingPairBuff();
btCuda_copyArrayToDevice(m_dPairScan, m_hPairScan, sizeof(unsigned int)*(m_numHandles + 1));
return;
} // btCudaBroadphase::scanOverlappingPairBuff()
//--------------------------------------------------------------------------
void btCudaBroadphase::squeezeOverlappingPairBuff()
{
BT_PROFILE("btCuda_squeezeOverlappingPairBuff");
btCuda_squeezeOverlappingPairBuff(m_dPairBuff, m_dPairBuffStartCurr, m_dPairScan, m_dPairOut, m_dAABB, m_numHandles);
btCuda_copyArrayFromDevice(m_hPairOut, m_dPairOut, sizeof(unsigned int) * m_hPairScan[m_numHandles]);
return;
} // btCudaBroadphase::squeezeOverlappingPairBuff()
//--------------------------------------------------------------------------

View File

@ -65,9 +65,9 @@
__device__ inline btCuda3F1U tex_fetch3F1U(float4 a) { return *((btCuda3F1U*)(&a)); }
#if B_CUDA_USE_TEX
#define FETCH(t, i) tex_fetch3F1U(tex1Dfetch(t##Tex, i))
#define BT3DGRIDFETCH(t, i) tex_fetch3F1U(tex1Dfetch(t##Tex, i))
#else
#define FETCH(t, i) t[i]
#define BT3DGRIDFETCH(t, i) t[i]
#endif
texture<uint2, 1, cudaReadModeElementType> particleHashTex;
@ -80,323 +80,25 @@ __constant__ btCudaBroadphaseParams params;
//----------------------------------------------------------------------------------------
// calculate position in uniform grid
__device__ int3 btCuda_calcGridPos(float4 p)
{
int3 gridPos;
gridPos.x = floor((p.x - params.m_worldOriginX) / params.m_cellSizeX);
gridPos.y = floor((p.y - params.m_worldOriginY) / params.m_cellSizeY);
gridPos.z = floor((p.z - params.m_worldOriginZ) / params.m_cellSizeZ);
return gridPos;
}
#define BT3DGRID__device__ __device__
#define BT3DGRIDmax(a, b) max(a, b)
#define BT3DGRIDmin(a, b) min(a, b)
#define BT3DGRIDparams params
#define BT3DGRID__mul24(a, b) __mul24(a, b)
#define BT3DGRID__global__ __global__
#define BT3DGRID__shared__ __shared__
#define BT3DGRID__syncthreads() __syncthreads()
#define BT3DGRIDmake_uint2(x, y) make_uint2(x, y)
#define BT3DGRIDmake_int3(x, y, z) make_int3(x, y, z)
#define BT3DGRIDPREF(func) btCuda_##func
#define BT3DGPRDMemset cudaMemset
#define BT3DGRIDblockIdx blockIdx
#define BT3DGRIDblockDim blockDim
#define BT3DGRIDthreadIdx threadIdx
#define BT3DGRIDEXECKERNEL(numb, numt, kfunc, args) kfunc<<<numb, numt>>>args
//----------------------------------------------------------------------------------------
// calculate address in grid from position (clamping to edges)
__device__ uint btCuda_calcGridHash(int3 gridPos)
{
gridPos.x = max(0, min(gridPos.x, params.m_gridSizeX - 1));
gridPos.y = max(0, min(gridPos.y, params.m_gridSizeY - 1));
gridPos.z = max(0, min(gridPos.z, params.m_gridSizeZ - 1));
return __mul24(__mul24(gridPos.z, params.m_gridSizeY), params.m_gridSizeX) + __mul24(gridPos.y, params.m_gridSizeX) + gridPos.x;
}
//----------------------------------------------------------------------------------------
// calculate grid hash value for each body using its AABB
__global__ void calcHashAABBD(btCuda3F1U* pAABB, uint2* pHash, uint numBodies)
{
int index = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
if(index >= numBodies)
{
return;
}
btCuda3F1U bbMin = pAABB[index*2];
btCuda3F1U bbMax = pAABB[index*2 + 1];
float4 pos;
pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
// get address in grid
int3 gridPos = btCuda_calcGridPos(pos);
uint gridHash = btCuda_calcGridHash(gridPos);
// store grid hash and body index
pHash[index] = make_uint2(gridHash, index);
}
//----------------------------------------------------------------------------------------
__global__ void findCellStartD(uint2* pHash, uint* cellStart, uint numBodies)
{
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if(index >= numBodies)
{
return;
}
uint2 sortedData = pHash[index];
// Load hash data into shared memory so that we can look
// at neighboring body's hash value without loading
// two hash values per thread
__shared__ uint sharedHash[257];
sharedHash[threadIdx.x+1] = sortedData.x;
if((index > 0) && (threadIdx.x == 0))
{
// first thread in block must load neighbor body hash
volatile uint2 prevData = pHash[index-1];
sharedHash[0] = prevData.x;
}
__syncthreads();
if((index == 0) || (sortedData.x != sharedHash[threadIdx.x]))
{
cellStart[sortedData.x] = index;
}
}
//----------------------------------------------------------------------------------------
__device__ uint cudaTestAABBOverlap(btCuda3F1U min0, btCuda3F1U max0, btCuda3F1U min1, btCuda3F1U max1)
{
return (min0.fx <= max1.fx)&& (min1.fx <= max0.fx) &&
(min0.fy <= max1.fy)&& (min1.fy <= max0.fy) &&
(min0.fz <= max1.fz)&& (min1.fz <= max0.fz);
}
//----------------------------------------------------------------------------------------
__device__ void findPairsInCell(int3 gridPos,
uint index,
uint2* pHash,
uint* pCellStart,
btCuda3F1U* pAABB,
uint* pPairBuff,
uint2* pPairBuffStartCurr,
uint numBodies)
{
if ( (gridPos.x < 0) || (gridPos.x > params.m_gridSizeX - 1)
|| (gridPos.y < 0) || (gridPos.y > params.m_gridSizeY - 1)
|| (gridPos.z < 0) || (gridPos.z > params.m_gridSizeZ - 1))
{
return;
}
uint gridHash = btCuda_calcGridHash(gridPos);
// get start of bucket for this cell
uint bucketStart = pCellStart[gridHash];
if (bucketStart == 0xffffffff)
{
return; // cell empty
}
// iterate over bodies in this cell
uint2 sortedData = pHash[index];
uint unsorted_indx = sortedData.y;
btCuda3F1U min0 = FETCH(pAABB, unsorted_indx*2);
btCuda3F1U max0 = FETCH(pAABB, unsorted_indx*2 + 1);
uint handleIndex = min0.uw;
uint2 start_curr = pPairBuffStartCurr[handleIndex];
uint start = start_curr.x;
uint curr = start_curr.y;
uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
uint curr_max = start_curr_next.x - start - 1;
uint bucketEnd = bucketStart + params.m_maxBodiesPerCell;
bucketEnd = (bucketEnd > numBodies) ? numBodies : bucketEnd;
for(uint index2 = bucketStart; index2 < bucketEnd; index2++)
{
uint2 cellData = pHash[index2];
if (cellData.x != gridHash)
{
break; // no longer in same bucket
}
uint unsorted_indx2 = cellData.y;
if (unsorted_indx2 < unsorted_indx) // check not colliding with self
{
btCuda3F1U min1 = FETCH(pAABB, unsorted_indx2*2);
btCuda3F1U max1 = FETCH(pAABB, unsorted_indx2*2 + 1);
if(cudaTestAABBOverlap(min0, max0, min1, max1))
{
uint handleIndex2 = min1.uw;
uint k;
for(k = 0; k < curr; k++)
{
uint old_pair = pPairBuff[start+k] & (~BT_CUDA_PAIR_ANY_FLG);
if(old_pair == handleIndex2)
{
pPairBuff[start+k] |= BT_CUDA_PAIR_FOUND_FLG;
break;
}
}
if(k == curr)
{
pPairBuff[start+curr] = handleIndex2 | BT_CUDA_PAIR_NEW_FLG;
if(curr >= curr_max)
{ // not a good solution, but let's avoid crash
break;
}
curr++;
}
}
}
}
pPairBuffStartCurr[handleIndex] = make_uint2(start, curr);
return;
}
//----------------------------------------------------------------------------------------
__global__ void
findOverlappingPairsD( btCuda3F1U* pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff,
uint2* pPairBuffStartCurr, uint numBodies)
{
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if(index >= numBodies)
{
return;
}
uint2 sortedData = pHash[index];
uint unsorted_indx = sortedData.y;
btCuda3F1U bbMin = FETCH(pAABB, unsorted_indx*2);
btCuda3F1U bbMax = FETCH(pAABB, unsorted_indx*2 + 1);
float4 pos;
pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
// get address in grid
int3 gridPos = btCuda_calcGridPos(pos);
// examine only neighbouring cells
for(int z=-1; z<=1; z++) {
for(int y=-1; y<=1; y++) {
for(int x=-1; x<=1; x++) {
findPairsInCell(gridPos + make_int3(x, y, z), index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, numBodies);
}
}
}
}
//----------------------------------------------------------------------------------------
__global__ void
findPairsLargeD( btCuda3F1U* pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff,
uint2* pPairBuffStartCurr, uint numBodies, uint numLarge)
{
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if(index >= numBodies)
{
return;
}
uint2 sortedData = pHash[index];
uint unsorted_indx = sortedData.y;
btCuda3F1U min0 = FETCH(pAABB, unsorted_indx*2);
btCuda3F1U max0 = FETCH(pAABB, unsorted_indx*2 + 1);
uint handleIndex = min0.uw;
uint2 start_curr = pPairBuffStartCurr[handleIndex];
uint start = start_curr.x;
uint curr = start_curr.y;
uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
uint curr_max = start_curr_next.x - start - 1;
for(uint i = 0; i < numLarge; i++)
{
uint indx2 = numBodies + i;
btCuda3F1U min1 = FETCH(pAABB, indx2*2);
btCuda3F1U max1 = FETCH(pAABB, indx2*2 + 1);
if(cudaTestAABBOverlap(min0, max0, min1, max1))
{
uint k;
uint handleIndex2 = min1.uw;
for(k = 0; k < curr; k++)
{
uint old_pair = pPairBuff[start+k] & (~BT_CUDA_PAIR_ANY_FLG);
if(old_pair == handleIndex2)
{
pPairBuff[start+k] |= BT_CUDA_PAIR_FOUND_FLG;
break;
}
}
if(k == curr)
{
pPairBuff[start+curr] = handleIndex2 | BT_CUDA_PAIR_NEW_FLG;
if(curr >= curr_max)
{ // not a good solution, but let's avoid crash
break;
}
curr++;
}
}
}
pPairBuffStartCurr[handleIndex] = make_uint2(start, curr);
return;
}
//----------------------------------------------------------------------------------------
__global__ void computePairCacheChangesD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan, btCuda3F1U* pAABB, uint numBodies)
{
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if(index >= numBodies)
{
return;
}
btCuda3F1U bbMin = pAABB[index * 2];
uint handleIndex = bbMin.uw;
uint2 start_curr = pPairBuffStartCurr[handleIndex];
uint start = start_curr.x;
uint curr = start_curr.y;
uint *pInp = pPairBuff + start;
uint num_changes = 0;
for(uint k = 0; k < curr; k++, pInp++)
{
if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG))
{
num_changes++;
}
}
pPairScan[index+1] = num_changes;
}
//----------------------------------------------------------------------------------------
__global__ void squeezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan, uint* pPairOut, btCuda3F1U* pAABB, uint numBodies)
{
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if(index >= numBodies)
{
return;
}
btCuda3F1U bbMin = pAABB[index * 2];
uint handleIndex = bbMin.uw;
uint2 start_curr = pPairBuffStartCurr[handleIndex];
uint start = start_curr.x;
uint curr = start_curr.y;
uint* pInp = pPairBuff + start;
uint* pOut = pPairOut + pPairScan[index];
uint* pOut2 = pInp;
uint num = 0;
for(uint k = 0; k < curr; k++, pInp++)
{
if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG))
{
*pOut = *pInp;
pOut++;
}
if((*pInp) & BT_CUDA_PAIR_ANY_FLG)
{
*pOut2 = (*pInp) & (~BT_CUDA_PAIR_ANY_FLG);
pOut2++;
num++;
}
}
pPairBuffStartCurr[handleIndex] = make_uint2(start, num);
} // squeezeOverlappingPairBuffD()
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
// E N D O F K E R N E L F U N C T I O N S
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//! Check for CUDA error
# define CUT_CHECK_ERROR(errorMessage) do { \
cudaError_t err = cudaGetLastError(); \
@ -430,9 +132,7 @@ __global__ void squeezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffSta
btCuda_exit(EXIT_FAILURE); \
} } while (0)
extern "C"
{
//----------------------------------------------------------------------------------------
void btCuda_exit(int val)
{
@ -465,125 +165,9 @@ void btCuda_setParameters(btCudaBroadphaseParams* hostParams)
MY_CUDA_SAFE_CALL(cudaMemcpyToSymbol(params, hostParams, sizeof(btCudaBroadphaseParams)));
}
//Round a / b to nearest higher integer value
int btCuda_iDivUp(int a, int b)
{
return (a % b != 0) ? (a / b + 1) : (a / b);
}
//----------------------------------------------------------------------------------------
// compute grid and thread block size for a given number of elements
void btCuda_computeGridSize(int n, int blockSize, int &numBlocks, int &numThreads)
{
numThreads = min(blockSize, n);
numBlocks = btCuda_iDivUp(n, numThreads);
}
#include "bt3DGridBroadphaseFunc.h"
void btCuda_calcHashAABB(btCuda3F1U* pAABB, unsigned int* hash, unsigned int numBodies)
{
int numThreads, numBlocks;
btCuda_computeGridSize(numBodies, 256, numBlocks, numThreads);
// execute the kernel
calcHashAABBD<<< numBlocks, numThreads >>>(pAABB, (uint2*)hash, numBodies);
// check if kernel invocation generated an error
CUT_CHECK_ERROR("calcHashAABBD kernel execution failed");
}
//----------------------------------------------------------------------------------------
void btCuda_findCellStart(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells)
{
int numThreads, numBlocks;
btCuda_computeGridSize(numBodies, 256, numBlocks, numThreads);
MY_CUDA_SAFE_CALL(cudaMemset(cellStart, 0xffffffff, numCells*sizeof(uint)));
findCellStartD<<< numBlocks, numThreads >>>((uint2*)hash, (uint*)cellStart, numBodies);
CUT_CHECK_ERROR("Kernel execution failed: findCellStartD");
}
void btCuda_findOverlappingPairs( btCuda3F1U* pAABB, unsigned int* pHash,
unsigned int* pCellStart,
unsigned int* pPairBuff,
unsigned int* pPairBuffStartCurr,
unsigned int numBodies)
{
#if B_CUDA_USE_TEX
MY_CUDA_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, numBodies * 2 * sizeof(btCuda3F1U)));
#endif
int numThreads, numBlocks;
btCuda_computeGridSize(numBodies, 64, numBlocks, numThreads);
findOverlappingPairsD<<< numBlocks, numThreads >>>(
pAABB,
(uint2*)pHash,
(uint*)pCellStart,
(uint*)pPairBuff,
(uint2*)pPairBuffStartCurr,
numBodies
);
CUT_CHECK_ERROR("Kernel execution failed: bt_CudaFindOverlappingPairsD");
#if B_CUDA_USE_TEX
MY_CUDA_SAFE_CALL(cudaUnbindTexture(pAABBTex));
#endif
} // btCuda_findOverlappingPairs()
void btCuda_findPairsLarge( btCuda3F1U* pAABB, unsigned int* pHash,
unsigned int* pCellStart,
unsigned int* pPairBuff,
unsigned int* pPairBuffStartCurr,
unsigned int numBodies,
unsigned int numLarge)
{
#if B_CUDA_USE_TEX
MY_CUDA_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, (numBodies+numLarge) * 2 * sizeof(btCuda3F1U)));
#endif
int numThreads, numBlocks;
btCuda_computeGridSize(numBodies, 64, numBlocks, numThreads);
findPairsLargeD<<< numBlocks, numThreads >>>(
pAABB,
(uint2*)pHash,
(uint*)pCellStart,
(uint*)pPairBuff,
(uint2*)pPairBuffStartCurr,
numBodies,
numLarge
);
CUT_CHECK_ERROR("Kernel execution failed: btCuda_findPairsLargeD");
#if B_CUDA_USE_TEX
MY_CUDA_SAFE_CALL(cudaUnbindTexture(pAABBTex));
#endif
} // btCuda_findPairsLarge()
void btCuda_computePairCacheChanges(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr,
unsigned int* pPairScan, btCuda3F1U* pAABB, unsigned int numBodies)
{
int numThreads, numBlocks;
btCuda_computeGridSize(numBodies, 256, numBlocks, numThreads);
computePairCacheChangesD<<< numBlocks, numThreads >>>(
(uint*)pPairBuff,
(uint2*)pPairBuffStartCurr,
(uint*)pPairScan,
pAABB,
numBodies
);
CUT_CHECK_ERROR("Kernel execution failed: btCudaComputePairCacheChangesD");
} // btCuda_computePairCacheChanges()
void btCuda_squeezeOverlappingPairBuff( unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan,
unsigned int* pPairOut, btCuda3F1U* pAABB, unsigned int numBodies)
{
int numThreads, numBlocks;
btCuda_computeGridSize(numBodies, 256, numBlocks, numThreads);
squeezeOverlappingPairBuffD<<< numBlocks, numThreads >>>(
(uint*)pPairBuff,
(uint2*)pPairBuffStartCurr,
(uint*)pPairScan,
(uint*)pPairOut,
pAABB,
numBodies
);
CUT_CHECK_ERROR("Kernel execution failed: btCudaSqueezeOverlappingPairBuffD");
} // btCuda_squeezeOverlappingPairBuff()
} // extern "C"

View File

@ -21,14 +21,18 @@ subject to the following restrictions:
#include "btCudaBroadphaseKernel.h"
///The btCudaBroadphase uses CUDA to compute overlapping pairs using a GPU.
class btCudaBroadphase : public btSimpleBroadphase
///The bt3DGridBroadphase uses CUDA to compute overlapping pairs using a GPU.
class bt3DGridBroadphase : public btSimpleBroadphase
{
protected:
bool m_bInitialized;
unsigned int m_numBodies;
unsigned int m_numCells;
unsigned int m_maxPairsPerBody;
btScalar m_cellFactorAABB;
unsigned int m_maxBodiesPerCell;
btCudaBroadphaseParams m_params;
btScalar m_maxRadius;
// CPU data
unsigned int* m_hBodiesHash;
unsigned int* m_hCellStart;
@ -37,17 +41,6 @@ class btCudaBroadphase : public btSimpleBroadphase
unsigned int* m_hPairBuff;
unsigned int* m_hPairScan;
unsigned int* m_hPairOut;
// GPU data
unsigned int* m_dBodiesHash[2];
unsigned int* m_dCellStart;
unsigned int* m_dPairBuff;
unsigned int* m_dPairBuffStartCurr;
btCuda3F1U* m_dAABB;
unsigned int* m_dPairScan;
unsigned int* m_dPairOut;
unsigned int m_maxBodiesPerCell;
btCudaBroadphaseParams m_params;
btScalar m_maxRadius;
// large proxies
int m_numLargeHandles;
int m_maxLargeHandles;
@ -82,12 +75,56 @@ class btCudaBroadphase : public btSimpleBroadphase
}
bool isLargeProxy(const btVector3& aabbMin, const btVector3& aabbMax);
bool isLargeProxy(btBroadphaseProxy* proxy);
// debug
unsigned int m_numPairsAdded;
unsigned int m_numPairsRemoved;
unsigned int m_numOverflows;
//
public:
bt3DGridBroadphase(const btVector3& worldAabbMin,const btVector3& worldAabbMax,
int gridSizeX, int gridSizeY, int gridSizeZ,
int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
int maxBodiesPerCell = 8,
btScalar cellFactorAABB = btScalar(1.0f));
virtual ~bt3DGridBroadphase();
virtual void calculateOverlappingPairs(btDispatcher* dispatcher);
virtual btBroadphaseProxy* createProxy(const btVector3& aabbMin, const btVector3& aabbMax,int shapeType,void* userPtr ,short int collisionFilterGroup,short int collisionFilterMask, btDispatcher* dispatcher,void* multiSapProxy);
virtual void destroyProxy(btBroadphaseProxy* proxy,btDispatcher* dispatcher);
virtual void rayTest(const btVector3& rayFrom,const btVector3& rayTo, btBroadphaseRayCallback& rayCallback);
protected:
void _initialize();
void _finalize();
void addPairsToCache(btDispatcher* dispatcher);
void addLarge2LargePairsToCache(btDispatcher* dispatcher);
// overrides for CPU version
virtual void setParameters(btCudaBroadphaseParams* hostParams);
virtual void prepareAABB();
virtual void calcHashAABB();
virtual void sortHash();
virtual void findCellStart();
virtual void findOverlappingPairs();
virtual void findPairsLarge();
virtual void computePairCacheChanges();
virtual void scanOverlappingPairBuff();
virtual void squeezeOverlappingPairBuff();
};
///The btCudaBroadphase uses CUDA to compute overlapping pairs using a GPU.
class btCudaBroadphase : public bt3DGridBroadphase
{
protected:
// GPU data
unsigned int* m_dBodiesHash[2];
unsigned int* m_dCellStart;
unsigned int* m_dPairBuff;
unsigned int* m_dPairBuffStartCurr;
btCuda3F1U* m_dAABB;
unsigned int* m_dPairScan;
unsigned int* m_dPairOut;
public:
btCudaBroadphase(const btVector3& worldAabbMin,const btVector3& worldAabbMax,
int gridSizeX, int gridSizeY, int gridSizeZ,
@ -95,18 +132,21 @@ public:
int maxBodiesPerCell = 8,
btScalar cellFactorAABB = btScalar(1.0f));
virtual ~btCudaBroadphase();
virtual void calculateOverlappingPairs(btDispatcher* dispatcher);
virtual btBroadphaseProxy* createProxy(const btVector3& aabbMin, const btVector3& aabbMax,int shapeType,void* userPtr ,short int collisionFilterGroup,short int collisionFilterMask, btDispatcher* dispatcher,void* multiSapProxy);
virtual void destroyProxy(btBroadphaseProxy* proxy,btDispatcher* dispatcher);
virtual void rayTest(const btVector3& rayFrom,const btVector3& rayTo, btBroadphaseRayCallback& rayCallback);
protected:
void _initialize();
void _finalize();
void scanOverlappingPairBuffCPU();
void addPairsToCacheCPU(btDispatcher* dispatcher);
void addLarge2LargePairsToCache(btDispatcher* dispatcher);
void allocateArray(void** devPtr, unsigned int size);
void freeArray(void* devPtr);
// overrides for CUDA version
virtual void setParameters(btCudaBroadphaseParams* hostParams);
virtual void prepareAABB();
virtual void calcHashAABB();
virtual void sortHash();
virtual void findCellStart();
virtual void findOverlappingPairs();
virtual void findPairsLarge();
virtual void computePairCacheChanges();
virtual void scanOverlappingPairBuff();
virtual void squeezeOverlappingPairBuff();
};
#endif //CUDA_BROADPHASE_H

View File

@ -54,7 +54,28 @@ struct btCuda3F1U
extern "C"
{
// CPU functions
void bt3DGrid_setParameters(btCudaBroadphaseParams* hostParams);
void bt3DGrid_calcHashAABB(btCuda3F1U* pAABB, unsigned int* hash, unsigned int numBodies);
void bt3DGrid_findCellStart(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells);
void bt3DGrid_findOverlappingPairs( btCuda3F1U* pAABB, unsigned int* pHash,
unsigned int* pCellStart,
unsigned int* pPairBuff,
unsigned int* pPairBuffStartCurr,
unsigned int numBodies);
void bt3DGrid_findPairsLarge( btCuda3F1U* pAABB, unsigned int* pHash,
unsigned int* pCellStart,
unsigned int* pPairBuff,
unsigned int* pPairBuffStartCurr,
unsigned int numBodies,
unsigned int numLarge);
void bt3DGrid_computePairCacheChanges( unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr,
unsigned int* pPairScan, btCuda3F1U* pAABB, unsigned int numBodies);
void bt3DGrid_squeezeOverlappingPairBuff( unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan,
unsigned int* pPairOut, btCuda3F1U* pAABB, unsigned int numBodies);
// CUDA functions
void btCuda_allocateArray(void** devPtr, unsigned int size);
void btCuda_freeArray(void* devPtr);
void btCuda_copyArrayFromDevice(void* host, const void* device, unsigned int size);

View File

@ -506,6 +506,14 @@
<Filter
Name="Source Files"
>
<File
RelativePath=".\bt3DGridBroadphase.cpp"
>
</File>
<File
RelativePath=".\bt3DGridBroadphaseFunc.h"
>
</File>
<File
RelativePath=".\btCudaBroadphase.cpp"
>

View File

@ -664,6 +664,7 @@ void ParticleSystem::initializeBullet()
// m_broadphase = new btDbvtBroadphase();
// m_broadphase = new btAxisSweep3(btVector3(-3,-3,-3),btVector3(3,3,3));
m_broadphase = new btCudaBroadphase(btVector3(-1, -1, -1), btVector3(1, 1, 1), 64, 64, 64, m_params.numBodies, 16, 64, 8, btScalar(1.0f/1.733f));
// m_broadphase = new bt3DGridBroadphase(btVector3(-1, -1, -1), btVector3(1, 1, 1), 64, 64, 64, m_params.numBodies, 16, 64, 8, btScalar(1.0f/1.733f));
m_constraintSolver=new btSequentialImpulseConstraintSolver();