compute best axis on host and OpenCL for 1-axis SAP, based on best variance

This commit is contained in:
erwin coumans 2013-07-12 20:46:43 -07:00
parent 74aa499ab6
commit 061f7173b8
11 changed files with 600 additions and 12 deletions

View File

@ -37,7 +37,7 @@ public:
class b3gWindowInterface* m_window;
class GwenUserInterface* m_gui;
ConstructionInfo()
:useOpenCL(true),
preferredOpenCLPlatformIndex(-1),

View File

@ -5,6 +5,8 @@ rem @echo off
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl" --headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h" --stringname="radixSort32KernelsCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl" --headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h" --stringname="boundSearchKernelsCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl" --headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h" --stringname="prefixScanKernelsCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl" --headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h" --stringname="prefixScanKernelsFloat4CL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl" --headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h" --stringname="fillKernelsCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h" --stringname="sapCL" stringify

View File

@ -2,6 +2,8 @@
#include "b3GpuSapBroadphase.h"
#include "Bullet3Common/b3Vector3.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "kernels/sapKernels.h"
@ -21,6 +23,9 @@ m_largeAabbsGPU(ctx,q),
m_overlappingPairs(ctx,q),
m_gpuSmallSortData(ctx,q),
m_gpuSmallSortedAabbs(ctx,q),
m_sum(ctx,q),
m_sum2(ctx,q),
m_dst(ctx,q),
m_currentBuffer(-1)
{
const char* sapSrc = sapCL;
@ -33,7 +38,7 @@ m_currentBuffer(-1)
cl_program sapFastProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapFastSrc,&errNum,"",B3_BROADPHASE_SAPFAST_PATH);
b3Assert(errNum==CL_SUCCESS);
m_prefixScanFloat4 = new b3PrefixScanFloat4CL(m_context,m_device,m_queue);
//m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg );
//m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBarrier",&errNum,sapProg );
//m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
@ -42,6 +47,10 @@ m_currentBuffer(-1)
m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg );
b3Assert(errNum==CL_SUCCESS);
m_prepareSumVarianceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "prepareSumVarianceKernel",&errNum,sapProg );
b3Assert(errNum==CL_SUCCESS);
#if 0
m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg );
@ -68,11 +77,14 @@ m_currentBuffer(-1)
b3GpuSapBroadphase::~b3GpuSapBroadphase()
{
delete m_sorter;
delete m_prefixScanFloat4;
clReleaseKernel(m_scatterKernel);
clReleaseKernel(m_flipFloatKernel);
clReleaseKernel(m_copyAabbsKernel);
clReleaseKernel(m_sapKernel);
clReleaseKernel(m_sap2Kernel);
clReleaseKernel(m_prepareSumVarianceKernel);
}
@ -155,22 +167,26 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
}
void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs)
{
//test
//if (m_currentBuffer>=0)
// calculateOverlappingPairsHostIncremental3Sap();
int axis=0;
b3Assert(m_allAabbsCPU.size() == m_allAabbsGPU.size());
m_allAabbsGPU.copyToHost(m_allAabbsCPU);
//m_data->m_broadphaseSap->calculateOverlappingPairs(m_data->m_config.m_maxBroadphasePairs);
m_allAabbsGPU.copyToHost(m_allAabbsCPU);
int numSmallAabbs = m_smallAabbsCPU.size();
{
int numSmallAabbs = m_smallAabbsCPU.size();
for (int j=0;j<numSmallAabbs;j++)
{
//sync aabb
@ -180,6 +196,30 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs)
}
}
int axis=0;
{
b3Vector3 s(0,0,0),s2(0,0,0);
int numRigidBodies = numSmallAabbs;
for(int i=0;i<numRigidBodies;i++)
{
b3Vector3 maxAabb(m_smallAabbsCPU[i].m_max[0],m_smallAabbsCPU[i].m_max[1],m_smallAabbsCPU[i].m_max[2]);
b3Vector3 minAabb(m_smallAabbsCPU[i].m_min[0],m_smallAabbsCPU[i].m_min[1],m_smallAabbsCPU[i].m_min[2]);
b3Vector3 centerAabb=(maxAabb+minAabb)*0.5f;
s += centerAabb;
s2 += centerAabb*centerAabb;
}
b3Vector3 v = s2 - (s*s) / (float)numRigidBodies;
if(v[1] > v[0])
axis = 1;
if(v[2] > v[axis])
axis = 2;
}
{
int numLargeAabbs = m_largeAabbsCPU.size();
for (int j=0;j<numLargeAabbs;j++)
@ -268,9 +308,9 @@ void b3GpuSapBroadphase::reset()
void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs)
{
int axis = 0;//todo on GPU for now hardcode
B3_PROFILE("GPU 1-axis SAP calculateOverlappingPairs");
int axis = 0;
{
@ -312,11 +352,47 @@ void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs)
launcher.setConst( numSmallAabbs );
int num = numSmallAabbs;
launcher.launch1D( num);
clFinish(m_queue);
}
}
}
{
B3_PROFILE("compute best variance axis");
int numSmallAabbs = m_smallAabbsGPU.size();
if (m_dst.size()!=(numSmallAabbs+1))
{
m_dst.resize(numSmallAabbs+1);
m_sum.resize(numSmallAabbs+1);
m_sum2.resize(numSmallAabbs+1);
m_sum.at(numSmallAabbs)=b3Vector3(0,0,0); //slow?
m_sum2.at(numSmallAabbs)=b3Vector3(0,0,0); //slow?
}
b3LauncherCL launcher(m_queue, m_prepareSumVarianceKernel );
launcher.setBuffer(m_smallAabbsGPU.getBufferCL());
launcher.setBuffer(m_sum.getBufferCL());
launcher.setBuffer(m_sum2.getBufferCL());
launcher.setConst( numSmallAabbs+1 );
int num = numSmallAabbs+1;
launcher.launch1D( num);
b3Vector3 s;
b3Vector3 s2;
m_prefixScanFloat4->execute(m_sum,m_dst,numSmallAabbs+1,&s);
m_prefixScanFloat4->execute(m_sum2,m_dst,numSmallAabbs+1,&s2);
b3Vector3 v = s2 - (s*s) / (float)numSmallAabbs;
if(v[1] > v[0])
axis = 1;
if(v[2] > v[axis])
axis = 2;
}
if (syncOnHost)
{
B3_PROFILE("Synchronize m_largeAabbsGPU (CPU/slow)");
@ -360,7 +436,7 @@ void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs)
B3_PROFILE("GPU SAP");
int numSmallAabbs = m_smallAabbsGPU.size();
m_gpuSmallSortData.resize(numSmallAabbs);

View File

@ -21,6 +21,7 @@ class b3GpuSapBroadphase
cl_kernel m_copyAabbsKernel;
cl_kernel m_sapKernel;
cl_kernel m_sap2Kernel;
cl_kernel m_prepareSumVarianceKernel;
class b3RadixSort32CL* m_sorter;
@ -33,6 +34,10 @@ class b3GpuSapBroadphase
b3OpenCLArray<b3SapAabb> m_allAabbsGPU;
b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU;
b3OpenCLArray<b3Vector3> m_sum;
b3OpenCLArray<b3Vector3> m_sum2;
b3OpenCLArray<b3Vector3> m_dst;
b3OpenCLArray<b3SapAabb> m_smallAabbsGPU;
b3AlignedObjectArray<b3SapAabb> m_smallAabbsCPU;
@ -45,6 +50,7 @@ class b3GpuSapBroadphase
b3OpenCLArray<b3SortData> m_gpuSmallSortData;
b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs;
class b3PrefixScanFloat4CL* m_prefixScanFloat4;
b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q );
virtual ~b3GpuSapBroadphase();

View File

@ -318,3 +318,16 @@ __kernel void scatterKernel( __global const btAabbCL* aabbs, volatile __global
sortedAabbs[i] = aabbs[sortData[i].y];
}
__kernel void prepareSumVarianceKernel( __global const btAabbCL* aabbs, __global float4* sum, __global float4* sum2,int numAabbs)
{
int i = get_global_id(0);
if (i>numAabbs)
return;
float4 s;
s = (aabbs[i].m_max+aabbs[i].m_min)*0.5f;
sum[i]=s;
sum2[i]=s*s;
}

View File

@ -321,4 +321,17 @@ static const char* sapCL= \
" sortedAabbs[i] = aabbs[sortData[i].y];\n"
"}\n"
"\n"
"\n"
"\n"
"__kernel void prepareSumVarianceKernel( __global const btAabbCL* aabbs, __global float4* sum, __global float4* sum2,int numAabbs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>numAabbs)\n"
" return;\n"
" float4 s;\n"
" s = (aabbs[i].m_max+aabbs[i].m_min)*0.5f;\n"
" sum[i]=s;\n"
" sum2[i]=s*s; \n"
"}\n"
"\n"
;

View File

@ -0,0 +1,126 @@
#include "b3PrefixScanFloat4CL.h"
#include "b3FillCL.h"
#define B3_PREFIXSCAN_FLOAT4_PROG_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl"
#include "b3LauncherCL.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "kernels/PrefixScanKernelsFloat4CL.h"
b3PrefixScanFloat4CL::b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size)
:m_commandQueue(queue)
{
const char* scanKernelSource = prefixScanKernelsFloat4CL;
cl_int pErrNum;
char* additionalMacros=0;
m_workBuffer = new b3OpenCLArray<b3Vector3>(ctx,queue,size);
cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, B3_PREFIXSCAN_FLOAT4_PROG_PATH);
b3Assert(scanProg);
m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros );
b3Assert(m_localScanKernel );
m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros );
b3Assert(m_blockSumKernel );
m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros );
b3Assert(m_propagationKernel );
}
b3PrefixScanFloat4CL::~b3PrefixScanFloat4CL()
{
delete m_workBuffer;
clReleaseKernel(m_localScanKernel);
clReleaseKernel(m_blockSumKernel);
clReleaseKernel(m_propagationKernel);
}
template<class T>
T b3NextPowerOf2(T n)
{
n -= 1;
for(int i=0; i<sizeof(T)*8; i++)
n = n | (n>>i);
return n+1;
}
void b3PrefixScanFloat4CL::execute(b3OpenCLArray<b3Vector3>& src, b3OpenCLArray<b3Vector3>& dst, int n, b3Vector3* sum)
{
// b3Assert( data->m_option == EXCLUSIVE );
const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) );
dst.resize(src.size());
m_workBuffer->resize(src.size());
b3Int4 constBuffer;
constBuffer.x = n;
constBuffer.y = numBlocks;
constBuffer.z = (int)b3NextPowerOf2( numBlocks );
b3OpenCLArray<b3Vector3>* srcNative = &src;
b3OpenCLArray<b3Vector3>* dstNative = &dst;
{
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( srcNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
b3LauncherCL launcher( m_commandQueue, m_localScanKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( constBuffer );
launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE );
}
{
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
b3LauncherCL launcher( m_commandQueue, m_blockSumKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( constBuffer );
launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE );
}
if( numBlocks > 1 )
{
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
b3LauncherCL launcher( m_commandQueue, m_propagationKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( constBuffer );
launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE );
}
if( sum )
{
clFinish(m_commandQueue);
dstNative->copyToHostPointer(sum,1,n-1,true);
}
}
void b3PrefixScanFloat4CL::executeHost(b3AlignedObjectArray<b3Vector3>& src, b3AlignedObjectArray<b3Vector3>& dst, int n, b3Vector3* sum)
{
b3Vector3 s(0,0,0);
//if( data->m_option == EXCLUSIVE )
{
for(int i=0; i<n; i++)
{
dst[i] = s;
s += src[i];
}
}
/*else
{
for(int i=0; i<n; i++)
{
s += hSrc[i];
hDst[i] = s;
}
}
*/
if( sum )
{
*sum = dst[n-1];
}
}

View File

@ -0,0 +1,38 @@
#ifndef B3_PREFIX_SCAN_CL_H
#define B3_PREFIX_SCAN_CL_H
#include "b3OpenCLArray.h"
#include "b3BufferInfoCL.h"
#include "Bullet3Common/b3AlignedObjectArray.h"
#include "Bullet3Common/b3Vector3.h"
class b3PrefixScanFloat4CL
{
enum
{
BLOCK_SIZE = 128
};
// Option m_option;
cl_command_queue m_commandQueue;
cl_kernel m_localScanKernel;
cl_kernel m_blockSumKernel;
cl_kernel m_propagationKernel;
b3OpenCLArray<b3Vector3>* m_workBuffer;
public:
b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0);
virtual ~b3PrefixScanFloat4CL();
void execute(b3OpenCLArray<b3Vector3>& src, b3OpenCLArray<b3Vector3>& dst, int n, b3Vector3* sum = 0);
void executeHost(b3AlignedObjectArray<b3Vector3>& src, b3AlignedObjectArray<b3Vector3>& dst, int n, b3Vector3* sum);
};
#endif //B3_PREFIX_SCAN_CL_H

View File

@ -0,0 +1,154 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
typedef unsigned int u32;
#define GET_GROUP_IDX get_group_id(0)
#define GET_LOCAL_IDX get_local_id(0)
#define GET_GLOBAL_IDX get_global_id(0)
#define GET_GROUP_SIZE get_local_size(0)
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
// takahiro end
#define WG_SIZE 128
#define m_numElems x
#define m_numBlocks y
#define m_numScanBlocks z
/*typedef struct
{
uint m_numElems;
uint m_numBlocks;
uint m_numScanBlocks;
uint m_padding[1];
} ConstBuffer;
*/
float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize)
{
float4 blocksum;
int offset = 1;
for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)
{
GROUP_LDS_BARRIER;
for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)
{
int ai = offset*(2*iIdx+1)-1;
int bi = offset*(2*iIdx+2)-1;
data[bi] += data[ai];
}
}
GROUP_LDS_BARRIER;
if( lIdx == 0 )
{
blocksum = data[ n-1 ];
data[ n-1 ] = 0;
}
GROUP_LDS_BARRIER;
offset >>= 1;
for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )
{
GROUP_LDS_BARRIER;
for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )
{
int ai = offset*(2*iIdx+1)-1;
int bi = offset*(2*iIdx+2)-1;
float4 temp = data[ai];
data[ai] = data[bi];
data[bi] += temp;
}
}
GROUP_LDS_BARRIER;
return blocksum;
}
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
__kernel
void LocalScanKernel(__global float4* dst, __global float4* src, __global float4* sumBuffer, uint4 cb)
{
__local float4 ldsData[WG_SIZE*2];
int gIdx = GET_GLOBAL_IDX;
int lIdx = GET_LOCAL_IDX;
ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;
ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;
float4 sum = ScanExclusiveFloat4(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);
if( lIdx == 0 )
sumBuffer[GET_GROUP_IDX] = sum;
if( (2*gIdx) < cb.m_numElems )
{
dst[2*gIdx] = ldsData[2*lIdx];
}
if( (2*gIdx + 1) < cb.m_numElems )
{
dst[2*gIdx + 1] = ldsData[2*lIdx + 1];
}
}
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
__kernel
void AddOffsetKernel(__global float4* dst, __global float4* blockSum, uint4 cb)
{
const u32 blockSize = WG_SIZE*2;
int myIdx = GET_GROUP_IDX+1;
int lIdx = GET_LOCAL_IDX;
float4 iBlockSum = blockSum[myIdx];
int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);
for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)
{
dst[i] += iBlockSum;
}
}
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
__kernel
void TopLevelScanKernel(__global float4* dst, uint4 cb)
{
__local float4 ldsData[2048];
int gIdx = GET_GLOBAL_IDX;
int lIdx = GET_LOCAL_IDX;
int lSize = GET_GROUP_SIZE;
for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )
{
ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;
}
GROUP_LDS_BARRIER;
float4 sum = ScanExclusiveFloat4(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);
for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )
{
dst[i] = ldsData[i];
}
if( gIdx == 0 )
{
dst[cb.m_numBlocks] = sum;
}
}

View File

@ -0,0 +1,158 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* prefixScanKernelsFloat4CL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"\n"
"\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"\n"
"// takahiro end\n"
"#define WG_SIZE 128 \n"
"#define m_numElems x\n"
"#define m_numBlocks y\n"
"#define m_numScanBlocks z\n"
"\n"
"/*typedef struct\n"
"{\n"
" uint m_numElems;\n"
" uint m_numBlocks;\n"
" uint m_numScanBlocks;\n"
" uint m_padding[1];\n"
"} ConstBuffer;\n"
"*/\n"
"\n"
"float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize)\n"
"{\n"
" float4 blocksum;\n"
" int offset = 1;\n"
" for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
" {\n"
" GROUP_LDS_BARRIER;\n"
" for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
" {\n"
" int ai = offset*(2*iIdx+1)-1;\n"
" int bi = offset*(2*iIdx+2)-1;\n"
" data[bi] += data[ai];\n"
" }\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" if( lIdx == 0 )\n"
" {\n"
" blocksum = data[ n-1 ];\n"
" data[ n-1 ] = 0;\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" offset >>= 1;\n"
" for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
" {\n"
" GROUP_LDS_BARRIER;\n"
" for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
" {\n"
" int ai = offset*(2*iIdx+1)-1;\n"
" int bi = offset*(2*iIdx+2)-1;\n"
" float4 temp = data[ai];\n"
" data[ai] = data[bi];\n"
" data[bi] += temp;\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" return blocksum;\n"
"}\n"
"\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void LocalScanKernel(__global float4* dst, __global float4* src, __global float4* sumBuffer, uint4 cb)\n"
"{\n"
" __local float4 ldsData[WG_SIZE*2];\n"
"\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
"\n"
" ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
" ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
"\n"
" float4 sum = ScanExclusiveFloat4(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
"\n"
" if( lIdx == 0 ) \n"
" sumBuffer[GET_GROUP_IDX] = sum;\n"
"\n"
" if( (2*gIdx) < cb.m_numElems )\n"
" {\n"
" dst[2*gIdx] = ldsData[2*lIdx];\n"
" }\n"
" if( (2*gIdx + 1) < cb.m_numElems )\n"
" {\n"
" dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
" }\n"
"}\n"
"\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void AddOffsetKernel(__global float4* dst, __global float4* blockSum, uint4 cb)\n"
"{\n"
" const u32 blockSize = WG_SIZE*2;\n"
"\n"
" int myIdx = GET_GROUP_IDX+1;\n"
" int lIdx = GET_LOCAL_IDX;\n"
"\n"
" float4 iBlockSum = blockSum[myIdx];\n"
"\n"
" int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
" for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
" {\n"
" dst[i] += iBlockSum;\n"
" }\n"
"}\n"
"\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void TopLevelScanKernel(__global float4* dst, uint4 cb)\n"
"{\n"
" __local float4 ldsData[2048];\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" int lSize = GET_GROUP_SIZE;\n"
"\n"
" for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
" {\n"
" ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" float4 sum = ScanExclusiveFloat4(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
"\n"
" for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
" {\n"
" dst[i] = ldsData[i];\n"
" }\n"
"\n"
" if( gIdx == 0 )\n"
" {\n"
" dst[cb.m_numBlocks] = sum;\n"
" }\n"
"}\n"
"\n"
;

View File

@ -228,6 +228,8 @@ void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
} else
{
m_data->m_broadphaseSap->calculateOverlappingPairs(m_data->m_config.m_maxBroadphasePairs);
//m_data->m_broadphaseSap->calculateOverlappingPairsHost(m_data->m_config.m_maxBroadphasePairs);
numPairs = m_data->m_broadphaseSap->getNumOverlap();
}
}