add bitonic sort, as comparison.

fix stringify.bat for Windows (need to fix Mac/Linux version too)
This commit is contained in:
erwincoumans 2013-04-30 11:40:09 -07:00
parent c5f488fe6d
commit 92f0938af3
24 changed files with 1857 additions and 177 deletions

View File

@ -101,7 +101,9 @@
include "../test/OpenCL/BasicInitialize"
-- include "../test/OpenCL/BroadphaseCollision"
-- include "../test/OpenCL/NarrowphaseCollision"
-- include "../test/OpenCL/ParallelPrimitives"
include "../test/OpenCL/ParallelPrimitives"
include "../test/OpenCL/RadixSortBenchmark"
include "../test/OpenCL/BitonicSort"
include "../src/Bullet3Dynamics"
include "../src/Bullet3Common"

View File

@ -1,35 +1,30 @@
@echo off
rem @echo off
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/vector_add/VectorAddKernels.cl" --headerfile="../opencl/vector_add/VectorAddKernels.h" --stringname="vectorAddCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/RadixSort32Kernels.cl" --headerfile="../opencl/parallel_primitives/kernels/RadixSort32KernelsCL.h" --stringname="radixSort32KernelsCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/BoundSearchKernels.cl" --headerfile="../opencl/parallel_primitives/kernels/BoundSearchKernelsCL.h" --stringname="boundSearchKernelsCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/PrefixScanKernels.cl" --headerfile="../opencl/parallel_primitives/kernels/PrefixScanKernelsCL.h" --stringname="prefixScanKernelsCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/FillKernels.cl" --headerfile="../opencl/parallel_primitives/kernels/FillKernelsCL.h" --stringname="fillKernelsCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl" --headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h" --stringname="radixSort32KernelsCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl" --headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h" --stringname="boundSearchKernelsCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl" --headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h" --stringname="prefixScanKernelsCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl" --headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h" --stringname="fillKernelsCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_broadphase/kernels/sap.cl" --headerfile="../opencl/gpu_broadphase/kernels/sapKernels.h" --stringname="sapCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_broadphase/kernels/sapFast.cl" --headerfile="../opencl/gpu_broadphase/kernels/sapFastKernels.h" --stringname="sapFastCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h" --stringname="sapCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h" --stringname="sapFastCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_narrowphase/kernels/sat.cl" --headerfile="../opencl/gpu_narrowphase/kernels/satKernels.h" --stringname="satKernelsCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_narrowphase/kernels/satClipHullContacts.cl" --headerfile="../opencl/gpu_narrowphase/kernels/satClipHullContacts.h" --stringname="satClipKernelsCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_narrowphase/kernels/primitiveContacts.cl" --headerfile="../opencl/gpu_narrowphase/kernels/primitiveContacts.h" --stringname="primitiveContactsKernelsCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_narrowphase/kernels/bvhTraversal.cl" --headerfile="../opencl/gpu_narrowphase/kernels/bvhTraversal.h" --stringname="bvhTraversalKernelCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/integrateKernel.cl" --headerfile="../opencl/gpu_rigidbody/kernels/integrateKernel.h" --stringname="integrateKernelCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/updateAabbsKernel.cl" --headerfile="../opencl/gpu_rigidbody/kernels/updateAabbsKernel.h" --stringname="updateAabbsKernelCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/solverSetup.cl" --headerfile="../opencl/gpu_rigidbody/kernels/solverSetup.h" --stringname="solverSetupCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/solverSetup2.cl" --headerfile="../opencl/gpu_rigidbody/kernels/solverSetup2.h" --stringname="solverSetup2CL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/batchingKernels.cl" --headerfile="../opencl/gpu_rigidbody/kernels/batchingKernels.h" --stringname="batchingKernelsCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/batchingKernelsNew.cl" --headerfile="../opencl/gpu_rigidbody/kernels/batchingKernelsNew.h" --stringname="batchingKernelsNewCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/solverUtils.cl" --headerfile="../opencl/gpu_rigidbody/kernels/solverUtils.h" --stringname="solverUtilsCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/solveContact.cl" --headerfile="../opencl/gpu_rigidbody/kernels/solveContact.h" --stringname="solveContactCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/solveFriction.cl" --headerfile="../opencl/gpu_rigidbody/kernels/solveFriction.h" --stringname="solveFrictionCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl" --headerfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h" --stringname="satKernelsCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl" --headerfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h" --stringname="satClipKernelsCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl" --headerfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h" --stringname="primitiveContactsKernelsCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.cl" --headerfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h" --stringname="bvhTraversalKernelCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/integrateKernel.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/integrateKernel.h" --stringname="integrateKernelCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/updateAabbsKernel.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/updateAabbsKernel.h" --stringname="updateAabbsKernelCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/solverSetup.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/solverSetup.h" --stringname="solverSetupCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/solverSetup2.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/solverSetup2.h" --stringname="solverSetup2CL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/batchingKernels.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/batchingKernels.h" --stringname="batchingKernelsCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/batchingKernelsNew.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/batchingKernelsNew.h" --stringname="batchingKernelsNewCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/solverUtils.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/solverUtils.h" --stringname="solverUtilsCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/solveContact.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/solveContact.h" --stringname="solveContactCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/solveFriction.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/solveFriction.h" --stringname="solveFrictionCL" stringify
pause

View File

@ -30,12 +30,12 @@ static const char* sapFastCL= \
" float m_maxElems[4];\n"
" int m_maxIndices[4];\n"
" };\n"
"} b3AabbCL;\n"
"} btAabbCL;\n"
"\n"
"\n"
"/// conservative test for overlap between two aabbs\n"
"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2)\n"
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
"{\n"
"//skip pairs between static (mass=0) objects\n"
" if ((aabb1->m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0))\n"
@ -50,18 +50,18 @@ static const char* sapFastCL= \
"\n"
"\n"
"//computePairsKernelBatchWrite\n"
"__kernel void computePairsKernel( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"__kernel void computePairsKernel( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" int localId = get_local_id(0);\n"
"\n"
" __local int numActiveWgItems[1];\n"
" __local int breakRequest[1];\n"
" __local b3AabbCL localAabbs[128];// = aabbs[i];\n"
" __local btAabbCL localAabbs[128];// = aabbs[i];\n"
" \n"
" int2 myPairs[64];\n"
" \n"
" b3AabbCL myAabb;\n"
" btAabbCL myAabb;\n"
" \n"
" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
" float testValue = myAabb.m_maxElems[axis];\n"

View File

@ -30,12 +30,12 @@ static const char* sapCL= \
" float m_maxElems[4];\n"
" int m_maxIndices[4];\n"
" };\n"
"} b3AabbCL;\n"
"} btAabbCL;\n"
"\n"
"\n"
"/// conservative test for overlap between two aabbs\n"
"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2)\n"
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
@ -43,8 +43,8 @@ static const char* sapCL= \
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"bool TestAabbAgainstAabb2GlobalGlobal(__global const b3AabbCL* aabb1, __global const b3AabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2GlobalGlobal(__global const b3AabbCL* aabb1, __global const b3AabbCL* aabb2)\n"
"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
@ -53,8 +53,8 @@ static const char* sapCL= \
" return overlap;\n"
"}\n"
"\n"
"bool TestAabbAgainstAabb2Global(const b3AabbCL* aabb1, __global const b3AabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2Global(const b3AabbCL* aabb1, __global const b3AabbCL* aabb2)\n"
"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
@ -64,7 +64,7 @@ static const char* sapCL= \
"}\n"
"\n"
"\n"
"__kernel void computePairsKernelTwoArrays( __global const b3AabbCL* unsortedAabbs, __global const b3AabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)\n"
"__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const btAabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numUnsortedAabbs)\n"
@ -89,7 +89,7 @@ static const char* sapCL= \
" }\n"
"}\n"
"\n"
"__kernel void computePairsKernelOriginal( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
@ -117,7 +117,7 @@ static const char* sapCL= \
"\n"
"\n"
"\n"
"__kernel void computePairsKernelBarrier( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" int localId = get_local_id(0);\n"
@ -181,16 +181,16 @@ static const char* sapCL= \
"}\n"
"\n"
"\n"
"__kernel void computePairsKernelLocalSharedMemory( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" int localId = get_local_id(0);\n"
"\n"
" __local int numActiveWgItems[1];\n"
" __local int breakRequest[1];\n"
" __local b3AabbCL localAabbs[128];// = aabbs[i];\n"
" __local btAabbCL localAabbs[128];// = aabbs[i];\n"
" \n"
" b3AabbCL myAabb;\n"
" btAabbCL myAabb;\n"
" \n"
" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
" float testValue = myAabb.m_maxElems[axis];\n"
@ -289,7 +289,7 @@ static const char* sapCL= \
"\n"
"\n"
"\n"
"__kernel void copyAabbsKernel( __global const b3AabbCL* allAabbs, __global b3AabbCL* destAabbs, int numObjects)\n"
"__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
@ -300,7 +300,7 @@ static const char* sapCL= \
"}\n"
"\n"
"\n"
"__kernel void flipFloatKernel( __global const b3AabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)\n"
"__kernel void flipFloatKernel( __global const btAabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
@ -312,7 +312,7 @@ static const char* sapCL= \
"}\n"
"\n"
"\n"
"__kernel void scatterKernel( __global const b3AabbCL* aabbs, volatile __global const int2* sortData, __global b3AabbCL* sortedAabbs, int numObjects)\n"
"__kernel void scatterKernel( __global const btAabbCL* aabbs, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"

View File

@ -1,6 +1,6 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* bvhTraversalKernelCL= \
"//keep this enum in sync with the CPU version (in b3Collidable.h)\n"
"//keep this enum in sync with the CPU version (in btCollidable.h)\n"
"//written by Erwin Coumans\n"
"\n"
"#define SHAPE_CONVEX_HULL 3\n"
@ -13,7 +13,7 @@ static const char* bvhTraversalKernelCL= \
"\n"
"#define MAX_NUM_PARTS_IN_BITS 10\n"
"\n"
"///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.\n"
"///btQuantizedBvhNode is a compressed aabb node, 16 bytes.\n"
"///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n"
"typedef struct\n"
"{\n"
@ -22,7 +22,7 @@ static const char* bvhTraversalKernelCL= \
" unsigned short int m_quantizedAabbMax[3];\n"
" //4 bytes\n"
" int m_escapeIndexOrTriangleIndex;\n"
"} b3QuantizedBvhNode;\n"
"} btQuantizedBvhNode;\n"
"\n"
"typedef struct\n"
"{\n"
@ -44,12 +44,12 @@ static const char* bvhTraversalKernelCL= \
" }\n"
" int getEscapeIndex() const\n"
" {\n"
" b3Assert(!isLeafNode());\n"
" btAssert(!isLeafNode());\n"
" return -m_escapeIndexOrTriangleIndex;\n"
" }\n"
" int getTriangleIndex() const\n"
" {\n"
" b3Assert(isLeafNode());\n"
" btAssert(isLeafNode());\n"
" unsigned int x=0;\n"
" unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n"
" // Get only the lower bits where the triangle index is stored\n"
@ -57,13 +57,13 @@ static const char* bvhTraversalKernelCL= \
" }\n"
" int getPartId() const\n"
" {\n"
" b3Assert(isLeafNode());\n"
" btAssert(isLeafNode());\n"
" // Get only the highest bits where the part index is stored\n"
" return (m_escapeIndexOrTriangleIndex>>(31-MAX_NUM_PARTS_IN_BITS));\n"
" }\n"
"*/\n"
"\n"
"int getTriangleIndex(const b3QuantizedBvhNode* rootNode)\n"
"int getTriangleIndex(const btQuantizedBvhNode* rootNode)\n"
"{\n"
" unsigned int x=0;\n"
" unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n"
@ -71,13 +71,13 @@ static const char* bvhTraversalKernelCL= \
" return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n"
"}\n"
"\n"
"int isLeaf(const b3QuantizedBvhNode* rootNode)\n"
"int isLeaf(const btQuantizedBvhNode* rootNode)\n"
"{\n"
" //skipindex is negative (internal node), triangleindex >=0 (leafnode)\n"
" return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n"
"}\n"
" \n"
"int getEscapeIndex(const b3QuantizedBvhNode* rootNode)\n"
"int getEscapeIndex(const btQuantizedBvhNode* rootNode)\n"
"{\n"
" return -rootNode->m_escapeIndexOrTriangleIndex;\n"
"}\n"
@ -92,9 +92,9 @@ static const char* bvhTraversalKernelCL= \
" //4 bytes\n"
" int m_subtreeSize;\n"
" int m_padding[3];\n"
"} b3BvhSubtreeInfo;\n"
"} btBvhSubtreeInfo;\n"
"\n"
"///keep this in sync with b3Collidable.h\n"
"///keep this in sync with btCollidable.h\n"
"typedef struct\n"
"{\n"
" int m_numChildShapes;\n"
@ -102,7 +102,7 @@ static const char* bvhTraversalKernelCL= \
" int m_shapeType;\n"
" int m_shapeIndex;\n"
" \n"
"} b3CollidableGpu;\n"
"} btCollidableGpu;\n"
"\n"
"typedef struct\n"
"{\n"
@ -112,7 +112,7 @@ static const char* bvhTraversalKernelCL= \
" int m_unused0;\n"
" int m_unused1;\n"
" int m_unused2;\n"
"} b3GpuChildShape;\n"
"} btGpuChildShape;\n"
"\n"
"\n"
"typedef struct\n"
@ -142,7 +142,7 @@ static const char* bvhTraversalKernelCL= \
" float m_maxElems[4];\n"
" int m_maxIndices[4];\n"
" };\n"
"} b3AabbCL;\n"
"} btAabbCL;\n"
"\n"
"\n"
"int testQuantizedAabbAgainstQuantizedAabb(\n"
@ -196,12 +196,12 @@ static const char* bvhTraversalKernelCL= \
"// work-in-progress\n"
"__kernel void bvhTraversalKernel( __global const int2* pairs, \n"
" __global const BodyData* rigidBodies, \n"
" __global const b3CollidableGpu* collidables,\n"
" __global b3AabbCL* aabbs,\n"
" __global const btCollidableGpu* collidables,\n"
" __global btAabbCL* aabbs,\n"
" __global int4* concavePairsOut,\n"
" __global volatile int* numConcavePairsOut,\n"
" __global const b3BvhSubtreeInfo* subtreeHeadersRoot,\n"
" __global const b3QuantizedBvhNode* quantizedNodesRoot,\n"
" __global const btBvhSubtreeInfo* subtreeHeadersRoot,\n"
" __global const btQuantizedBvhNode* quantizedNodesRoot,\n"
" __global const b3BvhInfo* bvhInfos,\n"
" int numPairs,\n"
" int maxNumConcavePairsCapacity)\n"
@ -238,8 +238,8 @@ static const char* bvhTraversalKernelCL= \
" float4 bvhAabbMax = bvhInfo.m_aabbMax;\n"
" float4 bvhQuantization = bvhInfo.m_quantization;\n"
" int numSubtreeHeaders = bvhInfo.m_numSubTrees;\n"
" __global const b3BvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];\n"
" __global const b3QuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];\n"
" __global const btBvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];\n"
" __global const btQuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];\n"
" \n"
"\n"
" unsigned short int quantizedQueryAabbMin[3];\n"
@ -249,7 +249,7 @@ static const char* bvhTraversalKernelCL= \
" \n"
" for (int i=0;i<numSubtreeHeaders;i++)\n"
" {\n"
" b3BvhSubtreeInfo subtree = subtreeHeaders[i];\n"
" btBvhSubtreeInfo subtree = subtreeHeaders[i];\n"
" \n"
" int overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);\n"
" if (overlap != 0)\n"
@ -262,7 +262,7 @@ static const char* bvhTraversalKernelCL= \
" int aabbOverlap;\n"
" while (curIndex < endNodeIndex)\n"
" {\n"
" b3QuantizedBvhNode rootNode = quantizedNodes[curIndex];\n"
" btQuantizedBvhNode rootNode = quantizedNodes[curIndex];\n"
" aabbOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode.m_quantizedAabbMin,rootNode.m_quantizedAabbMax);\n"
" isLeafNode = isLeaf(&rootNode);\n"
" if (aabbOverlap)\n"

View File

@ -67,9 +67,9 @@ static const char* primitiveContactsKernelsCL= \
" float m_maxElems[4];\n"
" int m_maxIndices[4];\n"
" };\n"
"} b3AabbCL;\n"
"} btAabbCL;\n"
"\n"
"///keep this in sync with b3Collidable.h\n"
"///keep this in sync with btCollidable.h\n"
"typedef struct\n"
"{\n"
" int m_numChildShapes;\n"
@ -77,7 +77,7 @@ static const char* primitiveContactsKernelsCL= \
" int m_shapeType;\n"
" int m_shapeIndex;\n"
" \n"
"} b3CollidableGpu;\n"
"} btCollidableGpu;\n"
"\n"
"typedef struct\n"
"{\n"
@ -87,7 +87,7 @@ static const char* primitiveContactsKernelsCL= \
" int m_unused0;\n"
" int m_unused1;\n"
" int m_unused2;\n"
"} b3GpuChildShape;\n"
"} btGpuChildShape;\n"
"\n"
"#define GET_NPOINTS(x) (x).m_worldNormal.w\n"
"\n"
@ -129,7 +129,7 @@ static const char* primitiveContactsKernelsCL= \
" float4 m_plane;\n"
" int m_indexOffset;\n"
" int m_numIndices;\n"
"} b3GpuFace;\n"
"} btGpuFace;\n"
"\n"
"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
"\n"
@ -290,7 +290,7 @@ static const char* primitiveContactsKernelsCL= \
"\n"
"\n"
"inline bool IsPointInPolygon(float4 p, \n"
" const b3GpuFace* face,\n"
" const btGpuFace* face,\n"
" __global const float4* baseVertex,\n"
" __global const int* convexIndices,\n"
" float4* out)\n"
@ -352,11 +352,11 @@ static const char* primitiveContactsKernelsCL= \
" int bodyIndexA, int bodyIndexB, \n"
" int collidableIndexA, int collidableIndexB, \n"
" __global const BodyData* rigidBodies, \n"
" __global const b3CollidableGpu* collidables,\n"
" __global const btCollidableGpu* collidables,\n"
" __global const ConvexPolyhedronCL* convexShapes,\n"
" __global const float4* convexVertices,\n"
" __global const int* convexIndices,\n"
" __global const b3GpuFace* faces,\n"
" __global const btGpuFace* faces,\n"
" __global Contact4* restrict globalContactsOut,\n"
" counter32_t nGlobalContactsOut,\n"
" int maxContactCapacity,\n"
@ -383,7 +383,7 @@ static const char* primitiveContactsKernelsCL= \
"\n"
" for ( int f = 0; f < numFaces; f++ )\n"
" {\n"
" b3GpuFace face = faces[convexShapes[shapeIndex].m_faceOffset+f];\n"
" btGpuFace face = faces[convexShapes[shapeIndex].m_faceOffset+f];\n"
"\n"
" // set up a plane equation \n"
" float4 planeEqn;\n"
@ -594,11 +594,11 @@ static const char* primitiveContactsKernelsCL= \
" int bodyIndexA, int bodyIndexB, \n"
" int collidableIndexA, int collidableIndexB, \n"
" __global const BodyData* rigidBodies, \n"
" __global const b3CollidableGpu*collidables,\n"
" __global const btCollidableGpu*collidables,\n"
" __global const ConvexPolyhedronCL* convexShapes,\n"
" __global const float4* convexVertices,\n"
" __global const int* convexIndices,\n"
" __global const b3GpuFace* faces,\n"
" __global const btGpuFace* faces,\n"
" __global Contact4* restrict globalContactsOut,\n"
" counter32_t nGlobalContactsOut,\n"
" int maxContactCapacity,\n"
@ -733,8 +733,8 @@ static const char* primitiveContactsKernelsCL= \
" int bodyIndexA, int bodyIndexB, \n"
" int collidableIndexA, int collidableIndexB, \n"
" __global const BodyData* rigidBodies, \n"
" __global const b3CollidableGpu* collidables,\n"
" __global const b3GpuFace* faces,\n"
" __global const btCollidableGpu* collidables,\n"
" __global const btGpuFace* faces,\n"
" __global Contact4* restrict globalContactsOut,\n"
" counter32_t nGlobalContactsOut,\n"
" int maxContactCapacity)\n"
@ -793,11 +793,11 @@ static const char* primitiveContactsKernelsCL= \
"\n"
"__kernel void primitiveContactsKernel( __global const int2* pairs, \n"
" __global const BodyData* rigidBodies, \n"
" __global const b3CollidableGpu* collidables,\n"
" __global const btCollidableGpu* collidables,\n"
" __global const ConvexPolyhedronCL* convexShapes, \n"
" __global const float4* vertices,\n"
" __global const float4* uniqueEdges,\n"
" __global const b3GpuFace* faces,\n"
" __global const btGpuFace* faces,\n"
" __global const int* indices,\n"
" __global Contact4* restrict globalContactsOut,\n"
" counter32_t nGlobalContactsOut,\n"
@ -972,14 +972,14 @@ static const char* primitiveContactsKernelsCL= \
"// work-in-progress\n"
"__kernel void processCompoundPairsPrimitivesKernel( __global const int4* gpuCompoundPairs,\n"
" __global const BodyData* rigidBodies, \n"
" __global const b3CollidableGpu* collidables,\n"
" __global const btCollidableGpu* collidables,\n"
" __global const ConvexPolyhedronCL* convexShapes, \n"
" __global const float4* vertices,\n"
" __global const float4* uniqueEdges,\n"
" __global const b3GpuFace* faces,\n"
" __global const btGpuFace* faces,\n"
" __global const int* indices,\n"
" __global b3AabbCL* aabbs,\n"
" __global const b3GpuChildShape* gpuChildShapes,\n"
" __global btAabbCL* aabbs,\n"
" __global const btGpuChildShape* gpuChildShapes,\n"
" __global Contact4* restrict globalContactsOut,\n"
" counter32_t nGlobalContactsOut,\n"
" int numCompoundPairs, int maxContactCapacity\n"
@ -1157,7 +1157,7 @@ static const char* primitiveContactsKernelsCL= \
" int bodyIndexA, int bodyIndexB,\n"
" int collidableIndexA, int collidableIndexB, \n"
" __global const BodyData* rigidBodies, \n"
" __global const b3CollidableGpu* collidables,\n"
" __global const btCollidableGpu* collidables,\n"
" const float4* triangleVertices,\n"
" __global Contact4* restrict globalContactsOut,\n"
" counter32_t nGlobalContactsOut,\n"
@ -1299,13 +1299,13 @@ static const char* primitiveContactsKernelsCL= \
"// work-in-progress\n"
"__kernel void findConcaveSphereContactsKernel( __global int4* concavePairs,\n"
" __global const BodyData* rigidBodies,\n"
" __global const b3CollidableGpu* collidables,\n"
" __global const btCollidableGpu* collidables,\n"
" __global const ConvexPolyhedronCL* convexShapes, \n"
" __global const float4* vertices,\n"
" __global const float4* uniqueEdges,\n"
" __global const b3GpuFace* faces,\n"
" __global const btGpuFace* faces,\n"
" __global const int* indices,\n"
" __global b3AabbCL* aabbs,\n"
" __global btAabbCL* aabbs,\n"
" __global Contact4* restrict globalContactsOut,\n"
" counter32_t nGlobalContactsOut,\n"
" int numConcavePairs, int maxContactCapacity\n"
@ -1329,7 +1329,7 @@ static const char* primitiveContactsKernelsCL= \
" if (collidables[collidableIndexB].m_shapeType==SHAPE_SPHERE)\n"
" {\n"
" int f = concavePairs[i].z;\n"
" b3GpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
" btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
" \n"
" float4 verticesA[3];\n"
" for (int i=0;i<3;i++)\n"

View File

@ -55,7 +55,7 @@ static const char* satClipKernelsCL= \
"} Contact4;\n"
"\n"
"\n"
"///keep this in sync with b3Collidable.h\n"
"///keep this in sync with btCollidable.h\n"
"typedef struct\n"
"{\n"
" int m_numChildShapes;\n"
@ -63,7 +63,7 @@ static const char* satClipKernelsCL= \
" int m_shapeType;\n"
" int m_shapeIndex;\n"
" \n"
"} b3CollidableGpu;\n"
"} btCollidableGpu;\n"
"\n"
"typedef struct\n"
"{\n"
@ -73,7 +73,7 @@ static const char* satClipKernelsCL= \
" int m_unused0;\n"
" int m_unused1;\n"
" int m_unused2;\n"
"} b3GpuChildShape;\n"
"} btGpuChildShape;\n"
"\n"
"#define GET_NPOINTS(x) (x).m_worldNormal.w\n"
"\n"
@ -115,7 +115,7 @@ static const char* satClipKernelsCL= \
" float4 m_plane;\n"
" int m_indexOffset;\n"
" int m_numIndices;\n"
"} b3GpuFace;\n"
"} btGpuFace;\n"
"\n"
"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
"\n"
@ -357,7 +357,7 @@ static const char* satClipKernelsCL= \
" float4* worldVertsB2, int capacityWorldVertsB2,\n"
" const float minDist, float maxDist,\n"
" __global const float4* vertices,\n"
" __global const b3GpuFace* faces,\n"
" __global const btGpuFace* faces,\n"
" __global const int* indices,\n"
" float4* contactsOut,\n"
" int contactCapacity)\n"
@ -392,7 +392,7 @@ static const char* satClipKernelsCL= \
" if (closestFaceA<0)\n"
" return numContactsOut;\n"
"\n"
" b3GpuFace polyA = faces[hullA->m_faceOffset+closestFaceA];\n"
" btGpuFace polyA = faces[hullA->m_faceOffset+closestFaceA];\n"
"\n"
" // clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n"
" int numVerticesA = polyA.m_numIndices;\n"
@ -416,7 +416,7 @@ static const char* satClipKernelsCL= \
" //clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);\n"
" numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);\n"
"\n"
" //b3Swap(pVtxIn,pVtxOut);\n"
" //btSwap(pVtxIn,pVtxOut);\n"
" float4* tmp = pVtxOut;\n"
" pVtxOut = pVtxIn;\n"
" pVtxIn = tmp;\n"
@ -458,10 +458,10 @@ static const char* satClipKernelsCL= \
" float4* worldVertsB2, int capacityWorldVertsB2,\n"
" const float minDist, float maxDist,\n"
" const float4* verticesA,\n"
" const b3GpuFace* facesA,\n"
" const btGpuFace* facesA,\n"
" const int* indicesA,\n"
" __global const float4* verticesB,\n"
" __global const b3GpuFace* facesB,\n"
" __global const btGpuFace* facesB,\n"
" __global const int* indicesB,\n"
" float4* contactsOut,\n"
" int contactCapacity)\n"
@ -496,7 +496,7 @@ static const char* satClipKernelsCL= \
" if (closestFaceA<0)\n"
" return numContactsOut;\n"
"\n"
" b3GpuFace polyA = facesA[hullA->m_faceOffset+closestFaceA];\n"
" btGpuFace polyA = facesA[hullA->m_faceOffset+closestFaceA];\n"
"\n"
" // clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n"
" int numVerticesA = polyA.m_numIndices;\n"
@ -520,7 +520,7 @@ static const char* satClipKernelsCL= \
" //clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);\n"
" numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);\n"
"\n"
" //b3Swap(pVtxIn,pVtxOut);\n"
" //btSwap(pVtxIn,pVtxOut);\n"
" float4* tmp = pVtxOut;\n"
" pVtxOut = pVtxIn;\n"
" pVtxIn = tmp;\n"
@ -561,7 +561,7 @@ static const char* satClipKernelsCL= \
" float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,\n"
" const float minDist, float maxDist,\n"
" __global const float4* vertices,\n"
" __global const b3GpuFace* faces,\n"
" __global const btGpuFace* faces,\n"
" __global const int* indices,\n"
" float4* localContactsOut,\n"
" int localContactCapacity)\n"
@ -589,7 +589,7 @@ static const char* satClipKernelsCL= \
" }\n"
"\n"
" {\n"
" const b3GpuFace polyB = faces[hullB->m_faceOffset+closestFaceB];\n"
" const btGpuFace polyB = faces[hullB->m_faceOffset+closestFaceB];\n"
" const int numVertices = polyB.m_numIndices;\n"
" for(int e0=0;e0<numVertices;e0++)\n"
" {\n"
@ -617,10 +617,10 @@ static const char* satClipKernelsCL= \
" float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,\n"
" const float minDist, float maxDist,\n"
" const float4* verticesA,\n"
" const b3GpuFace* facesA,\n"
" const btGpuFace* facesA,\n"
" const int* indicesA,\n"
" __global const float4* verticesB,\n"
" __global const b3GpuFace* facesB,\n"
" __global const btGpuFace* facesB,\n"
" __global const int* indicesB,\n"
" float4* localContactsOut,\n"
" int localContactCapacity)\n"
@ -648,7 +648,7 @@ static const char* satClipKernelsCL= \
" }\n"
"\n"
" {\n"
" const b3GpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB];\n"
" const btGpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB];\n"
" const int numVertices = polyB.m_numIndices;\n"
" for(int e0=0;e0<numVertices;e0++)\n"
" {\n"
@ -956,11 +956,11 @@ static const char* satClipKernelsCL= \
"\n"
"__kernel void clipHullHullKernel( __global const int2* pairs, \n"
" __global const BodyData* rigidBodies, \n"
" __global const b3CollidableGpu* collidables,\n"
" __global const btCollidableGpu* collidables,\n"
" __global const ConvexPolyhedronCL* convexShapes, \n"
" __global const float4* vertices,\n"
" __global const float4* uniqueEdges,\n"
" __global const b3GpuFace* faces,\n"
" __global const btGpuFace* faces,\n"
" __global const int* indices,\n"
" __global const float4* separatingNormals,\n"
" __global const int* hasSeparatingAxis,\n"
@ -1053,13 +1053,13 @@ static const char* satClipKernelsCL= \
"\n"
"__kernel void clipCompoundsHullHullKernel( __global const int4* gpuCompoundPairs, \n"
" __global const BodyData* rigidBodies, \n"
" __global const b3CollidableGpu* collidables,\n"
" __global const btCollidableGpu* collidables,\n"
" __global const ConvexPolyhedronCL* convexShapes, \n"
" __global const float4* vertices,\n"
" __global const float4* uniqueEdges,\n"
" __global const b3GpuFace* faces,\n"
" __global const btGpuFace* faces,\n"
" __global const int* indices,\n"
" __global const b3GpuChildShape* gpuChildShapes,\n"
" __global const btGpuChildShape* gpuChildShapes,\n"
" __global const float4* gpuCompoundSepNormalsOut,\n"
" __global const int* gpuHasCompoundSepNormalsOut,\n"
" __global Contact4* restrict globalContactsOut,\n"
@ -1185,7 +1185,7 @@ static const char* satClipKernelsCL= \
"\n"
"__kernel void sphereSphereCollisionKernel( __global const int2* pairs, \n"
" __global const BodyData* rigidBodies, \n"
" __global const b3CollidableGpu* collidables,\n"
" __global const btCollidableGpu* collidables,\n"
" __global const float4* separatingNormals,\n"
" __global const int* hasSeparatingAxis,\n"
" __global Contact4* restrict globalContactsOut,\n"
@ -1252,13 +1252,13 @@ static const char* satClipKernelsCL= \
"\n"
"__kernel void clipHullHullConcaveConvexKernel( __global int4* concavePairsIn,\n"
" __global const BodyData* rigidBodies, \n"
" __global const b3CollidableGpu* collidables,\n"
" __global const btCollidableGpu* collidables,\n"
" __global const ConvexPolyhedronCL* convexShapes, \n"
" __global const float4* vertices,\n"
" __global const float4* uniqueEdges,\n"
" __global const b3GpuFace* faces,\n"
" __global const btGpuFace* faces,\n"
" __global const int* indices,\n"
" __global const b3GpuChildShape* gpuChildShapes,\n"
" __global const btGpuChildShape* gpuChildShapes,\n"
" __global const float4* separatingNormals,\n"
" __global Contact4* restrict globalContactsOut,\n"
" counter32_t nGlobalContactsOut,\n"
@ -1306,7 +1306,7 @@ static const char* satClipKernelsCL= \
" convexPolyhedronA.m_vertexOffset = 0;\n"
" float4 localCenter = make_float4(0.f,0.f,0.f,0.f);\n"
"\n"
" b3GpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
" btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
" \n"
" float4 verticesA[3];\n"
" for (int i=0;i<3;i++)\n"
@ -1335,7 +1335,7 @@ static const char* satClipKernelsCL= \
" \n"
" float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n"
" \n"
" b3GpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n"
" btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n"
" int indicesA[3+3+2+2+2];\n"
" int curUsedIndices=0;\n"
" int fidx=0;\n"
@ -1496,7 +1496,7 @@ static const char* satClipKernelsCL= \
" int capacityWorldVerts,\n"
" const float minDist, float maxDist,\n"
" __global const float4* vertices,\n"
" __global const b3GpuFace* faces,\n"
" __global const btGpuFace* faces,\n"
" __global const int* indices,\n"
" __global int4* clippingFaces, int pairIndex)\n"
"{\n"
@ -1523,7 +1523,7 @@ static const char* satClipKernelsCL= \
" }\n"
" \n"
" {\n"
" const b3GpuFace polyB = faces[hullB->m_faceOffset+closestFaceB];\n"
" const btGpuFace polyB = faces[hullB->m_faceOffset+closestFaceB];\n"
" const int numVertices = polyB.m_numIndices;\n"
" for(int e0=0;e0<numVertices;e0++)\n"
" {\n"
@ -1664,11 +1664,11 @@ static const char* satClipKernelsCL= \
"\n"
"__kernel void findClippingFacesKernel( __global const int2* pairs,\n"
" __global const BodyData* rigidBodies,\n"
" __global const b3CollidableGpu* collidables,\n"
" __global const btCollidableGpu* collidables,\n"
" __global const ConvexPolyhedronCL* convexShapes,\n"
" __global const float4* vertices,\n"
" __global const float4* uniqueEdges,\n"
" __global const b3GpuFace* faces,\n"
" __global const btGpuFace* faces,\n"
" __global const int* indices,\n"
" __global const float4* separatingNormals,\n"
" __global const int* hasSeparatingAxis,\n"

View File

@ -1,6 +1,6 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* satKernelsCL= \
"//keep this enum in sync with the CPU version (in b3Collidable.h)\n"
"//keep this enum in sync with the CPU version (in btCollidable.h)\n"
"//written by Erwin Coumans\n"
"\n"
"\n"
@ -13,7 +13,7 @@ static const char* satKernelsCL= \
"\n"
"typedef unsigned int u32;\n"
"\n"
"///keep this in sync with b3Collidable.h\n"
"///keep this in sync with btCollidable.h\n"
"typedef struct\n"
"{\n"
" int m_numChildShapes;\n"
@ -21,7 +21,7 @@ static const char* satKernelsCL= \
" int m_shapeType;\n"
" int m_shapeIndex;\n"
" \n"
"} b3CollidableGpu;\n"
"} btCollidableGpu;\n"
"\n"
"typedef struct\n"
"{\n"
@ -31,7 +31,7 @@ static const char* satKernelsCL= \
" int m_unused0;\n"
" int m_unused1;\n"
" int m_unused2;\n"
"} b3GpuChildShape;\n"
"} btGpuChildShape;\n"
"\n"
"\n"
"typedef struct\n"
@ -80,14 +80,14 @@ static const char* satKernelsCL= \
" float m_maxElems[4];\n"
" int m_maxIndices[4];\n"
" };\n"
"} b3AabbCL;\n"
"} btAabbCL;\n"
"\n"
"typedef struct\n"
"{\n"
" float4 m_plane;\n"
" int m_indexOffset;\n"
" int m_numIndices;\n"
"} b3GpuFace;\n"
"} btGpuFace;\n"
"\n"
"#define make_float4 (float4)\n"
"\n"
@ -296,12 +296,12 @@ static const char* satKernelsCL= \
" \n"
" const float4* verticesA, \n"
" const float4* uniqueEdgesA, \n"
" const b3GpuFace* facesA,\n"
" const btGpuFace* facesA,\n"
" const int* indicesA,\n"
"\n"
" __global const float4* verticesB, \n"
" __global const float4* uniqueEdgesB, \n"
" __global const b3GpuFace* facesB,\n"
" __global const btGpuFace* facesB,\n"
" __global const int* indicesB,\n"
" float4* sep,\n"
" float* dmin)\n"
@ -348,11 +348,11 @@ static const char* satKernelsCL= \
" const float4 DeltaC2,\n"
" __global const float4* verticesA, \n"
" __global const float4* uniqueEdgesA, \n"
" __global const b3GpuFace* facesA,\n"
" __global const btGpuFace* facesA,\n"
" __global const int* indicesA,\n"
" const float4* verticesB,\n"
" const float4* uniqueEdgesB, \n"
" const b3GpuFace* facesB,\n"
" const btGpuFace* facesB,\n"
" const int* indicesB,\n"
" float4* sep,\n"
" float* dmin)\n"
@ -401,11 +401,11 @@ static const char* satKernelsCL= \
" const float4 DeltaC2,\n"
" const float4* verticesA, \n"
" const float4* uniqueEdgesA, \n"
" const b3GpuFace* facesA,\n"
" const btGpuFace* facesA,\n"
" const int* indicesA,\n"
" __global const float4* verticesB, \n"
" __global const float4* uniqueEdgesB, \n"
" __global const b3GpuFace* facesB,\n"
" __global const btGpuFace* facesB,\n"
" __global const int* indicesB,\n"
" float4* sep,\n"
" float* dmin)\n"
@ -507,7 +507,7 @@ static const char* satKernelsCL= \
" const float4 DeltaC2,\n"
" __global const float4* vertices, \n"
" __global const float4* uniqueEdges, \n"
" __global const b3GpuFace* faces,\n"
" __global const btGpuFace* faces,\n"
" __global const int* indices,\n"
" float4* sep,\n"
" float* dmin)\n"
@ -566,7 +566,7 @@ static const char* satKernelsCL= \
" const float4 DeltaC2,\n"
" __global const float4* vertices, \n"
" __global const float4* uniqueEdges, \n"
" __global const b3GpuFace* faces,\n"
" __global const btGpuFace* faces,\n"
" __global const int* indices,\n"
" float4* sep,\n"
" float* dmin)\n"
@ -643,14 +643,14 @@ static const char* satKernelsCL= \
"// work-in-progress\n"
"__kernel void processCompoundPairsKernel( __global const int4* gpuCompoundPairs,\n"
" __global const BodyData* rigidBodies, \n"
" __global const b3CollidableGpu* collidables,\n"
" __global const btCollidableGpu* collidables,\n"
" __global const ConvexPolyhedronCL* convexShapes, \n"
" __global const float4* vertices,\n"
" __global const float4* uniqueEdges,\n"
" __global const b3GpuFace* faces,\n"
" __global const btGpuFace* faces,\n"
" __global const int* indices,\n"
" __global b3AabbCL* aabbs,\n"
" __global const b3GpuChildShape* gpuChildShapes,\n"
" __global btAabbCL* aabbs,\n"
" __global const btGpuChildShape* gpuChildShapes,\n"
" __global volatile float4* gpuCompoundSepNormalsOut,\n"
" __global volatile int* gpuHasCompoundSepNormalsOut,\n"
" int numCompoundPairs\n"
@ -760,14 +760,14 @@ static const char* satKernelsCL= \
"// work-in-progress\n"
"__kernel void findCompoundPairsKernel( __global const int2* pairs, \n"
" __global const BodyData* rigidBodies, \n"
" __global const b3CollidableGpu* collidables,\n"
" __global const btCollidableGpu* collidables,\n"
" __global const ConvexPolyhedronCL* convexShapes, \n"
" __global const float4* vertices,\n"
" __global const float4* uniqueEdges,\n"
" __global const b3GpuFace* faces,\n"
" __global const btGpuFace* faces,\n"
" __global const int* indices,\n"
" __global b3AabbCL* aabbs,\n"
" __global const b3GpuChildShape* gpuChildShapes,\n"
" __global btAabbCL* aabbs,\n"
" __global const btGpuChildShape* gpuChildShapes,\n"
" __global volatile int4* gpuCompoundPairsOut,\n"
" __global volatile int* numCompoundPairsOut,\n"
" int numPairs,\n"
@ -942,13 +942,13 @@ static const char* satKernelsCL= \
"// work-in-progress\n"
"__kernel void findSeparatingAxisKernel( __global const int2* pairs, \n"
" __global const BodyData* rigidBodies, \n"
" __global const b3CollidableGpu* collidables,\n"
" __global const btCollidableGpu* collidables,\n"
" __global const ConvexPolyhedronCL* convexShapes, \n"
" __global const float4* vertices,\n"
" __global const float4* uniqueEdges,\n"
" __global const b3GpuFace* faces,\n"
" __global const btGpuFace* faces,\n"
" __global const int* indices,\n"
" __global b3AabbCL* aabbs,\n"
" __global btAabbCL* aabbs,\n"
" __global volatile float4* separatingNormals,\n"
" __global volatile int* hasSeparatingAxis,\n"
" int numPairs\n"
@ -1056,14 +1056,14 @@ static const char* satKernelsCL= \
"// work-in-progress\n"
"__kernel void findConcaveSeparatingAxisKernel( __global int4* concavePairs,\n"
" __global const BodyData* rigidBodies,\n"
" __global const b3CollidableGpu* collidables,\n"
" __global const btCollidableGpu* collidables,\n"
" __global const ConvexPolyhedronCL* convexShapes, \n"
" __global const float4* vertices,\n"
" __global const float4* uniqueEdges,\n"
" __global const b3GpuFace* faces,\n"
" __global const btGpuFace* faces,\n"
" __global const int* indices,\n"
" __global const b3GpuChildShape* gpuChildShapes,\n"
" __global b3AabbCL* aabbs,\n"
" __global const btGpuChildShape* gpuChildShapes,\n"
" __global btAabbCL* aabbs,\n"
" __global float4* concaveSeparatingNormalsOut,\n"
" int numConcavePairs\n"
" )\n"
@ -1106,9 +1106,9 @@ static const char* satKernelsCL= \
" convexPolyhedronA.m_vertexOffset = 0;\n"
" float4 localCenter = make_float4(0.f,0.f,0.f,0.f);\n"
"\n"
" b3GpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
" btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
" float4 triMinAabb, triMaxAabb;\n"
" b3AabbCL triAabb;\n"
" btAabbCL triAabb;\n"
" triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);\n"
" triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);\n"
" \n"
@ -1153,7 +1153,7 @@ static const char* satKernelsCL= \
" \n"
" float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n"
" \n"
" b3GpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n"
" btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n"
" int indicesA[3+3+2+2+2];\n"
" int curUsedIndices=0;\n"
" int fidx=0;\n"

View File

@ -210,7 +210,7 @@ static const char* batchingKernelsNewCL= \
"\n"
" if (i!=numValidConstraints)\n"
" {\n"
" //b3Swap(cs[i],cs[numValidConstraints]);\n"
" //btSwap(cs[i],cs[numValidConstraints]);\n"
" \n"
" Contact4 tmp = cs[i];\n"
" cs[i] = cs[numValidConstraints];\n"

View File

@ -47,7 +47,7 @@ static const char* integrateKernelCL= \
" integrateTransformsKernel( __global Body* bodies,const int numNodes, float timeStep, float angularDamping, float4 gravityAcceleration)\n"
"{\n"
" int nodeID = get_global_id(0);\n"
" float B3_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n"
" float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n"
" if( nodeID < numNodes && (bodies[nodeID].m_invMass != 0.f))\n"
" {\n"
" //angular velocity\n"
@ -61,9 +61,9 @@ static const char* integrateKernelCL= \
" float4 angvel = bodies[nodeID].m_angVel;\n"
" float fAngle = native_sqrt(dot(angvel, angvel));\n"
" //limit the angular motion\n"
" if(fAngle*timeStep > B3_GPU_ANGULAR_MOTION_THRESHOLD)\n"
" if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)\n"
" {\n"
" fAngle = B3_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n"
" fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n"
" }\n"
" if(fAngle < 0.001f)\n"
" {\n"

View File

@ -313,8 +313,8 @@ static const char* solveContactCL= \
" }\n"
"}\n"
"\n"
"void b3PlaneSpace1 (const float4* n, float4* p, float4* q);\n"
" void b3PlaneSpace1 (const float4* n, float4* p, float4* q)\n"
"void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n"
" void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n"
"{\n"
" if (fabs(n[0].z) > 0.70710678f) {\n"
" // choose p in y-z plane\n"

View File

@ -265,8 +265,8 @@ static const char* solveFrictionCL= \
" float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
" return -1.f/(jmj0+jmj1+jmj2+jmj3);\n"
"}\n"
"void b3PlaneSpace1 (const float4* n, float4* p, float4* q);\n"
" void b3PlaneSpace1 (const float4* n, float4* p, float4* q)\n"
"void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n"
" void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n"
"{\n"
" if (fabs(n[0].z) > 0.70710678f) {\n"
" // choose p in y-z plane\n"
@ -347,7 +347,7 @@ static const char* solveFrictionCL= \
" float4 n = -cs->m_linear;\n"
" \n"
" float4 tangent[2];\n"
" b3PlaneSpace1(&n,&tangent[0],&tangent[1]);\n"
" btPlaneSpace1(&n,&tangent[0],&tangent[1]);\n"
" float4 angular0, angular1, linear;\n"
" float4 r0 = center - posA;\n"
" float4 r1 = center - posB;\n"

View File

@ -489,8 +489,8 @@ static const char* solverSetupCL= \
"} ConstBufferSSD;\n"
"\n"
"\n"
"void b3PlaneSpace1 (float4 n, float4* p, float4* q);\n"
" void b3PlaneSpace1 (float4 n, float4* p, float4* q)\n"
"void btPlaneSpace1 (float4 n, float4* p, float4* q);\n"
" void btPlaneSpace1 (float4 n, float4* p, float4* q)\n"
"{\n"
" if (fabs(n.z) > 0.70710678f) {\n"
" // choose p in y-z plane\n"
@ -577,7 +577,7 @@ static const char* solverSetupCL= \
" center /= (float)src->m_worldNormal.w;\n"
"\n"
" float4 tangent[2];\n"
" b3PlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]);\n"
" btPlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]);\n"
" \n"
" float4 r[2];\n"
" r[0] = center - posA;\n"

View File

@ -488,8 +488,8 @@ static const char* solverUtilsCL= \
"}\n"
"\n"
"\n"
"void b3PlaneSpace1 (float4 n, float4* p, float4* q);\n"
" void b3PlaneSpace1 (float4 n, float4* p, float4* q)\n"
"void btPlaneSpace1 (float4 n, float4* p, float4* q);\n"
" void btPlaneSpace1 (float4 n, float4* p, float4* q)\n"
"{\n"
" if (fabs(n.z) > 0.70710678f) {\n"
" // choose p in y-z plane\n"
@ -739,7 +739,7 @@ static const char* solverUtilsCL= \
" float4 n = -cs->m_linear;\n"
" \n"
" float4 tangent[2];\n"
" b3PlaneSpace1(n,&tangent[0],&tangent[1]);\n"
" btPlaneSpace1(n,&tangent[0],&tangent[1]);\n"
" float4 angular0, angular1, linear;\n"
" float4 r0 = center - posA;\n"
" float4 r1 = center - posB;\n"
@ -896,7 +896,7 @@ static const char* solverUtilsCL= \
" center /= (float)src->m_worldNormal.w;\n"
"\n"
" float4 tangent[2];\n"
" b3PlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]);\n"
" btPlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]);\n"
" \n"
" float4 r[2];\n"
" r[0] = center - posA;\n"

View File

@ -120,7 +120,7 @@ static const char* updateAabbsKernelCL= \
" float fy;\n"
" float fz;\n"
" int uw;\n"
"} b3AABBCL;\n"
"} btAABBCL;\n"
"\n"
"__inline\n"
"Matrix3x3 mtTranspose(Matrix3x3 m)\n"
@ -156,7 +156,7 @@ static const char* updateAabbsKernelCL= \
"}\n"
"\n"
"\n"
"__kernel void initializeGpuAabbsFull( const int numNodes, __global Body* gBodies,__global Collidable* collidables, __global b3AABBCL* plocalShapeAABB, __global b3AABBCL* pAABB)\n"
"__kernel void initializeGpuAabbsFull( const int numNodes, __global Body* gBodies,__global Collidable* collidables, __global btAABBCL* plocalShapeAABB, __global btAABBCL* pAABB)\n"
"{\n"
" int nodeID = get_global_id(0);\n"
" \n"
@ -171,8 +171,8 @@ static const char* updateAabbsKernelCL= \
" \n"
" if (shapeIndex>=0)\n"
" {\n"
" b3AABBCL minAabb = plocalShapeAABB[collidableIndex*2];\n"
" b3AABBCL maxAabb = plocalShapeAABB[collidableIndex*2+1];\n"
" btAABBCL minAabb = plocalShapeAABB[collidableIndex*2];\n"
" btAABBCL maxAabb = plocalShapeAABB[collidableIndex*2+1];\n"
" \n"
" float4 halfExtents = ((float4)(maxAabb.fx - minAabb.fx,maxAabb.fy - minAabb.fy,maxAabb.fz - minAabb.fz,0.f))*0.5f;\n"
" float4 localCenter = ((float4)(maxAabb.fx + minAabb.fx,maxAabb.fy + minAabb.fy,maxAabb.fz + minAabb.fz,0.f))*0.5f;\n"

View File

@ -0,0 +1,171 @@
MSTRINGIFY(
/*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
*
* NVIDIA Corporation and its licensors retain all intellectual property and
* proprietary rights in and to this software and related documentation.
* Any use, reproduction, disclosure, or distribution of this software
* and related documentation without an express license agreement from
* NVIDIA Corporation is strictly prohibited.
*
* Please refer to the applicable NVIDIA end user license agreement (EULA)
* associated with this source code for terms and conditions that govern
* your use of this NVIDIA software.
*
*/
inline void ComparatorPrivate(int2* keyA, int2* keyB, uint dir)
{
if((keyA[0].x > keyB[0].x) == dir)
{
int2 tmp = *keyA;
*keyA = *keyB;
*keyB = tmp;
}
}
inline void ComparatorLocal(__local int2* keyA, __local int2* keyB, uint dir)
{
if((keyA[0].x > keyB[0].x) == dir)
{
int2 tmp = *keyA;
*keyA = *keyB;
*keyB = tmp;
}
}
////////////////////////////////////////////////////////////////////////////////
// Monolithic bitonic sort kernel for short arrays fitting into local memory
////////////////////////////////////////////////////////////////////////////////
__kernel void kBitonicSortCellIdLocal(__global int2* pKey, uint arrayLength, uint dir GUID_ARG)
{
__local int2 l_key[1024U];
int localSizeLimit = get_local_size(0) * 2;
//Offset to the beginning of subbatch and load data
pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
l_key[get_local_id(0) + 0] = pKey[ 0];
l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
for(uint size = 2; size < arrayLength; size <<= 1)
{
//Bitonic merge
uint ddd = dir ^ ( (get_local_id(0) & (size / 2)) != 0 );
for(uint stride = size / 2; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
}
}
//ddd == dir for the last bitonic merge step
{
for(uint stride = arrayLength / 2; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], dir);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
pKey[ 0] = l_key[get_local_id(0) + 0];
pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
}
////////////////////////////////////////////////////////////////////////////////
// Bitonic sort kernel for large arrays (not fitting into local memory)
////////////////////////////////////////////////////////////////////////////////
//Bottom-level bitonic sort
//Almost the same as bitonicSortLocal with the only exception
//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being
//sorted in opposite directions
__kernel void kBitonicSortCellIdLocal1(__global int2* pKey GUID_ARG)
{
__local int2 l_key[1024U];
uint localSizeLimit = get_local_size(0) * 2;
//Offset to the beginning of subarray and load data
pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
l_key[get_local_id(0) + 0] = pKey[ 0];
l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
uint comparatorI = get_global_id(0) & ((localSizeLimit / 2) - 1);
for(uint size = 2; size < localSizeLimit; size <<= 1)
{
//Bitonic merge
uint ddd = (comparatorI & (size / 2)) != 0;
for(uint stride = size / 2; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
}
}
//Odd / even arrays of localSizeLimit elements
//sorted in opposite directions
{
uint ddd = (get_group_id(0) & 1);
for(uint stride = localSizeLimit / 2; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
pKey[ 0] = l_key[get_local_id(0) + 0];
pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
}
//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
__kernel void kBitonicSortCellIdMergeGlobal(__global int2* pKey, uint arrayLength, uint size, uint stride, uint dir GUID_ARG)
{
uint global_comparatorI = get_global_id(0);
uint comparatorI = global_comparatorI & (arrayLength / 2 - 1);
//Bitonic merge
uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
int2 keyA = pKey[pos + 0];
int2 keyB = pKey[pos + stride];
ComparatorPrivate(&keyA, &keyB, ddd);
pKey[pos + 0] = keyA;
pKey[pos + stride] = keyB;
}
//Combined bitonic merge steps for
//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]
__kernel void kBitonicSortCellIdMergeLocal(__global int2* pKey, uint arrayLength, uint stride, uint size, uint dir GUID_ARG)
{
__local int2 l_key[1024U];
int localSizeLimit = get_local_size(0) * 2;
pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
l_key[get_local_id(0) + 0] = pKey[ 0];
l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
//Bitonic merge
uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1);
uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
for(; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
}
barrier(CLK_LOCAL_MEM_FENCE);
pKey[ 0] = l_key[get_local_id(0) + 0];
pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
}
);

View File

@ -0,0 +1,83 @@
#include "b3BitonicSort.h"
#include "Bullet3Common/b3Scalar.h"
//Note: logically shared with BitonicSort OpenCL code!
// TODO : get parameter from OpenCL and pass it to kernel (needed for platforms other than NVIDIA)
void bitonicSortNv(cl_mem pKey, int arrayLength, b3BitonicSortInfo& info)
{
if(arrayLength < 2)
return;
//Only power-of-two array lengths are supported so far
info.dir = (info.dir != 0);
cl_int ciErrNum;
size_t localWorkSize, globalWorkSize;
if(arrayLength <= info.localSizeLimit)
{
b3Assert( ( arrayLength) % info.localSizeLimit == 0);
//Launch bitonicSortLocal
ciErrNum = clSetKernelArg(info.bitonicSortLocal, 0, sizeof(cl_mem), (void *)&pKey);
ciErrNum |= clSetKernelArg(info.bitonicSortLocal, 1, sizeof(cl_uint), (void *)&arrayLength);
ciErrNum |= clSetKernelArg(info.bitonicSortLocal, 2, sizeof(cl_uint), (void *)&info.dir);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
localWorkSize = info.localSizeLimit / 2;
globalWorkSize = arrayLength / 2;
ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortLocal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
else
{
//Launch bitonicSortLocal1
ciErrNum = clSetKernelArg(info.bitonicSortLocal1, 0, sizeof(cl_mem), (void *)&pKey);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
localWorkSize = info.localSizeLimit / 2;
globalWorkSize = arrayLength / 2;
ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortLocal1, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
for(unsigned int size = 2 * info.localSizeLimit; size <= arrayLength; size <<= 1)
{
for(unsigned stride = size / 2; stride > 0; stride >>= 1)
{
if(stride >= info.localSizeLimit)
{
//Launch bitonicMergeGlobal
ciErrNum = clSetKernelArg(info.bitonicSortMergeGlobal, 0, sizeof(cl_mem), (void *)&pKey);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 1, sizeof(cl_uint), (void *)&arrayLength);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 2, sizeof(cl_uint), (void *)&size);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 3, sizeof(cl_uint), (void *)&stride);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 4, sizeof(cl_uint), (void *)&info.dir);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
localWorkSize = info.localSizeLimit / 4;
globalWorkSize = arrayLength / 2;
ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortMergeGlobal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
else
{
//Launch bitonicMergeLocal
ciErrNum = clSetKernelArg(info.bitonicSortMergeLocal, 0, sizeof(cl_mem), (void *)&pKey);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 1, sizeof(cl_uint), (void *)&arrayLength);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 2, sizeof(cl_uint), (void *)&stride);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 3, sizeof(cl_uint), (void *)&size);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 4, sizeof(cl_uint), (void *)&info.dir);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
localWorkSize = info.localSizeLimit / 2;
globalWorkSize = arrayLength / 2;
ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortMergeLocal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
break;
}
}
}
}
}

View File

@ -0,0 +1,30 @@
#ifndef B3_BITONIC_SORT_H
#define B3_BITONIC_SORT_H
#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
struct b3BitonicSortInfo
{
cl_command_queue m_cqCommandQue;
cl_kernel bitonicSortLocal;
cl_kernel bitonicSortLocal1;
cl_kernel bitonicSortMergeGlobal;
cl_kernel bitonicSortMergeLocal;
unsigned int dir;
unsigned int localSizeLimit;
b3BitonicSortInfo()
{
bitonicSortLocal=0;
bitonicSortLocal1=0;
bitonicSortMergeGlobal=0;
bitonicSortMergeLocal=0;
dir = 1;
localSizeLimit = 1024U;
}
};
void bitonicSortNv(cl_mem pKey, int arrayLength, b3BitonicSortInfo& info);
#endif //B3_BITONIC_SORT_H

View File

@ -0,0 +1,192 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
///original author: Erwin Coumans
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
#include "Bullet3Common/b3Int2.h"
#include "Bullet3Common/b3Quickprof.h"
#include "b3BitonicSort.h"
#include <stdio.h>
int numSuccess=0;
int numFailed=0;
cl_context g_cxMainContext;
cl_command_queue g_cqCommandQue;
#define MSTRINGIFY(A) #A
static const char* kernelSource=
#include "BitonicSort.cl"
static bool compareFunc(const b3Int2& p, const b3Int2& q)
{
return (p.x < q.x) || ((p.x == q.x) && ((p.y < q.y)));
}
int main(int argc, char* argv[])
{
int ciErrNum = 0;
b3Clock clock;
cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
const char* vendorSDK = b3OpenCLUtils::getSdkVendorName();
printf("This program was compiled using the %s OpenCL SDK\n",vendorSDK);
int numPlatforms = b3OpenCLUtils::getNumPlatforms();
printf("Num Platforms = %d\n", numPlatforms);
for (int i=0;i<numPlatforms;i++)
{
cl_platform_id platform = b3OpenCLUtils::getPlatform(i);
b3OpenCLPlatformInfo platformInfo;
b3OpenCLUtils::getPlatformInfo(platform,&platformInfo);
printf("--------------------------------\n");
printf("Platform info for platform nr %d:\n",i);
printf(" CL_PLATFORM_VENDOR: \t\t\t%s\n",platformInfo.m_platformVendor);
printf(" CL_PLATFORM_NAME: \t\t\t%s\n",platformInfo.m_platformName);
printf(" CL_PLATFORM_VERSION: \t\t\t%s\n",platformInfo.m_platformVersion);
cl_context context = b3OpenCLUtils::createContextFromPlatform(platform,deviceType,&ciErrNum);
int numDevices = b3OpenCLUtils::getNumDevices(context);
printf("Num Devices = %d\n", numDevices);
for (int j=0;j<numDevices;j++)
{
cl_device_id dev = b3OpenCLUtils::getDevice(context,j);
b3OpenCLDeviceInfo devInfo;
b3OpenCLUtils::getDeviceInfo(dev,&devInfo);
printf("m_deviceName = %s\n",devInfo.m_deviceName);
//b3OpenCLUtils::printDeviceInfo(dev);
g_cqCommandQue = clCreateCommandQueue(context, dev, 0, &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
b3BitonicSortInfo info;
info.bitonicSortLocal = b3OpenCLUtils::compileCLKernelFromString(context,dev,kernelSource,"kBitonicSortCellIdLocal",&ciErrNum,0,"");
oclCHECKERROR(ciErrNum, CL_SUCCESS);
info.bitonicSortLocal1 = b3OpenCLUtils::compileCLKernelFromString(context,dev,kernelSource,"kBitonicSortCellIdLocal1",&ciErrNum,0,"");
oclCHECKERROR(ciErrNum, CL_SUCCESS);
info.bitonicSortMergeGlobal = b3OpenCLUtils::compileCLKernelFromString(context,dev,kernelSource,"kBitonicSortCellIdMergeGlobal",&ciErrNum,0,"");
oclCHECKERROR(ciErrNum, CL_SUCCESS);
info.bitonicSortMergeLocal = b3OpenCLUtils::compileCLKernelFromString(context,dev,kernelSource,"kBitonicSortCellIdMergeLocal",&ciErrNum,0,"");
oclCHECKERROR(ciErrNum, CL_SUCCESS);
info.m_cqCommandQue = g_cqCommandQue;
b3OpenCLArray<b3Int2> keyValuesGPU(context,g_cqCommandQue);
b3AlignedObjectArray<b3Int2> keyValuesCPU;
b3AlignedObjectArray<b3Int2> keyValuesGold;
int numValues = 8*1024*1024;//2048;//1024;
keyValuesCPU.resize(numValues);
for (int i=0;i<numValues;i++)
{
b3Int2 v;
v.x = numValues+1-i;
v.y = i*i;
keyValuesCPU[i] = v;
}
keyValuesGPU.copyFromHost(keyValuesCPU);
keyValuesGPU.copyToHost(keyValuesGold);
keyValuesGold.quickSort(compareFunc);
unsigned int batch = 1;
unsigned int arrayLength = keyValuesGPU.size();
for (int i=0;i<10;i++)
{
keyValuesGPU.copyFromHost(keyValuesCPU);
clFinish(info.m_cqCommandQue);
unsigned long pre=clock.getTimeMilliseconds();
bitonicSortNv(keyValuesGPU.getBufferCL(), arrayLength, info);
clFinish(info.m_cqCommandQue);
unsigned long post=clock.getTimeMilliseconds();
printf("GPU sort took %d ms\n",post-pre);
}
keyValuesGPU.copyToHost(keyValuesCPU);
int success=1;
for (int i=0;i<numValues;i++)
{
if (keyValuesCPU[i].x != keyValuesGold[i].x)
success = 0;
if (keyValuesCPU[i].y != keyValuesGold[i].y)
success = 0;
}
if (success)
{
printf("Correct\n");
numSuccess++;
} else
{
printf("Sort Failed\n");
numFailed++;
}
}
clReleaseContext(context);
}
///Easier method to initialize OpenCL using createContextFromType for a GPU
deviceType = CL_DEVICE_TYPE_GPU;
void* glCtx=0;
void* glDC = 0;
printf("Initialize OpenCL using b3OpenCLUtils::createContextFromType for CL_DEVICE_TYPE_GPU\n");
g_cxMainContext = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
if (g_cxMainContext)
{
int numDev = b3OpenCLUtils::getNumDevices(g_cxMainContext);
for (int i=0;i<numDev;i++)
{
cl_device_id device;
device = b3OpenCLUtils::getDevice(g_cxMainContext,i);
b3OpenCLDeviceInfo clInfo;
b3OpenCLUtils::getDeviceInfo(device,&clInfo);
b3OpenCLUtils::printDeviceInfo(device);
// create a command-queue
g_cqCommandQue = clCreateCommandQueue(g_cxMainContext, device, 0, &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
//normally you would create and execute kernels using this command queue
clReleaseCommandQueue(g_cqCommandQue);
}
clReleaseContext(g_cxMainContext);
}
else {
printf("No OpenCL capable GPU found!");
}
printf("numSuccess=%d\n",numSuccess);
printf("numFailed=%d\n",numFailed);
printf("press <Enter>\n");
getchar();
return 0;
}

View File

@ -0,0 +1,36 @@
function createProject(vendor)
hasCL = findOpenCL(vendor)
if (hasCL) then
project ("Test_BitonicSort_" .. vendor)
initOpenCL(vendor)
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
includedirs {"../../../src"}
files {
"main.cpp",
"b3BitonicSort.cpp",
"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
"../../../src/Bullet3Common/b3AlignedAllocator.h",
"../../../src/Bullet3Common/b3Quickprof.cpp",
"../../../src/Bullet3Common/b3Quickprof.h",
"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp",
"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
}
end
end
createProject("Apple")
createProject("AMD")
createProject("Intel")
createProject("NVIDIA")

View File

@ -0,0 +1,378 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include <stdio.h>
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h"
#include "Bullet3Common/b3CommandLineArgs.h"
#include "Bullet3Common/b3MinMax.h"
int g_nPassed = 0;
int g_nFailed = 0;
bool g_testFailed = 0;
#define TEST_INIT g_testFailed = 0;
#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;}
#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++;
#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
cl_context g_context=0;
cl_device_id g_device=0;
cl_command_queue g_queue =0;
const char* g_deviceName = 0;
void initCL(int preferredDeviceIndex, int preferredPlatformIndex)
{
void* glCtx=0;
void* glDC = 0;
int ciErrNum = 0;
//bound search and radix sort only work on GPU right now (assume 32 or 64 width workgroup without barriers)
cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
g_context = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
int numDev = b3OpenCLUtils::getNumDevices(g_context);
if (numDev>0)
{
b3OpenCLDeviceInfo info;
g_device= b3OpenCLUtils::getDevice(g_context,0);
g_queue = clCreateCommandQueue(g_context, g_device, 0, &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
b3OpenCLUtils::printDeviceInfo(g_device);
b3OpenCLUtils::getDeviceInfo(g_device,&info);
g_deviceName = info.m_deviceName;
}
}
void exitCL()
{
clReleaseCommandQueue(g_queue);
clReleaseContext(g_context);
}
inline void fillIntTest()
{
TEST_INIT;
b3FillCL* fillCL = new b3FillCL(g_context,g_device,g_queue);
int maxSize=1024*256;
b3OpenCLArray<int> intBuffer(g_context,g_queue,maxSize);
intBuffer.resize(maxSize);
#define NUM_TESTS 7
int dx = maxSize/NUM_TESTS;
for (int iter=0;iter<NUM_TESTS;iter++)
{
int size = b3Min( 11+dx*iter, maxSize );
int value = 2;
int offset=0;
fillCL->execute(intBuffer,value,size,offset);
b3AlignedObjectArray<int> hostBuf2;
hostBuf2.resize(size);
fillCL->executeHost(hostBuf2,value,size,offset);
b3AlignedObjectArray<int> hostBuf;
intBuffer.copyToHost(hostBuf);
for(int i=0; i<size; i++)
{
TEST_ASSERT( hostBuf[i] == hostBuf2[i] );
TEST_ASSERT( hostBuf[i] == hostBuf2[i] );
}
}
delete fillCL;
TEST_REPORT( "fillIntTest" );
}
__inline
void seedRandom(int seed)
{
srand( seed );
}
template<typename T>
__inline
T getRandom(const T& minV, const T& maxV)
{
float r = (rand()%10000)/10000.f;
T range = maxV - minV;
return (T)(minV + r*range);
}
struct b3SortDataCompare
{
inline bool operator()(const b3SortData& first, const b3SortData& second) const
{
return (first.m_key < second.m_key) || (first.m_key==second.m_key && first.m_value < second.m_value);
}
};
void boundSearchTest( )
{
TEST_INIT;
int maxSize = 1024*256;
int bucketSize = 256;
b3OpenCLArray<b3SortData> srcCL(g_context,g_queue,maxSize);
b3OpenCLArray<unsigned int> upperCL(g_context,g_queue,maxSize);
b3OpenCLArray<unsigned int> lowerCL(g_context,g_queue,maxSize);
b3AlignedObjectArray<b3SortData> srcHost;
b3AlignedObjectArray<unsigned int> upperHost;
b3AlignedObjectArray<unsigned int> lowerHost;
b3AlignedObjectArray<unsigned int> upperHostCompare;
b3AlignedObjectArray<unsigned int> lowerHostCompare;
b3BoundSearchCL* search = new b3BoundSearchCL(g_context,g_device,g_queue, maxSize);
int dx = maxSize/NUM_TESTS;
for(int iter=0; iter<NUM_TESTS; iter++)
{
int size = b3Min( 128+dx*iter, maxSize );
upperHost.resize(bucketSize);
lowerHost.resize(bucketSize);
upperHostCompare.resize(bucketSize);
lowerHostCompare.resize(bucketSize);
srcHost.resize(size);
for(int i=0; i<size; i++)
{
b3SortData v;
// v.m_key = i<2? 0 : 5;
v.m_key = getRandom(0,bucketSize);
v.m_value = i;
srcHost.at(i) = v;
}
srcHost.quickSort(b3SortDataCompare());
srcCL.copyFromHost(srcHost);
{
for(int i=0; i<bucketSize; i++)
{
lowerHost[i] = -1;
lowerHostCompare[i] = -1;
upperHost[i] = -1;
upperHostCompare[i] = -1;
}
upperCL.copyFromHost(upperHost);
lowerCL.copyFromHost(lowerHost);
}
search->execute(srcCL,size,upperCL,bucketSize,b3BoundSearchCL::BOUND_UPPER);
search->execute(srcCL,size,lowerCL,bucketSize,b3BoundSearchCL::BOUND_LOWER);
search->executeHost(srcHost,size,upperHostCompare,bucketSize,b3BoundSearchCL::BOUND_UPPER);
search->executeHost(srcHost,size,lowerHostCompare,bucketSize,b3BoundSearchCL::BOUND_LOWER);
lowerCL.copyToHost(lowerHost);
upperCL.copyToHost(upperHost);
for(int i=0; i<bucketSize; i++)
{
TEST_ASSERT(upperHostCompare[i] == upperHost[i]);
TEST_ASSERT(lowerHostCompare[i] == lowerHost[i]);
}
/*
for(int i=1; i<bucketSize; i++)
{
int lhi_1 = lowerHost[i-1];
int lhi = lowerHost[i];
for(int j=lhi_1; j<lhi; j++)
//for(int j=lowerHost[i-1]; j<lowerHost[i]; j++)
{
TEST_ASSERT( srcHost[j].m_key < i );
}
}
for(int i=0; i<bucketSize; i++)
{
int jMin = (i==0)?0:upperHost[i-1];
for(int j=jMin; j<upperHost[i]; j++)
{
TEST_ASSERT( srcHost[j].m_key <= i );
}
}
*/
for(int i=0; i<bucketSize; i++)
{
int lhi = lowerHost[i];
int uhi = upperHost[i];
for(int j=lhi; j<uhi; j++)
{
if ( srcHost[j].m_key != i )
{
printf("error %d != %d\n",srcHost[j].m_key,i);
}
TEST_ASSERT( srcHost[j].m_key == i );
}
}
}
delete search;
TEST_REPORT( "boundSearchTest" );
}
void prefixScanTest()
{
TEST_INIT;
int maxSize = 1024*256;
b3AlignedObjectArray<unsigned int> buf0Host;
b3AlignedObjectArray<unsigned int> buf1Host;
b3OpenCLArray<unsigned int> buf2CL(g_context,g_queue,maxSize);
b3OpenCLArray<unsigned int> buf3CL(g_context,g_queue,maxSize);
b3PrefixScanCL* scan = new b3PrefixScanCL(g_context,g_device,g_queue,maxSize);
int dx = maxSize/NUM_TESTS;
for(int iter=0; iter<NUM_TESTS; iter++)
{
int size = b3Min( 128+dx*iter, maxSize );
buf0Host.resize(size);
buf1Host.resize(size);
for(int i=0; i<size; i++)
buf0Host[i] = 1;
buf2CL.copyFromHost( buf0Host);
unsigned int sumHost, sumGPU;
scan->executeHost(buf0Host, buf1Host, size, &sumHost );
scan->execute( buf2CL, buf3CL, size, &sumGPU );
buf3CL.copyToHost(buf0Host);
TEST_ASSERT( sumHost == sumGPU );
for(int i=0; i<size; i++)
TEST_ASSERT( buf1Host[i] == buf0Host[i] );
}
delete scan;
TEST_REPORT( "scanTest" );
}
bool radixSortTest()
{
TEST_INIT;
int maxSize = 1024*256;
b3AlignedObjectArray<b3SortData> buf0Host;
buf0Host.resize(maxSize);
b3AlignedObjectArray<b3SortData> buf1Host;
buf1Host.resize(maxSize );
b3OpenCLArray<b3SortData> buf2CL(g_context,g_queue,maxSize);
b3RadixSort32CL* sort = new b3RadixSort32CL(g_context,g_device,g_queue,maxSize);
int dx = maxSize/NUM_TESTS;
for(int iter=0; iter<NUM_TESTS; iter++)
{
int size = b3Min( 128+dx*iter, maxSize-512 );
size = NEXTMULTIPLEOF( size, 512 );//not necessary
buf0Host.resize(size);
for(int i=0; i<size; i++)
{
b3SortData v;
v.m_key = getRandom(0,0xff);
v.m_value = i;
buf0Host[i] = v;
}
buf2CL.copyFromHost( buf0Host);
sort->executeHost( buf0Host);
sort->execute(buf2CL);
buf2CL.copyToHost(buf1Host);
for(int i=0; i<size; i++)
{
TEST_ASSERT( buf0Host[i].m_value == buf1Host[i].m_value && buf0Host[i].m_key == buf1Host[i].m_key );
}
}
delete sort;
TEST_REPORT( "radixSort" );
return g_testFailed;
}
int main(int argc, char** argv)
{
int preferredDeviceIndex = -1;
int preferredPlatformIndex = -1;
b3CommandLineArgs args(argc, argv);
args.GetCmdLineArgument("deviceId", preferredDeviceIndex);
args.GetCmdLineArgument("platformId", preferredPlatformIndex);
initCL(preferredDeviceIndex,preferredPlatformIndex);
fillIntTest();
boundSearchTest();
prefixScanTest();
radixSortTest();
exitCL();
printf("%d tests passed, %d tests failed\n",g_nPassed, g_nFailed);
printf("End, press <enter>\n");
getchar();
}

View File

@ -0,0 +1,41 @@
function createProject(vendor)
hasCL = findOpenCL(vendor)
if (hasCL) then
project ("Test_OpenCL_Primitives_" .. vendor)
initOpenCL(vendor)
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
includedirs {".","../../../src"}
files {
"main.cpp",
"../../../src/Bullet3OpenCL/Initialize/b3OpenCLInclude.h",
"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp",
"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h",
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp",
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h",
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp",
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h",
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp",
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h",
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp",
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h",
"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
"../../../src/Bullet3Common/b3AlignedAllocator.h",
"../../../src/Bullet3Common/b3AlignedObjectArray.h",
}
end
end
createProject("AMD")
createProject("Intel")
createProject("NVIDIA")
createProject("Apple")

View File

@ -0,0 +1,712 @@
/******************************************************************************
* Copyright 2010 Duane Merrill
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may ob3ain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*
*
*
* AUTHORS' REQUEST:
*
* If you use|reference|benchmark this code, please cite our Technical
* Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
*
* @TechReport{ Merrill:Sorting:2010,
* author = "Duane Merrill and Andrew Grimshaw",
* title = "Revisiting Sorting for GPGPU Stream Architectures",
* year = "2010",
* institution = "University of Virginia, Department of Computer Science",
* address = "Charlottesville, VA, USA",
* number = "CS2010-03"
* }
*
* For more information, see our Google Code project site:
* http://code.google.com/p/back40computing/
*
* Thanks!
******************************************************************************/
/******************************************************************************
* Simple test driver program for *large-problem* radix sorting.
*
* Useful for demonstrating how to integrate radix sorting into
* your application
******************************************************************************/
/******************************************************************************
* Converted from CUDA to OpenCL/DirectCompute by Erwin Coumans
******************************************************************************/
#ifdef _WIN32
#pragma warning (disable:4996)
#endif
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <float.h>
#include <algorithm>
#include <string>
//#include <iostream>
#include <sstream>
/**********************
*
*/
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "Bullet3Common/b3Quickprof.h"
cl_context g_cxMainContext;
cl_device_id g_device;
cl_command_queue g_cqCommandQueue;
/***********************
*
*/
bool g_verbose;
///Preferred OpenCL device/platform. When < 0 then no preference is used.
///Note that b3OpenCLUtils might still use the preference of using a platform vendor that matches the SDK vendor used to build the application.
///Preferred device/platform take priority over this platform-vendor match
int gPreferredDeviceId = -1;
int gPreferredPlatformId = -1;
/******************************************************************************
* Routines
******************************************************************************/
/**
* Keys-only sorting. Uses the GPU to sort the specified vector of elements for the given
* number of iterations, displaying runtime information.
*
* @param[in] num_elements
* Size in elements of the vector to sort
* @param[in] h_keys
* Vector of keys to sort
* @param[in] iterations
* Number of times to invoke the GPU sorting primitive
* @param[in] cfg
* Config
*/
template <typename K>
void TimedSort(
unsigned int num_elements,
K *h_keys,
unsigned int iterations)
{
printf("Keys only, %d iterations, %d elements\n", iterations, num_elements);
int max_elements = num_elements;
b3AlignedObjectArray<unsigned int> hostData;
hostData.resize(num_elements);
for (int i=0;i<num_elements;i++)
{
hostData[i] = h_keys[i];
}
b3RadixSort32CL sorter(g_cxMainContext,g_device,g_cqCommandQueue);
b3OpenCLArray<unsigned int> gpuData(g_cxMainContext,g_cqCommandQueue);
gpuData.copyFromHost(hostData);
//sorter.executeHost(gpuData);
sorter.execute(gpuData);
b3AlignedObjectArray<unsigned int> hostDataSorted;
gpuData.copyToHost(hostDataSorted);
clFinish(g_cqCommandQueue);
{
//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
// Create sorting enactor
// Perform the timed number of sorting iterations
double elapsed = 0;
float duration = 0;
b3Clock watch;
//warm-start
gpuData.copyFromHost(hostData);
clFinish(g_cqCommandQueue);
sorter.execute(gpuData);
watch.reset();
for (int i = 0; i < iterations; i++)
{
// Move a fresh copy of the problem into device storage
gpuData.copyFromHost(hostData);
clFinish(g_cqCommandQueue);
// Start GPU timing record
double startMs = watch.getTimeMicroseconds()/1e3;
// Call the sorting API routine
sorter.execute(gpuData);
clFinish(g_cqCommandQueue);
double stopMs = watch.getTimeMicroseconds()/1e3;
duration = stopMs - startMs;
// End GPU timing record
elapsed += (double) duration;
printf("duration = %f\n", duration);
}
// Display timing information
double avg_runtime = elapsed / iterations;
// double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0;
// printf(", %f GPU ms, %f x10^9 elts/sec\n", avg_runtime, throughput);
double throughput = ((double) num_elements) / avg_runtime / 1000.0 ;
printf(", %f GPU ms, %f x10^6 elts/sec\n", avg_runtime, throughput);
gpuData.copyToHost(hostData);
for (int i=0;i<num_elements;i++)
{
h_keys[i] = hostData[i];
}
}
}
/**
* Key-value sorting. Uses the GPU to sort the specified vector of elements for the given
* number of iterations, displaying runtime information.
*
* @param[in] num_elements
* Size in elements of the vector to sort
* @param[in] h_keys
* Vector of keys to sort
* @param[in,out] h_values
* Vector of values to sort
* @param[in] iterations
* Number of times to invoke the GPU sorting primitive
* @param[in] cfg
* Config
*/
template <typename K, typename V>
void TimedSort(
unsigned int num_elements,
K *h_keys,
V *h_values,
unsigned int iterations)
{
printf("Key-values, %d iterations, %d elements\n", iterations, num_elements);
int max_elements = num_elements;
b3AlignedObjectArray<b3SortData> hostData;
hostData.resize(num_elements);
for (int i=0;i<num_elements;i++)
{
hostData[i].m_key = h_keys[i];
hostData[i].m_value = h_values[i];
}
b3RadixSort32CL sorter(g_cxMainContext,g_device,g_cqCommandQueue);
b3OpenCLArray<b3SortData> gpuData(g_cxMainContext,g_cqCommandQueue);
gpuData.copyFromHost(hostData);
//sorter.executeHost(gpuData);
sorter.execute(gpuData);
b3AlignedObjectArray<b3SortData> hostDataSorted;
gpuData.copyToHost(hostDataSorted);
#if 0
for (int i=0;i<num_elements;i++)
{
printf("hostData[%d].m_key = %d\n",i, hostDataSorted[i].m_key);
printf("hostData[%d].m_value = %d\n",i,hostDataSorted[i].m_value);
}
#endif
clFinish(g_cqCommandQueue);
{
//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
// Create sorting enactor
// Perform the timed number of sorting iterations
double elapsed = 0;
float duration = 0;
b3Clock watch;
//warm-start
gpuData.copyFromHost(hostData);
sorter.execute(gpuData);
clFinish(g_cqCommandQueue);
watch.reset();
for (int i = 0; i < iterations; i++)
{
// Move a fresh copy of the problem into device storage
gpuData.copyFromHost(hostData);
clFinish(g_cqCommandQueue);
// Start GPU timing record
double startMs = watch.getTimeMicroseconds()/1e3;
// Call the sorting API routine
sorter.execute(gpuData);
clFinish(g_cqCommandQueue);
double stopMs = watch.getTimeMicroseconds()/1e3;
duration = stopMs - startMs;
// End GPU timing record
elapsed += (double) duration;
printf("duration = %f\n", duration);
}
// Display timing information
double avg_runtime = elapsed / iterations;
// double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0;
// printf(", %f GPU ms, %f x10^9 elts/sec\n", avg_runtime, throughput);
double throughput = ((double) num_elements) / avg_runtime / 1000.0 ;
printf(", %f GPU ms, %f x10^6 elts/sec\n", avg_runtime, throughput);
gpuData.copyToHost(hostData);
for (int i=0;i<num_elements;i++)
{
h_keys[i] = hostData[i].m_key;
h_values[i] = hostData[i].m_value;
}
}
}
/**
* Generates random 32-bit keys.
*
* We always take the second-order byte from rand() because the higher-order
* bits returned by rand() are commonly considered more uniformly distributed
* than the lower-order bits.
*
* We can decrease the entropy level of keys by adopting the technique
* of Thearling and Smith in which keys are computed from the bitwise AND of
* multiple random samples:
*
* entropy_reduction | Effectively-unique bits per key
* -----------------------------------------------------
* -1 | 0
* 0 | 32
* 1 | 25.95
* 2 | 17.41
* 3 | 10.78
* 4 | 6.42
* ... | ...
*
*/
template <typename K>
void RandomBits(K &key, int entropy_reduction = 0, int lower_key_bits = sizeof(K) * 8)
{
const unsigned int NUM_UCHARS = (sizeof(K) + sizeof(unsigned char) - 1) / sizeof(unsigned char);
unsigned char key_bits[NUM_UCHARS];
do {
for (int j = 0; j < NUM_UCHARS; j++) {
unsigned char quarterword = 0xff;
for (int i = 0; i <= entropy_reduction; i++) {
quarterword &= (rand() >> 7);
}
key_bits[j] = quarterword;
}
if (lower_key_bits < sizeof(K) * 8) {
unsigned long long base = 0;
memcpy(&base, key_bits, sizeof(K));
base &= (1 << lower_key_bits) - 1;
memcpy(key_bits, &base, sizeof(K));
}
memcpy(&key, key_bits, sizeof(K));
} while (key != key); // avoids NaNs when generating random floating point numbers
}
/******************************************************************************
* Templated routines for printing keys/values to the console
******************************************************************************/
template<typename T>
void PrintValue(T val) {
printf("%d", val);
}
template<>
void PrintValue<float>(float val) {
printf("%f", val);
}
template<>
void PrintValue<double>(double val) {
printf("%f", val);
}
template<>
void PrintValue<unsigned char>(unsigned char val) {
printf("%u", val);
}
template<>
void PrintValue<unsigned short>(unsigned short val) {
printf("%u", val);
}
template<>
void PrintValue<unsigned int>(unsigned int val) {
printf("%u", val);
}
template<>
void PrintValue<long>(long val) {
printf("%ld", val);
}
template<>
void PrintValue<unsigned long>(unsigned long val) {
printf("%lu", val);
}
template<>
void PrintValue<long long>(long long val) {
printf("%lld", val);
}
template<>
void PrintValue<unsigned long long>(unsigned long long val) {
printf("%llu", val);
}
/**
* Compares the equivalence of two arrays
*/
template <typename T, typename SizeT>
int CompareResults(T* computed, T* reference, SizeT len, bool verbose = true)
{
printf("\n");
for (SizeT i = 0; i < len; i++) {
if (computed[i] != reference[i]) {
printf("INCORRECT: [%lu]: ", (unsigned long) i);
PrintValue<T>(computed[i]);
printf(" != ");
PrintValue<T>(reference[i]);
if (verbose) {
printf("\nresult[...");
for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
PrintValue<T>(computed[j]);
printf(", ");
}
printf("...]");
printf("\nreference[...");
for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
PrintValue<T>(reference[j]);
printf(", ");
}
printf("...]");
}
return 1;
}
}
printf("CORRECT\n");
return 0;
}
/**
* Creates an example sorting problem whose keys is a vector of the specified
* number of K elements, values of V elements, and then dispatches the problem
* to the GPU for the given number of iterations, displaying runtime information.
*
* @param[in] iterations
* Number of times to invoke the GPU sorting primitive
* @param[in] num_elements
* Size in elements of the vector to sort
* @param[in] cfg
* Config
*/
template<typename K, typename V>
void TestSort(
unsigned int iterations,
int num_elements,
bool keys_only)
{
// Allocate the sorting problem on the host and fill the keys with random bytes
K *h_keys = NULL;
K *h_reference_keys = NULL;
V *h_values = NULL;
h_keys = (K*) malloc(num_elements * sizeof(K));
h_reference_keys = (K*) malloc(num_elements * sizeof(K));
if (!keys_only) h_values = (V*) malloc(num_elements * sizeof(V));
// Use random bits
for (unsigned int i = 0; i < num_elements; ++i) {
RandomBits<K>(h_keys[i], 0);
//h_keys[i] = num_elements-i;
//h_keys[i] = 0xffffffffu-i;
if (!keys_only)
h_values[i] = h_keys[i];//0xffffffffu-i;
h_reference_keys[i] = h_keys[i];
}
// Run the timing test
if (keys_only) {
TimedSort<K>(num_elements, h_keys, iterations);
} else {
TimedSort<K, V>(num_elements, h_keys, h_values, iterations);
}
// cudaThreadSynchronize();
// Display sorted key data
if (g_verbose) {
printf("\n\nKeys:\n");
for (int i = 0; i < num_elements; i++) {
PrintValue<K>(h_keys[i]);
printf(", ");
}
printf("\n\n");
}
// Verify solution
std::sort(h_reference_keys, h_reference_keys + num_elements);
CompareResults<K>(h_keys, h_reference_keys, num_elements, true);
printf("\n");
fflush(stdout);
// Free our allocated host memory
if (h_keys != NULL) free(h_keys);
if (h_values != NULL) free(h_values);
}
/**
* Displays the commandline usage for this tool
*/
void Usage()
{
printf("\ntest_large_problem_sorting [--device=<device index>] [--v] [--i=<num-iterations>] [--n=<num-elements>] [--key-values] [--deviceId=<int>] [--platformId=<int>]\n");
printf("\n");
printf("\t--v\tDisplays sorted results to the console.\n");
printf("\n");
printf("\t--i\tPerforms the sorting operation <num-iterations> times\n");
printf("\t\t\ton the device. Re-copies original input each time. Default = 1\n");
printf("\n");
printf("\t--n\tThe number of elements to comprise the sample problem\n");
printf("\t\t\tDefault = 512\n");
printf("\n");
printf("\t--key-values\tSpecifies that keys are accommodated by value pairings\n");
printf("\n");
}
/******************************************************************************
* Command-line parsing
******************************************************************************/
#include <map>
#include <algorithm>
#include <string>
class b3CommandLineArgs
{
protected:
std::map<std::string, std::string> pairs;
public:
// Constructor
b3CommandLineArgs(int argc, char **argv)
{
using namespace std;
for (int i = 1; i < argc; i++)
{
string arg = argv[i];
if ((arg[0] != '-') || (arg[1] != '-')) {
continue;
}
string::size_type pos;
string key, val;
if ((pos = arg.find( '=')) == string::npos) {
key = string(arg, 2, arg.length() - 2);
val = "";
} else {
key = string(arg, 2, pos - 2);
val = string(arg, pos + 1, arg.length() - 1);
}
pairs[key] = val;
}
}
bool CheckCmdLineFlag(const char* arg_name)
{
using namespace std;
map<string, string>::iterator itr;
if ((itr = pairs.find(arg_name)) != pairs.end()) {
return true;
}
return false;
}
template <typename T>
void GetCmdLineArgument(const char *arg_name, T &val);
int ParsedArgc()
{
return pairs.size();
}
};
template <typename T>
void b3CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val)
{
using namespace std;
map<string, string>::iterator itr;
if ((itr = pairs.find(arg_name)) != pairs.end()) {
istringstream strstream(itr->second);
strstream >> val;
}
}
template <>
void b3CommandLineArgs::GetCmdLineArgument<char*>(const char* arg_name, char* &val)
{
using namespace std;
map<string, string>::iterator itr;
if ((itr = pairs.find(arg_name)) != pairs.end()) {
string s = itr->second;
val = (char*) malloc(sizeof(char) * (s.length() + 1));
strcpy(val, s.c_str());
} else {
val = NULL;
}
}
/******************************************************************************
* Main
******************************************************************************/
extern bool gDebugSkipLoadingBinary;
int main( int argc, char** argv)
{
//gDebugSkipLoadingBinary = true;
cl_int ciErrNum;
b3CommandLineArgs args(argc,argv);
args.GetCmdLineArgument("deviceId", gPreferredDeviceId);
args.GetCmdLineArgument("platformId", gPreferredPlatformId);
printf("Initialize OpenCL using b3OpenCLUtils_createContextFromType\n");
cl_platform_id platformId;
// g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
//g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_CPU, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
int numDev = b3OpenCLUtils_getNumDevices(g_cxMainContext);
if (!numDev)
{
printf("error: no OpenCL devices\n");
exit(0);
}
int result;
int devId = 0;
g_device = b3OpenCLUtils_getDevice(g_cxMainContext,devId);
b3OpenCLUtils_printDeviceInfo(g_device);
// create a command-queue
g_cqCommandQueue = clCreateCommandQueue(g_cxMainContext, g_device, 0, &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
//srand(time(NULL));
srand(0); // presently deterministic
unsigned int num_elements = 8*1024*1024;//4*1024*1024;//4*1024*1024;//257;//8*524288;//2048;//512;//524288;
unsigned int iterations = 10;
bool keys_only = true;
//
// Check command line arguments
//
if (args.CheckCmdLineFlag("help"))
{
Usage();
return 0;
}
args.GetCmdLineArgument("i", iterations);
args.GetCmdLineArgument("n", num_elements);
keys_only = !args.CheckCmdLineFlag("key-values");
g_verbose = args.CheckCmdLineFlag("v");
TestSort<unsigned int, unsigned int>(
iterations,
num_elements,
keys_only);
}

View File

@ -0,0 +1,40 @@
function createProject(vendor)
hasCL = findOpenCL(vendor)
if (hasCL) then
project ("Test_OpenCL_RadixSortBenchmark_" .. vendor)
initOpenCL(vendor)
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
includedirs {"..","../../../src"}
-- links {
-- ("OpenCL_lib_parallel_primitives_host_" .. vendor)
-- }
files {
"main.cpp",
"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp",
"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h",
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp",
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp",
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp",
"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
"../../../src/Bullet3Common/b3AlignedAllocator.h",
"../../../src/Bullet3Common/b3AlignedObjectArray.h",
"../../../src/Bullet3Common/b3Quickprof.cpp",
"../../../src/Bullet3Common/b3Quickprof.h",
}
end
end
createProject("AMD")
createProject("Intel")
createProject("NVIDIA")
createProject("Apple")