mirror of
https://github.com/bulletphysics/bullet3
synced 2024-12-13 13:20:07 +00:00
add bitonic sort, as comparison.
fix stringify.bat for Windows (need to fix Mac/Linux version too)
This commit is contained in:
parent
c5f488fe6d
commit
92f0938af3
@ -101,7 +101,9 @@
|
||||
include "../test/OpenCL/BasicInitialize"
|
||||
-- include "../test/OpenCL/BroadphaseCollision"
|
||||
-- include "../test/OpenCL/NarrowphaseCollision"
|
||||
-- include "../test/OpenCL/ParallelPrimitives"
|
||||
include "../test/OpenCL/ParallelPrimitives"
|
||||
include "../test/OpenCL/RadixSortBenchmark"
|
||||
include "../test/OpenCL/BitonicSort"
|
||||
|
||||
include "../src/Bullet3Dynamics"
|
||||
include "../src/Bullet3Common"
|
||||
|
@ -1,35 +1,30 @@
|
||||
|
||||
@echo off
|
||||
rem @echo off
|
||||
|
||||
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/vector_add/VectorAddKernels.cl" --headerfile="../opencl/vector_add/VectorAddKernels.h" --stringname="vectorAddCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/RadixSort32Kernels.cl" --headerfile="../opencl/parallel_primitives/kernels/RadixSort32KernelsCL.h" --stringname="radixSort32KernelsCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/BoundSearchKernels.cl" --headerfile="../opencl/parallel_primitives/kernels/BoundSearchKernelsCL.h" --stringname="boundSearchKernelsCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/PrefixScanKernels.cl" --headerfile="../opencl/parallel_primitives/kernels/PrefixScanKernelsCL.h" --stringname="prefixScanKernelsCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/FillKernels.cl" --headerfile="../opencl/parallel_primitives/kernels/FillKernelsCL.h" --stringname="fillKernelsCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl" --headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h" --stringname="radixSort32KernelsCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl" --headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h" --stringname="boundSearchKernelsCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl" --headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h" --stringname="prefixScanKernelsCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl" --headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h" --stringname="fillKernelsCL" stringify
|
||||
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_broadphase/kernels/sap.cl" --headerfile="../opencl/gpu_broadphase/kernels/sapKernels.h" --stringname="sapCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_broadphase/kernels/sapFast.cl" --headerfile="../opencl/gpu_broadphase/kernels/sapFastKernels.h" --stringname="sapFastCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h" --stringname="sapCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h" --stringname="sapFastCL" stringify
|
||||
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_narrowphase/kernels/sat.cl" --headerfile="../opencl/gpu_narrowphase/kernels/satKernels.h" --stringname="satKernelsCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_narrowphase/kernels/satClipHullContacts.cl" --headerfile="../opencl/gpu_narrowphase/kernels/satClipHullContacts.h" --stringname="satClipKernelsCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_narrowphase/kernels/primitiveContacts.cl" --headerfile="../opencl/gpu_narrowphase/kernels/primitiveContacts.h" --stringname="primitiveContactsKernelsCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_narrowphase/kernels/bvhTraversal.cl" --headerfile="../opencl/gpu_narrowphase/kernels/bvhTraversal.h" --stringname="bvhTraversalKernelCL" stringify
|
||||
|
||||
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/integrateKernel.cl" --headerfile="../opencl/gpu_rigidbody/kernels/integrateKernel.h" --stringname="integrateKernelCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/updateAabbsKernel.cl" --headerfile="../opencl/gpu_rigidbody/kernels/updateAabbsKernel.h" --stringname="updateAabbsKernelCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/solverSetup.cl" --headerfile="../opencl/gpu_rigidbody/kernels/solverSetup.h" --stringname="solverSetupCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/solverSetup2.cl" --headerfile="../opencl/gpu_rigidbody/kernels/solverSetup2.h" --stringname="solverSetup2CL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/batchingKernels.cl" --headerfile="../opencl/gpu_rigidbody/kernels/batchingKernels.h" --stringname="batchingKernelsCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/batchingKernelsNew.cl" --headerfile="../opencl/gpu_rigidbody/kernels/batchingKernelsNew.h" --stringname="batchingKernelsNewCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/solverUtils.cl" --headerfile="../opencl/gpu_rigidbody/kernels/solverUtils.h" --stringname="solverUtilsCL" stringify
|
||||
|
||||
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/solveContact.cl" --headerfile="../opencl/gpu_rigidbody/kernels/solveContact.h" --stringname="solveContactCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/solveFriction.cl" --headerfile="../opencl/gpu_rigidbody/kernels/solveFriction.h" --stringname="solveFrictionCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl" --headerfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h" --stringname="satKernelsCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl" --headerfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h" --stringname="satClipKernelsCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl" --headerfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h" --stringname="primitiveContactsKernelsCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.cl" --headerfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h" --stringname="bvhTraversalKernelCL" stringify
|
||||
|
||||
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/integrateKernel.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/integrateKernel.h" --stringname="integrateKernelCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/updateAabbsKernel.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/updateAabbsKernel.h" --stringname="updateAabbsKernelCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/solverSetup.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/solverSetup.h" --stringname="solverSetupCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/solverSetup2.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/solverSetup2.h" --stringname="solverSetup2CL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/batchingKernels.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/batchingKernels.h" --stringname="batchingKernelsCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/batchingKernelsNew.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/batchingKernelsNew.h" --stringname="batchingKernelsNewCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/solverUtils.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/solverUtils.h" --stringname="solverUtilsCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/solveContact.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/solveContact.h" --stringname="solveContactCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/solveFriction.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/solveFriction.h" --stringname="solveFrictionCL" stringify
|
||||
|
||||
|
||||
pause
|
@ -30,12 +30,12 @@ static const char* sapFastCL= \
|
||||
" float m_maxElems[4];\n"
|
||||
" int m_maxIndices[4];\n"
|
||||
" };\n"
|
||||
"} b3AabbCL;\n"
|
||||
"} btAabbCL;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"/// conservative test for overlap between two aabbs\n"
|
||||
"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2);\n"
|
||||
"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2)\n"
|
||||
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
|
||||
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
|
||||
"{\n"
|
||||
"//skip pairs between static (mass=0) objects\n"
|
||||
" if ((aabb1->m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0))\n"
|
||||
@ -50,18 +50,18 @@ static const char* sapFastCL= \
|
||||
"\n"
|
||||
"\n"
|
||||
"//computePairsKernelBatchWrite\n"
|
||||
"__kernel void computePairsKernel( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
|
||||
"__kernel void computePairsKernel( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" int localId = get_local_id(0);\n"
|
||||
"\n"
|
||||
" __local int numActiveWgItems[1];\n"
|
||||
" __local int breakRequest[1];\n"
|
||||
" __local b3AabbCL localAabbs[128];// = aabbs[i];\n"
|
||||
" __local btAabbCL localAabbs[128];// = aabbs[i];\n"
|
||||
" \n"
|
||||
" int2 myPairs[64];\n"
|
||||
" \n"
|
||||
" b3AabbCL myAabb;\n"
|
||||
" btAabbCL myAabb;\n"
|
||||
" \n"
|
||||
" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
|
||||
" float testValue = myAabb.m_maxElems[axis];\n"
|
||||
|
@ -30,12 +30,12 @@ static const char* sapCL= \
|
||||
" float m_maxElems[4];\n"
|
||||
" int m_maxIndices[4];\n"
|
||||
" };\n"
|
||||
"} b3AabbCL;\n"
|
||||
"} btAabbCL;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"/// conservative test for overlap between two aabbs\n"
|
||||
"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2);\n"
|
||||
"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2)\n"
|
||||
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
|
||||
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
|
||||
"{\n"
|
||||
" bool overlap = true;\n"
|
||||
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
|
||||
@ -43,8 +43,8 @@ static const char* sapCL= \
|
||||
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
|
||||
" return overlap;\n"
|
||||
"}\n"
|
||||
"bool TestAabbAgainstAabb2GlobalGlobal(__global const b3AabbCL* aabb1, __global const b3AabbCL* aabb2);\n"
|
||||
"bool TestAabbAgainstAabb2GlobalGlobal(__global const b3AabbCL* aabb1, __global const b3AabbCL* aabb2)\n"
|
||||
"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
|
||||
"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
|
||||
"{\n"
|
||||
" bool overlap = true;\n"
|
||||
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
|
||||
@ -53,8 +53,8 @@ static const char* sapCL= \
|
||||
" return overlap;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"bool TestAabbAgainstAabb2Global(const b3AabbCL* aabb1, __global const b3AabbCL* aabb2);\n"
|
||||
"bool TestAabbAgainstAabb2Global(const b3AabbCL* aabb1, __global const b3AabbCL* aabb2)\n"
|
||||
"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
|
||||
"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
|
||||
"{\n"
|
||||
" bool overlap = true;\n"
|
||||
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
|
||||
@ -64,7 +64,7 @@ static const char* sapCL= \
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel void computePairsKernelTwoArrays( __global const b3AabbCL* unsortedAabbs, __global const b3AabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)\n"
|
||||
"__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const btAabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numUnsortedAabbs)\n"
|
||||
@ -89,7 +89,7 @@ static const char* sapCL= \
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__kernel void computePairsKernelOriginal( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
|
||||
"__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numObjects)\n"
|
||||
@ -117,7 +117,7 @@ static const char* sapCL= \
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel void computePairsKernelBarrier( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
|
||||
"__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" int localId = get_local_id(0);\n"
|
||||
@ -181,16 +181,16 @@ static const char* sapCL= \
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel void computePairsKernelLocalSharedMemory( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
|
||||
"__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" int localId = get_local_id(0);\n"
|
||||
"\n"
|
||||
" __local int numActiveWgItems[1];\n"
|
||||
" __local int breakRequest[1];\n"
|
||||
" __local b3AabbCL localAabbs[128];// = aabbs[i];\n"
|
||||
" __local btAabbCL localAabbs[128];// = aabbs[i];\n"
|
||||
" \n"
|
||||
" b3AabbCL myAabb;\n"
|
||||
" btAabbCL myAabb;\n"
|
||||
" \n"
|
||||
" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
|
||||
" float testValue = myAabb.m_maxElems[axis];\n"
|
||||
@ -289,7 +289,7 @@ static const char* sapCL= \
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel void copyAabbsKernel( __global const b3AabbCL* allAabbs, __global b3AabbCL* destAabbs, int numObjects)\n"
|
||||
"__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numObjects)\n"
|
||||
@ -300,7 +300,7 @@ static const char* sapCL= \
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel void flipFloatKernel( __global const b3AabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)\n"
|
||||
"__kernel void flipFloatKernel( __global const btAabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numObjects)\n"
|
||||
@ -312,7 +312,7 @@ static const char* sapCL= \
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel void scatterKernel( __global const b3AabbCL* aabbs, volatile __global const int2* sortData, __global b3AabbCL* sortedAabbs, int numObjects)\n"
|
||||
"__kernel void scatterKernel( __global const btAabbCL* aabbs, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numObjects)\n"
|
||||
|
@ -1,6 +1,6 @@
|
||||
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
|
||||
static const char* bvhTraversalKernelCL= \
|
||||
"//keep this enum in sync with the CPU version (in b3Collidable.h)\n"
|
||||
"//keep this enum in sync with the CPU version (in btCollidable.h)\n"
|
||||
"//written by Erwin Coumans\n"
|
||||
"\n"
|
||||
"#define SHAPE_CONVEX_HULL 3\n"
|
||||
@ -13,7 +13,7 @@ static const char* bvhTraversalKernelCL= \
|
||||
"\n"
|
||||
"#define MAX_NUM_PARTS_IN_BITS 10\n"
|
||||
"\n"
|
||||
"///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.\n"
|
||||
"///btQuantizedBvhNode is a compressed aabb node, 16 bytes.\n"
|
||||
"///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
@ -22,7 +22,7 @@ static const char* bvhTraversalKernelCL= \
|
||||
" unsigned short int m_quantizedAabbMax[3];\n"
|
||||
" //4 bytes\n"
|
||||
" int m_escapeIndexOrTriangleIndex;\n"
|
||||
"} b3QuantizedBvhNode;\n"
|
||||
"} btQuantizedBvhNode;\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
@ -44,12 +44,12 @@ static const char* bvhTraversalKernelCL= \
|
||||
" }\n"
|
||||
" int getEscapeIndex() const\n"
|
||||
" {\n"
|
||||
" b3Assert(!isLeafNode());\n"
|
||||
" btAssert(!isLeafNode());\n"
|
||||
" return -m_escapeIndexOrTriangleIndex;\n"
|
||||
" }\n"
|
||||
" int getTriangleIndex() const\n"
|
||||
" {\n"
|
||||
" b3Assert(isLeafNode());\n"
|
||||
" btAssert(isLeafNode());\n"
|
||||
" unsigned int x=0;\n"
|
||||
" unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n"
|
||||
" // Get only the lower bits where the triangle index is stored\n"
|
||||
@ -57,13 +57,13 @@ static const char* bvhTraversalKernelCL= \
|
||||
" }\n"
|
||||
" int getPartId() const\n"
|
||||
" {\n"
|
||||
" b3Assert(isLeafNode());\n"
|
||||
" btAssert(isLeafNode());\n"
|
||||
" // Get only the highest bits where the part index is stored\n"
|
||||
" return (m_escapeIndexOrTriangleIndex>>(31-MAX_NUM_PARTS_IN_BITS));\n"
|
||||
" }\n"
|
||||
"*/\n"
|
||||
"\n"
|
||||
"int getTriangleIndex(const b3QuantizedBvhNode* rootNode)\n"
|
||||
"int getTriangleIndex(const btQuantizedBvhNode* rootNode)\n"
|
||||
"{\n"
|
||||
" unsigned int x=0;\n"
|
||||
" unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n"
|
||||
@ -71,13 +71,13 @@ static const char* bvhTraversalKernelCL= \
|
||||
" return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"int isLeaf(const b3QuantizedBvhNode* rootNode)\n"
|
||||
"int isLeaf(const btQuantizedBvhNode* rootNode)\n"
|
||||
"{\n"
|
||||
" //skipindex is negative (internal node), triangleindex >=0 (leafnode)\n"
|
||||
" return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n"
|
||||
"}\n"
|
||||
" \n"
|
||||
"int getEscapeIndex(const b3QuantizedBvhNode* rootNode)\n"
|
||||
"int getEscapeIndex(const btQuantizedBvhNode* rootNode)\n"
|
||||
"{\n"
|
||||
" return -rootNode->m_escapeIndexOrTriangleIndex;\n"
|
||||
"}\n"
|
||||
@ -92,9 +92,9 @@ static const char* bvhTraversalKernelCL= \
|
||||
" //4 bytes\n"
|
||||
" int m_subtreeSize;\n"
|
||||
" int m_padding[3];\n"
|
||||
"} b3BvhSubtreeInfo;\n"
|
||||
"} btBvhSubtreeInfo;\n"
|
||||
"\n"
|
||||
"///keep this in sync with b3Collidable.h\n"
|
||||
"///keep this in sync with btCollidable.h\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" int m_numChildShapes;\n"
|
||||
@ -102,7 +102,7 @@ static const char* bvhTraversalKernelCL= \
|
||||
" int m_shapeType;\n"
|
||||
" int m_shapeIndex;\n"
|
||||
" \n"
|
||||
"} b3CollidableGpu;\n"
|
||||
"} btCollidableGpu;\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
@ -112,7 +112,7 @@ static const char* bvhTraversalKernelCL= \
|
||||
" int m_unused0;\n"
|
||||
" int m_unused1;\n"
|
||||
" int m_unused2;\n"
|
||||
"} b3GpuChildShape;\n"
|
||||
"} btGpuChildShape;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
@ -142,7 +142,7 @@ static const char* bvhTraversalKernelCL= \
|
||||
" float m_maxElems[4];\n"
|
||||
" int m_maxIndices[4];\n"
|
||||
" };\n"
|
||||
"} b3AabbCL;\n"
|
||||
"} btAabbCL;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"int testQuantizedAabbAgainstQuantizedAabb(\n"
|
||||
@ -196,12 +196,12 @@ static const char* bvhTraversalKernelCL= \
|
||||
"// work-in-progress\n"
|
||||
"__kernel void bvhTraversalKernel( __global const int2* pairs, \n"
|
||||
" __global const BodyData* rigidBodies, \n"
|
||||
" __global const b3CollidableGpu* collidables,\n"
|
||||
" __global b3AabbCL* aabbs,\n"
|
||||
" __global const btCollidableGpu* collidables,\n"
|
||||
" __global btAabbCL* aabbs,\n"
|
||||
" __global int4* concavePairsOut,\n"
|
||||
" __global volatile int* numConcavePairsOut,\n"
|
||||
" __global const b3BvhSubtreeInfo* subtreeHeadersRoot,\n"
|
||||
" __global const b3QuantizedBvhNode* quantizedNodesRoot,\n"
|
||||
" __global const btBvhSubtreeInfo* subtreeHeadersRoot,\n"
|
||||
" __global const btQuantizedBvhNode* quantizedNodesRoot,\n"
|
||||
" __global const b3BvhInfo* bvhInfos,\n"
|
||||
" int numPairs,\n"
|
||||
" int maxNumConcavePairsCapacity)\n"
|
||||
@ -238,8 +238,8 @@ static const char* bvhTraversalKernelCL= \
|
||||
" float4 bvhAabbMax = bvhInfo.m_aabbMax;\n"
|
||||
" float4 bvhQuantization = bvhInfo.m_quantization;\n"
|
||||
" int numSubtreeHeaders = bvhInfo.m_numSubTrees;\n"
|
||||
" __global const b3BvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];\n"
|
||||
" __global const b3QuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];\n"
|
||||
" __global const btBvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];\n"
|
||||
" __global const btQuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];\n"
|
||||
" \n"
|
||||
"\n"
|
||||
" unsigned short int quantizedQueryAabbMin[3];\n"
|
||||
@ -249,7 +249,7 @@ static const char* bvhTraversalKernelCL= \
|
||||
" \n"
|
||||
" for (int i=0;i<numSubtreeHeaders;i++)\n"
|
||||
" {\n"
|
||||
" b3BvhSubtreeInfo subtree = subtreeHeaders[i];\n"
|
||||
" btBvhSubtreeInfo subtree = subtreeHeaders[i];\n"
|
||||
" \n"
|
||||
" int overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);\n"
|
||||
" if (overlap != 0)\n"
|
||||
@ -262,7 +262,7 @@ static const char* bvhTraversalKernelCL= \
|
||||
" int aabbOverlap;\n"
|
||||
" while (curIndex < endNodeIndex)\n"
|
||||
" {\n"
|
||||
" b3QuantizedBvhNode rootNode = quantizedNodes[curIndex];\n"
|
||||
" btQuantizedBvhNode rootNode = quantizedNodes[curIndex];\n"
|
||||
" aabbOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode.m_quantizedAabbMin,rootNode.m_quantizedAabbMax);\n"
|
||||
" isLeafNode = isLeaf(&rootNode);\n"
|
||||
" if (aabbOverlap)\n"
|
||||
|
@ -67,9 +67,9 @@ static const char* primitiveContactsKernelsCL= \
|
||||
" float m_maxElems[4];\n"
|
||||
" int m_maxIndices[4];\n"
|
||||
" };\n"
|
||||
"} b3AabbCL;\n"
|
||||
"} btAabbCL;\n"
|
||||
"\n"
|
||||
"///keep this in sync with b3Collidable.h\n"
|
||||
"///keep this in sync with btCollidable.h\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" int m_numChildShapes;\n"
|
||||
@ -77,7 +77,7 @@ static const char* primitiveContactsKernelsCL= \
|
||||
" int m_shapeType;\n"
|
||||
" int m_shapeIndex;\n"
|
||||
" \n"
|
||||
"} b3CollidableGpu;\n"
|
||||
"} btCollidableGpu;\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
@ -87,7 +87,7 @@ static const char* primitiveContactsKernelsCL= \
|
||||
" int m_unused0;\n"
|
||||
" int m_unused1;\n"
|
||||
" int m_unused2;\n"
|
||||
"} b3GpuChildShape;\n"
|
||||
"} btGpuChildShape;\n"
|
||||
"\n"
|
||||
"#define GET_NPOINTS(x) (x).m_worldNormal.w\n"
|
||||
"\n"
|
||||
@ -129,7 +129,7 @@ static const char* primitiveContactsKernelsCL= \
|
||||
" float4 m_plane;\n"
|
||||
" int m_indexOffset;\n"
|
||||
" int m_numIndices;\n"
|
||||
"} b3GpuFace;\n"
|
||||
"} btGpuFace;\n"
|
||||
"\n"
|
||||
"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
|
||||
"\n"
|
||||
@ -290,7 +290,7 @@ static const char* primitiveContactsKernelsCL= \
|
||||
"\n"
|
||||
"\n"
|
||||
"inline bool IsPointInPolygon(float4 p, \n"
|
||||
" const b3GpuFace* face,\n"
|
||||
" const btGpuFace* face,\n"
|
||||
" __global const float4* baseVertex,\n"
|
||||
" __global const int* convexIndices,\n"
|
||||
" float4* out)\n"
|
||||
@ -352,11 +352,11 @@ static const char* primitiveContactsKernelsCL= \
|
||||
" int bodyIndexA, int bodyIndexB, \n"
|
||||
" int collidableIndexA, int collidableIndexB, \n"
|
||||
" __global const BodyData* rigidBodies, \n"
|
||||
" __global const b3CollidableGpu* collidables,\n"
|
||||
" __global const btCollidableGpu* collidables,\n"
|
||||
" __global const ConvexPolyhedronCL* convexShapes,\n"
|
||||
" __global const float4* convexVertices,\n"
|
||||
" __global const int* convexIndices,\n"
|
||||
" __global const b3GpuFace* faces,\n"
|
||||
" __global const btGpuFace* faces,\n"
|
||||
" __global Contact4* restrict globalContactsOut,\n"
|
||||
" counter32_t nGlobalContactsOut,\n"
|
||||
" int maxContactCapacity,\n"
|
||||
@ -383,7 +383,7 @@ static const char* primitiveContactsKernelsCL= \
|
||||
"\n"
|
||||
" for ( int f = 0; f < numFaces; f++ )\n"
|
||||
" {\n"
|
||||
" b3GpuFace face = faces[convexShapes[shapeIndex].m_faceOffset+f];\n"
|
||||
" btGpuFace face = faces[convexShapes[shapeIndex].m_faceOffset+f];\n"
|
||||
"\n"
|
||||
" // set up a plane equation \n"
|
||||
" float4 planeEqn;\n"
|
||||
@ -594,11 +594,11 @@ static const char* primitiveContactsKernelsCL= \
|
||||
" int bodyIndexA, int bodyIndexB, \n"
|
||||
" int collidableIndexA, int collidableIndexB, \n"
|
||||
" __global const BodyData* rigidBodies, \n"
|
||||
" __global const b3CollidableGpu*collidables,\n"
|
||||
" __global const btCollidableGpu*collidables,\n"
|
||||
" __global const ConvexPolyhedronCL* convexShapes,\n"
|
||||
" __global const float4* convexVertices,\n"
|
||||
" __global const int* convexIndices,\n"
|
||||
" __global const b3GpuFace* faces,\n"
|
||||
" __global const btGpuFace* faces,\n"
|
||||
" __global Contact4* restrict globalContactsOut,\n"
|
||||
" counter32_t nGlobalContactsOut,\n"
|
||||
" int maxContactCapacity,\n"
|
||||
@ -733,8 +733,8 @@ static const char* primitiveContactsKernelsCL= \
|
||||
" int bodyIndexA, int bodyIndexB, \n"
|
||||
" int collidableIndexA, int collidableIndexB, \n"
|
||||
" __global const BodyData* rigidBodies, \n"
|
||||
" __global const b3CollidableGpu* collidables,\n"
|
||||
" __global const b3GpuFace* faces,\n"
|
||||
" __global const btCollidableGpu* collidables,\n"
|
||||
" __global const btGpuFace* faces,\n"
|
||||
" __global Contact4* restrict globalContactsOut,\n"
|
||||
" counter32_t nGlobalContactsOut,\n"
|
||||
" int maxContactCapacity)\n"
|
||||
@ -793,11 +793,11 @@ static const char* primitiveContactsKernelsCL= \
|
||||
"\n"
|
||||
"__kernel void primitiveContactsKernel( __global const int2* pairs, \n"
|
||||
" __global const BodyData* rigidBodies, \n"
|
||||
" __global const b3CollidableGpu* collidables,\n"
|
||||
" __global const btCollidableGpu* collidables,\n"
|
||||
" __global const ConvexPolyhedronCL* convexShapes, \n"
|
||||
" __global const float4* vertices,\n"
|
||||
" __global const float4* uniqueEdges,\n"
|
||||
" __global const b3GpuFace* faces,\n"
|
||||
" __global const btGpuFace* faces,\n"
|
||||
" __global const int* indices,\n"
|
||||
" __global Contact4* restrict globalContactsOut,\n"
|
||||
" counter32_t nGlobalContactsOut,\n"
|
||||
@ -972,14 +972,14 @@ static const char* primitiveContactsKernelsCL= \
|
||||
"// work-in-progress\n"
|
||||
"__kernel void processCompoundPairsPrimitivesKernel( __global const int4* gpuCompoundPairs,\n"
|
||||
" __global const BodyData* rigidBodies, \n"
|
||||
" __global const b3CollidableGpu* collidables,\n"
|
||||
" __global const btCollidableGpu* collidables,\n"
|
||||
" __global const ConvexPolyhedronCL* convexShapes, \n"
|
||||
" __global const float4* vertices,\n"
|
||||
" __global const float4* uniqueEdges,\n"
|
||||
" __global const b3GpuFace* faces,\n"
|
||||
" __global const btGpuFace* faces,\n"
|
||||
" __global const int* indices,\n"
|
||||
" __global b3AabbCL* aabbs,\n"
|
||||
" __global const b3GpuChildShape* gpuChildShapes,\n"
|
||||
" __global btAabbCL* aabbs,\n"
|
||||
" __global const btGpuChildShape* gpuChildShapes,\n"
|
||||
" __global Contact4* restrict globalContactsOut,\n"
|
||||
" counter32_t nGlobalContactsOut,\n"
|
||||
" int numCompoundPairs, int maxContactCapacity\n"
|
||||
@ -1157,7 +1157,7 @@ static const char* primitiveContactsKernelsCL= \
|
||||
" int bodyIndexA, int bodyIndexB,\n"
|
||||
" int collidableIndexA, int collidableIndexB, \n"
|
||||
" __global const BodyData* rigidBodies, \n"
|
||||
" __global const b3CollidableGpu* collidables,\n"
|
||||
" __global const btCollidableGpu* collidables,\n"
|
||||
" const float4* triangleVertices,\n"
|
||||
" __global Contact4* restrict globalContactsOut,\n"
|
||||
" counter32_t nGlobalContactsOut,\n"
|
||||
@ -1299,13 +1299,13 @@ static const char* primitiveContactsKernelsCL= \
|
||||
"// work-in-progress\n"
|
||||
"__kernel void findConcaveSphereContactsKernel( __global int4* concavePairs,\n"
|
||||
" __global const BodyData* rigidBodies,\n"
|
||||
" __global const b3CollidableGpu* collidables,\n"
|
||||
" __global const btCollidableGpu* collidables,\n"
|
||||
" __global const ConvexPolyhedronCL* convexShapes, \n"
|
||||
" __global const float4* vertices,\n"
|
||||
" __global const float4* uniqueEdges,\n"
|
||||
" __global const b3GpuFace* faces,\n"
|
||||
" __global const btGpuFace* faces,\n"
|
||||
" __global const int* indices,\n"
|
||||
" __global b3AabbCL* aabbs,\n"
|
||||
" __global btAabbCL* aabbs,\n"
|
||||
" __global Contact4* restrict globalContactsOut,\n"
|
||||
" counter32_t nGlobalContactsOut,\n"
|
||||
" int numConcavePairs, int maxContactCapacity\n"
|
||||
@ -1329,7 +1329,7 @@ static const char* primitiveContactsKernelsCL= \
|
||||
" if (collidables[collidableIndexB].m_shapeType==SHAPE_SPHERE)\n"
|
||||
" {\n"
|
||||
" int f = concavePairs[i].z;\n"
|
||||
" b3GpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
|
||||
" btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
|
||||
" \n"
|
||||
" float4 verticesA[3];\n"
|
||||
" for (int i=0;i<3;i++)\n"
|
||||
|
@ -55,7 +55,7 @@ static const char* satClipKernelsCL= \
|
||||
"} Contact4;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"///keep this in sync with b3Collidable.h\n"
|
||||
"///keep this in sync with btCollidable.h\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" int m_numChildShapes;\n"
|
||||
@ -63,7 +63,7 @@ static const char* satClipKernelsCL= \
|
||||
" int m_shapeType;\n"
|
||||
" int m_shapeIndex;\n"
|
||||
" \n"
|
||||
"} b3CollidableGpu;\n"
|
||||
"} btCollidableGpu;\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
@ -73,7 +73,7 @@ static const char* satClipKernelsCL= \
|
||||
" int m_unused0;\n"
|
||||
" int m_unused1;\n"
|
||||
" int m_unused2;\n"
|
||||
"} b3GpuChildShape;\n"
|
||||
"} btGpuChildShape;\n"
|
||||
"\n"
|
||||
"#define GET_NPOINTS(x) (x).m_worldNormal.w\n"
|
||||
"\n"
|
||||
@ -115,7 +115,7 @@ static const char* satClipKernelsCL= \
|
||||
" float4 m_plane;\n"
|
||||
" int m_indexOffset;\n"
|
||||
" int m_numIndices;\n"
|
||||
"} b3GpuFace;\n"
|
||||
"} btGpuFace;\n"
|
||||
"\n"
|
||||
"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
|
||||
"\n"
|
||||
@ -357,7 +357,7 @@ static const char* satClipKernelsCL= \
|
||||
" float4* worldVertsB2, int capacityWorldVertsB2,\n"
|
||||
" const float minDist, float maxDist,\n"
|
||||
" __global const float4* vertices,\n"
|
||||
" __global const b3GpuFace* faces,\n"
|
||||
" __global const btGpuFace* faces,\n"
|
||||
" __global const int* indices,\n"
|
||||
" float4* contactsOut,\n"
|
||||
" int contactCapacity)\n"
|
||||
@ -392,7 +392,7 @@ static const char* satClipKernelsCL= \
|
||||
" if (closestFaceA<0)\n"
|
||||
" return numContactsOut;\n"
|
||||
"\n"
|
||||
" b3GpuFace polyA = faces[hullA->m_faceOffset+closestFaceA];\n"
|
||||
" btGpuFace polyA = faces[hullA->m_faceOffset+closestFaceA];\n"
|
||||
"\n"
|
||||
" // clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n"
|
||||
" int numVerticesA = polyA.m_numIndices;\n"
|
||||
@ -416,7 +416,7 @@ static const char* satClipKernelsCL= \
|
||||
" //clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);\n"
|
||||
" numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);\n"
|
||||
"\n"
|
||||
" //b3Swap(pVtxIn,pVtxOut);\n"
|
||||
" //btSwap(pVtxIn,pVtxOut);\n"
|
||||
" float4* tmp = pVtxOut;\n"
|
||||
" pVtxOut = pVtxIn;\n"
|
||||
" pVtxIn = tmp;\n"
|
||||
@ -458,10 +458,10 @@ static const char* satClipKernelsCL= \
|
||||
" float4* worldVertsB2, int capacityWorldVertsB2,\n"
|
||||
" const float minDist, float maxDist,\n"
|
||||
" const float4* verticesA,\n"
|
||||
" const b3GpuFace* facesA,\n"
|
||||
" const btGpuFace* facesA,\n"
|
||||
" const int* indicesA,\n"
|
||||
" __global const float4* verticesB,\n"
|
||||
" __global const b3GpuFace* facesB,\n"
|
||||
" __global const btGpuFace* facesB,\n"
|
||||
" __global const int* indicesB,\n"
|
||||
" float4* contactsOut,\n"
|
||||
" int contactCapacity)\n"
|
||||
@ -496,7 +496,7 @@ static const char* satClipKernelsCL= \
|
||||
" if (closestFaceA<0)\n"
|
||||
" return numContactsOut;\n"
|
||||
"\n"
|
||||
" b3GpuFace polyA = facesA[hullA->m_faceOffset+closestFaceA];\n"
|
||||
" btGpuFace polyA = facesA[hullA->m_faceOffset+closestFaceA];\n"
|
||||
"\n"
|
||||
" // clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n"
|
||||
" int numVerticesA = polyA.m_numIndices;\n"
|
||||
@ -520,7 +520,7 @@ static const char* satClipKernelsCL= \
|
||||
" //clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);\n"
|
||||
" numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);\n"
|
||||
"\n"
|
||||
" //b3Swap(pVtxIn,pVtxOut);\n"
|
||||
" //btSwap(pVtxIn,pVtxOut);\n"
|
||||
" float4* tmp = pVtxOut;\n"
|
||||
" pVtxOut = pVtxIn;\n"
|
||||
" pVtxIn = tmp;\n"
|
||||
@ -561,7 +561,7 @@ static const char* satClipKernelsCL= \
|
||||
" float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,\n"
|
||||
" const float minDist, float maxDist,\n"
|
||||
" __global const float4* vertices,\n"
|
||||
" __global const b3GpuFace* faces,\n"
|
||||
" __global const btGpuFace* faces,\n"
|
||||
" __global const int* indices,\n"
|
||||
" float4* localContactsOut,\n"
|
||||
" int localContactCapacity)\n"
|
||||
@ -589,7 +589,7 @@ static const char* satClipKernelsCL= \
|
||||
" }\n"
|
||||
"\n"
|
||||
" {\n"
|
||||
" const b3GpuFace polyB = faces[hullB->m_faceOffset+closestFaceB];\n"
|
||||
" const btGpuFace polyB = faces[hullB->m_faceOffset+closestFaceB];\n"
|
||||
" const int numVertices = polyB.m_numIndices;\n"
|
||||
" for(int e0=0;e0<numVertices;e0++)\n"
|
||||
" {\n"
|
||||
@ -617,10 +617,10 @@ static const char* satClipKernelsCL= \
|
||||
" float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,\n"
|
||||
" const float minDist, float maxDist,\n"
|
||||
" const float4* verticesA,\n"
|
||||
" const b3GpuFace* facesA,\n"
|
||||
" const btGpuFace* facesA,\n"
|
||||
" const int* indicesA,\n"
|
||||
" __global const float4* verticesB,\n"
|
||||
" __global const b3GpuFace* facesB,\n"
|
||||
" __global const btGpuFace* facesB,\n"
|
||||
" __global const int* indicesB,\n"
|
||||
" float4* localContactsOut,\n"
|
||||
" int localContactCapacity)\n"
|
||||
@ -648,7 +648,7 @@ static const char* satClipKernelsCL= \
|
||||
" }\n"
|
||||
"\n"
|
||||
" {\n"
|
||||
" const b3GpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB];\n"
|
||||
" const btGpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB];\n"
|
||||
" const int numVertices = polyB.m_numIndices;\n"
|
||||
" for(int e0=0;e0<numVertices;e0++)\n"
|
||||
" {\n"
|
||||
@ -956,11 +956,11 @@ static const char* satClipKernelsCL= \
|
||||
"\n"
|
||||
"__kernel void clipHullHullKernel( __global const int2* pairs, \n"
|
||||
" __global const BodyData* rigidBodies, \n"
|
||||
" __global const b3CollidableGpu* collidables,\n"
|
||||
" __global const btCollidableGpu* collidables,\n"
|
||||
" __global const ConvexPolyhedronCL* convexShapes, \n"
|
||||
" __global const float4* vertices,\n"
|
||||
" __global const float4* uniqueEdges,\n"
|
||||
" __global const b3GpuFace* faces,\n"
|
||||
" __global const btGpuFace* faces,\n"
|
||||
" __global const int* indices,\n"
|
||||
" __global const float4* separatingNormals,\n"
|
||||
" __global const int* hasSeparatingAxis,\n"
|
||||
@ -1053,13 +1053,13 @@ static const char* satClipKernelsCL= \
|
||||
"\n"
|
||||
"__kernel void clipCompoundsHullHullKernel( __global const int4* gpuCompoundPairs, \n"
|
||||
" __global const BodyData* rigidBodies, \n"
|
||||
" __global const b3CollidableGpu* collidables,\n"
|
||||
" __global const btCollidableGpu* collidables,\n"
|
||||
" __global const ConvexPolyhedronCL* convexShapes, \n"
|
||||
" __global const float4* vertices,\n"
|
||||
" __global const float4* uniqueEdges,\n"
|
||||
" __global const b3GpuFace* faces,\n"
|
||||
" __global const btGpuFace* faces,\n"
|
||||
" __global const int* indices,\n"
|
||||
" __global const b3GpuChildShape* gpuChildShapes,\n"
|
||||
" __global const btGpuChildShape* gpuChildShapes,\n"
|
||||
" __global const float4* gpuCompoundSepNormalsOut,\n"
|
||||
" __global const int* gpuHasCompoundSepNormalsOut,\n"
|
||||
" __global Contact4* restrict globalContactsOut,\n"
|
||||
@ -1185,7 +1185,7 @@ static const char* satClipKernelsCL= \
|
||||
"\n"
|
||||
"__kernel void sphereSphereCollisionKernel( __global const int2* pairs, \n"
|
||||
" __global const BodyData* rigidBodies, \n"
|
||||
" __global const b3CollidableGpu* collidables,\n"
|
||||
" __global const btCollidableGpu* collidables,\n"
|
||||
" __global const float4* separatingNormals,\n"
|
||||
" __global const int* hasSeparatingAxis,\n"
|
||||
" __global Contact4* restrict globalContactsOut,\n"
|
||||
@ -1252,13 +1252,13 @@ static const char* satClipKernelsCL= \
|
||||
"\n"
|
||||
"__kernel void clipHullHullConcaveConvexKernel( __global int4* concavePairsIn,\n"
|
||||
" __global const BodyData* rigidBodies, \n"
|
||||
" __global const b3CollidableGpu* collidables,\n"
|
||||
" __global const btCollidableGpu* collidables,\n"
|
||||
" __global const ConvexPolyhedronCL* convexShapes, \n"
|
||||
" __global const float4* vertices,\n"
|
||||
" __global const float4* uniqueEdges,\n"
|
||||
" __global const b3GpuFace* faces,\n"
|
||||
" __global const btGpuFace* faces,\n"
|
||||
" __global const int* indices,\n"
|
||||
" __global const b3GpuChildShape* gpuChildShapes,\n"
|
||||
" __global const btGpuChildShape* gpuChildShapes,\n"
|
||||
" __global const float4* separatingNormals,\n"
|
||||
" __global Contact4* restrict globalContactsOut,\n"
|
||||
" counter32_t nGlobalContactsOut,\n"
|
||||
@ -1306,7 +1306,7 @@ static const char* satClipKernelsCL= \
|
||||
" convexPolyhedronA.m_vertexOffset = 0;\n"
|
||||
" float4 localCenter = make_float4(0.f,0.f,0.f,0.f);\n"
|
||||
"\n"
|
||||
" b3GpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
|
||||
" btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
|
||||
" \n"
|
||||
" float4 verticesA[3];\n"
|
||||
" for (int i=0;i<3;i++)\n"
|
||||
@ -1335,7 +1335,7 @@ static const char* satClipKernelsCL= \
|
||||
" \n"
|
||||
" float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n"
|
||||
" \n"
|
||||
" b3GpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n"
|
||||
" btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n"
|
||||
" int indicesA[3+3+2+2+2];\n"
|
||||
" int curUsedIndices=0;\n"
|
||||
" int fidx=0;\n"
|
||||
@ -1496,7 +1496,7 @@ static const char* satClipKernelsCL= \
|
||||
" int capacityWorldVerts,\n"
|
||||
" const float minDist, float maxDist,\n"
|
||||
" __global const float4* vertices,\n"
|
||||
" __global const b3GpuFace* faces,\n"
|
||||
" __global const btGpuFace* faces,\n"
|
||||
" __global const int* indices,\n"
|
||||
" __global int4* clippingFaces, int pairIndex)\n"
|
||||
"{\n"
|
||||
@ -1523,7 +1523,7 @@ static const char* satClipKernelsCL= \
|
||||
" }\n"
|
||||
" \n"
|
||||
" {\n"
|
||||
" const b3GpuFace polyB = faces[hullB->m_faceOffset+closestFaceB];\n"
|
||||
" const btGpuFace polyB = faces[hullB->m_faceOffset+closestFaceB];\n"
|
||||
" const int numVertices = polyB.m_numIndices;\n"
|
||||
" for(int e0=0;e0<numVertices;e0++)\n"
|
||||
" {\n"
|
||||
@ -1664,11 +1664,11 @@ static const char* satClipKernelsCL= \
|
||||
"\n"
|
||||
"__kernel void findClippingFacesKernel( __global const int2* pairs,\n"
|
||||
" __global const BodyData* rigidBodies,\n"
|
||||
" __global const b3CollidableGpu* collidables,\n"
|
||||
" __global const btCollidableGpu* collidables,\n"
|
||||
" __global const ConvexPolyhedronCL* convexShapes,\n"
|
||||
" __global const float4* vertices,\n"
|
||||
" __global const float4* uniqueEdges,\n"
|
||||
" __global const b3GpuFace* faces,\n"
|
||||
" __global const btGpuFace* faces,\n"
|
||||
" __global const int* indices,\n"
|
||||
" __global const float4* separatingNormals,\n"
|
||||
" __global const int* hasSeparatingAxis,\n"
|
||||
|
@ -1,6 +1,6 @@
|
||||
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
|
||||
static const char* satKernelsCL= \
|
||||
"//keep this enum in sync with the CPU version (in b3Collidable.h)\n"
|
||||
"//keep this enum in sync with the CPU version (in btCollidable.h)\n"
|
||||
"//written by Erwin Coumans\n"
|
||||
"\n"
|
||||
"\n"
|
||||
@ -13,7 +13,7 @@ static const char* satKernelsCL= \
|
||||
"\n"
|
||||
"typedef unsigned int u32;\n"
|
||||
"\n"
|
||||
"///keep this in sync with b3Collidable.h\n"
|
||||
"///keep this in sync with btCollidable.h\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" int m_numChildShapes;\n"
|
||||
@ -21,7 +21,7 @@ static const char* satKernelsCL= \
|
||||
" int m_shapeType;\n"
|
||||
" int m_shapeIndex;\n"
|
||||
" \n"
|
||||
"} b3CollidableGpu;\n"
|
||||
"} btCollidableGpu;\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
@ -31,7 +31,7 @@ static const char* satKernelsCL= \
|
||||
" int m_unused0;\n"
|
||||
" int m_unused1;\n"
|
||||
" int m_unused2;\n"
|
||||
"} b3GpuChildShape;\n"
|
||||
"} btGpuChildShape;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
@ -80,14 +80,14 @@ static const char* satKernelsCL= \
|
||||
" float m_maxElems[4];\n"
|
||||
" int m_maxIndices[4];\n"
|
||||
" };\n"
|
||||
"} b3AabbCL;\n"
|
||||
"} btAabbCL;\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" float4 m_plane;\n"
|
||||
" int m_indexOffset;\n"
|
||||
" int m_numIndices;\n"
|
||||
"} b3GpuFace;\n"
|
||||
"} btGpuFace;\n"
|
||||
"\n"
|
||||
"#define make_float4 (float4)\n"
|
||||
"\n"
|
||||
@ -296,12 +296,12 @@ static const char* satKernelsCL= \
|
||||
" \n"
|
||||
" const float4* verticesA, \n"
|
||||
" const float4* uniqueEdgesA, \n"
|
||||
" const b3GpuFace* facesA,\n"
|
||||
" const btGpuFace* facesA,\n"
|
||||
" const int* indicesA,\n"
|
||||
"\n"
|
||||
" __global const float4* verticesB, \n"
|
||||
" __global const float4* uniqueEdgesB, \n"
|
||||
" __global const b3GpuFace* facesB,\n"
|
||||
" __global const btGpuFace* facesB,\n"
|
||||
" __global const int* indicesB,\n"
|
||||
" float4* sep,\n"
|
||||
" float* dmin)\n"
|
||||
@ -348,11 +348,11 @@ static const char* satKernelsCL= \
|
||||
" const float4 DeltaC2,\n"
|
||||
" __global const float4* verticesA, \n"
|
||||
" __global const float4* uniqueEdgesA, \n"
|
||||
" __global const b3GpuFace* facesA,\n"
|
||||
" __global const btGpuFace* facesA,\n"
|
||||
" __global const int* indicesA,\n"
|
||||
" const float4* verticesB,\n"
|
||||
" const float4* uniqueEdgesB, \n"
|
||||
" const b3GpuFace* facesB,\n"
|
||||
" const btGpuFace* facesB,\n"
|
||||
" const int* indicesB,\n"
|
||||
" float4* sep,\n"
|
||||
" float* dmin)\n"
|
||||
@ -401,11 +401,11 @@ static const char* satKernelsCL= \
|
||||
" const float4 DeltaC2,\n"
|
||||
" const float4* verticesA, \n"
|
||||
" const float4* uniqueEdgesA, \n"
|
||||
" const b3GpuFace* facesA,\n"
|
||||
" const btGpuFace* facesA,\n"
|
||||
" const int* indicesA,\n"
|
||||
" __global const float4* verticesB, \n"
|
||||
" __global const float4* uniqueEdgesB, \n"
|
||||
" __global const b3GpuFace* facesB,\n"
|
||||
" __global const btGpuFace* facesB,\n"
|
||||
" __global const int* indicesB,\n"
|
||||
" float4* sep,\n"
|
||||
" float* dmin)\n"
|
||||
@ -507,7 +507,7 @@ static const char* satKernelsCL= \
|
||||
" const float4 DeltaC2,\n"
|
||||
" __global const float4* vertices, \n"
|
||||
" __global const float4* uniqueEdges, \n"
|
||||
" __global const b3GpuFace* faces,\n"
|
||||
" __global const btGpuFace* faces,\n"
|
||||
" __global const int* indices,\n"
|
||||
" float4* sep,\n"
|
||||
" float* dmin)\n"
|
||||
@ -566,7 +566,7 @@ static const char* satKernelsCL= \
|
||||
" const float4 DeltaC2,\n"
|
||||
" __global const float4* vertices, \n"
|
||||
" __global const float4* uniqueEdges, \n"
|
||||
" __global const b3GpuFace* faces,\n"
|
||||
" __global const btGpuFace* faces,\n"
|
||||
" __global const int* indices,\n"
|
||||
" float4* sep,\n"
|
||||
" float* dmin)\n"
|
||||
@ -643,14 +643,14 @@ static const char* satKernelsCL= \
|
||||
"// work-in-progress\n"
|
||||
"__kernel void processCompoundPairsKernel( __global const int4* gpuCompoundPairs,\n"
|
||||
" __global const BodyData* rigidBodies, \n"
|
||||
" __global const b3CollidableGpu* collidables,\n"
|
||||
" __global const btCollidableGpu* collidables,\n"
|
||||
" __global const ConvexPolyhedronCL* convexShapes, \n"
|
||||
" __global const float4* vertices,\n"
|
||||
" __global const float4* uniqueEdges,\n"
|
||||
" __global const b3GpuFace* faces,\n"
|
||||
" __global const btGpuFace* faces,\n"
|
||||
" __global const int* indices,\n"
|
||||
" __global b3AabbCL* aabbs,\n"
|
||||
" __global const b3GpuChildShape* gpuChildShapes,\n"
|
||||
" __global btAabbCL* aabbs,\n"
|
||||
" __global const btGpuChildShape* gpuChildShapes,\n"
|
||||
" __global volatile float4* gpuCompoundSepNormalsOut,\n"
|
||||
" __global volatile int* gpuHasCompoundSepNormalsOut,\n"
|
||||
" int numCompoundPairs\n"
|
||||
@ -760,14 +760,14 @@ static const char* satKernelsCL= \
|
||||
"// work-in-progress\n"
|
||||
"__kernel void findCompoundPairsKernel( __global const int2* pairs, \n"
|
||||
" __global const BodyData* rigidBodies, \n"
|
||||
" __global const b3CollidableGpu* collidables,\n"
|
||||
" __global const btCollidableGpu* collidables,\n"
|
||||
" __global const ConvexPolyhedronCL* convexShapes, \n"
|
||||
" __global const float4* vertices,\n"
|
||||
" __global const float4* uniqueEdges,\n"
|
||||
" __global const b3GpuFace* faces,\n"
|
||||
" __global const btGpuFace* faces,\n"
|
||||
" __global const int* indices,\n"
|
||||
" __global b3AabbCL* aabbs,\n"
|
||||
" __global const b3GpuChildShape* gpuChildShapes,\n"
|
||||
" __global btAabbCL* aabbs,\n"
|
||||
" __global const btGpuChildShape* gpuChildShapes,\n"
|
||||
" __global volatile int4* gpuCompoundPairsOut,\n"
|
||||
" __global volatile int* numCompoundPairsOut,\n"
|
||||
" int numPairs,\n"
|
||||
@ -942,13 +942,13 @@ static const char* satKernelsCL= \
|
||||
"// work-in-progress\n"
|
||||
"__kernel void findSeparatingAxisKernel( __global const int2* pairs, \n"
|
||||
" __global const BodyData* rigidBodies, \n"
|
||||
" __global const b3CollidableGpu* collidables,\n"
|
||||
" __global const btCollidableGpu* collidables,\n"
|
||||
" __global const ConvexPolyhedronCL* convexShapes, \n"
|
||||
" __global const float4* vertices,\n"
|
||||
" __global const float4* uniqueEdges,\n"
|
||||
" __global const b3GpuFace* faces,\n"
|
||||
" __global const btGpuFace* faces,\n"
|
||||
" __global const int* indices,\n"
|
||||
" __global b3AabbCL* aabbs,\n"
|
||||
" __global btAabbCL* aabbs,\n"
|
||||
" __global volatile float4* separatingNormals,\n"
|
||||
" __global volatile int* hasSeparatingAxis,\n"
|
||||
" int numPairs\n"
|
||||
@ -1056,14 +1056,14 @@ static const char* satKernelsCL= \
|
||||
"// work-in-progress\n"
|
||||
"__kernel void findConcaveSeparatingAxisKernel( __global int4* concavePairs,\n"
|
||||
" __global const BodyData* rigidBodies,\n"
|
||||
" __global const b3CollidableGpu* collidables,\n"
|
||||
" __global const btCollidableGpu* collidables,\n"
|
||||
" __global const ConvexPolyhedronCL* convexShapes, \n"
|
||||
" __global const float4* vertices,\n"
|
||||
" __global const float4* uniqueEdges,\n"
|
||||
" __global const b3GpuFace* faces,\n"
|
||||
" __global const btGpuFace* faces,\n"
|
||||
" __global const int* indices,\n"
|
||||
" __global const b3GpuChildShape* gpuChildShapes,\n"
|
||||
" __global b3AabbCL* aabbs,\n"
|
||||
" __global const btGpuChildShape* gpuChildShapes,\n"
|
||||
" __global btAabbCL* aabbs,\n"
|
||||
" __global float4* concaveSeparatingNormalsOut,\n"
|
||||
" int numConcavePairs\n"
|
||||
" )\n"
|
||||
@ -1106,9 +1106,9 @@ static const char* satKernelsCL= \
|
||||
" convexPolyhedronA.m_vertexOffset = 0;\n"
|
||||
" float4 localCenter = make_float4(0.f,0.f,0.f,0.f);\n"
|
||||
"\n"
|
||||
" b3GpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
|
||||
" btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
|
||||
" float4 triMinAabb, triMaxAabb;\n"
|
||||
" b3AabbCL triAabb;\n"
|
||||
" btAabbCL triAabb;\n"
|
||||
" triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);\n"
|
||||
" triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);\n"
|
||||
" \n"
|
||||
@ -1153,7 +1153,7 @@ static const char* satKernelsCL= \
|
||||
" \n"
|
||||
" float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n"
|
||||
" \n"
|
||||
" b3GpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n"
|
||||
" btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n"
|
||||
" int indicesA[3+3+2+2+2];\n"
|
||||
" int curUsedIndices=0;\n"
|
||||
" int fidx=0;\n"
|
||||
|
@ -210,7 +210,7 @@ static const char* batchingKernelsNewCL= \
|
||||
"\n"
|
||||
" if (i!=numValidConstraints)\n"
|
||||
" {\n"
|
||||
" //b3Swap(cs[i],cs[numValidConstraints]);\n"
|
||||
" //btSwap(cs[i],cs[numValidConstraints]);\n"
|
||||
" \n"
|
||||
" Contact4 tmp = cs[i];\n"
|
||||
" cs[i] = cs[numValidConstraints];\n"
|
||||
|
@ -47,7 +47,7 @@ static const char* integrateKernelCL= \
|
||||
" integrateTransformsKernel( __global Body* bodies,const int numNodes, float timeStep, float angularDamping, float4 gravityAcceleration)\n"
|
||||
"{\n"
|
||||
" int nodeID = get_global_id(0);\n"
|
||||
" float B3_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n"
|
||||
" float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n"
|
||||
" if( nodeID < numNodes && (bodies[nodeID].m_invMass != 0.f))\n"
|
||||
" {\n"
|
||||
" //angular velocity\n"
|
||||
@ -61,9 +61,9 @@ static const char* integrateKernelCL= \
|
||||
" float4 angvel = bodies[nodeID].m_angVel;\n"
|
||||
" float fAngle = native_sqrt(dot(angvel, angvel));\n"
|
||||
" //limit the angular motion\n"
|
||||
" if(fAngle*timeStep > B3_GPU_ANGULAR_MOTION_THRESHOLD)\n"
|
||||
" if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)\n"
|
||||
" {\n"
|
||||
" fAngle = B3_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n"
|
||||
" fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n"
|
||||
" }\n"
|
||||
" if(fAngle < 0.001f)\n"
|
||||
" {\n"
|
||||
|
@ -313,8 +313,8 @@ static const char* solveContactCL= \
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"void b3PlaneSpace1 (const float4* n, float4* p, float4* q);\n"
|
||||
" void b3PlaneSpace1 (const float4* n, float4* p, float4* q)\n"
|
||||
"void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n"
|
||||
" void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n"
|
||||
"{\n"
|
||||
" if (fabs(n[0].z) > 0.70710678f) {\n"
|
||||
" // choose p in y-z plane\n"
|
||||
|
@ -265,8 +265,8 @@ static const char* solveFrictionCL= \
|
||||
" float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
|
||||
" return -1.f/(jmj0+jmj1+jmj2+jmj3);\n"
|
||||
"}\n"
|
||||
"void b3PlaneSpace1 (const float4* n, float4* p, float4* q);\n"
|
||||
" void b3PlaneSpace1 (const float4* n, float4* p, float4* q)\n"
|
||||
"void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n"
|
||||
" void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n"
|
||||
"{\n"
|
||||
" if (fabs(n[0].z) > 0.70710678f) {\n"
|
||||
" // choose p in y-z plane\n"
|
||||
@ -347,7 +347,7 @@ static const char* solveFrictionCL= \
|
||||
" float4 n = -cs->m_linear;\n"
|
||||
" \n"
|
||||
" float4 tangent[2];\n"
|
||||
" b3PlaneSpace1(&n,&tangent[0],&tangent[1]);\n"
|
||||
" btPlaneSpace1(&n,&tangent[0],&tangent[1]);\n"
|
||||
" float4 angular0, angular1, linear;\n"
|
||||
" float4 r0 = center - posA;\n"
|
||||
" float4 r1 = center - posB;\n"
|
||||
|
@ -489,8 +489,8 @@ static const char* solverSetupCL= \
|
||||
"} ConstBufferSSD;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"void b3PlaneSpace1 (float4 n, float4* p, float4* q);\n"
|
||||
" void b3PlaneSpace1 (float4 n, float4* p, float4* q)\n"
|
||||
"void btPlaneSpace1 (float4 n, float4* p, float4* q);\n"
|
||||
" void btPlaneSpace1 (float4 n, float4* p, float4* q)\n"
|
||||
"{\n"
|
||||
" if (fabs(n.z) > 0.70710678f) {\n"
|
||||
" // choose p in y-z plane\n"
|
||||
@ -577,7 +577,7 @@ static const char* solverSetupCL= \
|
||||
" center /= (float)src->m_worldNormal.w;\n"
|
||||
"\n"
|
||||
" float4 tangent[2];\n"
|
||||
" b3PlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]);\n"
|
||||
" btPlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]);\n"
|
||||
" \n"
|
||||
" float4 r[2];\n"
|
||||
" r[0] = center - posA;\n"
|
||||
|
@ -488,8 +488,8 @@ static const char* solverUtilsCL= \
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"void b3PlaneSpace1 (float4 n, float4* p, float4* q);\n"
|
||||
" void b3PlaneSpace1 (float4 n, float4* p, float4* q)\n"
|
||||
"void btPlaneSpace1 (float4 n, float4* p, float4* q);\n"
|
||||
" void btPlaneSpace1 (float4 n, float4* p, float4* q)\n"
|
||||
"{\n"
|
||||
" if (fabs(n.z) > 0.70710678f) {\n"
|
||||
" // choose p in y-z plane\n"
|
||||
@ -739,7 +739,7 @@ static const char* solverUtilsCL= \
|
||||
" float4 n = -cs->m_linear;\n"
|
||||
" \n"
|
||||
" float4 tangent[2];\n"
|
||||
" b3PlaneSpace1(n,&tangent[0],&tangent[1]);\n"
|
||||
" btPlaneSpace1(n,&tangent[0],&tangent[1]);\n"
|
||||
" float4 angular0, angular1, linear;\n"
|
||||
" float4 r0 = center - posA;\n"
|
||||
" float4 r1 = center - posB;\n"
|
||||
@ -896,7 +896,7 @@ static const char* solverUtilsCL= \
|
||||
" center /= (float)src->m_worldNormal.w;\n"
|
||||
"\n"
|
||||
" float4 tangent[2];\n"
|
||||
" b3PlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]);\n"
|
||||
" btPlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]);\n"
|
||||
" \n"
|
||||
" float4 r[2];\n"
|
||||
" r[0] = center - posA;\n"
|
||||
|
@ -120,7 +120,7 @@ static const char* updateAabbsKernelCL= \
|
||||
" float fy;\n"
|
||||
" float fz;\n"
|
||||
" int uw;\n"
|
||||
"} b3AABBCL;\n"
|
||||
"} btAABBCL;\n"
|
||||
"\n"
|
||||
"__inline\n"
|
||||
"Matrix3x3 mtTranspose(Matrix3x3 m)\n"
|
||||
@ -156,7 +156,7 @@ static const char* updateAabbsKernelCL= \
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel void initializeGpuAabbsFull( const int numNodes, __global Body* gBodies,__global Collidable* collidables, __global b3AABBCL* plocalShapeAABB, __global b3AABBCL* pAABB)\n"
|
||||
"__kernel void initializeGpuAabbsFull( const int numNodes, __global Body* gBodies,__global Collidable* collidables, __global btAABBCL* plocalShapeAABB, __global btAABBCL* pAABB)\n"
|
||||
"{\n"
|
||||
" int nodeID = get_global_id(0);\n"
|
||||
" \n"
|
||||
@ -171,8 +171,8 @@ static const char* updateAabbsKernelCL= \
|
||||
" \n"
|
||||
" if (shapeIndex>=0)\n"
|
||||
" {\n"
|
||||
" b3AABBCL minAabb = plocalShapeAABB[collidableIndex*2];\n"
|
||||
" b3AABBCL maxAabb = plocalShapeAABB[collidableIndex*2+1];\n"
|
||||
" btAABBCL minAabb = plocalShapeAABB[collidableIndex*2];\n"
|
||||
" btAABBCL maxAabb = plocalShapeAABB[collidableIndex*2+1];\n"
|
||||
" \n"
|
||||
" float4 halfExtents = ((float4)(maxAabb.fx - minAabb.fx,maxAabb.fy - minAabb.fy,maxAabb.fz - minAabb.fz,0.f))*0.5f;\n"
|
||||
" float4 localCenter = ((float4)(maxAabb.fx + minAabb.fx,maxAabb.fy + minAabb.fy,maxAabb.fz + minAabb.fz,0.f))*0.5f;\n"
|
||||
|
171
test/OpenCL/BitonicSort/BitonicSort.cl
Normal file
171
test/OpenCL/BitonicSort/BitonicSort.cl
Normal file
@ -0,0 +1,171 @@
|
||||
MSTRINGIFY(
|
||||
/*
|
||||
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NVIDIA Corporation and its licensors retain all intellectual property and
|
||||
* proprietary rights in and to this software and related documentation.
|
||||
* Any use, reproduction, disclosure, or distribution of this software
|
||||
* and related documentation without an express license agreement from
|
||||
* NVIDIA Corporation is strictly prohibited.
|
||||
*
|
||||
* Please refer to the applicable NVIDIA end user license agreement (EULA)
|
||||
* associated with this source code for terms and conditions that govern
|
||||
* your use of this NVIDIA software.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
|
||||
inline void ComparatorPrivate(int2* keyA, int2* keyB, uint dir)
|
||||
{
|
||||
if((keyA[0].x > keyB[0].x) == dir)
|
||||
{
|
||||
int2 tmp = *keyA;
|
||||
*keyA = *keyB;
|
||||
*keyB = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
inline void ComparatorLocal(__local int2* keyA, __local int2* keyB, uint dir)
|
||||
{
|
||||
if((keyA[0].x > keyB[0].x) == dir)
|
||||
{
|
||||
int2 tmp = *keyA;
|
||||
*keyA = *keyB;
|
||||
*keyB = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Monolithic bitonic sort kernel for short arrays fitting into local memory
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void kBitonicSortCellIdLocal(__global int2* pKey, uint arrayLength, uint dir GUID_ARG)
|
||||
{
|
||||
__local int2 l_key[1024U];
|
||||
int localSizeLimit = get_local_size(0) * 2;
|
||||
|
||||
//Offset to the beginning of subbatch and load data
|
||||
pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
|
||||
l_key[get_local_id(0) + 0] = pKey[ 0];
|
||||
l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
|
||||
|
||||
for(uint size = 2; size < arrayLength; size <<= 1)
|
||||
{
|
||||
//Bitonic merge
|
||||
uint ddd = dir ^ ( (get_local_id(0) & (size / 2)) != 0 );
|
||||
for(uint stride = size / 2; stride > 0; stride >>= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
|
||||
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
|
||||
}
|
||||
}
|
||||
|
||||
//ddd == dir for the last bitonic merge step
|
||||
{
|
||||
for(uint stride = arrayLength / 2; stride > 0; stride >>= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
|
||||
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], dir);
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
pKey[ 0] = l_key[get_local_id(0) + 0];
|
||||
pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Bitonic sort kernel for large arrays (not fitting into local memory)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//Bottom-level bitonic sort
|
||||
//Almost the same as bitonicSortLocal with the only exception
|
||||
//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being
|
||||
//sorted in opposite directions
|
||||
__kernel void kBitonicSortCellIdLocal1(__global int2* pKey GUID_ARG)
|
||||
{
|
||||
__local int2 l_key[1024U];
|
||||
uint localSizeLimit = get_local_size(0) * 2;
|
||||
|
||||
//Offset to the beginning of subarray and load data
|
||||
pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
|
||||
l_key[get_local_id(0) + 0] = pKey[ 0];
|
||||
l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
|
||||
|
||||
uint comparatorI = get_global_id(0) & ((localSizeLimit / 2) - 1);
|
||||
|
||||
for(uint size = 2; size < localSizeLimit; size <<= 1)
|
||||
{
|
||||
//Bitonic merge
|
||||
uint ddd = (comparatorI & (size / 2)) != 0;
|
||||
for(uint stride = size / 2; stride > 0; stride >>= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
|
||||
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
|
||||
}
|
||||
}
|
||||
|
||||
//Odd / even arrays of localSizeLimit elements
|
||||
//sorted in opposite directions
|
||||
{
|
||||
uint ddd = (get_group_id(0) & 1);
|
||||
for(uint stride = localSizeLimit / 2; stride > 0; stride >>= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
|
||||
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
pKey[ 0] = l_key[get_local_id(0) + 0];
|
||||
pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
|
||||
}
|
||||
|
||||
//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
|
||||
__kernel void kBitonicSortCellIdMergeGlobal(__global int2* pKey, uint arrayLength, uint size, uint stride, uint dir GUID_ARG)
|
||||
{
|
||||
uint global_comparatorI = get_global_id(0);
|
||||
uint comparatorI = global_comparatorI & (arrayLength / 2 - 1);
|
||||
|
||||
//Bitonic merge
|
||||
uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
|
||||
uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
|
||||
|
||||
int2 keyA = pKey[pos + 0];
|
||||
int2 keyB = pKey[pos + stride];
|
||||
|
||||
ComparatorPrivate(&keyA, &keyB, ddd);
|
||||
|
||||
pKey[pos + 0] = keyA;
|
||||
pKey[pos + stride] = keyB;
|
||||
}
|
||||
|
||||
//Combined bitonic merge steps for
|
||||
//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]
|
||||
__kernel void kBitonicSortCellIdMergeLocal(__global int2* pKey, uint arrayLength, uint stride, uint size, uint dir GUID_ARG)
|
||||
{
|
||||
__local int2 l_key[1024U];
|
||||
int localSizeLimit = get_local_size(0) * 2;
|
||||
|
||||
pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
|
||||
l_key[get_local_id(0) + 0] = pKey[ 0];
|
||||
l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
|
||||
|
||||
//Bitonic merge
|
||||
uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1);
|
||||
uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
|
||||
for(; stride > 0; stride >>= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
|
||||
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
pKey[ 0] = l_key[get_local_id(0) + 0];
|
||||
pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
|
||||
}
|
||||
);
|
83
test/OpenCL/BitonicSort/b3BitonicSort.cpp
Normal file
83
test/OpenCL/BitonicSort/b3BitonicSort.cpp
Normal file
@ -0,0 +1,83 @@
|
||||
|
||||
#include "b3BitonicSort.h"
|
||||
#include "Bullet3Common/b3Scalar.h"
|
||||
|
||||
|
||||
//Note: logically shared with BitonicSort OpenCL code!
|
||||
// TODO : get parameter from OpenCL and pass it to kernel (needed for platforms other than NVIDIA)
|
||||
|
||||
void bitonicSortNv(cl_mem pKey, int arrayLength, b3BitonicSortInfo& info)
|
||||
{
|
||||
|
||||
if(arrayLength < 2)
|
||||
return;
|
||||
//Only power-of-two array lengths are supported so far
|
||||
info.dir = (info.dir != 0);
|
||||
cl_int ciErrNum;
|
||||
size_t localWorkSize, globalWorkSize;
|
||||
if(arrayLength <= info.localSizeLimit)
|
||||
{
|
||||
b3Assert( ( arrayLength) % info.localSizeLimit == 0);
|
||||
//Launch bitonicSortLocal
|
||||
ciErrNum = clSetKernelArg(info.bitonicSortLocal, 0, sizeof(cl_mem), (void *)&pKey);
|
||||
ciErrNum |= clSetKernelArg(info.bitonicSortLocal, 1, sizeof(cl_uint), (void *)&arrayLength);
|
||||
ciErrNum |= clSetKernelArg(info.bitonicSortLocal, 2, sizeof(cl_uint), (void *)&info.dir);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
|
||||
localWorkSize = info.localSizeLimit / 2;
|
||||
globalWorkSize = arrayLength / 2;
|
||||
ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortLocal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
}
|
||||
else
|
||||
{
|
||||
//Launch bitonicSortLocal1
|
||||
ciErrNum = clSetKernelArg(info.bitonicSortLocal1, 0, sizeof(cl_mem), (void *)&pKey);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
|
||||
localWorkSize = info.localSizeLimit / 2;
|
||||
globalWorkSize = arrayLength / 2;
|
||||
ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortLocal1, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
|
||||
for(unsigned int size = 2 * info.localSizeLimit; size <= arrayLength; size <<= 1)
|
||||
{
|
||||
for(unsigned stride = size / 2; stride > 0; stride >>= 1)
|
||||
{
|
||||
if(stride >= info.localSizeLimit)
|
||||
{
|
||||
//Launch bitonicMergeGlobal
|
||||
ciErrNum = clSetKernelArg(info.bitonicSortMergeGlobal, 0, sizeof(cl_mem), (void *)&pKey);
|
||||
ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 1, sizeof(cl_uint), (void *)&arrayLength);
|
||||
ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 2, sizeof(cl_uint), (void *)&size);
|
||||
ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 3, sizeof(cl_uint), (void *)&stride);
|
||||
ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 4, sizeof(cl_uint), (void *)&info.dir);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
|
||||
localWorkSize = info.localSizeLimit / 4;
|
||||
globalWorkSize = arrayLength / 2;
|
||||
|
||||
ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortMergeGlobal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
}
|
||||
else
|
||||
{
|
||||
//Launch bitonicMergeLocal
|
||||
ciErrNum = clSetKernelArg(info.bitonicSortMergeLocal, 0, sizeof(cl_mem), (void *)&pKey);
|
||||
ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 1, sizeof(cl_uint), (void *)&arrayLength);
|
||||
ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 2, sizeof(cl_uint), (void *)&stride);
|
||||
ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 3, sizeof(cl_uint), (void *)&size);
|
||||
ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 4, sizeof(cl_uint), (void *)&info.dir);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
|
||||
localWorkSize = info.localSizeLimit / 2;
|
||||
globalWorkSize = arrayLength / 2;
|
||||
|
||||
ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortMergeLocal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
30
test/OpenCL/BitonicSort/b3BitonicSort.h
Normal file
30
test/OpenCL/BitonicSort/b3BitonicSort.h
Normal file
@ -0,0 +1,30 @@
|
||||
#ifndef B3_BITONIC_SORT_H
|
||||
#define B3_BITONIC_SORT_H
|
||||
|
||||
#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
|
||||
|
||||
struct b3BitonicSortInfo
|
||||
{
|
||||
cl_command_queue m_cqCommandQue;
|
||||
cl_kernel bitonicSortLocal;
|
||||
cl_kernel bitonicSortLocal1;
|
||||
cl_kernel bitonicSortMergeGlobal;
|
||||
cl_kernel bitonicSortMergeLocal;
|
||||
unsigned int dir;
|
||||
unsigned int localSizeLimit;
|
||||
|
||||
b3BitonicSortInfo()
|
||||
{
|
||||
bitonicSortLocal=0;
|
||||
bitonicSortLocal1=0;
|
||||
bitonicSortMergeGlobal=0;
|
||||
bitonicSortMergeLocal=0;
|
||||
dir = 1;
|
||||
localSizeLimit = 1024U;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
void bitonicSortNv(cl_mem pKey, int arrayLength, b3BitonicSortInfo& info);
|
||||
|
||||
#endif //B3_BITONIC_SORT_H
|
192
test/OpenCL/BitonicSort/main.cpp
Normal file
192
test/OpenCL/BitonicSort/main.cpp
Normal file
@ -0,0 +1,192 @@
|
||||
/*
|
||||
Bullet Continuous Collision Detection and Physics Library
|
||||
Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
///original author: Erwin Coumans
|
||||
|
||||
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
|
||||
#include "Bullet3Common/b3Int2.h"
|
||||
#include "Bullet3Common/b3Quickprof.h"
|
||||
|
||||
#include "b3BitonicSort.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
int numSuccess=0;
|
||||
int numFailed=0;
|
||||
|
||||
cl_context g_cxMainContext;
|
||||
cl_command_queue g_cqCommandQue;
|
||||
|
||||
#define MSTRINGIFY(A) #A
|
||||
static const char* kernelSource=
|
||||
#include "BitonicSort.cl"
|
||||
|
||||
|
||||
|
||||
|
||||
static bool compareFunc(const b3Int2& p, const b3Int2& q)
|
||||
{
|
||||
return (p.x < q.x) || ((p.x == q.x) && ((p.y < q.y)));
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
int ciErrNum = 0;
|
||||
|
||||
b3Clock clock;
|
||||
|
||||
|
||||
cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
|
||||
const char* vendorSDK = b3OpenCLUtils::getSdkVendorName();
|
||||
|
||||
printf("This program was compiled using the %s OpenCL SDK\n",vendorSDK);
|
||||
int numPlatforms = b3OpenCLUtils::getNumPlatforms();
|
||||
printf("Num Platforms = %d\n", numPlatforms);
|
||||
|
||||
for (int i=0;i<numPlatforms;i++)
|
||||
{
|
||||
cl_platform_id platform = b3OpenCLUtils::getPlatform(i);
|
||||
b3OpenCLPlatformInfo platformInfo;
|
||||
b3OpenCLUtils::getPlatformInfo(platform,&platformInfo);
|
||||
printf("--------------------------------\n");
|
||||
printf("Platform info for platform nr %d:\n",i);
|
||||
printf(" CL_PLATFORM_VENDOR: \t\t\t%s\n",platformInfo.m_platformVendor);
|
||||
printf(" CL_PLATFORM_NAME: \t\t\t%s\n",platformInfo.m_platformName);
|
||||
printf(" CL_PLATFORM_VERSION: \t\t\t%s\n",platformInfo.m_platformVersion);
|
||||
|
||||
cl_context context = b3OpenCLUtils::createContextFromPlatform(platform,deviceType,&ciErrNum);
|
||||
|
||||
int numDevices = b3OpenCLUtils::getNumDevices(context);
|
||||
printf("Num Devices = %d\n", numDevices);
|
||||
for (int j=0;j<numDevices;j++)
|
||||
{
|
||||
cl_device_id dev = b3OpenCLUtils::getDevice(context,j);
|
||||
b3OpenCLDeviceInfo devInfo;
|
||||
b3OpenCLUtils::getDeviceInfo(dev,&devInfo);
|
||||
printf("m_deviceName = %s\n",devInfo.m_deviceName);
|
||||
//b3OpenCLUtils::printDeviceInfo(dev);
|
||||
|
||||
g_cqCommandQue = clCreateCommandQueue(context, dev, 0, &ciErrNum);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
|
||||
b3BitonicSortInfo info;
|
||||
|
||||
info.bitonicSortLocal = b3OpenCLUtils::compileCLKernelFromString(context,dev,kernelSource,"kBitonicSortCellIdLocal",&ciErrNum,0,"");
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
info.bitonicSortLocal1 = b3OpenCLUtils::compileCLKernelFromString(context,dev,kernelSource,"kBitonicSortCellIdLocal1",&ciErrNum,0,"");
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
info.bitonicSortMergeGlobal = b3OpenCLUtils::compileCLKernelFromString(context,dev,kernelSource,"kBitonicSortCellIdMergeGlobal",&ciErrNum,0,"");
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
info.bitonicSortMergeLocal = b3OpenCLUtils::compileCLKernelFromString(context,dev,kernelSource,"kBitonicSortCellIdMergeLocal",&ciErrNum,0,"");
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
info.m_cqCommandQue = g_cqCommandQue;
|
||||
|
||||
b3OpenCLArray<b3Int2> keyValuesGPU(context,g_cqCommandQue);
|
||||
b3AlignedObjectArray<b3Int2> keyValuesCPU;
|
||||
b3AlignedObjectArray<b3Int2> keyValuesGold;
|
||||
int numValues = 8*1024*1024;//2048;//1024;
|
||||
keyValuesCPU.resize(numValues);
|
||||
for (int i=0;i<numValues;i++)
|
||||
{
|
||||
b3Int2 v;
|
||||
v.x = numValues+1-i;
|
||||
v.y = i*i;
|
||||
keyValuesCPU[i] = v;
|
||||
}
|
||||
keyValuesGPU.copyFromHost(keyValuesCPU);
|
||||
keyValuesGPU.copyToHost(keyValuesGold);
|
||||
keyValuesGold.quickSort(compareFunc);
|
||||
|
||||
unsigned int batch = 1;
|
||||
unsigned int arrayLength = keyValuesGPU.size();
|
||||
|
||||
for (int i=0;i<10;i++)
|
||||
{
|
||||
keyValuesGPU.copyFromHost(keyValuesCPU);
|
||||
clFinish(info.m_cqCommandQue);
|
||||
unsigned long pre=clock.getTimeMilliseconds();
|
||||
bitonicSortNv(keyValuesGPU.getBufferCL(), arrayLength, info);
|
||||
clFinish(info.m_cqCommandQue);
|
||||
unsigned long post=clock.getTimeMilliseconds();
|
||||
printf("GPU sort took %d ms\n",post-pre);
|
||||
}
|
||||
keyValuesGPU.copyToHost(keyValuesCPU);
|
||||
int success=1;
|
||||
for (int i=0;i<numValues;i++)
|
||||
{
|
||||
if (keyValuesCPU[i].x != keyValuesGold[i].x)
|
||||
success = 0;
|
||||
if (keyValuesCPU[i].y != keyValuesGold[i].y)
|
||||
success = 0;
|
||||
}
|
||||
if (success)
|
||||
{
|
||||
printf("Correct\n");
|
||||
numSuccess++;
|
||||
} else
|
||||
{
|
||||
printf("Sort Failed\n");
|
||||
numFailed++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
clReleaseContext(context);
|
||||
}
|
||||
|
||||
///Easier method to initialize OpenCL using createContextFromType for a GPU
|
||||
deviceType = CL_DEVICE_TYPE_GPU;
|
||||
|
||||
void* glCtx=0;
|
||||
void* glDC = 0;
|
||||
printf("Initialize OpenCL using b3OpenCLUtils::createContextFromType for CL_DEVICE_TYPE_GPU\n");
|
||||
g_cxMainContext = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
|
||||
if (g_cxMainContext)
|
||||
{
|
||||
int numDev = b3OpenCLUtils::getNumDevices(g_cxMainContext);
|
||||
|
||||
for (int i=0;i<numDev;i++)
|
||||
{
|
||||
cl_device_id device;
|
||||
device = b3OpenCLUtils::getDevice(g_cxMainContext,i);
|
||||
b3OpenCLDeviceInfo clInfo;
|
||||
b3OpenCLUtils::getDeviceInfo(device,&clInfo);
|
||||
b3OpenCLUtils::printDeviceInfo(device);
|
||||
// create a command-queue
|
||||
g_cqCommandQue = clCreateCommandQueue(g_cxMainContext, device, 0, &ciErrNum);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
//normally you would create and execute kernels using this command queue
|
||||
|
||||
|
||||
clReleaseCommandQueue(g_cqCommandQue);
|
||||
}
|
||||
|
||||
clReleaseContext(g_cxMainContext);
|
||||
|
||||
}
|
||||
else {
|
||||
printf("No OpenCL capable GPU found!");
|
||||
}
|
||||
|
||||
printf("numSuccess=%d\n",numSuccess);
|
||||
printf("numFailed=%d\n",numFailed);
|
||||
|
||||
printf("press <Enter>\n");
|
||||
getchar();
|
||||
return 0;
|
||||
}
|
36
test/OpenCL/BitonicSort/premake4.lua
Normal file
36
test/OpenCL/BitonicSort/premake4.lua
Normal file
@ -0,0 +1,36 @@
|
||||
function createProject(vendor)
|
||||
|
||||
hasCL = findOpenCL(vendor)
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project ("Test_BitonicSort_" .. vendor)
|
||||
|
||||
initOpenCL(vendor)
|
||||
|
||||
language "C++"
|
||||
|
||||
|
||||
kind "ConsoleApp"
|
||||
targetdir "../../../bin"
|
||||
|
||||
includedirs {"../../../src"}
|
||||
|
||||
files {
|
||||
"main.cpp",
|
||||
"b3BitonicSort.cpp",
|
||||
"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
|
||||
"../../../src/Bullet3Common/b3AlignedAllocator.h",
|
||||
"../../../src/Bullet3Common/b3Quickprof.cpp",
|
||||
"../../../src/Bullet3Common/b3Quickprof.h",
|
||||
"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp",
|
||||
"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
|
||||
}
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
createProject("Apple")
|
||||
createProject("AMD")
|
||||
createProject("Intel")
|
||||
createProject("NVIDIA")
|
378
test/OpenCL/ParallelPrimitives/main.cpp
Normal file
378
test/OpenCL/ParallelPrimitives/main.cpp
Normal file
@ -0,0 +1,378 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h"
|
||||
#include "Bullet3Common/b3CommandLineArgs.h"
|
||||
#include "Bullet3Common/b3MinMax.h"
|
||||
|
||||
int g_nPassed = 0;
|
||||
int g_nFailed = 0;
|
||||
bool g_testFailed = 0;
|
||||
|
||||
#define TEST_INIT g_testFailed = 0;
|
||||
#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;}
|
||||
#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++;
|
||||
#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
|
||||
|
||||
cl_context g_context=0;
|
||||
cl_device_id g_device=0;
|
||||
cl_command_queue g_queue =0;
|
||||
const char* g_deviceName = 0;
|
||||
|
||||
void initCL(int preferredDeviceIndex, int preferredPlatformIndex)
|
||||
{
|
||||
void* glCtx=0;
|
||||
void* glDC = 0;
|
||||
int ciErrNum = 0;
|
||||
//bound search and radix sort only work on GPU right now (assume 32 or 64 width workgroup without barriers)
|
||||
|
||||
cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
|
||||
|
||||
g_context = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
int numDev = b3OpenCLUtils::getNumDevices(g_context);
|
||||
if (numDev>0)
|
||||
{
|
||||
b3OpenCLDeviceInfo info;
|
||||
g_device= b3OpenCLUtils::getDevice(g_context,0);
|
||||
g_queue = clCreateCommandQueue(g_context, g_device, 0, &ciErrNum);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
b3OpenCLUtils::printDeviceInfo(g_device);
|
||||
b3OpenCLUtils::getDeviceInfo(g_device,&info);
|
||||
g_deviceName = info.m_deviceName;
|
||||
}
|
||||
}
|
||||
|
||||
void exitCL()
|
||||
{
|
||||
clReleaseCommandQueue(g_queue);
|
||||
clReleaseContext(g_context);
|
||||
}
|
||||
|
||||
|
||||
inline void fillIntTest()
|
||||
{
|
||||
TEST_INIT;
|
||||
|
||||
b3FillCL* fillCL = new b3FillCL(g_context,g_device,g_queue);
|
||||
int maxSize=1024*256;
|
||||
b3OpenCLArray<int> intBuffer(g_context,g_queue,maxSize);
|
||||
intBuffer.resize(maxSize);
|
||||
|
||||
#define NUM_TESTS 7
|
||||
|
||||
int dx = maxSize/NUM_TESTS;
|
||||
for (int iter=0;iter<NUM_TESTS;iter++)
|
||||
{
|
||||
int size = b3Min( 11+dx*iter, maxSize );
|
||||
|
||||
int value = 2;
|
||||
|
||||
|
||||
int offset=0;
|
||||
fillCL->execute(intBuffer,value,size,offset);
|
||||
|
||||
b3AlignedObjectArray<int> hostBuf2;
|
||||
hostBuf2.resize(size);
|
||||
fillCL->executeHost(hostBuf2,value,size,offset);
|
||||
|
||||
b3AlignedObjectArray<int> hostBuf;
|
||||
intBuffer.copyToHost(hostBuf);
|
||||
|
||||
for(int i=0; i<size; i++)
|
||||
{
|
||||
TEST_ASSERT( hostBuf[i] == hostBuf2[i] );
|
||||
TEST_ASSERT( hostBuf[i] == hostBuf2[i] );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
delete fillCL;
|
||||
|
||||
TEST_REPORT( "fillIntTest" );
|
||||
}
|
||||
|
||||
|
||||
__inline
|
||||
void seedRandom(int seed)
|
||||
{
|
||||
srand( seed );
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__inline
|
||||
T getRandom(const T& minV, const T& maxV)
|
||||
{
|
||||
float r = (rand()%10000)/10000.f;
|
||||
T range = maxV - minV;
|
||||
return (T)(minV + r*range);
|
||||
}
|
||||
|
||||
struct b3SortDataCompare
|
||||
{
|
||||
inline bool operator()(const b3SortData& first, const b3SortData& second) const
|
||||
{
|
||||
return (first.m_key < second.m_key) || (first.m_key==second.m_key && first.m_value < second.m_value);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
void boundSearchTest( )
|
||||
{
|
||||
TEST_INIT;
|
||||
|
||||
int maxSize = 1024*256;
|
||||
int bucketSize = 256;
|
||||
|
||||
b3OpenCLArray<b3SortData> srcCL(g_context,g_queue,maxSize);
|
||||
b3OpenCLArray<unsigned int> upperCL(g_context,g_queue,maxSize);
|
||||
b3OpenCLArray<unsigned int> lowerCL(g_context,g_queue,maxSize);
|
||||
|
||||
b3AlignedObjectArray<b3SortData> srcHost;
|
||||
b3AlignedObjectArray<unsigned int> upperHost;
|
||||
b3AlignedObjectArray<unsigned int> lowerHost;
|
||||
b3AlignedObjectArray<unsigned int> upperHostCompare;
|
||||
b3AlignedObjectArray<unsigned int> lowerHostCompare;
|
||||
|
||||
b3BoundSearchCL* search = new b3BoundSearchCL(g_context,g_device,g_queue, maxSize);
|
||||
|
||||
|
||||
int dx = maxSize/NUM_TESTS;
|
||||
for(int iter=0; iter<NUM_TESTS; iter++)
|
||||
{
|
||||
|
||||
int size = b3Min( 128+dx*iter, maxSize );
|
||||
|
||||
upperHost.resize(bucketSize);
|
||||
lowerHost.resize(bucketSize);
|
||||
upperHostCompare.resize(bucketSize);
|
||||
lowerHostCompare.resize(bucketSize);
|
||||
|
||||
srcHost.resize(size);
|
||||
|
||||
for(int i=0; i<size; i++)
|
||||
{
|
||||
b3SortData v;
|
||||
// v.m_key = i<2? 0 : 5;
|
||||
v.m_key = getRandom(0,bucketSize);
|
||||
|
||||
v.m_value = i;
|
||||
srcHost.at(i) = v;
|
||||
}
|
||||
|
||||
srcHost.quickSort(b3SortDataCompare());
|
||||
srcCL.copyFromHost(srcHost);
|
||||
|
||||
{
|
||||
|
||||
for(int i=0; i<bucketSize; i++)
|
||||
{
|
||||
lowerHost[i] = -1;
|
||||
lowerHostCompare[i] = -1;
|
||||
upperHost[i] = -1;
|
||||
upperHostCompare[i] = -1;
|
||||
}
|
||||
upperCL.copyFromHost(upperHost);
|
||||
lowerCL.copyFromHost(lowerHost);
|
||||
}
|
||||
|
||||
search->execute(srcCL,size,upperCL,bucketSize,b3BoundSearchCL::BOUND_UPPER);
|
||||
search->execute(srcCL,size,lowerCL,bucketSize,b3BoundSearchCL::BOUND_LOWER);
|
||||
|
||||
search->executeHost(srcHost,size,upperHostCompare,bucketSize,b3BoundSearchCL::BOUND_UPPER);
|
||||
search->executeHost(srcHost,size,lowerHostCompare,bucketSize,b3BoundSearchCL::BOUND_LOWER);
|
||||
|
||||
lowerCL.copyToHost(lowerHost);
|
||||
upperCL.copyToHost(upperHost);
|
||||
for(int i=0; i<bucketSize; i++)
|
||||
{
|
||||
TEST_ASSERT(upperHostCompare[i] == upperHost[i]);
|
||||
TEST_ASSERT(lowerHostCompare[i] == lowerHost[i]);
|
||||
}
|
||||
/*
|
||||
for(int i=1; i<bucketSize; i++)
|
||||
{
|
||||
int lhi_1 = lowerHost[i-1];
|
||||
int lhi = lowerHost[i];
|
||||
|
||||
for(int j=lhi_1; j<lhi; j++)
|
||||
//for(int j=lowerHost[i-1]; j<lowerHost[i]; j++)
|
||||
{
|
||||
TEST_ASSERT( srcHost[j].m_key < i );
|
||||
}
|
||||
}
|
||||
|
||||
for(int i=0; i<bucketSize; i++)
|
||||
{
|
||||
int jMin = (i==0)?0:upperHost[i-1];
|
||||
for(int j=jMin; j<upperHost[i]; j++)
|
||||
{
|
||||
TEST_ASSERT( srcHost[j].m_key <= i );
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
for(int i=0; i<bucketSize; i++)
|
||||
{
|
||||
int lhi = lowerHost[i];
|
||||
int uhi = upperHost[i];
|
||||
|
||||
for(int j=lhi; j<uhi; j++)
|
||||
{
|
||||
if ( srcHost[j].m_key != i )
|
||||
{
|
||||
printf("error %d != %d\n",srcHost[j].m_key,i);
|
||||
}
|
||||
TEST_ASSERT( srcHost[j].m_key == i );
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
delete search;
|
||||
|
||||
TEST_REPORT( "boundSearchTest" );
|
||||
}
|
||||
|
||||
|
||||
void prefixScanTest()
|
||||
{
|
||||
TEST_INIT;
|
||||
|
||||
int maxSize = 1024*256;
|
||||
|
||||
b3AlignedObjectArray<unsigned int> buf0Host;
|
||||
b3AlignedObjectArray<unsigned int> buf1Host;
|
||||
|
||||
b3OpenCLArray<unsigned int> buf2CL(g_context,g_queue,maxSize);
|
||||
b3OpenCLArray<unsigned int> buf3CL(g_context,g_queue,maxSize);
|
||||
|
||||
|
||||
b3PrefixScanCL* scan = new b3PrefixScanCL(g_context,g_device,g_queue,maxSize);
|
||||
|
||||
int dx = maxSize/NUM_TESTS;
|
||||
for(int iter=0; iter<NUM_TESTS; iter++)
|
||||
{
|
||||
int size = b3Min( 128+dx*iter, maxSize );
|
||||
buf0Host.resize(size);
|
||||
buf1Host.resize(size);
|
||||
|
||||
for(int i=0; i<size; i++)
|
||||
buf0Host[i] = 1;
|
||||
|
||||
buf2CL.copyFromHost( buf0Host);
|
||||
|
||||
unsigned int sumHost, sumGPU;
|
||||
|
||||
scan->executeHost(buf0Host, buf1Host, size, &sumHost );
|
||||
scan->execute( buf2CL, buf3CL, size, &sumGPU );
|
||||
|
||||
buf3CL.copyToHost(buf0Host);
|
||||
|
||||
TEST_ASSERT( sumHost == sumGPU );
|
||||
for(int i=0; i<size; i++)
|
||||
TEST_ASSERT( buf1Host[i] == buf0Host[i] );
|
||||
}
|
||||
|
||||
delete scan;
|
||||
|
||||
TEST_REPORT( "scanTest" );
|
||||
}
|
||||
|
||||
|
||||
bool radixSortTest()
|
||||
{
|
||||
TEST_INIT;
|
||||
|
||||
int maxSize = 1024*256;
|
||||
|
||||
b3AlignedObjectArray<b3SortData> buf0Host;
|
||||
buf0Host.resize(maxSize);
|
||||
b3AlignedObjectArray<b3SortData> buf1Host;
|
||||
buf1Host.resize(maxSize );
|
||||
b3OpenCLArray<b3SortData> buf2CL(g_context,g_queue,maxSize);
|
||||
|
||||
b3RadixSort32CL* sort = new b3RadixSort32CL(g_context,g_device,g_queue,maxSize);
|
||||
|
||||
int dx = maxSize/NUM_TESTS;
|
||||
for(int iter=0; iter<NUM_TESTS; iter++)
|
||||
{
|
||||
int size = b3Min( 128+dx*iter, maxSize-512 );
|
||||
size = NEXTMULTIPLEOF( size, 512 );//not necessary
|
||||
|
||||
buf0Host.resize(size);
|
||||
|
||||
for(int i=0; i<size; i++)
|
||||
{
|
||||
b3SortData v;
|
||||
v.m_key = getRandom(0,0xff);
|
||||
v.m_value = i;
|
||||
buf0Host[i] = v;
|
||||
}
|
||||
|
||||
buf2CL.copyFromHost( buf0Host);
|
||||
|
||||
|
||||
sort->executeHost( buf0Host);
|
||||
sort->execute(buf2CL);
|
||||
|
||||
buf2CL.copyToHost(buf1Host);
|
||||
|
||||
for(int i=0; i<size; i++)
|
||||
{
|
||||
TEST_ASSERT( buf0Host[i].m_value == buf1Host[i].m_value && buf0Host[i].m_key == buf1Host[i].m_key );
|
||||
}
|
||||
}
|
||||
|
||||
delete sort;
|
||||
|
||||
TEST_REPORT( "radixSort" );
|
||||
|
||||
return g_testFailed;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
int preferredDeviceIndex = -1;
|
||||
int preferredPlatformIndex = -1;
|
||||
|
||||
b3CommandLineArgs args(argc, argv);
|
||||
args.GetCmdLineArgument("deviceId", preferredDeviceIndex);
|
||||
args.GetCmdLineArgument("platformId", preferredPlatformIndex);
|
||||
|
||||
initCL(preferredDeviceIndex,preferredPlatformIndex);
|
||||
|
||||
fillIntTest();
|
||||
|
||||
boundSearchTest();
|
||||
|
||||
prefixScanTest();
|
||||
|
||||
radixSortTest();
|
||||
|
||||
exitCL();
|
||||
|
||||
printf("%d tests passed, %d tests failed\n",g_nPassed, g_nFailed);
|
||||
printf("End, press <enter>\n");
|
||||
getchar();
|
||||
}
|
41
test/OpenCL/ParallelPrimitives/premake4.lua
Normal file
41
test/OpenCL/ParallelPrimitives/premake4.lua
Normal file
@ -0,0 +1,41 @@
|
||||
function createProject(vendor)
|
||||
hasCL = findOpenCL(vendor)
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project ("Test_OpenCL_Primitives_" .. vendor)
|
||||
|
||||
initOpenCL(vendor)
|
||||
|
||||
language "C++"
|
||||
|
||||
kind "ConsoleApp"
|
||||
targetdir "../../../bin"
|
||||
includedirs {".","../../../src"}
|
||||
|
||||
|
||||
files {
|
||||
"main.cpp",
|
||||
"../../../src/Bullet3OpenCL/Initialize/b3OpenCLInclude.h",
|
||||
"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp",
|
||||
"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h",
|
||||
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp",
|
||||
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h",
|
||||
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp",
|
||||
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h",
|
||||
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp",
|
||||
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h",
|
||||
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp",
|
||||
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h",
|
||||
"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
|
||||
"../../../src/Bullet3Common/b3AlignedAllocator.h",
|
||||
"../../../src/Bullet3Common/b3AlignedObjectArray.h",
|
||||
}
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
createProject("AMD")
|
||||
createProject("Intel")
|
||||
createProject("NVIDIA")
|
||||
createProject("Apple")
|
712
test/OpenCL/RadixSortBenchmark/main.cpp
Normal file
712
test/OpenCL/RadixSortBenchmark/main.cpp
Normal file
@ -0,0 +1,712 @@
|
||||
/******************************************************************************
|
||||
* Copyright 2010 Duane Merrill
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may ob3ain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
* AUTHORS' REQUEST:
|
||||
*
|
||||
* If you use|reference|benchmark this code, please cite our Technical
|
||||
* Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
|
||||
*
|
||||
* @TechReport{ Merrill:Sorting:2010,
|
||||
* author = "Duane Merrill and Andrew Grimshaw",
|
||||
* title = "Revisiting Sorting for GPGPU Stream Architectures",
|
||||
* year = "2010",
|
||||
* institution = "University of Virginia, Department of Computer Science",
|
||||
* address = "Charlottesville, VA, USA",
|
||||
* number = "CS2010-03"
|
||||
* }
|
||||
*
|
||||
* For more information, see our Google Code project site:
|
||||
* http://code.google.com/p/back40computing/
|
||||
*
|
||||
* Thanks!
|
||||
******************************************************************************/
|
||||
|
||||
/******************************************************************************
|
||||
* Simple test driver program for *large-problem* radix sorting.
|
||||
*
|
||||
* Useful for demonstrating how to integrate radix sorting into
|
||||
* your application
|
||||
******************************************************************************/
|
||||
|
||||
/******************************************************************************
|
||||
* Converted from CUDA to OpenCL/DirectCompute by Erwin Coumans
|
||||
******************************************************************************/
|
||||
#ifdef _WIN32
|
||||
#pragma warning (disable:4996)
|
||||
#endif
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
|
||||
|
||||
//#include <iostream>
|
||||
#include <sstream>
|
||||
/**********************
|
||||
*
|
||||
*/
|
||||
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
|
||||
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
|
||||
#include "Bullet3Common/b3Quickprof.h"
|
||||
|
||||
cl_context g_cxMainContext;
|
||||
cl_device_id g_device;
|
||||
cl_command_queue g_cqCommandQueue;
|
||||
|
||||
/***********************
|
||||
*
|
||||
*/
|
||||
|
||||
bool g_verbose;
|
||||
///Preferred OpenCL device/platform. When < 0 then no preference is used.
|
||||
///Note that b3OpenCLUtils might still use the preference of using a platform vendor that matches the SDK vendor used to build the application.
|
||||
///Preferred device/platform take priority over this platform-vendor match
|
||||
int gPreferredDeviceId = -1;
|
||||
int gPreferredPlatformId = -1;
|
||||
|
||||
|
||||
|
||||
/******************************************************************************
|
||||
* Routines
|
||||
******************************************************************************/
|
||||
|
||||
|
||||
/**
|
||||
* Keys-only sorting. Uses the GPU to sort the specified vector of elements for the given
|
||||
* number of iterations, displaying runtime information.
|
||||
*
|
||||
* @param[in] num_elements
|
||||
* Size in elements of the vector to sort
|
||||
* @param[in] h_keys
|
||||
* Vector of keys to sort
|
||||
* @param[in] iterations
|
||||
* Number of times to invoke the GPU sorting primitive
|
||||
* @param[in] cfg
|
||||
* Config
|
||||
*/
|
||||
template <typename K>
|
||||
void TimedSort(
|
||||
unsigned int num_elements,
|
||||
K *h_keys,
|
||||
unsigned int iterations)
|
||||
{
|
||||
printf("Keys only, %d iterations, %d elements\n", iterations, num_elements);
|
||||
|
||||
int max_elements = num_elements;
|
||||
b3AlignedObjectArray<unsigned int> hostData;
|
||||
hostData.resize(num_elements);
|
||||
for (int i=0;i<num_elements;i++)
|
||||
{
|
||||
hostData[i] = h_keys[i];
|
||||
}
|
||||
|
||||
b3RadixSort32CL sorter(g_cxMainContext,g_device,g_cqCommandQueue);
|
||||
|
||||
b3OpenCLArray<unsigned int> gpuData(g_cxMainContext,g_cqCommandQueue);
|
||||
gpuData.copyFromHost(hostData);
|
||||
//sorter.executeHost(gpuData);
|
||||
sorter.execute(gpuData);
|
||||
|
||||
b3AlignedObjectArray<unsigned int> hostDataSorted;
|
||||
gpuData.copyToHost(hostDataSorted);
|
||||
|
||||
clFinish(g_cqCommandQueue);
|
||||
|
||||
{
|
||||
//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
|
||||
|
||||
// Create sorting enactor
|
||||
|
||||
// Perform the timed number of sorting iterations
|
||||
double elapsed = 0;
|
||||
float duration = 0;
|
||||
b3Clock watch;
|
||||
|
||||
//warm-start
|
||||
gpuData.copyFromHost(hostData);
|
||||
clFinish(g_cqCommandQueue);
|
||||
sorter.execute(gpuData);
|
||||
|
||||
watch.reset();
|
||||
|
||||
|
||||
for (int i = 0; i < iterations; i++)
|
||||
{
|
||||
|
||||
|
||||
|
||||
// Move a fresh copy of the problem into device storage
|
||||
gpuData.copyFromHost(hostData);
|
||||
clFinish(g_cqCommandQueue);
|
||||
|
||||
// Start GPU timing record
|
||||
double startMs = watch.getTimeMicroseconds()/1e3;
|
||||
|
||||
// Call the sorting API routine
|
||||
sorter.execute(gpuData);
|
||||
|
||||
|
||||
|
||||
clFinish(g_cqCommandQueue);
|
||||
|
||||
double stopMs = watch.getTimeMicroseconds()/1e3;
|
||||
|
||||
duration = stopMs - startMs;
|
||||
|
||||
// End GPU timing record
|
||||
elapsed += (double) duration;
|
||||
printf("duration = %f\n", duration);
|
||||
}
|
||||
|
||||
// Display timing information
|
||||
double avg_runtime = elapsed / iterations;
|
||||
// double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0;
|
||||
// printf(", %f GPU ms, %f x10^9 elts/sec\n", avg_runtime, throughput);
|
||||
double throughput = ((double) num_elements) / avg_runtime / 1000.0 ;
|
||||
printf(", %f GPU ms, %f x10^6 elts/sec\n", avg_runtime, throughput);
|
||||
|
||||
gpuData.copyToHost(hostData);
|
||||
for (int i=0;i<num_elements;i++)
|
||||
{
|
||||
h_keys[i] = hostData[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Key-value sorting. Uses the GPU to sort the specified vector of elements for the given
|
||||
* number of iterations, displaying runtime information.
|
||||
*
|
||||
* @param[in] num_elements
|
||||
* Size in elements of the vector to sort
|
||||
* @param[in] h_keys
|
||||
* Vector of keys to sort
|
||||
* @param[in,out] h_values
|
||||
* Vector of values to sort
|
||||
* @param[in] iterations
|
||||
* Number of times to invoke the GPU sorting primitive
|
||||
* @param[in] cfg
|
||||
* Config
|
||||
*/
|
||||
template <typename K, typename V>
|
||||
void TimedSort(
|
||||
unsigned int num_elements,
|
||||
K *h_keys,
|
||||
V *h_values,
|
||||
unsigned int iterations)
|
||||
{
|
||||
|
||||
printf("Key-values, %d iterations, %d elements\n", iterations, num_elements);
|
||||
|
||||
int max_elements = num_elements;
|
||||
b3AlignedObjectArray<b3SortData> hostData;
|
||||
hostData.resize(num_elements);
|
||||
for (int i=0;i<num_elements;i++)
|
||||
{
|
||||
hostData[i].m_key = h_keys[i];
|
||||
hostData[i].m_value = h_values[i];
|
||||
}
|
||||
|
||||
b3RadixSort32CL sorter(g_cxMainContext,g_device,g_cqCommandQueue);
|
||||
|
||||
b3OpenCLArray<b3SortData> gpuData(g_cxMainContext,g_cqCommandQueue);
|
||||
gpuData.copyFromHost(hostData);
|
||||
//sorter.executeHost(gpuData);
|
||||
sorter.execute(gpuData);
|
||||
|
||||
b3AlignedObjectArray<b3SortData> hostDataSorted;
|
||||
gpuData.copyToHost(hostDataSorted);
|
||||
#if 0
|
||||
for (int i=0;i<num_elements;i++)
|
||||
{
|
||||
printf("hostData[%d].m_key = %d\n",i, hostDataSorted[i].m_key);
|
||||
printf("hostData[%d].m_value = %d\n",i,hostDataSorted[i].m_value);
|
||||
}
|
||||
#endif
|
||||
|
||||
clFinish(g_cqCommandQueue);
|
||||
|
||||
{
|
||||
//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
|
||||
|
||||
// Create sorting enactor
|
||||
|
||||
// Perform the timed number of sorting iterations
|
||||
double elapsed = 0;
|
||||
float duration = 0;
|
||||
b3Clock watch;
|
||||
|
||||
//warm-start
|
||||
gpuData.copyFromHost(hostData);
|
||||
sorter.execute(gpuData);
|
||||
clFinish(g_cqCommandQueue);
|
||||
|
||||
watch.reset();
|
||||
|
||||
|
||||
for (int i = 0; i < iterations; i++)
|
||||
{
|
||||
|
||||
|
||||
|
||||
// Move a fresh copy of the problem into device storage
|
||||
gpuData.copyFromHost(hostData);
|
||||
clFinish(g_cqCommandQueue);
|
||||
|
||||
// Start GPU timing record
|
||||
double startMs = watch.getTimeMicroseconds()/1e3;
|
||||
|
||||
// Call the sorting API routine
|
||||
sorter.execute(gpuData);
|
||||
clFinish(g_cqCommandQueue);
|
||||
|
||||
double stopMs = watch.getTimeMicroseconds()/1e3;
|
||||
|
||||
duration = stopMs - startMs;
|
||||
|
||||
// End GPU timing record
|
||||
elapsed += (double) duration;
|
||||
printf("duration = %f\n", duration);
|
||||
}
|
||||
|
||||
// Display timing information
|
||||
double avg_runtime = elapsed / iterations;
|
||||
// double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0;
|
||||
// printf(", %f GPU ms, %f x10^9 elts/sec\n", avg_runtime, throughput);
|
||||
double throughput = ((double) num_elements) / avg_runtime / 1000.0 ;
|
||||
printf(", %f GPU ms, %f x10^6 elts/sec\n", avg_runtime, throughput);
|
||||
|
||||
gpuData.copyToHost(hostData);
|
||||
for (int i=0;i<num_elements;i++)
|
||||
{
|
||||
h_keys[i] = hostData[i].m_key;
|
||||
h_values[i] = hostData[i].m_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Generates random 32-bit keys.
|
||||
*
|
||||
* We always take the second-order byte from rand() because the higher-order
|
||||
* bits returned by rand() are commonly considered more uniformly distributed
|
||||
* than the lower-order bits.
|
||||
*
|
||||
* We can decrease the entropy level of keys by adopting the technique
|
||||
* of Thearling and Smith in which keys are computed from the bitwise AND of
|
||||
* multiple random samples:
|
||||
*
|
||||
* entropy_reduction | Effectively-unique bits per key
|
||||
* -----------------------------------------------------
|
||||
* -1 | 0
|
||||
* 0 | 32
|
||||
* 1 | 25.95
|
||||
* 2 | 17.41
|
||||
* 3 | 10.78
|
||||
* 4 | 6.42
|
||||
* ... | ...
|
||||
*
|
||||
*/
|
||||
template <typename K>
|
||||
void RandomBits(K &key, int entropy_reduction = 0, int lower_key_bits = sizeof(K) * 8)
|
||||
{
|
||||
const unsigned int NUM_UCHARS = (sizeof(K) + sizeof(unsigned char) - 1) / sizeof(unsigned char);
|
||||
unsigned char key_bits[NUM_UCHARS];
|
||||
|
||||
do {
|
||||
|
||||
for (int j = 0; j < NUM_UCHARS; j++) {
|
||||
unsigned char quarterword = 0xff;
|
||||
for (int i = 0; i <= entropy_reduction; i++) {
|
||||
quarterword &= (rand() >> 7);
|
||||
}
|
||||
key_bits[j] = quarterword;
|
||||
}
|
||||
|
||||
if (lower_key_bits < sizeof(K) * 8) {
|
||||
unsigned long long base = 0;
|
||||
memcpy(&base, key_bits, sizeof(K));
|
||||
base &= (1 << lower_key_bits) - 1;
|
||||
memcpy(key_bits, &base, sizeof(K));
|
||||
}
|
||||
|
||||
memcpy(&key, key_bits, sizeof(K));
|
||||
|
||||
} while (key != key); // avoids NaNs when generating random floating point numbers
|
||||
}
|
||||
|
||||
|
||||
/******************************************************************************
|
||||
* Templated routines for printing keys/values to the console
|
||||
******************************************************************************/
|
||||
|
||||
template<typename T>
|
||||
void PrintValue(T val) {
|
||||
printf("%d", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<float>(float val) {
|
||||
printf("%f", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<double>(double val) {
|
||||
printf("%f", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<unsigned char>(unsigned char val) {
|
||||
printf("%u", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<unsigned short>(unsigned short val) {
|
||||
printf("%u", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<unsigned int>(unsigned int val) {
|
||||
printf("%u", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<long>(long val) {
|
||||
printf("%ld", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<unsigned long>(unsigned long val) {
|
||||
printf("%lu", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<long long>(long long val) {
|
||||
printf("%lld", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<unsigned long long>(unsigned long long val) {
|
||||
printf("%llu", val);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Compares the equivalence of two arrays
|
||||
*/
|
||||
template <typename T, typename SizeT>
|
||||
int CompareResults(T* computed, T* reference, SizeT len, bool verbose = true)
|
||||
{
|
||||
printf("\n");
|
||||
for (SizeT i = 0; i < len; i++) {
|
||||
|
||||
if (computed[i] != reference[i]) {
|
||||
printf("INCORRECT: [%lu]: ", (unsigned long) i);
|
||||
PrintValue<T>(computed[i]);
|
||||
printf(" != ");
|
||||
PrintValue<T>(reference[i]);
|
||||
|
||||
if (verbose) {
|
||||
printf("\nresult[...");
|
||||
for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
|
||||
PrintValue<T>(computed[j]);
|
||||
printf(", ");
|
||||
}
|
||||
printf("...]");
|
||||
printf("\nreference[...");
|
||||
for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
|
||||
PrintValue<T>(reference[j]);
|
||||
printf(", ");
|
||||
}
|
||||
printf("...]");
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
printf("CORRECT\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an example sorting problem whose keys is a vector of the specified
|
||||
* number of K elements, values of V elements, and then dispatches the problem
|
||||
* to the GPU for the given number of iterations, displaying runtime information.
|
||||
*
|
||||
* @param[in] iterations
|
||||
* Number of times to invoke the GPU sorting primitive
|
||||
* @param[in] num_elements
|
||||
* Size in elements of the vector to sort
|
||||
* @param[in] cfg
|
||||
* Config
|
||||
*/
|
||||
template<typename K, typename V>
|
||||
void TestSort(
|
||||
unsigned int iterations,
|
||||
int num_elements,
|
||||
bool keys_only)
|
||||
{
|
||||
// Allocate the sorting problem on the host and fill the keys with random bytes
|
||||
|
||||
K *h_keys = NULL;
|
||||
K *h_reference_keys = NULL;
|
||||
V *h_values = NULL;
|
||||
h_keys = (K*) malloc(num_elements * sizeof(K));
|
||||
h_reference_keys = (K*) malloc(num_elements * sizeof(K));
|
||||
if (!keys_only) h_values = (V*) malloc(num_elements * sizeof(V));
|
||||
|
||||
|
||||
// Use random bits
|
||||
for (unsigned int i = 0; i < num_elements; ++i) {
|
||||
RandomBits<K>(h_keys[i], 0);
|
||||
//h_keys[i] = num_elements-i;
|
||||
//h_keys[i] = 0xffffffffu-i;
|
||||
if (!keys_only)
|
||||
h_values[i] = h_keys[i];//0xffffffffu-i;
|
||||
|
||||
h_reference_keys[i] = h_keys[i];
|
||||
}
|
||||
|
||||
// Run the timing test
|
||||
if (keys_only) {
|
||||
TimedSort<K>(num_elements, h_keys, iterations);
|
||||
} else {
|
||||
TimedSort<K, V>(num_elements, h_keys, h_values, iterations);
|
||||
}
|
||||
|
||||
// cudaThreadSynchronize();
|
||||
|
||||
// Display sorted key data
|
||||
if (g_verbose) {
|
||||
printf("\n\nKeys:\n");
|
||||
for (int i = 0; i < num_elements; i++) {
|
||||
PrintValue<K>(h_keys[i]);
|
||||
printf(", ");
|
||||
}
|
||||
printf("\n\n");
|
||||
}
|
||||
|
||||
// Verify solution
|
||||
std::sort(h_reference_keys, h_reference_keys + num_elements);
|
||||
CompareResults<K>(h_keys, h_reference_keys, num_elements, true);
|
||||
printf("\n");
|
||||
fflush(stdout);
|
||||
|
||||
// Free our allocated host memory
|
||||
if (h_keys != NULL) free(h_keys);
|
||||
if (h_values != NULL) free(h_values);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Displays the commandline usage for this tool
|
||||
*/
|
||||
void Usage()
|
||||
{
|
||||
printf("\ntest_large_problem_sorting [--device=<device index>] [--v] [--i=<num-iterations>] [--n=<num-elements>] [--key-values] [--deviceId=<int>] [--platformId=<int>]\n");
|
||||
printf("\n");
|
||||
printf("\t--v\tDisplays sorted results to the console.\n");
|
||||
printf("\n");
|
||||
printf("\t--i\tPerforms the sorting operation <num-iterations> times\n");
|
||||
printf("\t\t\ton the device. Re-copies original input each time. Default = 1\n");
|
||||
printf("\n");
|
||||
printf("\t--n\tThe number of elements to comprise the sample problem\n");
|
||||
printf("\t\t\tDefault = 512\n");
|
||||
printf("\n");
|
||||
printf("\t--key-values\tSpecifies that keys are accommodated by value pairings\n");
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
|
||||
/******************************************************************************
|
||||
* Command-line parsing
|
||||
******************************************************************************/
|
||||
#include <map>
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
|
||||
class b3CommandLineArgs
|
||||
{
|
||||
protected:
|
||||
|
||||
std::map<std::string, std::string> pairs;
|
||||
|
||||
public:
|
||||
|
||||
// Constructor
|
||||
b3CommandLineArgs(int argc, char **argv)
|
||||
{
|
||||
using namespace std;
|
||||
|
||||
for (int i = 1; i < argc; i++)
|
||||
{
|
||||
string arg = argv[i];
|
||||
|
||||
if ((arg[0] != '-') || (arg[1] != '-')) {
|
||||
continue;
|
||||
}
|
||||
|
||||
string::size_type pos;
|
||||
string key, val;
|
||||
if ((pos = arg.find( '=')) == string::npos) {
|
||||
key = string(arg, 2, arg.length() - 2);
|
||||
val = "";
|
||||
} else {
|
||||
key = string(arg, 2, pos - 2);
|
||||
val = string(arg, pos + 1, arg.length() - 1);
|
||||
}
|
||||
pairs[key] = val;
|
||||
}
|
||||
}
|
||||
|
||||
bool CheckCmdLineFlag(const char* arg_name)
|
||||
{
|
||||
using namespace std;
|
||||
map<string, string>::iterator itr;
|
||||
if ((itr = pairs.find(arg_name)) != pairs.end()) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void GetCmdLineArgument(const char *arg_name, T &val);
|
||||
|
||||
int ParsedArgc()
|
||||
{
|
||||
return pairs.size();
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void b3CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val)
|
||||
{
|
||||
using namespace std;
|
||||
map<string, string>::iterator itr;
|
||||
if ((itr = pairs.find(arg_name)) != pairs.end()) {
|
||||
istringstream strstream(itr->second);
|
||||
strstream >> val;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
void b3CommandLineArgs::GetCmdLineArgument<char*>(const char* arg_name, char* &val)
|
||||
{
|
||||
using namespace std;
|
||||
map<string, string>::iterator itr;
|
||||
if ((itr = pairs.find(arg_name)) != pairs.end()) {
|
||||
|
||||
string s = itr->second;
|
||||
val = (char*) malloc(sizeof(char) * (s.length() + 1));
|
||||
strcpy(val, s.c_str());
|
||||
|
||||
} else {
|
||||
val = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/******************************************************************************
|
||||
* Main
|
||||
******************************************************************************/
|
||||
|
||||
extern bool gDebugSkipLoadingBinary;
|
||||
|
||||
int main( int argc, char** argv)
|
||||
{
|
||||
//gDebugSkipLoadingBinary = true;
|
||||
|
||||
cl_int ciErrNum;
|
||||
b3CommandLineArgs args(argc,argv);
|
||||
|
||||
args.GetCmdLineArgument("deviceId", gPreferredDeviceId);
|
||||
args.GetCmdLineArgument("platformId", gPreferredPlatformId);
|
||||
|
||||
printf("Initialize OpenCL using b3OpenCLUtils_createContextFromType\n");
|
||||
cl_platform_id platformId;
|
||||
// g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
|
||||
g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
|
||||
//g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_CPU, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
|
||||
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
|
||||
int numDev = b3OpenCLUtils_getNumDevices(g_cxMainContext);
|
||||
|
||||
if (!numDev)
|
||||
{
|
||||
printf("error: no OpenCL devices\n");
|
||||
exit(0);
|
||||
}
|
||||
int result;
|
||||
int devId = 0;
|
||||
g_device = b3OpenCLUtils_getDevice(g_cxMainContext,devId);
|
||||
b3OpenCLUtils_printDeviceInfo(g_device);
|
||||
// create a command-queue
|
||||
g_cqCommandQueue = clCreateCommandQueue(g_cxMainContext, g_device, 0, &ciErrNum);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
|
||||
|
||||
|
||||
//srand(time(NULL));
|
||||
srand(0); // presently deterministic
|
||||
|
||||
unsigned int num_elements = 8*1024*1024;//4*1024*1024;//4*1024*1024;//257;//8*524288;//2048;//512;//524288;
|
||||
unsigned int iterations = 10;
|
||||
bool keys_only = true;
|
||||
|
||||
//
|
||||
// Check command line arguments
|
||||
//
|
||||
|
||||
|
||||
|
||||
if (args.CheckCmdLineFlag("help"))
|
||||
{
|
||||
Usage();
|
||||
return 0;
|
||||
}
|
||||
|
||||
args.GetCmdLineArgument("i", iterations);
|
||||
args.GetCmdLineArgument("n", num_elements);
|
||||
|
||||
|
||||
|
||||
keys_only = !args.CheckCmdLineFlag("key-values");
|
||||
g_verbose = args.CheckCmdLineFlag("v");
|
||||
|
||||
|
||||
|
||||
TestSort<unsigned int, unsigned int>(
|
||||
iterations,
|
||||
num_elements,
|
||||
keys_only);
|
||||
|
||||
|
||||
}
|
40
test/OpenCL/RadixSortBenchmark/premake4.lua
Normal file
40
test/OpenCL/RadixSortBenchmark/premake4.lua
Normal file
@ -0,0 +1,40 @@
|
||||
function createProject(vendor)
|
||||
hasCL = findOpenCL(vendor)
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project ("Test_OpenCL_RadixSortBenchmark_" .. vendor)
|
||||
|
||||
initOpenCL(vendor)
|
||||
|
||||
language "C++"
|
||||
|
||||
kind "ConsoleApp"
|
||||
targetdir "../../../bin"
|
||||
includedirs {"..","../../../src"}
|
||||
|
||||
-- links {
|
||||
-- ("OpenCL_lib_parallel_primitives_host_" .. vendor)
|
||||
-- }
|
||||
|
||||
files {
|
||||
"main.cpp",
|
||||
"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp",
|
||||
"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h",
|
||||
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp",
|
||||
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp",
|
||||
"../../../src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp",
|
||||
"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
|
||||
"../../../src/Bullet3Common/b3AlignedAllocator.h",
|
||||
"../../../src/Bullet3Common/b3AlignedObjectArray.h",
|
||||
"../../../src/Bullet3Common/b3Quickprof.cpp",
|
||||
"../../../src/Bullet3Common/b3Quickprof.h",
|
||||
}
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
createProject("AMD")
|
||||
createProject("Intel")
|
||||
createProject("NVIDIA")
|
||||
createProject("Apple")
|
Loading…
Reference in New Issue
Block a user