add bitonic sort, as comparison.

fix stringify.bat for Windows (need to fix Mac/Linux version too)
2024-12-13 13:20:07 +00:00 · 2013-04-30 11:40:09 -07:00 · 2013-04-30 11:40:09 -07:00 · 92f0938af3
commit 92f0938af3
parent c5f488fe6d
24 changed files with 1857 additions and 177 deletions
--- a/build/premake4.lua
+++ b/build/premake4.lua
@ -101,7 +101,9 @@
 		include "../test/OpenCL/BasicInitialize"
 --		include "../test/OpenCL/BroadphaseCollision"
 --		include "../test/OpenCL/NarrowphaseCollision"
--		include "../test/OpenCL/ParallelPrimitives"
+		include "../test/OpenCL/ParallelPrimitives"
+		include "../test/OpenCL/RadixSortBenchmark"
+		include "../test/OpenCL/BitonicSort"

 		include "../src/Bullet3Dynamics"
 		include "../src/Bullet3Common"
--- a/build/stringify.bat
+++ b/build/stringify.bat
@ -1,35 +1,30 @@

-@echo off
+rem @echo off


-premake4 --file=stringifyKernel.lua --kernelfile="../opencl/vector_add/VectorAddKernels.cl" --headerfile="../opencl/vector_add/VectorAddKernels.h" --stringname="vectorAddCL" stringify
-premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/RadixSort32Kernels.cl" 	--headerfile="../opencl/parallel_primitives/kernels/RadixSort32KernelsCL.h" --stringname="radixSort32KernelsCL" stringify
-premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/BoundSearchKernels.cl" 	--headerfile="../opencl/parallel_primitives/kernels/BoundSearchKernelsCL.h" --stringname="boundSearchKernelsCL" stringify
-premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/PrefixScanKernels.cl" 	--headerfile="../opencl/parallel_primitives/kernels/PrefixScanKernelsCL.h" --stringname="prefixScanKernelsCL" stringify
-premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/FillKernels.cl" 				--headerfile="../opencl/parallel_primitives/kernels/FillKernelsCL.h" --stringname="fillKernelsCL" stringify
+premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl" 	--headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h" --stringname="radixSort32KernelsCL" stringify
+premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl" 	--headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h" --stringname="boundSearchKernelsCL" stringify
+premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl" 	--headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h" --stringname="prefixScanKernelsCL" stringify
+premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl" 				--headerfile="../src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h" --stringname="fillKernelsCL" stringify

-premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_broadphase/kernels/sap.cl" --headerfile="../opencl/gpu_broadphase/kernels/sapKernels.h" --stringname="sapCL" stringify
-premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_broadphase/kernels/sapFast.cl" --headerfile="../opencl/gpu_broadphase/kernels/sapFastKernels.h" --stringname="sapFastCL" stringify
+premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h" --stringname="sapCL" stringify
+premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h" --stringname="sapFastCL" stringify

-premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_narrowphase/kernels/sat.cl" --headerfile="../opencl/gpu_narrowphase/kernels/satKernels.h" --stringname="satKernelsCL" stringify
-premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_narrowphase/kernels/satClipHullContacts.cl" --headerfile="../opencl/gpu_narrowphase/kernels/satClipHullContacts.h" --stringname="satClipKernelsCL" stringify
-premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_narrowphase/kernels/primitiveContacts.cl" --headerfile="../opencl/gpu_narrowphase/kernels/primitiveContacts.h" --stringname="primitiveContactsKernelsCL" stringify
-premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_narrowphase/kernels/bvhTraversal.cl" --headerfile="../opencl/gpu_narrowphase/kernels/bvhTraversal.h" --stringname="bvhTraversalKernelCL" stringify
-
-
-premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/integrateKernel.cl" --headerfile="../opencl/gpu_rigidbody/kernels/integrateKernel.h" --stringname="integrateKernelCL" stringify
-premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/updateAabbsKernel.cl" --headerfile="../opencl/gpu_rigidbody/kernels/updateAabbsKernel.h" --stringname="updateAabbsKernelCL" stringify
-premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/solverSetup.cl" --headerfile="../opencl/gpu_rigidbody/kernels/solverSetup.h" --stringname="solverSetupCL" stringify
-premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/solverSetup2.cl" --headerfile="../opencl/gpu_rigidbody/kernels/solverSetup2.h" --stringname="solverSetup2CL" stringify
-premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/batchingKernels.cl" --headerfile="../opencl/gpu_rigidbody/kernels/batchingKernels.h" --stringname="batchingKernelsCL" stringify
-premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/batchingKernelsNew.cl" --headerfile="../opencl/gpu_rigidbody/kernels/batchingKernelsNew.h" --stringname="batchingKernelsNewCL" stringify
-premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/solverUtils.cl" --headerfile="../opencl/gpu_rigidbody/kernels/solverUtils.h" --stringname="solverUtilsCL" stringify
-
-
-premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/solveContact.cl" --headerfile="../opencl/gpu_rigidbody/kernels/solveContact.h" --stringname="solveContactCL" stringify
-premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/solveFriction.cl" --headerfile="../opencl/gpu_rigidbody/kernels/solveFriction.h" --stringname="solveFrictionCL" stringify
+premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl" --headerfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h" --stringname="satKernelsCL" stringify
+premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl" --headerfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h" --stringname="satClipKernelsCL" stringify
+premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl" --headerfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h" --stringname="primitiveContactsKernelsCL" stringify
+premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.cl" --headerfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h" --stringname="bvhTraversalKernelCL" stringify


+premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/integrateKernel.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/integrateKernel.h" --stringname="integrateKernelCL" stringify
+premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/updateAabbsKernel.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/updateAabbsKernel.h" --stringname="updateAabbsKernelCL" stringify
+premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/solverSetup.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/solverSetup.h" --stringname="solverSetupCL" stringify
+premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/solverSetup2.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/solverSetup2.h" --stringname="solverSetup2CL" stringify
+premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/batchingKernels.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/batchingKernels.h" --stringname="batchingKernelsCL" stringify
+premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/batchingKernelsNew.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/batchingKernelsNew.h" --stringname="batchingKernelsNewCL" stringify
+premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/solverUtils.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/solverUtils.h" --stringname="solverUtilsCL" stringify
+premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/solveContact.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/solveContact.h" --stringname="solveContactCL" stringify
+premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/RigidBody//kernels/solveFriction.cl" --headerfile="../src/Bullet3OpenCL/RigidBody//kernels/solveFriction.h" --stringname="solveFrictionCL" stringify


 pause
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h
@ -30,12 +30,12 @@ static const char* sapFastCL= \
 "		float   m_maxElems[4];\n"
 "		int			m_maxIndices[4];\n"
 "	};\n"
-"} b3AabbCL;\n"
+"} btAabbCL;\n"
 "\n"
 "\n"
 "/// conservative test for overlap between two aabbs\n"
-"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2);\n"
-"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2)\n"
+"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
+"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
 "{\n"
 "//skip pairs between static (mass=0) objects\n"
 "	if ((aabb1->m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0))\n"
@ -50,18 +50,18 @@ static const char* sapFastCL= \
 "\n"
 "\n"
 "//computePairsKernelBatchWrite\n"
-"__kernel void   computePairsKernel( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+"__kernel void   computePairsKernel( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
 "	int localId = get_local_id(0);\n"
 "\n"
 "	__local int numActiveWgItems[1];\n"
 "	__local int breakRequest[1];\n"
-"	__local b3AabbCL localAabbs[128];// = aabbs[i];\n"
+"	__local btAabbCL localAabbs[128];// = aabbs[i];\n"
 "	\n"
 "	int2 myPairs[64];\n"
 "	\n"
-"	b3AabbCL myAabb;\n"
+"	btAabbCL myAabb;\n"
 "	\n"
 "	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
 "	float testValue = 	myAabb.m_maxElems[axis];\n"
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h
@ -30,12 +30,12 @@ static const char* sapCL= \
 "		float   m_maxElems[4];\n"
 "		int			m_maxIndices[4];\n"
 "	};\n"
-"} b3AabbCL;\n"
+"} btAabbCL;\n"
 "\n"
 "\n"
 "/// conservative test for overlap between two aabbs\n"
-"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2);\n"
-"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2)\n"
+"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
+"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
 "{\n"
 "	bool overlap = true;\n"
 "	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
@ -43,8 +43,8 @@ static const char* sapCL= \
 "	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
 "	return overlap;\n"
 "}\n"
-"bool TestAabbAgainstAabb2GlobalGlobal(__global const b3AabbCL* aabb1, __global const b3AabbCL* aabb2);\n"
-"bool TestAabbAgainstAabb2GlobalGlobal(__global const b3AabbCL* aabb1, __global const b3AabbCL* aabb2)\n"
+"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
+"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
 "{\n"
 "	bool overlap = true;\n"
 "	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
@ -53,8 +53,8 @@ static const char* sapCL= \
 "	return overlap;\n"
 "}\n"
 "\n"
-"bool TestAabbAgainstAabb2Global(const b3AabbCL* aabb1, __global const b3AabbCL* aabb2);\n"
-"bool TestAabbAgainstAabb2Global(const b3AabbCL* aabb1, __global const b3AabbCL* aabb2)\n"
+"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
+"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
 "{\n"
 "	bool overlap = true;\n"
 "	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
@ -64,7 +64,7 @@ static const char* sapCL= \
 "}\n"
 "\n"
 "\n"
-"__kernel void   computePairsKernelTwoArrays( __global const b3AabbCL* unsortedAabbs, __global const b3AabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)\n"
+"__kernel void   computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const btAabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
 "	if (i>=numUnsortedAabbs)\n"
@ -89,7 +89,7 @@ static const char* sapCL= \
 "	}\n"
 "}\n"
 "\n"
-"__kernel void   computePairsKernelOriginal( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+"__kernel void   computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
 "	if (i>=numObjects)\n"
@ -117,7 +117,7 @@ static const char* sapCL= \
 "\n"
 "\n"
 "\n"
-"__kernel void   computePairsKernelBarrier( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+"__kernel void   computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
 "	int localId = get_local_id(0);\n"
@ -181,16 +181,16 @@ static const char* sapCL= \
 "}\n"
 "\n"
 "\n"
-"__kernel void   computePairsKernelLocalSharedMemory( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+"__kernel void   computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
 "	int localId = get_local_id(0);\n"
 "\n"
 "	__local int numActiveWgItems[1];\n"
 "	__local int breakRequest[1];\n"
-"	__local b3AabbCL localAabbs[128];// = aabbs[i];\n"
+"	__local btAabbCL localAabbs[128];// = aabbs[i];\n"
 "	\n"
-"	b3AabbCL myAabb;\n"
+"	btAabbCL myAabb;\n"
 "	\n"
 "	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
 "	float testValue = 	myAabb.m_maxElems[axis];\n"
@ -289,7 +289,7 @@ static const char* sapCL= \
 "\n"
 "\n"
 "\n"
-"__kernel void   copyAabbsKernel( __global const b3AabbCL* allAabbs, __global b3AabbCL* destAabbs, int numObjects)\n"
+"__kernel void   copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
 "	if (i>=numObjects)\n"
@ -300,7 +300,7 @@ static const char* sapCL= \
 "}\n"
 "\n"
 "\n"
-"__kernel void   flipFloatKernel( __global const b3AabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)\n"
+"__kernel void   flipFloatKernel( __global const btAabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
 "	if (i>=numObjects)\n"
@ -312,7 +312,7 @@ static const char* sapCL= \
 "}\n"
 "\n"
 "\n"
-"__kernel void   scatterKernel( __global const b3AabbCL* aabbs, volatile __global const int2* sortData, __global b3AabbCL* sortedAabbs, int numObjects)\n"
+"__kernel void   scatterKernel( __global const btAabbCL* aabbs, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
 "	if (i>=numObjects)\n"
--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h
@ -1,6 +1,6 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
 static const char* bvhTraversalKernelCL= \
-"//keep this enum in sync with the CPU version (in b3Collidable.h)\n"
+"//keep this enum in sync with the CPU version (in btCollidable.h)\n"
 "//written by Erwin Coumans\n"
 "\n"
 "#define SHAPE_CONVEX_HULL 3\n"
@ -13,7 +13,7 @@ static const char* bvhTraversalKernelCL= \
 "\n"
 "#define MAX_NUM_PARTS_IN_BITS 10\n"
 "\n"
-"///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.\n"
+"///btQuantizedBvhNode is a compressed aabb node, 16 bytes.\n"
 "///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n"
 "typedef struct\n"
 "{\n"
@ -22,7 +22,7 @@ static const char* bvhTraversalKernelCL= \
 "	unsigned short int	m_quantizedAabbMax[3];\n"
 "	//4 bytes\n"
 "	int	m_escapeIndexOrTriangleIndex;\n"
-"} b3QuantizedBvhNode;\n"
+"} btQuantizedBvhNode;\n"
 "\n"
 "typedef struct\n"
 "{\n"
@ -44,12 +44,12 @@ static const char* bvhTraversalKernelCL= \
 "	}\n"
 "	int getEscapeIndex() const\n"
 "	{\n"
-"		b3Assert(!isLeafNode());\n"
+"		btAssert(!isLeafNode());\n"
 "		return -m_escapeIndexOrTriangleIndex;\n"
 "	}\n"
 "	int	getTriangleIndex() const\n"
 "	{\n"
-"		b3Assert(isLeafNode());\n"
+"		btAssert(isLeafNode());\n"
 "		unsigned int x=0;\n"
 "		unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n"
 "		// Get only the lower bits where the triangle index is stored\n"
@ -57,13 +57,13 @@ static const char* bvhTraversalKernelCL= \
 "	}\n"
 "	int	getPartId() const\n"
 "	{\n"
-"		b3Assert(isLeafNode());\n"
+"		btAssert(isLeafNode());\n"
 "		// Get only the highest bits where the part index is stored\n"
 "		return (m_escapeIndexOrTriangleIndex>>(31-MAX_NUM_PARTS_IN_BITS));\n"
 "	}\n"
 "*/\n"
 "\n"
-"int	getTriangleIndex(const b3QuantizedBvhNode* rootNode)\n"
+"int	getTriangleIndex(const btQuantizedBvhNode* rootNode)\n"
 "{\n"
 "	unsigned int x=0;\n"
 "	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n"
@ -71,13 +71,13 @@ static const char* bvhTraversalKernelCL= \
 "	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n"
 "}\n"
 "\n"
-"int isLeaf(const b3QuantizedBvhNode* rootNode)\n"
+"int isLeaf(const btQuantizedBvhNode* rootNode)\n"
 "{\n"
 "	//skipindex is negative (internal node), triangleindex >=0 (leafnode)\n"
 "	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n"
 "}\n"
 "	\n"
-"int getEscapeIndex(const b3QuantizedBvhNode* rootNode)\n"
+"int getEscapeIndex(const btQuantizedBvhNode* rootNode)\n"
 "{\n"
 "	return -rootNode->m_escapeIndexOrTriangleIndex;\n"
 "}\n"
@ -92,9 +92,9 @@ static const char* bvhTraversalKernelCL= \
 "	//4 bytes\n"
 "	int			m_subtreeSize;\n"
 "	int			m_padding[3];\n"
-"} b3BvhSubtreeInfo;\n"
+"} btBvhSubtreeInfo;\n"
 "\n"
-"///keep this in sync with b3Collidable.h\n"
+"///keep this in sync with btCollidable.h\n"
 "typedef struct\n"
 "{\n"
 "	int m_numChildShapes;\n"
@ -102,7 +102,7 @@ static const char* bvhTraversalKernelCL= \
 "	int m_shapeType;\n"
 "	int m_shapeIndex;\n"
 "	\n"
-"} b3CollidableGpu;\n"
+"} btCollidableGpu;\n"
 "\n"
 "typedef struct\n"
 "{\n"
@ -112,7 +112,7 @@ static const char* bvhTraversalKernelCL= \
 "	int m_unused0;\n"
 "	int m_unused1;\n"
 "	int m_unused2;\n"
-"} b3GpuChildShape;\n"
+"} btGpuChildShape;\n"
 "\n"
 "\n"
 "typedef struct\n"
@ -142,7 +142,7 @@ static const char* bvhTraversalKernelCL= \
 "		float   m_maxElems[4];\n"
 "		int			m_maxIndices[4];\n"
 "	};\n"
-"} b3AabbCL;\n"
+"} btAabbCL;\n"
 "\n"
 "\n"
 "int testQuantizedAabbAgainstQuantizedAabb(\n"
@ -196,12 +196,12 @@ static const char* bvhTraversalKernelCL= \
 "// work-in-progress\n"
 "__kernel void   bvhTraversalKernel( __global const int2* pairs, \n"
 "									__global const BodyData* rigidBodies, \n"
-"									__global const b3CollidableGpu* collidables,\n"
-"									__global b3AabbCL* aabbs,\n"
+"									__global const btCollidableGpu* collidables,\n"
+"									__global btAabbCL* aabbs,\n"
 "									__global int4* concavePairsOut,\n"
 "									__global volatile int* numConcavePairsOut,\n"
-"									__global const b3BvhSubtreeInfo* subtreeHeadersRoot,\n"
-"									__global const b3QuantizedBvhNode* quantizedNodesRoot,\n"
+"									__global const btBvhSubtreeInfo* subtreeHeadersRoot,\n"
+"									__global const btQuantizedBvhNode* quantizedNodesRoot,\n"
 "									__global const b3BvhInfo* bvhInfos,\n"
 "									int numPairs,\n"
 "									int maxNumConcavePairsCapacity)\n"
@ -238,8 +238,8 @@ static const char* bvhTraversalKernelCL= \
 "	float4 bvhAabbMax = bvhInfo.m_aabbMax;\n"
 "	float4 bvhQuantization = bvhInfo.m_quantization;\n"
 "	int numSubtreeHeaders = bvhInfo.m_numSubTrees;\n"
-"	__global const b3BvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];\n"
-"	__global const b3QuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];\n"
+"	__global const btBvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];\n"
+"	__global const btQuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];\n"
 "	\n"
 "\n"
 "	unsigned short int quantizedQueryAabbMin[3];\n"
@ -249,7 +249,7 @@ static const char* bvhTraversalKernelCL= \
 "	\n"
 "	for (int i=0;i<numSubtreeHeaders;i++)\n"
 "	{\n"
-"		b3BvhSubtreeInfo subtree = subtreeHeaders[i];\n"
+"		btBvhSubtreeInfo subtree = subtreeHeaders[i];\n"
 "				\n"
 "		int overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);\n"
 "		if (overlap != 0)\n"
@ -262,7 +262,7 @@ static const char* bvhTraversalKernelCL= \
 "			int aabbOverlap;\n"
 "			while (curIndex < endNodeIndex)\n"
 "			{\n"
-"				b3QuantizedBvhNode rootNode = quantizedNodes[curIndex];\n"
+"				btQuantizedBvhNode rootNode = quantizedNodes[curIndex];\n"
 "				aabbOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode.m_quantizedAabbMin,rootNode.m_quantizedAabbMax);\n"
 "				isLeafNode = isLeaf(&rootNode);\n"
 "				if (aabbOverlap)\n"
--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h
@ -67,9 +67,9 @@ static const char* primitiveContactsKernelsCL= \
 "		float   m_maxElems[4];\n"
 "		int			m_maxIndices[4];\n"
 "	};\n"
-"} b3AabbCL;\n"
+"} btAabbCL;\n"
 "\n"
-"///keep this in sync with b3Collidable.h\n"
+"///keep this in sync with btCollidable.h\n"
 "typedef struct\n"
 "{\n"
 "	int m_numChildShapes;\n"
@ -77,7 +77,7 @@ static const char* primitiveContactsKernelsCL= \
 "	int m_shapeType;\n"
 "	int m_shapeIndex;\n"
 "	\n"
-"} b3CollidableGpu;\n"
+"} btCollidableGpu;\n"
 "\n"
 "typedef struct\n"
 "{\n"
@ -87,7 +87,7 @@ static const char* primitiveContactsKernelsCL= \
 "	int m_unused0;\n"
 "	int m_unused1;\n"
 "	int m_unused2;\n"
-"} b3GpuChildShape;\n"
+"} btGpuChildShape;\n"
 "\n"
 "#define GET_NPOINTS(x) (x).m_worldNormal.w\n"
 "\n"
@ -129,7 +129,7 @@ static const char* primitiveContactsKernelsCL= \
 "	float4 m_plane;\n"
 "	int m_indexOffset;\n"
 "	int m_numIndices;\n"
-"} b3GpuFace;\n"
+"} btGpuFace;\n"
 "\n"
 "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
 "\n"
@ -290,7 +290,7 @@ static const char* primitiveContactsKernelsCL= \
 "\n"
 "\n"
 "inline bool IsPointInPolygon(float4 p, \n"
-"							const b3GpuFace* face,\n"
+"							const btGpuFace* face,\n"
 "							__global const float4* baseVertex,\n"
 "							__global const  int* convexIndices,\n"
 "							float4* out)\n"
@ -352,11 +352,11 @@ static const char* primitiveContactsKernelsCL= \
 "																int bodyIndexA, int bodyIndexB, \n"
 "																int collidableIndexA, int collidableIndexB, \n"
 "																__global const BodyData* rigidBodies, \n"
-"																__global const b3CollidableGpu* collidables,\n"
+"																__global const btCollidableGpu* collidables,\n"
 "																__global const ConvexPolyhedronCL* convexShapes,\n"
 "																__global const float4* convexVertices,\n"
 "																__global const int* convexIndices,\n"
-"																__global const b3GpuFace* faces,\n"
+"																__global const btGpuFace* faces,\n"
 "																__global Contact4* restrict globalContactsOut,\n"
 "																counter32_t nGlobalContactsOut,\n"
 "																int maxContactCapacity,\n"
@ -383,7 +383,7 @@ static const char* primitiveContactsKernelsCL= \
 "\n"
 "	for ( int f = 0; f < numFaces; f++ )\n"
 "	{\n"
-"		b3GpuFace face = faces[convexShapes[shapeIndex].m_faceOffset+f];\n"
+"		btGpuFace face = faces[convexShapes[shapeIndex].m_faceOffset+f];\n"
 "\n"
 "		// set up a plane equation \n"
 "		float4 planeEqn;\n"
@ -594,11 +594,11 @@ static const char* primitiveContactsKernelsCL= \
 "								int bodyIndexA, int bodyIndexB, \n"
 "								int collidableIndexA, int collidableIndexB, \n"
 "								__global const BodyData* rigidBodies, \n"
-"								__global const b3CollidableGpu*collidables,\n"
+"								__global const btCollidableGpu*collidables,\n"
 "								__global const ConvexPolyhedronCL* convexShapes,\n"
 "								__global const float4* convexVertices,\n"
 "								__global const int* convexIndices,\n"
-"								__global const b3GpuFace* faces,\n"
+"								__global const btGpuFace* faces,\n"
 "								__global Contact4* restrict globalContactsOut,\n"
 "								counter32_t nGlobalContactsOut,\n"
 "								int maxContactCapacity,\n"
@ -733,8 +733,8 @@ static const char* primitiveContactsKernelsCL= \
 "																int bodyIndexA, int bodyIndexB, \n"
 "																int collidableIndexA, int collidableIndexB, \n"
 "																__global const BodyData* rigidBodies, \n"
-"																__global const b3CollidableGpu* collidables,\n"
-"																__global const b3GpuFace* faces,\n"
+"																__global const btCollidableGpu* collidables,\n"
+"																__global const btGpuFace* faces,\n"
 "																__global Contact4* restrict globalContactsOut,\n"
 "																counter32_t nGlobalContactsOut,\n"
 "																int maxContactCapacity)\n"
@ -793,11 +793,11 @@ static const char* primitiveContactsKernelsCL= \
 "\n"
 "__kernel void   primitiveContactsKernel( __global const int2* pairs, \n"
 "																					__global const BodyData* rigidBodies, \n"
-"																					__global const b3CollidableGpu* collidables,\n"
+"																					__global const btCollidableGpu* collidables,\n"
 "																					__global const ConvexPolyhedronCL* convexShapes, \n"
 "																					__global const float4* vertices,\n"
 "																					__global const float4* uniqueEdges,\n"
-"																					__global const b3GpuFace* faces,\n"
+"																					__global const btGpuFace* faces,\n"
 "																					__global const int* indices,\n"
 "																					__global Contact4* restrict globalContactsOut,\n"
 "																					counter32_t nGlobalContactsOut,\n"
@ -972,14 +972,14 @@ static const char* primitiveContactsKernelsCL= \
 "// work-in-progress\n"
 "__kernel void   processCompoundPairsPrimitivesKernel( __global const int4* gpuCompoundPairs,\n"
 "													__global const BodyData* rigidBodies, \n"
-"													__global const b3CollidableGpu* collidables,\n"
+"													__global const btCollidableGpu* collidables,\n"
 "													__global const ConvexPolyhedronCL* convexShapes, \n"
 "													__global const float4* vertices,\n"
 "													__global const float4* uniqueEdges,\n"
-"													__global const b3GpuFace* faces,\n"
+"													__global const btGpuFace* faces,\n"
 "													__global const int* indices,\n"
-"													__global b3AabbCL* aabbs,\n"
-"													__global const b3GpuChildShape* gpuChildShapes,\n"
+"													__global btAabbCL* aabbs,\n"
+"													__global const btGpuChildShape* gpuChildShapes,\n"
 "													__global Contact4* restrict globalContactsOut,\n"
 "													counter32_t nGlobalContactsOut,\n"
 "													int numCompoundPairs, int maxContactCapacity\n"
@ -1157,7 +1157,7 @@ static const char* primitiveContactsKernelsCL= \
 "									int bodyIndexA, int bodyIndexB,\n"
 "									int collidableIndexA, int collidableIndexB, \n"
 "									__global const BodyData* rigidBodies, \n"
-"									__global const b3CollidableGpu* collidables,\n"
+"									__global const btCollidableGpu* collidables,\n"
 "									const float4* triangleVertices,\n"
 "									__global Contact4* restrict globalContactsOut,\n"
 "									counter32_t nGlobalContactsOut,\n"
@ -1299,13 +1299,13 @@ static const char* primitiveContactsKernelsCL= \
 "// work-in-progress\n"
 "__kernel void   findConcaveSphereContactsKernel( __global int4* concavePairs,\n"
 "												__global const BodyData* rigidBodies,\n"
-"												__global const b3CollidableGpu* collidables,\n"
+"												__global const btCollidableGpu* collidables,\n"
 "												__global const ConvexPolyhedronCL* convexShapes, \n"
 "												__global const float4* vertices,\n"
 "												__global const float4* uniqueEdges,\n"
-"												__global const b3GpuFace* faces,\n"
+"												__global const btGpuFace* faces,\n"
 "												__global const int* indices,\n"
-"												__global b3AabbCL* aabbs,\n"
+"												__global btAabbCL* aabbs,\n"
 "												__global Contact4* restrict globalContactsOut,\n"
 "												counter32_t nGlobalContactsOut,\n"
 "													int numConcavePairs, int maxContactCapacity\n"
@ -1329,7 +1329,7 @@ static const char* primitiveContactsKernelsCL= \
 "	if (collidables[collidableIndexB].m_shapeType==SHAPE_SPHERE)\n"
 "	{\n"
 "		int f = concavePairs[i].z;\n"
-"		b3GpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
+"		btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
 "		\n"
 "		float4 verticesA[3];\n"
 "		for (int i=0;i<3;i++)\n"
--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h
@ -55,7 +55,7 @@ static const char* satClipKernelsCL= \
 "} Contact4;\n"
 "\n"
 "\n"
-"///keep this in sync with b3Collidable.h\n"
+"///keep this in sync with btCollidable.h\n"
 "typedef struct\n"
 "{\n"
 "	int m_numChildShapes;\n"
@ -63,7 +63,7 @@ static const char* satClipKernelsCL= \
 "	int m_shapeType;\n"
 "	int m_shapeIndex;\n"
 "	\n"
-"} b3CollidableGpu;\n"
+"} btCollidableGpu;\n"
 "\n"
 "typedef struct\n"
 "{\n"
@ -73,7 +73,7 @@ static const char* satClipKernelsCL= \
 "	int m_unused0;\n"
 "	int m_unused1;\n"
 "	int m_unused2;\n"
-"} b3GpuChildShape;\n"
+"} btGpuChildShape;\n"
 "\n"
 "#define GET_NPOINTS(x) (x).m_worldNormal.w\n"
 "\n"
@ -115,7 +115,7 @@ static const char* satClipKernelsCL= \
 "	float4 m_plane;\n"
 "	int m_indexOffset;\n"
 "	int m_numIndices;\n"
-"} b3GpuFace;\n"
+"} btGpuFace;\n"
 "\n"
 "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
 "\n"
@ -357,7 +357,7 @@ static const char* satClipKernelsCL= \
 "	float4* worldVertsB2, int capacityWorldVertsB2,\n"
 "	const float minDist, float maxDist,\n"
 "	__global const float4* vertices,\n"
-"	__global const b3GpuFace* faces,\n"
+"	__global const btGpuFace* faces,\n"
 "	__global const int* indices,\n"
 "	float4* contactsOut,\n"
 "	int contactCapacity)\n"
@ -392,7 +392,7 @@ static const char* satClipKernelsCL= \
 "	if (closestFaceA<0)\n"
 "		return numContactsOut;\n"
 "\n"
-"	b3GpuFace polyA = faces[hullA->m_faceOffset+closestFaceA];\n"
+"	btGpuFace polyA = faces[hullA->m_faceOffset+closestFaceA];\n"
 "\n"
 "	// clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n"
 "	int numVerticesA = polyA.m_numIndices;\n"
@ -416,7 +416,7 @@ static const char* satClipKernelsCL= \
 "		//clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);\n"
 "		numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);\n"
 "\n"
-"		//b3Swap(pVtxIn,pVtxOut);\n"
+"		//btSwap(pVtxIn,pVtxOut);\n"
 "		float4* tmp = pVtxOut;\n"
 "		pVtxOut = pVtxIn;\n"
 "		pVtxIn = tmp;\n"
@ -458,10 +458,10 @@ static const char* satClipKernelsCL= \
 "	float4* worldVertsB2, int capacityWorldVertsB2,\n"
 "	const float minDist, float maxDist,\n"
 "	const float4* verticesA,\n"
-"	const b3GpuFace* facesA,\n"
+"	const btGpuFace* facesA,\n"
 "	const int* indicesA,\n"
 "	__global const float4* verticesB,\n"
-"	__global const b3GpuFace* facesB,\n"
+"	__global const btGpuFace* facesB,\n"
 "	__global const int* indicesB,\n"
 "	float4* contactsOut,\n"
 "	int contactCapacity)\n"
@ -496,7 +496,7 @@ static const char* satClipKernelsCL= \
 "	if (closestFaceA<0)\n"
 "		return numContactsOut;\n"
 "\n"
-"	b3GpuFace polyA = facesA[hullA->m_faceOffset+closestFaceA];\n"
+"	btGpuFace polyA = facesA[hullA->m_faceOffset+closestFaceA];\n"
 "\n"
 "	// clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n"
 "	int numVerticesA = polyA.m_numIndices;\n"
@ -520,7 +520,7 @@ static const char* satClipKernelsCL= \
 "		//clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);\n"
 "		numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);\n"
 "\n"
-"		//b3Swap(pVtxIn,pVtxOut);\n"
+"		//btSwap(pVtxIn,pVtxOut);\n"
 "		float4* tmp = pVtxOut;\n"
 "		pVtxOut = pVtxIn;\n"
 "		pVtxIn = tmp;\n"
@ -561,7 +561,7 @@ static const char* satClipKernelsCL= \
 "	float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,\n"
 "	const float minDist, float maxDist,\n"
 "	__global const float4* vertices,\n"
-"	__global const b3GpuFace* faces,\n"
+"	__global const btGpuFace* faces,\n"
 "	__global const int* indices,\n"
 "	float4*	localContactsOut,\n"
 "	int localContactCapacity)\n"
@ -589,7 +589,7 @@ static const char* satClipKernelsCL= \
 "	}\n"
 "\n"
 "	{\n"
-"		const b3GpuFace polyB = faces[hullB->m_faceOffset+closestFaceB];\n"
+"		const btGpuFace polyB = faces[hullB->m_faceOffset+closestFaceB];\n"
 "		const int numVertices = polyB.m_numIndices;\n"
 "		for(int e0=0;e0<numVertices;e0++)\n"
 "		{\n"
@ -617,10 +617,10 @@ static const char* satClipKernelsCL= \
 "	float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,\n"
 "	const float minDist, float maxDist,\n"
 "	const float4* verticesA,\n"
-"	const b3GpuFace* facesA,\n"
+"	const btGpuFace* facesA,\n"
 "	const int* indicesA,\n"
 "	__global const float4* verticesB,\n"
-"	__global const b3GpuFace* facesB,\n"
+"	__global const btGpuFace* facesB,\n"
 "	__global const int* indicesB,\n"
 "	float4*	localContactsOut,\n"
 "	int localContactCapacity)\n"
@ -648,7 +648,7 @@ static const char* satClipKernelsCL= \
 "	}\n"
 "\n"
 "	{\n"
-"		const b3GpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB];\n"
+"		const btGpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB];\n"
 "		const int numVertices = polyB.m_numIndices;\n"
 "		for(int e0=0;e0<numVertices;e0++)\n"
 "		{\n"
@ -956,11 +956,11 @@ static const char* satClipKernelsCL= \
 "\n"
 "__kernel void   clipHullHullKernel( __global const int2* pairs, \n"
 "																					__global const BodyData* rigidBodies, \n"
-"																					__global const b3CollidableGpu* collidables,\n"
+"																					__global const btCollidableGpu* collidables,\n"
 "																					__global const ConvexPolyhedronCL* convexShapes, \n"
 "																					__global const float4* vertices,\n"
 "																					__global const float4* uniqueEdges,\n"
-"																					__global const b3GpuFace* faces,\n"
+"																					__global const btGpuFace* faces,\n"
 "																					__global const int* indices,\n"
 "																					__global const float4* separatingNormals,\n"
 "																					__global const int* hasSeparatingAxis,\n"
@ -1053,13 +1053,13 @@ static const char* satClipKernelsCL= \
 "\n"
 "__kernel void   clipCompoundsHullHullKernel( __global const int4* gpuCompoundPairs, \n"
 "																					__global const BodyData* rigidBodies, \n"
-"																					__global const b3CollidableGpu* collidables,\n"
+"																					__global const btCollidableGpu* collidables,\n"
 "																					__global const ConvexPolyhedronCL* convexShapes, \n"
 "																					__global const float4* vertices,\n"
 "																					__global const float4* uniqueEdges,\n"
-"																					__global const b3GpuFace* faces,\n"
+"																					__global const btGpuFace* faces,\n"
 "																					__global const int* indices,\n"
-"																					__global const b3GpuChildShape* gpuChildShapes,\n"
+"																					__global const btGpuChildShape* gpuChildShapes,\n"
 "																					__global const float4* gpuCompoundSepNormalsOut,\n"
 "																					__global const int* gpuHasCompoundSepNormalsOut,\n"
 "																					__global Contact4* restrict globalContactsOut,\n"
@ -1185,7 +1185,7 @@ static const char* satClipKernelsCL= \
 "\n"
 "__kernel void   sphereSphereCollisionKernel( __global const int2* pairs, \n"
 "																					__global const BodyData* rigidBodies, \n"
-"																					__global const b3CollidableGpu* collidables,\n"
+"																					__global const btCollidableGpu* collidables,\n"
 "																					__global const float4* separatingNormals,\n"
 "																					__global const int* hasSeparatingAxis,\n"
 "																					__global Contact4* restrict globalContactsOut,\n"
@ -1252,13 +1252,13 @@ static const char* satClipKernelsCL= \
 "\n"
 "__kernel void   clipHullHullConcaveConvexKernel( __global int4* concavePairsIn,\n"
 "																					__global const BodyData* rigidBodies, \n"
-"																					__global const b3CollidableGpu* collidables,\n"
+"																					__global const btCollidableGpu* collidables,\n"
 "																					__global const ConvexPolyhedronCL* convexShapes, \n"
 "																					__global const float4* vertices,\n"
 "																					__global const float4* uniqueEdges,\n"
-"																					__global const b3GpuFace* faces,\n"
+"																					__global const btGpuFace* faces,\n"
 "																					__global const int* indices,\n"
-"																					__global const b3GpuChildShape* gpuChildShapes,\n"
+"																					__global const btGpuChildShape* gpuChildShapes,\n"
 "																					__global const float4* separatingNormals,\n"
 "																					__global Contact4* restrict globalContactsOut,\n"
 "																					counter32_t nGlobalContactsOut,\n"
@ -1306,7 +1306,7 @@ static const char* satClipKernelsCL= \
 "		convexPolyhedronA.m_vertexOffset = 0;\n"
 "		float4	localCenter = make_float4(0.f,0.f,0.f,0.f);\n"
 "\n"
-"		b3GpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
+"		btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
 "		\n"
 "		float4 verticesA[3];\n"
 "		for (int i=0;i<3;i++)\n"
@ -1335,7 +1335,7 @@ static const char* satClipKernelsCL= \
 "                                  \n"
 "		float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n"
 "                             \n"
-"		b3GpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n"
+"		btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n"
 "		int indicesA[3+3+2+2+2];\n"
 "		int curUsedIndices=0;\n"
 "		int fidx=0;\n"
@ -1496,7 +1496,7 @@ static const char* satClipKernelsCL= \
 "                      int capacityWorldVerts,\n"
 "                      const float minDist, float maxDist,\n"
 "                      __global const float4* vertices,\n"
-"                      __global const b3GpuFace* faces,\n"
+"                      __global const btGpuFace* faces,\n"
 "                      __global const int* indices,\n"
 "                      __global int4* clippingFaces, int pairIndex)\n"
 "{\n"
@ -1523,7 +1523,7 @@ static const char* satClipKernelsCL= \
 "	}\n"
 "    \n"
 "	{\n"
-"		const b3GpuFace polyB = faces[hullB->m_faceOffset+closestFaceB];\n"
+"		const btGpuFace polyB = faces[hullB->m_faceOffset+closestFaceB];\n"
 "		const int numVertices = polyB.m_numIndices;\n"
 "		for(int e0=0;e0<numVertices;e0++)\n"
 "		{\n"
@ -1664,11 +1664,11 @@ static const char* satClipKernelsCL= \
 "\n"
 "__kernel void   findClippingFacesKernel(  __global const int2* pairs,\n"
 "                                        __global const BodyData* rigidBodies,\n"
-"                                        __global const b3CollidableGpu* collidables,\n"
+"                                        __global const btCollidableGpu* collidables,\n"
 "                                        __global const ConvexPolyhedronCL* convexShapes,\n"
 "                                        __global const float4* vertices,\n"
 "                                        __global const float4* uniqueEdges,\n"
-"                                        __global const b3GpuFace* faces,\n"
+"                                        __global const btGpuFace* faces,\n"
 "                                        __global const int* indices,\n"
 "                                        __global const float4* separatingNormals,\n"
 "                                        __global const int* hasSeparatingAxis,\n"
--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h
@ -1,6 +1,6 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
 static const char* satKernelsCL= \
-"//keep this enum in sync with the CPU version (in b3Collidable.h)\n"
+"//keep this enum in sync with the CPU version (in btCollidable.h)\n"
 "//written by Erwin Coumans\n"
 "\n"
 "\n"
@ -13,7 +13,7 @@ static const char* satKernelsCL= \
 "\n"
 "typedef unsigned int u32;\n"
 "\n"
-"///keep this in sync with b3Collidable.h\n"
+"///keep this in sync with btCollidable.h\n"
 "typedef struct\n"
 "{\n"
 "	int m_numChildShapes;\n"
@ -21,7 +21,7 @@ static const char* satKernelsCL= \
 "	int m_shapeType;\n"
 "	int m_shapeIndex;\n"
 "	\n"
-"} b3CollidableGpu;\n"
+"} btCollidableGpu;\n"
 "\n"
 "typedef struct\n"
 "{\n"
@ -31,7 +31,7 @@ static const char* satKernelsCL= \
 "	int m_unused0;\n"
 "	int m_unused1;\n"
 "	int m_unused2;\n"
-"} b3GpuChildShape;\n"
+"} btGpuChildShape;\n"
 "\n"
 "\n"
 "typedef struct\n"
@ -80,14 +80,14 @@ static const char* satKernelsCL= \
 "		float   m_maxElems[4];\n"
 "		int			m_maxIndices[4];\n"
 "	};\n"
-"} b3AabbCL;\n"
+"} btAabbCL;\n"
 "\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_plane;\n"
 "	int m_indexOffset;\n"
 "	int m_numIndices;\n"
-"} b3GpuFace;\n"
+"} btGpuFace;\n"
 "\n"
 "#define make_float4 (float4)\n"
 "\n"
@ -296,12 +296,12 @@ static const char* satKernelsCL= \
 "	\n"
 "	const float4* verticesA, \n"
 "	const float4* uniqueEdgesA, \n"
-"	const b3GpuFace* facesA,\n"
+"	const btGpuFace* facesA,\n"
 "	const int*  indicesA,\n"
 "\n"
 "	__global const float4* verticesB, \n"
 "	__global const float4* uniqueEdgesB, \n"
-"	__global const b3GpuFace* facesB,\n"
+"	__global const btGpuFace* facesB,\n"
 "	__global const int*  indicesB,\n"
 "	float4* sep,\n"
 "	float* dmin)\n"
@ -348,11 +348,11 @@ static const char* satKernelsCL= \
 "	const float4 DeltaC2,\n"
 "	__global const float4* verticesA, \n"
 "	__global const float4* uniqueEdgesA, \n"
-"	__global const b3GpuFace* facesA,\n"
+"	__global const btGpuFace* facesA,\n"
 "	__global const int*  indicesA,\n"
 "	const float4* verticesB,\n"
 "	const float4* uniqueEdgesB, \n"
-"	const b3GpuFace* facesB,\n"
+"	const btGpuFace* facesB,\n"
 "	const int*  indicesB,\n"
 "	float4* sep,\n"
 "	float* dmin)\n"
@ -401,11 +401,11 @@ static const char* satKernelsCL= \
 "	const float4 DeltaC2,\n"
 "	const float4* verticesA, \n"
 "	const float4* uniqueEdgesA, \n"
-"	const b3GpuFace* facesA,\n"
+"	const btGpuFace* facesA,\n"
 "	const int*  indicesA,\n"
 "	__global const float4* verticesB, \n"
 "	__global const float4* uniqueEdgesB, \n"
-"	__global const b3GpuFace* facesB,\n"
+"	__global const btGpuFace* facesB,\n"
 "	__global const int*  indicesB,\n"
 "		float4* sep,\n"
 "	float* dmin)\n"
@ -507,7 +507,7 @@ static const char* satKernelsCL= \
 "	const float4 DeltaC2,\n"
 "	__global const float4* vertices, \n"
 "	__global const float4* uniqueEdges, \n"
-"	__global const b3GpuFace* faces,\n"
+"	__global const btGpuFace* faces,\n"
 "	__global const int*  indices,\n"
 "	float4* sep,\n"
 "	float* dmin)\n"
@ -566,7 +566,7 @@ static const char* satKernelsCL= \
 "	const float4 DeltaC2,\n"
 "	__global const float4* vertices, \n"
 "	__global const float4* uniqueEdges, \n"
-"	__global const b3GpuFace* faces,\n"
+"	__global const btGpuFace* faces,\n"
 "	__global const int*  indices,\n"
 "	float4* sep,\n"
 "	float* dmin)\n"
@ -643,14 +643,14 @@ static const char* satKernelsCL= \
 "// work-in-progress\n"
 "__kernel void   processCompoundPairsKernel( __global const int4* gpuCompoundPairs,\n"
 "																					__global const BodyData* rigidBodies, \n"
-"																					__global const b3CollidableGpu* collidables,\n"
+"																					__global const btCollidableGpu* collidables,\n"
 "																					__global const ConvexPolyhedronCL* convexShapes, \n"
 "																					__global const float4* vertices,\n"
 "																					__global const float4* uniqueEdges,\n"
-"																					__global const b3GpuFace* faces,\n"
+"																					__global const btGpuFace* faces,\n"
 "																					__global const int* indices,\n"
-"																					__global b3AabbCL* aabbs,\n"
-"																					__global const b3GpuChildShape* gpuChildShapes,\n"
+"																					__global btAabbCL* aabbs,\n"
+"																					__global const btGpuChildShape* gpuChildShapes,\n"
 "																					__global volatile float4* gpuCompoundSepNormalsOut,\n"
 "																					__global volatile int* gpuHasCompoundSepNormalsOut,\n"
 "																					int numCompoundPairs\n"
@ -760,14 +760,14 @@ static const char* satKernelsCL= \
 "// work-in-progress\n"
 "__kernel void   findCompoundPairsKernel( __global const int2* pairs, \n"
 "	__global const BodyData* rigidBodies, \n"
-"	__global const b3CollidableGpu* collidables,\n"
+"	__global const btCollidableGpu* collidables,\n"
 "	__global const ConvexPolyhedronCL* convexShapes, \n"
 "	__global const float4* vertices,\n"
 "	__global const float4* uniqueEdges,\n"
-"	__global const b3GpuFace* faces,\n"
+"	__global const btGpuFace* faces,\n"
 "	__global const int* indices,\n"
-"	__global b3AabbCL* aabbs,\n"
-"	__global const b3GpuChildShape* gpuChildShapes,\n"
+"	__global btAabbCL* aabbs,\n"
+"	__global const btGpuChildShape* gpuChildShapes,\n"
 "	__global volatile int4* gpuCompoundPairsOut,\n"
 "	__global volatile int* numCompoundPairsOut,\n"
 "	int numPairs,\n"
@ -942,13 +942,13 @@ static const char* satKernelsCL= \
 "// work-in-progress\n"
 "__kernel void   findSeparatingAxisKernel( __global const int2* pairs, \n"
 "																					__global const BodyData* rigidBodies, \n"
-"																					__global const b3CollidableGpu* collidables,\n"
+"																					__global const btCollidableGpu* collidables,\n"
 "																					__global const ConvexPolyhedronCL* convexShapes, \n"
 "																					__global const float4* vertices,\n"
 "																					__global const float4* uniqueEdges,\n"
-"																					__global const b3GpuFace* faces,\n"
+"																					__global const btGpuFace* faces,\n"
 "																					__global const int* indices,\n"
-"																					__global b3AabbCL* aabbs,\n"
+"																					__global btAabbCL* aabbs,\n"
 "																					__global volatile float4* separatingNormals,\n"
 "																					__global volatile int* hasSeparatingAxis,\n"
 "																					int numPairs\n"
@ -1056,14 +1056,14 @@ static const char* satKernelsCL= \
 "// work-in-progress\n"
 "__kernel void   findConcaveSeparatingAxisKernel( __global int4* concavePairs,\n"
 "																					__global const BodyData* rigidBodies,\n"
-"																					__global const b3CollidableGpu* collidables,\n"
+"																					__global const btCollidableGpu* collidables,\n"
 "																					__global const ConvexPolyhedronCL* convexShapes, \n"
 "																					__global const float4* vertices,\n"
 "																					__global const float4* uniqueEdges,\n"
-"																					__global const b3GpuFace* faces,\n"
+"																					__global const btGpuFace* faces,\n"
 "																					__global const int* indices,\n"
-"																					__global const b3GpuChildShape* gpuChildShapes,\n"
-"																					__global b3AabbCL* aabbs,\n"
+"																					__global const btGpuChildShape* gpuChildShapes,\n"
+"																					__global btAabbCL* aabbs,\n"
 "																					__global float4* concaveSeparatingNormalsOut,\n"
 "																					int numConcavePairs\n"
 "																					)\n"
@ -1106,9 +1106,9 @@ static const char* satKernelsCL= \
 "	convexPolyhedronA.m_vertexOffset = 0;\n"
 "	float4	localCenter = make_float4(0.f,0.f,0.f,0.f);\n"
 "\n"
-"	b3GpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
+"	btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
 "	float4 triMinAabb, triMaxAabb;\n"
-"	b3AabbCL triAabb;\n"
+"	btAabbCL triAabb;\n"
 "	triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);\n"
 "	triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);\n"
 "	\n"
@ -1153,7 +1153,7 @@ static const char* satKernelsCL= \
 "                                  \n"
 "		float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n"
 "                             \n"
-"		b3GpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n"
+"		btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n"
 "		int indicesA[3+3+2+2+2];\n"
 "		int curUsedIndices=0;\n"
 "		int fidx=0;\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h
@ -210,7 +210,7 @@ static const char* batchingKernelsNewCL= \
 "\n"
 "					if (i!=numValidConstraints)\n"
 "					{\n"
-"						//b3Swap(cs[i],cs[numValidConstraints]);\n"
+"						//btSwap(cs[i],cs[numValidConstraints]);\n"
 "						\n"
 "						Contact4 tmp = cs[i];\n"
 "						cs[i] = cs[numValidConstraints];\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.h
@ -47,7 +47,7 @@ static const char* integrateKernelCL= \
 "  integrateTransformsKernel( __global Body* bodies,const int numNodes, float timeStep, float angularDamping, float4 gravityAcceleration)\n"
 "{\n"
 "	int nodeID = get_global_id(0);\n"
-"	float B3_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n"
+"	float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n"
 "	if( nodeID < numNodes && (bodies[nodeID].m_invMass != 0.f))\n"
 "	{\n"
 "		//angular velocity\n"
@ -61,9 +61,9 @@ static const char* integrateKernelCL= \
 "			float4 angvel = bodies[nodeID].m_angVel;\n"
 "			float fAngle = native_sqrt(dot(angvel, angvel));\n"
 "			//limit the angular motion\n"
-"			if(fAngle*timeStep > B3_GPU_ANGULAR_MOTION_THRESHOLD)\n"
+"			if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)\n"
 "			{\n"
-"				fAngle = B3_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n"
+"				fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n"
 "			}\n"
 "			if(fAngle < 0.001f)\n"
 "			{\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h
@ -313,8 +313,8 @@ static const char* solveContactCL= \
 "	}\n"
 "}\n"
 "\n"
-"void b3PlaneSpace1 (const float4* n, float4* p, float4* q);\n"
-" void b3PlaneSpace1 (const float4* n, float4* p, float4* q)\n"
+"void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n"
+" void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n"
 "{\n"
 "  if (fabs(n[0].z) > 0.70710678f) {\n"
 "    // choose p in y-z plane\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h
@ -265,8 +265,8 @@ static const char* solveFrictionCL= \
 "	float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
 "	return -1.f/(jmj0+jmj1+jmj2+jmj3);\n"
 "}\n"
-"void b3PlaneSpace1 (const float4* n, float4* p, float4* q);\n"
-" void b3PlaneSpace1 (const float4* n, float4* p, float4* q)\n"
+"void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n"
+" void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n"
 "{\n"
 "  if (fabs(n[0].z) > 0.70710678f) {\n"
 "    // choose p in y-z plane\n"
@ -347,7 +347,7 @@ static const char* solveFrictionCL= \
 "			float4 n = -cs->m_linear;\n"
 "			\n"
 "			float4 tangent[2];\n"
-"			b3PlaneSpace1(&n,&tangent[0],&tangent[1]);\n"
+"			btPlaneSpace1(&n,&tangent[0],&tangent[1]);\n"
 "			float4 angular0, angular1, linear;\n"
 "			float4 r0 = center - posA;\n"
 "			float4 r1 = center - posB;\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.h
@ -489,8 +489,8 @@ static const char* solverSetupCL= \
 "} ConstBufferSSD;\n"
 "\n"
 "\n"
-"void b3PlaneSpace1 (float4 n, float4* p, float4* q);\n"
-" void b3PlaneSpace1 (float4 n, float4* p, float4* q)\n"
+"void btPlaneSpace1 (float4 n, float4* p, float4* q);\n"
+" void btPlaneSpace1 (float4 n, float4* p, float4* q)\n"
 "{\n"
 "  if (fabs(n.z) > 0.70710678f) {\n"
 "    // choose p in y-z plane\n"
@ -577,7 +577,7 @@ static const char* solverSetupCL= \
 "		center /= (float)src->m_worldNormal.w;\n"
 "\n"
 "		float4 tangent[2];\n"
-"		b3PlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]);\n"
+"		btPlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]);\n"
 "		\n"
 "		float4 r[2];\n"
 "		r[0] = center - posA;\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.h
@ -488,8 +488,8 @@ static const char* solverUtilsCL= \
 "}\n"
 "\n"
 "\n"
-"void b3PlaneSpace1 (float4 n, float4* p, float4* q);\n"
-" void b3PlaneSpace1 (float4 n, float4* p, float4* q)\n"
+"void btPlaneSpace1 (float4 n, float4* p, float4* q);\n"
+" void btPlaneSpace1 (float4 n, float4* p, float4* q)\n"
 "{\n"
 "  if (fabs(n.z) > 0.70710678f) {\n"
 "    // choose p in y-z plane\n"
@ -739,7 +739,7 @@ static const char* solverUtilsCL= \
 "			float4 n = -cs->m_linear;\n"
 "			\n"
 "			float4 tangent[2];\n"
-"			b3PlaneSpace1(n,&tangent[0],&tangent[1]);\n"
+"			btPlaneSpace1(n,&tangent[0],&tangent[1]);\n"
 "			float4 angular0, angular1, linear;\n"
 "			float4 r0 = center - posA;\n"
 "			float4 r1 = center - posB;\n"
@ -896,7 +896,7 @@ static const char* solverUtilsCL= \
 "		center /= (float)src->m_worldNormal.w;\n"
 "\n"
 "		float4 tangent[2];\n"
-"		b3PlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]);\n"
+"		btPlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]);\n"
 "		\n"
 "		float4 r[2];\n"
 "		r[0] = center - posA;\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h
@ -120,7 +120,7 @@ static const char* updateAabbsKernelCL= \
 "	float			fy;\n"
 "	float			fz;\n"
 "	int	uw;\n"
-"} b3AABBCL;\n"
+"} btAABBCL;\n"
 "\n"
 "__inline\n"
 "Matrix3x3 mtTranspose(Matrix3x3 m)\n"
@ -156,7 +156,7 @@ static const char* updateAabbsKernelCL= \
 "}\n"
 "\n"
 "\n"
-"__kernel void initializeGpuAabbsFull(  const int numNodes, __global Body* gBodies,__global Collidable* collidables, __global b3AABBCL* plocalShapeAABB, __global b3AABBCL* pAABB)\n"
+"__kernel void initializeGpuAabbsFull(  const int numNodes, __global Body* gBodies,__global Collidable* collidables, __global btAABBCL* plocalShapeAABB, __global btAABBCL* pAABB)\n"
 "{\n"
 "	int nodeID = get_global_id(0);\n"
 "		\n"
@ -171,8 +171,8 @@ static const char* updateAabbsKernelCL= \
 "			\n"
 "		if (shapeIndex>=0)\n"
 "		{\n"
-"			b3AABBCL minAabb = plocalShapeAABB[collidableIndex*2];\n"
-"			b3AABBCL maxAabb = plocalShapeAABB[collidableIndex*2+1];\n"
+"			btAABBCL minAabb = plocalShapeAABB[collidableIndex*2];\n"
+"			btAABBCL maxAabb = plocalShapeAABB[collidableIndex*2+1];\n"
 "				\n"
 "			float4 halfExtents = ((float4)(maxAabb.fx - minAabb.fx,maxAabb.fy - minAabb.fy,maxAabb.fz - minAabb.fz,0.f))*0.5f;\n"
 "			float4 localCenter = ((float4)(maxAabb.fx + minAabb.fx,maxAabb.fy + minAabb.fy,maxAabb.fz + minAabb.fz,0.f))*0.5f;\n"
--- a/test/OpenCL/BitonicSort/BitonicSort.cl
+++ b/test/OpenCL/BitonicSort/BitonicSort.cl
@ -0,0 +1,171 @@
+MSTRINGIFY(
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation. 
+ * Any use, reproduction, disclosure, or distribution of this software 
+ * and related documentation without an express license agreement from
+ * NVIDIA Corporation is strictly prohibited.
+ *
+ * Please refer to the applicable NVIDIA end user license agreement (EULA) 
+ * associated with this source code for terms and conditions that govern 
+ * your use of this NVIDIA software.
+ * 
+ */
+
+
+
+inline void ComparatorPrivate(int2* keyA, int2* keyB, uint dir)
+{
+    if((keyA[0].x > keyB[0].x) == dir)
+    {
+		int2 tmp = *keyA;
+		*keyA = *keyB;
+		*keyB = tmp;
+    }
+}
+
+inline void ComparatorLocal(__local int2* keyA, __local int2* keyB, uint dir)
+{
+    if((keyA[0].x > keyB[0].x) == dir)
+    {
+		int2 tmp = *keyA;
+		*keyA = *keyB;
+		*keyB = tmp;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Monolithic bitonic sort kernel for short arrays fitting into local memory
+////////////////////////////////////////////////////////////////////////////////
+__kernel void kBitonicSortCellIdLocal(__global int2* pKey, uint arrayLength, uint dir GUID_ARG)
+{
+    __local int2 l_key[1024U];
+    int localSizeLimit = get_local_size(0) * 2;
+
+    //Offset to the beginning of subbatch and load data
+    pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
+    l_key[get_local_id(0) +                    0] = pKey[                   0];
+    l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
+
+    for(uint size = 2; size < arrayLength; size <<= 1)
+    {
+        //Bitonic merge
+        uint ddd = dir ^ ( (get_local_id(0) & (size / 2)) != 0 );
+        for(uint stride = size / 2; stride > 0; stride >>= 1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+            ComparatorLocal(&l_key[pos +      0], &l_key[pos + stride], ddd);
+        }
+    }
+
+    //ddd == dir for the last bitonic merge step
+    {
+        for(uint stride = arrayLength / 2; stride > 0; stride >>= 1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+            ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], dir);
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    pKey[                   0] = l_key[get_local_id(0) +                    0];
+    pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Bitonic sort kernel for large arrays (not fitting into local memory)
+////////////////////////////////////////////////////////////////////////////////
+//Bottom-level bitonic sort
+//Almost the same as bitonicSortLocal with the only exception
+//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being
+//sorted in opposite directions
+__kernel void kBitonicSortCellIdLocal1(__global int2* pKey GUID_ARG)
+{
+    __local int2 l_key[1024U];
+    uint localSizeLimit = get_local_size(0) * 2;
+
+    //Offset to the beginning of subarray and load data
+    pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
+    l_key[get_local_id(0) +                    0] = pKey[                   0];
+    l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
+
+    uint comparatorI = get_global_id(0) & ((localSizeLimit / 2) - 1);
+
+    for(uint size = 2; size < localSizeLimit; size <<= 1)
+    {
+        //Bitonic merge
+        uint ddd = (comparatorI & (size / 2)) != 0;
+        for(uint stride = size / 2; stride > 0; stride >>= 1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+            ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
+        }
+    }
+
+    //Odd / even arrays of localSizeLimit elements
+    //sorted in opposite directions
+    {
+        uint ddd = (get_group_id(0) & 1);
+        for(uint stride = localSizeLimit / 2; stride > 0; stride >>= 1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+            ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    pKey[                   0] = l_key[get_local_id(0) +                    0];
+    pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
+}
+
+//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
+__kernel void kBitonicSortCellIdMergeGlobal(__global int2* pKey, uint arrayLength, uint size, uint stride, uint dir GUID_ARG)
+{
+    uint global_comparatorI = get_global_id(0);
+    uint        comparatorI = global_comparatorI & (arrayLength / 2 - 1);
+
+    //Bitonic merge
+    uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
+    uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
+
+    int2 keyA = pKey[pos +      0];
+    int2 keyB = pKey[pos + stride];
+
+    ComparatorPrivate(&keyA, &keyB, ddd);
+
+    pKey[pos +      0] = keyA;
+    pKey[pos + stride] = keyB;
+}
+
+//Combined bitonic merge steps for
+//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]
+__kernel void kBitonicSortCellIdMergeLocal(__global int2* pKey, uint arrayLength, uint stride, uint size, uint dir GUID_ARG)
+{
+    __local int2 l_key[1024U];
+    int localSizeLimit = get_local_size(0) * 2;
+
+    pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
+    l_key[get_local_id(0) +                    0] = pKey[                   0];
+    l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
+
+    //Bitonic merge
+    uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1);
+    uint         ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
+    for(; stride > 0; stride >>= 1)
+    {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+        ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    pKey[                   0] = l_key[get_local_id(0) +                    0];
+    pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
+}
+);
--- a/test/OpenCL/BitonicSort/b3BitonicSort.cpp
+++ b/test/OpenCL/BitonicSort/b3BitonicSort.cpp
@ -0,0 +1,83 @@
+
+#include "b3BitonicSort.h"
+#include "Bullet3Common/b3Scalar.h"
+
+
+//Note: logically shared with BitonicSort OpenCL code!
+// TODO : get parameter from OpenCL and pass it to kernel (needed for platforms other than NVIDIA)
+
+void bitonicSortNv(cl_mem pKey, int arrayLength, b3BitonicSortInfo& info)
+{
+	
+    if(arrayLength < 2)
+        return;
+    //Only power-of-two array lengths are supported so far
+    info.dir = (info.dir != 0);
+    cl_int ciErrNum;
+    size_t localWorkSize, globalWorkSize;
+    if(arrayLength <= info.localSizeLimit)
+    {
+        b3Assert( ( arrayLength) % info.localSizeLimit == 0);
+        //Launch bitonicSortLocal
+		ciErrNum  = clSetKernelArg(info.bitonicSortLocal, 0,   sizeof(cl_mem), (void *)&pKey);
+        ciErrNum |= clSetKernelArg(info.bitonicSortLocal, 1,  sizeof(cl_uint), (void *)&arrayLength);
+        ciErrNum |= clSetKernelArg(info.bitonicSortLocal, 2,  sizeof(cl_uint), (void *)&info.dir);
+        oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+        localWorkSize  = info.localSizeLimit / 2;
+        globalWorkSize =  arrayLength / 2;
+        ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortLocal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
+        oclCHECKERROR(ciErrNum, CL_SUCCESS);
+    }
+    else
+    {
+        //Launch bitonicSortLocal1
+        ciErrNum  = clSetKernelArg(info.bitonicSortLocal1, 0,  sizeof(cl_mem), (void *)&pKey);
+        oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+        localWorkSize  = info.localSizeLimit / 2;
+        globalWorkSize =  arrayLength / 2;
+        ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortLocal1, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
+        oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+        for(unsigned int size = 2 * info.localSizeLimit; size <= arrayLength; size <<= 1)
+        {
+            for(unsigned stride = size / 2; stride > 0; stride >>= 1)
+            {
+                if(stride >= info.localSizeLimit)
+                {
+                    //Launch bitonicMergeGlobal
+                    ciErrNum  = clSetKernelArg(info.bitonicSortMergeGlobal, 0,  sizeof(cl_mem), (void *)&pKey);
+                    ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 1, sizeof(cl_uint), (void *)&arrayLength);
+                    ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 2, sizeof(cl_uint), (void *)&size);
+                    ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 3, sizeof(cl_uint), (void *)&stride);
+                    ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 4, sizeof(cl_uint), (void *)&info.dir);
+					oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+                    localWorkSize  = info.localSizeLimit / 4;
+                    globalWorkSize =  arrayLength / 2;
+
+                    ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortMergeGlobal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
+					oclCHECKERROR(ciErrNum, CL_SUCCESS);
+                }
+                else
+                {
+                    //Launch bitonicMergeLocal
+					ciErrNum  = clSetKernelArg(info.bitonicSortMergeLocal, 0,  sizeof(cl_mem), (void *)&pKey);
+                    ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 1, sizeof(cl_uint), (void *)&arrayLength);
+                    ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 2, sizeof(cl_uint), (void *)&stride);
+                    ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 3, sizeof(cl_uint), (void *)&size);
+                    ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 4, sizeof(cl_uint), (void *)&info.dir);
+					oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+                    localWorkSize  = info.localSizeLimit / 2;
+                    globalWorkSize =  arrayLength / 2;
+
+                    ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortMergeLocal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
+					oclCHECKERROR(ciErrNum, CL_SUCCESS);
+                    break;
+                }
+            }
+        }
+    }
+}
--- a/test/OpenCL/BitonicSort/b3BitonicSort.h
+++ b/test/OpenCL/BitonicSort/b3BitonicSort.h
@ -0,0 +1,30 @@
+#ifndef B3_BITONIC_SORT_H
+#define B3_BITONIC_SORT_H
+
+#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
+
+struct b3BitonicSortInfo
+{
+	cl_command_queue m_cqCommandQue;
+	cl_kernel bitonicSortLocal;
+	cl_kernel bitonicSortLocal1;
+	cl_kernel bitonicSortMergeGlobal;
+	cl_kernel bitonicSortMergeLocal;
+	unsigned int dir;
+	unsigned int localSizeLimit;
+
+	b3BitonicSortInfo()
+	{
+		bitonicSortLocal=0;
+		bitonicSortLocal1=0;
+		bitonicSortMergeGlobal=0;
+		bitonicSortMergeLocal=0;
+		dir = 1;
+		localSizeLimit = 1024U;
+	}
+};
+
+
+void bitonicSortNv(cl_mem pKey, int arrayLength, b3BitonicSortInfo& info);
+
+#endif //B3_BITONIC_SORT_H
--- a/test/OpenCL/BitonicSort/main.cpp
+++ b/test/OpenCL/BitonicSort/main.cpp
@ -0,0 +1,192 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+///original author: Erwin Coumans
+
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
+#include "Bullet3Common/b3Int2.h"
+#include "Bullet3Common/b3Quickprof.h"
+
+#include "b3BitonicSort.h"
+
+#include <stdio.h>
+
+int numSuccess=0;
+int numFailed=0;
+
+cl_context			g_cxMainContext;
+cl_command_queue	g_cqCommandQue;
+
+#define MSTRINGIFY(A) #A
+static const char* kernelSource= 
+#include "BitonicSort.cl"
+
+
+
+
+static bool compareFunc(const b3Int2& p, const b3Int2& q)
+{
+	return (p.x < q.x) || ((p.x == q.x) && ((p.y < q.y)));
+}
+
+int main(int argc, char* argv[])
+{
+	int ciErrNum = 0;
+
+	b3Clock clock;
+	
+
+	cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
+	const char* vendorSDK = b3OpenCLUtils::getSdkVendorName();
+
+	printf("This program was compiled using the %s OpenCL SDK\n",vendorSDK);
+	int numPlatforms = b3OpenCLUtils::getNumPlatforms();
+	printf("Num Platforms = %d\n", numPlatforms);
+
+	for (int i=0;i<numPlatforms;i++)
+	{
+		cl_platform_id platform = b3OpenCLUtils::getPlatform(i);
+		b3OpenCLPlatformInfo platformInfo;
+		b3OpenCLUtils::getPlatformInfo(platform,&platformInfo);
+		printf("--------------------------------\n");
+		printf("Platform info for platform nr %d:\n",i);
+		printf("  CL_PLATFORM_VENDOR: \t\t\t%s\n",platformInfo.m_platformVendor);
+		printf("  CL_PLATFORM_NAME: \t\t\t%s\n",platformInfo.m_platformName);
+		printf("  CL_PLATFORM_VERSION: \t\t\t%s\n",platformInfo.m_platformVersion);
+		
+		cl_context context = b3OpenCLUtils::createContextFromPlatform(platform,deviceType,&ciErrNum);
+		
+		int numDevices = b3OpenCLUtils::getNumDevices(context);
+		printf("Num Devices = %d\n", numDevices);
+		for (int j=0;j<numDevices;j++)
+		{
+			cl_device_id dev = b3OpenCLUtils::getDevice(context,j);
+			b3OpenCLDeviceInfo devInfo;
+			b3OpenCLUtils::getDeviceInfo(dev,&devInfo);
+			printf("m_deviceName = %s\n",devInfo.m_deviceName);
+			//b3OpenCLUtils::printDeviceInfo(dev);
+
+			g_cqCommandQue = clCreateCommandQueue(context, dev, 0, &ciErrNum);
+			oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+			b3BitonicSortInfo info;
+			
+			info.bitonicSortLocal = b3OpenCLUtils::compileCLKernelFromString(context,dev,kernelSource,"kBitonicSortCellIdLocal",&ciErrNum,0,"");
+			oclCHECKERROR(ciErrNum, CL_SUCCESS);
+			info.bitonicSortLocal1 = b3OpenCLUtils::compileCLKernelFromString(context,dev,kernelSource,"kBitonicSortCellIdLocal1",&ciErrNum,0,"");
+			oclCHECKERROR(ciErrNum, CL_SUCCESS);
+			info.bitonicSortMergeGlobal = b3OpenCLUtils::compileCLKernelFromString(context,dev,kernelSource,"kBitonicSortCellIdMergeGlobal",&ciErrNum,0,"");
+			oclCHECKERROR(ciErrNum, CL_SUCCESS);
+			info.bitonicSortMergeLocal = b3OpenCLUtils::compileCLKernelFromString(context,dev,kernelSource,"kBitonicSortCellIdMergeLocal",&ciErrNum,0,"");
+			oclCHECKERROR(ciErrNum, CL_SUCCESS);
+			info.m_cqCommandQue = g_cqCommandQue;
+
+			b3OpenCLArray<b3Int2> keyValuesGPU(context,g_cqCommandQue);
+			b3AlignedObjectArray<b3Int2> keyValuesCPU;
+			b3AlignedObjectArray<b3Int2> keyValuesGold;
+			int numValues = 8*1024*1024;//2048;//1024;
+			keyValuesCPU.resize(numValues);
+			for (int i=0;i<numValues;i++)
+			{
+				b3Int2 v;
+				v.x = numValues+1-i;
+				v.y = i*i;
+				keyValuesCPU[i] = v;
+			}
+			keyValuesGPU.copyFromHost(keyValuesCPU);
+			keyValuesGPU.copyToHost(keyValuesGold);
+			keyValuesGold.quickSort(compareFunc);
+			
+			unsigned int batch = 1;
+			unsigned int arrayLength = keyValuesGPU.size();
+
+			for (int i=0;i<10;i++)
+			{
+				keyValuesGPU.copyFromHost(keyValuesCPU);
+				clFinish(info.m_cqCommandQue);
+				unsigned long pre=clock.getTimeMilliseconds();
+				bitonicSortNv(keyValuesGPU.getBufferCL(), arrayLength, info);
+				clFinish(info.m_cqCommandQue);
+				unsigned long post=clock.getTimeMilliseconds();
+				printf("GPU sort took %d ms\n",post-pre);
+			}
+			keyValuesGPU.copyToHost(keyValuesCPU);
+			int success=1;
+			for (int i=0;i<numValues;i++)
+			{
+				if (keyValuesCPU[i].x != keyValuesGold[i].x)
+					success = 0;
+				if (keyValuesCPU[i].y != keyValuesGold[i].y)
+					success = 0;
+			}
+			if (success)
+			{
+				printf("Correct\n");
+				numSuccess++;
+			} else
+			{
+				printf("Sort Failed\n");
+				numFailed++;
+			}
+			
+		}
+
+		clReleaseContext(context);
+	}
+
+	///Easier method to initialize OpenCL using createContextFromType for a GPU
+	deviceType = CL_DEVICE_TYPE_GPU;
+	
+	void* glCtx=0;
+	void* glDC = 0;
+	printf("Initialize OpenCL using b3OpenCLUtils::createContextFromType for CL_DEVICE_TYPE_GPU\n");
+	g_cxMainContext = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+	if (g_cxMainContext)
+	{
+		int numDev = b3OpenCLUtils::getNumDevices(g_cxMainContext);
+
+		for (int i=0;i<numDev;i++)
+		{
+			cl_device_id		device;
+			device = b3OpenCLUtils::getDevice(g_cxMainContext,i);
+			b3OpenCLDeviceInfo clInfo;
+			b3OpenCLUtils::getDeviceInfo(device,&clInfo);
+			b3OpenCLUtils::printDeviceInfo(device);
+			// create a command-queue
+			g_cqCommandQue = clCreateCommandQueue(g_cxMainContext, device, 0, &ciErrNum);
+			oclCHECKERROR(ciErrNum, CL_SUCCESS);
+			//normally you would create and execute kernels using this command queue
+
+
+			clReleaseCommandQueue(g_cqCommandQue);
+		}
+
+		clReleaseContext(g_cxMainContext);
+
+	}
+	else {
+		printf("No OpenCL capable GPU found!");
+	}
+
+	printf("numSuccess=%d\n",numSuccess);
+	printf("numFailed=%d\n",numFailed);
+
+	printf("press <Enter>\n");
+	getchar();
+	return 0;
+}
--- a/test/OpenCL/BitonicSort/premake4.lua
+++ b/test/OpenCL/BitonicSort/premake4.lua
@ -0,0 +1,36 @@
+function createProject(vendor)
+	
+	hasCL = findOpenCL(vendor)
+	
+	if (hasCL) then
+
+		project ("Test_BitonicSort_" .. vendor)
+
+		initOpenCL(vendor)
+	
+		language "C++"
+				
+		
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+		includedirs {"../../../src"}
+		
+		files {
+			"main.cpp",
+			"b3BitonicSort.cpp",
+			"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
+			"../../../src/Bullet3Common/b3AlignedAllocator.h",
+			"../../../src/Bullet3Common/b3Quickprof.cpp",
+			"../../../src/Bullet3Common/b3Quickprof.h",
+			"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp",
+			"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+		}
+		
+	end
+end
+	
+createProject("Apple")
+createProject("AMD")
+createProject("Intel")
+createProject("NVIDIA")
--- a/test/OpenCL/ParallelPrimitives/main.cpp
+++ b/test/OpenCL/ParallelPrimitives/main.cpp
@ -0,0 +1,378 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include <stdio.h>
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h"
+#include "Bullet3Common/b3CommandLineArgs.h"
+#include "Bullet3Common/b3MinMax.h"
+
+int g_nPassed = 0;
+int g_nFailed = 0;
+bool g_testFailed = 0;
+
+#define TEST_INIT g_testFailed = 0;
+#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;}
+#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++;
+#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
+
+cl_context g_context=0;
+cl_device_id g_device=0;
+cl_command_queue g_queue =0;
+const char* g_deviceName = 0;
+
+void initCL(int preferredDeviceIndex, int preferredPlatformIndex)
+{
+	void* glCtx=0;
+	void* glDC = 0;
+	int ciErrNum = 0;
+	//bound search and radix sort only work on GPU right now (assume 32 or 64 width workgroup without barriers)
+
+	cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
+
+	g_context = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	int numDev = b3OpenCLUtils::getNumDevices(g_context);
+	if (numDev>0)
+	{
+		b3OpenCLDeviceInfo info;
+		g_device= b3OpenCLUtils::getDevice(g_context,0);
+		g_queue = clCreateCommandQueue(g_context, g_device, 0, &ciErrNum);
+		oclCHECKERROR(ciErrNum, CL_SUCCESS);
+        b3OpenCLUtils::printDeviceInfo(g_device);
+		b3OpenCLUtils::getDeviceInfo(g_device,&info);
+		g_deviceName = info.m_deviceName;
+	}
+}
+
+void exitCL()
+{
+	clReleaseCommandQueue(g_queue);
+	clReleaseContext(g_context);
+}
+
+
+inline void fillIntTest()
+{
+	TEST_INIT;
+
+	b3FillCL* fillCL = new b3FillCL(g_context,g_device,g_queue);
+	int maxSize=1024*256;
+	b3OpenCLArray<int> intBuffer(g_context,g_queue,maxSize);
+	intBuffer.resize(maxSize);
+	
+#define NUM_TESTS 7
+
+	int dx = maxSize/NUM_TESTS;
+	for (int iter=0;iter<NUM_TESTS;iter++)
+	{
+		int size = b3Min( 11+dx*iter, maxSize );
+
+		int value = 2;
+		
+
+		int offset=0;
+		fillCL->execute(intBuffer,value,size,offset);
+
+		b3AlignedObjectArray<int> hostBuf2;
+		hostBuf2.resize(size);
+		fillCL->executeHost(hostBuf2,value,size,offset);
+
+		b3AlignedObjectArray<int> hostBuf;
+		intBuffer.copyToHost(hostBuf);
+
+		for(int i=0; i<size; i++)
+		{
+				TEST_ASSERT( hostBuf[i] == hostBuf2[i] );
+				TEST_ASSERT( hostBuf[i] == hostBuf2[i] );
+		}
+	}
+
+	
+
+	delete fillCL;
+
+	TEST_REPORT( "fillIntTest" );
+}
+
+
+__inline
+void seedRandom(int seed)
+{
+	srand( seed );
+}
+
+template<typename T>
+__inline
+T getRandom(const T& minV, const T& maxV)
+{
+	float r = (rand()%10000)/10000.f;
+	T range = maxV - minV;
+	return (T)(minV + r*range);
+}
+
+struct b3SortDataCompare
+{
+	inline bool operator()(const b3SortData& first, const b3SortData& second) const
+	{
+		return (first.m_key < second.m_key) || (first.m_key==second.m_key && first.m_value < second.m_value);
+	}
+};
+
+
+void boundSearchTest( )
+{
+	TEST_INIT;
+
+	int maxSize = 1024*256;
+	int bucketSize = 256;
+
+	b3OpenCLArray<b3SortData> srcCL(g_context,g_queue,maxSize);
+	b3OpenCLArray<unsigned int> upperCL(g_context,g_queue,maxSize);
+	b3OpenCLArray<unsigned int> lowerCL(g_context,g_queue,maxSize);
+	
+	b3AlignedObjectArray<b3SortData> srcHost;
+	b3AlignedObjectArray<unsigned int> upperHost;
+	b3AlignedObjectArray<unsigned int> lowerHost;
+	b3AlignedObjectArray<unsigned int> upperHostCompare;
+	b3AlignedObjectArray<unsigned int> lowerHostCompare;
+	
+	b3BoundSearchCL* search = new b3BoundSearchCL(g_context,g_device,g_queue, maxSize);
+
+
+	int dx = maxSize/NUM_TESTS;
+	for(int iter=0; iter<NUM_TESTS; iter++)
+	{
+		
+		int size = b3Min( 128+dx*iter, maxSize );
+
+		upperHost.resize(bucketSize);
+		lowerHost.resize(bucketSize);
+		upperHostCompare.resize(bucketSize);
+		lowerHostCompare.resize(bucketSize);
+
+		srcHost.resize(size);
+
+		for(int i=0; i<size; i++) 
+		{
+			b3SortData v;
+//			v.m_key = i<2? 0 : 5;
+			v.m_key = getRandom(0,bucketSize);
+
+			v.m_value = i;
+			srcHost.at(i) = v;
+		}
+
+		srcHost.quickSort(b3SortDataCompare());
+		srcCL.copyFromHost(srcHost);
+
+		{
+			
+			for(int i=0; i<bucketSize; i++) 
+			{
+				lowerHost[i] = -1;
+				lowerHostCompare[i] = -1;
+				upperHost[i] = -1;
+				upperHostCompare[i] = -1;
+			}
+			upperCL.copyFromHost(upperHost);
+			lowerCL.copyFromHost(lowerHost);
+		}
+
+		search->execute(srcCL,size,upperCL,bucketSize,b3BoundSearchCL::BOUND_UPPER);
+		search->execute(srcCL,size,lowerCL,bucketSize,b3BoundSearchCL::BOUND_LOWER);
+
+		search->executeHost(srcHost,size,upperHostCompare,bucketSize,b3BoundSearchCL::BOUND_UPPER);
+		search->executeHost(srcHost,size,lowerHostCompare,bucketSize,b3BoundSearchCL::BOUND_LOWER);
+
+		lowerCL.copyToHost(lowerHost);
+		upperCL.copyToHost(upperHost);
+		for(int i=0; i<bucketSize; i++)
+		{
+			TEST_ASSERT(upperHostCompare[i] == upperHost[i]);
+			TEST_ASSERT(lowerHostCompare[i] == lowerHost[i]);
+		}
+		/*
+		for(int i=1; i<bucketSize; i++)
+		{
+			int lhi_1 = lowerHost[i-1];
+			int lhi = lowerHost[i];
+
+			for(int j=lhi_1; j<lhi; j++)
+			//for(int j=lowerHost[i-1]; j<lowerHost[i]; j++)
+			{
+				TEST_ASSERT( srcHost[j].m_key < i );
+			}
+		}
+
+		for(int i=0; i<bucketSize; i++)
+		{
+			int jMin = (i==0)?0:upperHost[i-1];
+			for(int j=jMin; j<upperHost[i]; j++)
+			{
+				TEST_ASSERT( srcHost[j].m_key <= i );
+			}
+		}
+		*/
+
+
+		for(int i=0; i<bucketSize; i++)
+		{
+			int lhi = lowerHost[i];
+			int uhi = upperHost[i];
+
+			for(int j=lhi; j<uhi; j++)
+			{
+				if ( srcHost[j].m_key != i )
+				{
+					printf("error %d != %d\n",srcHost[j].m_key,i);
+				}
+				TEST_ASSERT( srcHost[j].m_key == i );
+			}
+		}
+
+	}
+
+	delete search;
+
+	TEST_REPORT( "boundSearchTest" );
+}
+
+
+void prefixScanTest()
+{
+	TEST_INIT;
+
+	int maxSize = 1024*256;
+
+	b3AlignedObjectArray<unsigned int> buf0Host;
+	b3AlignedObjectArray<unsigned int> buf1Host;
+
+	b3OpenCLArray<unsigned int> buf2CL(g_context,g_queue,maxSize);
+	b3OpenCLArray<unsigned int> buf3CL(g_context,g_queue,maxSize);
+	
+	
+	b3PrefixScanCL* scan = new b3PrefixScanCL(g_context,g_device,g_queue,maxSize);
+		
+	int dx = maxSize/NUM_TESTS;
+	for(int iter=0; iter<NUM_TESTS; iter++)
+	{
+		int size = b3Min( 128+dx*iter, maxSize );
+		buf0Host.resize(size);
+		buf1Host.resize(size);
+
+		for(int i=0; i<size; i++) 
+			buf0Host[i] = 1;
+		
+		buf2CL.copyFromHost( buf0Host);
+	
+		unsigned int sumHost, sumGPU;
+
+		scan->executeHost(buf0Host, buf1Host, size, &sumHost );
+		scan->execute( buf2CL, buf3CL, size, &sumGPU );
+
+		buf3CL.copyToHost(buf0Host);
+		
+		TEST_ASSERT( sumHost == sumGPU );
+		for(int i=0; i<size; i++) 
+			TEST_ASSERT( buf1Host[i] == buf0Host[i] );
+	}
+
+	delete scan;
+
+	TEST_REPORT( "scanTest" );
+}
+
+
+bool radixSortTest()
+{
+	TEST_INIT;
+	
+	int maxSize = 1024*256;
+
+	b3AlignedObjectArray<b3SortData> buf0Host;
+	buf0Host.resize(maxSize);
+	b3AlignedObjectArray<b3SortData> buf1Host;
+	buf1Host.resize(maxSize );
+	b3OpenCLArray<b3SortData> buf2CL(g_context,g_queue,maxSize);
+
+	b3RadixSort32CL* sort = new b3RadixSort32CL(g_context,g_device,g_queue,maxSize);
+
+	int dx = maxSize/NUM_TESTS;
+	for(int iter=0; iter<NUM_TESTS; iter++)
+	{
+		int size = b3Min( 128+dx*iter, maxSize-512 );
+		size = NEXTMULTIPLEOF( size, 512 );//not necessary
+		
+		buf0Host.resize(size);
+
+		for(int i=0; i<size; i++)
+		{
+			b3SortData v;
+			v.m_key = getRandom(0,0xff);
+			v.m_value = i;
+			buf0Host[i] = v;
+		}
+
+		buf2CL.copyFromHost( buf0Host);
+		
+
+		sort->executeHost( buf0Host);
+		sort->execute(buf2CL);
+
+		buf2CL.copyToHost(buf1Host);
+				
+		for(int i=0; i<size; i++) 
+		{
+			TEST_ASSERT( buf0Host[i].m_value == buf1Host[i].m_value && buf0Host[i].m_key == buf1Host[i].m_key );
+		}
+	}
+
+	delete sort;
+
+	TEST_REPORT( "radixSort" );
+
+	return g_testFailed;
+}
+
+
+int main(int argc, char** argv)
+{
+	int preferredDeviceIndex = -1;
+	int preferredPlatformIndex = -1;
+
+	b3CommandLineArgs args(argc, argv);
+	args.GetCmdLineArgument("deviceId", preferredDeviceIndex);
+	args.GetCmdLineArgument("platformId", preferredPlatformIndex);
+
+	initCL(preferredDeviceIndex,preferredPlatformIndex);
+
+	fillIntTest();
+
+	boundSearchTest();
+
+	prefixScanTest();
+
+	radixSortTest();
+
+	exitCL();
+
+	printf("%d tests passed, %d tests failed\n",g_nPassed, g_nFailed);
+	printf("End, press <enter>\n");
+	getchar();
+}
--- a/test/OpenCL/ParallelPrimitives/premake4.lua
+++ b/test/OpenCL/ParallelPrimitives/premake4.lua
@ -0,0 +1,41 @@
+function createProject(vendor)	
+	hasCL = findOpenCL(vendor)
+	
+	if (hasCL) then
+
+		project ("Test_OpenCL_Primitives_" .. vendor)
+
+		initOpenCL(vendor)
+
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+		includedirs {".","../../../src"}
+		
+		
+		files {
+			"main.cpp",
+			"../../../src/Bullet3OpenCL/Initialize/b3OpenCLInclude.h",
+			"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp",
+			"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h",
+			"../../../src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp",
+			"../../../src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h",
+			"../../../src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp",
+			"../../../src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h",
+			"../../../src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp",
+			"../../../src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h",
+			"../../../src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp",
+			"../../../src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h",
+			"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
+			"../../../src/Bullet3Common/b3AlignedAllocator.h",
+			"../../../src/Bullet3Common/b3AlignedObjectArray.h",
+		}
+		
+	end
+end
+
+createProject("AMD")
+createProject("Intel")
+createProject("NVIDIA")
+createProject("Apple")
--- a/test/OpenCL/RadixSortBenchmark/main.cpp
+++ b/test/OpenCL/RadixSortBenchmark/main.cpp
@ -0,0 +1,712 @@
+/******************************************************************************
+ * Copyright 2010 Duane Merrill
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may ob3ain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. 
+ * 
+ * 
+ * 
+ * 
+ * AUTHORS' REQUEST: 
+ * 
+ * 		If you use|reference|benchmark this code, please cite our Technical 
+ * 		Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
+ * 
+ *		@TechReport{ Merrill:Sorting:2010,
+ *        	author = "Duane Merrill and Andrew Grimshaw",
+ *        	title = "Revisiting Sorting for GPGPU Stream Architectures",
+ *        	year = "2010",
+ *        	institution = "University of Virginia, Department of Computer Science",
+ *        	address = "Charlottesville, VA, USA",
+ *        	number = "CS2010-03"
+ *		}
+ * 
+ * For more information, see our Google Code project site: 
+ * http://code.google.com/p/back40computing/
+ * 
+ * Thanks!
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple test driver program for *large-problem* radix sorting.
+ *
+ * Useful for demonstrating how to integrate radix sorting into 
+ * your application 
+ ******************************************************************************/
+
+/******************************************************************************
+ * Converted from CUDA to OpenCL/DirectCompute by Erwin Coumans
+ ******************************************************************************/
+#ifdef _WIN32
+#pragma warning (disable:4996)
+#endif
+#include <stdlib.h> 
+#include <stdio.h> 
+#include <string.h> 
+#include <math.h> 
+#include <float.h>
+#include <algorithm>
+#include <string>
+
+
+//#include <iostream>
+#include <sstream>
+/**********************
+*
+*/
+
+#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "Bullet3Common/b3Quickprof.h"
+
+cl_context g_cxMainContext;
+cl_device_id g_device;
+cl_command_queue g_cqCommandQueue;
+
+/***********************
+*
+*/
+
+bool g_verbose;
+///Preferred OpenCL device/platform. When < 0 then no preference is used. 
+///Note that b3OpenCLUtils might still use the preference of using a platform vendor that matches the SDK vendor used to build the application.
+///Preferred device/platform take priority over this platform-vendor match
+int gPreferredDeviceId = -1;
+int gPreferredPlatformId = -1;
+
+
+
+/******************************************************************************
+ * Routines
+ ******************************************************************************/
+
+
+/**
+ * Keys-only sorting.  Uses the GPU to sort the specified vector of elements for the given 
+ * number of iterations, displaying runtime information.
+ *
+ * @param[in] 		num_elements 
+ * 		Size in elements of the vector to sort
+ * @param[in] 		h_keys 
+ * 		Vector of keys to sort 
+ * @param[in] 		iterations  
+ * 		Number of times to invoke the GPU sorting primitive
+  * @param[in] 		cfg 
+ * 		Config
+ */
+template <typename K>
+void TimedSort(
+	unsigned int num_elements, 
+	K *h_keys,
+	unsigned int iterations)
+{
+	printf("Keys only, %d iterations, %d elements\n", iterations, num_elements);
+
+	int max_elements = num_elements;
+	b3AlignedObjectArray<unsigned int> hostData;
+	hostData.resize(num_elements);
+	for (int i=0;i<num_elements;i++)
+	{
+		hostData[i] = h_keys[i];
+	}
+
+	b3RadixSort32CL sorter(g_cxMainContext,g_device,g_cqCommandQueue);
+
+	b3OpenCLArray<unsigned int> gpuData(g_cxMainContext,g_cqCommandQueue);
+	gpuData.copyFromHost(hostData);
+	//sorter.executeHost(gpuData);
+    sorter.execute(gpuData);
+    
+	b3AlignedObjectArray<unsigned int> hostDataSorted;
+	gpuData.copyToHost(hostDataSorted);
+    
+	clFinish(g_cqCommandQueue);
+
+	{
+		//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
+
+		// Create sorting enactor
+
+		// Perform the timed number of sorting iterations
+		double elapsed = 0;
+		float duration = 0;
+		b3Clock watch;
+
+		//warm-start
+		gpuData.copyFromHost(hostData);
+		clFinish(g_cqCommandQueue);
+		sorter.execute(gpuData);
+
+		watch.reset();
+
+			
+		for (int i = 0; i < iterations; i++) 
+		{
+
+
+
+			// Move a fresh copy of the problem into device storage
+			gpuData.copyFromHost(hostData);
+			clFinish(g_cqCommandQueue);
+
+			// Start GPU timing record
+			double startMs = watch.getTimeMicroseconds()/1e3;
+			
+			// Call the sorting API routine
+			sorter.execute(gpuData);
+
+
+
+			clFinish(g_cqCommandQueue);
+	
+			double stopMs = watch.getTimeMicroseconds()/1e3;
+
+			duration = stopMs - startMs;
+			
+			// End GPU timing record
+			elapsed += (double) duration;
+			printf("duration = %f\n", duration);
+		}
+
+		// Display timing information
+		double avg_runtime = elapsed / iterations;
+	//	double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0; 
+	//   printf(", %f GPU ms, %f x10^9 elts/sec\n", 	avg_runtime,	throughput);
+		double throughput = ((double) num_elements) / avg_runtime / 1000.0 ; 
+		printf(", %f GPU ms, %f x10^6 elts/sec\n", 	avg_runtime,	throughput);
+
+		gpuData.copyToHost(hostData);
+		for (int i=0;i<num_elements;i++)
+		{
+			h_keys[i] = hostData[i];
+		}
+	}
+}
+
+/**
+ * Key-value sorting.  Uses the GPU to sort the specified vector of elements for the given 
+ * number of iterations, displaying runtime information.
+ *
+ * @param[in] 		num_elements 
+ * 		Size in elements of the vector to sort
+ * @param[in] 		h_keys 
+ * 		Vector of keys to sort 
+ * @param[in,out] 	h_values  
+ * 		Vector of values to sort 
+ * @param[in] 		iterations  
+ * 		Number of times to invoke the GPU sorting primitive
+  * @param[in] 		cfg 
+ * 		Config
+ */
+template <typename K, typename V>
+void TimedSort(
+	unsigned int num_elements, 
+	K *h_keys,
+	V *h_values, 
+	unsigned int iterations) 
+{
+	
+	printf("Key-values, %d iterations, %d elements\n", iterations, num_elements);
+
+	int max_elements = num_elements;
+	b3AlignedObjectArray<b3SortData> hostData;
+	hostData.resize(num_elements);
+	for (int i=0;i<num_elements;i++)
+	{
+		hostData[i].m_key = h_keys[i];
+		hostData[i].m_value = h_values[i];
+	}
+
+	b3RadixSort32CL sorter(g_cxMainContext,g_device,g_cqCommandQueue);
+
+	b3OpenCLArray<b3SortData> gpuData(g_cxMainContext,g_cqCommandQueue);
+	gpuData.copyFromHost(hostData);
+	//sorter.executeHost(gpuData);
+    sorter.execute(gpuData);
+    
+	b3AlignedObjectArray<b3SortData> hostDataSorted;
+	gpuData.copyToHost(hostDataSorted);
+#if 0
+    for (int i=0;i<num_elements;i++)
+	{
+		printf("hostData[%d].m_key = %d\n",i, hostDataSorted[i].m_key);
+        printf("hostData[%d].m_value = %d\n",i,hostDataSorted[i].m_value);
+	}
+#endif
+    
+clFinish(g_cqCommandQueue);
+
+	{
+		//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
+
+		// Create sorting enactor
+
+		// Perform the timed number of sorting iterations
+		double elapsed = 0;
+		float duration = 0;
+		b3Clock watch;
+		
+		//warm-start
+		gpuData.copyFromHost(hostData);
+		sorter.execute(gpuData);
+		clFinish(g_cqCommandQueue);
+
+		watch.reset();
+
+			
+		for (int i = 0; i < iterations; i++) 
+		{
+
+
+
+			// Move a fresh copy of the problem into device storage
+			gpuData.copyFromHost(hostData);
+			clFinish(g_cqCommandQueue);
+
+			// Start GPU timing record
+			double startMs = watch.getTimeMicroseconds()/1e3;
+			
+			// Call the sorting API routine
+			sorter.execute(gpuData);
+			clFinish(g_cqCommandQueue);
+	
+			double stopMs = watch.getTimeMicroseconds()/1e3;
+
+			duration = stopMs - startMs;
+			
+			// End GPU timing record
+			elapsed += (double) duration;
+			printf("duration = %f\n", duration);
+		}
+
+		// Display timing information
+		double avg_runtime = elapsed / iterations;
+	//	double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0; 
+	//   printf(", %f GPU ms, %f x10^9 elts/sec\n", 	avg_runtime,	throughput);
+		double throughput = ((double) num_elements) / avg_runtime / 1000.0 ; 
+		printf(", %f GPU ms, %f x10^6 elts/sec\n", 	avg_runtime,	throughput);
+
+		gpuData.copyToHost(hostData);
+		for (int i=0;i<num_elements;i++)
+		{
+			h_keys[i] = hostData[i].m_key;
+			h_values[i] = hostData[i].m_value;
+		}
+	}
+}
+
+
+
+/**
+ * Generates random 32-bit keys.
+ * 
+ * We always take the second-order byte from rand() because the higher-order 
+ * bits returned by rand() are commonly considered more uniformly distributed
+ * than the lower-order bits.
+ * 
+ * We can decrease the entropy level of keys by adopting the technique 
+ * of Thearling and Smith in which keys are computed from the bitwise AND of 
+ * multiple random samples: 
+ * 
+ * entropy_reduction	| Effectively-unique bits per key
+ * -----------------------------------------------------
+ * -1					| 0
+ * 0					| 32
+ * 1					| 25.95
+ * 2					| 17.41
+ * 3					| 10.78
+ * 4					| 6.42
+ * ...					| ...
+ * 
+ */
+template <typename K>
+void RandomBits(K &key, int entropy_reduction = 0, int lower_key_bits = sizeof(K) * 8)
+{
+	const unsigned int NUM_UCHARS = (sizeof(K) + sizeof(unsigned char) - 1) / sizeof(unsigned char);
+	unsigned char key_bits[NUM_UCHARS];
+	
+	do {
+	
+		for (int j = 0; j < NUM_UCHARS; j++) {
+			unsigned char quarterword = 0xff;
+			for (int i = 0; i <= entropy_reduction; i++) {
+				quarterword &= (rand() >> 7);
+			}
+			key_bits[j] = quarterword;
+		}
+		
+		if (lower_key_bits < sizeof(K) * 8) {
+			unsigned long long base = 0;
+			memcpy(&base, key_bits, sizeof(K));
+			base &= (1 << lower_key_bits) - 1;
+			memcpy(key_bits, &base, sizeof(K));
+		}
+		
+		memcpy(&key, key_bits, sizeof(K));
+		
+	} while (key != key);		// avoids NaNs when generating random floating point numbers 
+}
+
+
+/******************************************************************************
+ * Templated routines for printing keys/values to the console 
+ ******************************************************************************/
+
+template<typename T> 
+void PrintValue(T val) {
+	printf("%d", val);
+}
+
+template<>
+void PrintValue<float>(float val) {
+	printf("%f", val);
+}
+
+template<>
+void PrintValue<double>(double val) {
+	printf("%f", val);
+}
+
+template<>
+void PrintValue<unsigned char>(unsigned char val) {
+	printf("%u", val);
+}
+
+template<>
+void PrintValue<unsigned short>(unsigned short val) {
+	printf("%u", val);
+}
+
+template<>
+void PrintValue<unsigned int>(unsigned int val) {
+	printf("%u", val);
+}
+
+template<>
+void PrintValue<long>(long val) {
+	printf("%ld", val);
+}
+
+template<>
+void PrintValue<unsigned long>(unsigned long val) {
+	printf("%lu", val);
+}
+
+template<>
+void PrintValue<long long>(long long val) {
+	printf("%lld", val);
+}
+
+template<>
+void PrintValue<unsigned long long>(unsigned long long val) {
+	printf("%llu", val);
+}
+
+
+
+/**
+ * Compares the equivalence of two arrays
+ */
+template <typename T, typename SizeT>
+int CompareResults(T* computed, T* reference, SizeT len, bool verbose = true)
+{
+	printf("\n");
+	for (SizeT i = 0; i < len; i++) {
+
+		if (computed[i] != reference[i]) {
+			printf("INCORRECT: [%lu]: ", (unsigned long) i);
+			PrintValue<T>(computed[i]);
+			printf(" != ");
+			PrintValue<T>(reference[i]);
+
+			if (verbose) {
+				printf("\nresult[...");
+				for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
+					PrintValue<T>(computed[j]);
+					printf(", ");
+				}
+				printf("...]");
+				printf("\nreference[...");
+				for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
+					PrintValue<T>(reference[j]);
+					printf(", ");
+				}
+				printf("...]");
+			}
+
+			return 1;
+		}
+	}
+
+	printf("CORRECT\n");
+	return 0;
+}
+
+/**
+ * Creates an example sorting problem whose keys is a vector of the specified 
+ * number of K elements, values of V elements, and then dispatches the problem 
+ * to the GPU for the given number of iterations, displaying runtime information.
+ *
+ * @param[in] 		iterations  
+ * 		Number of times to invoke the GPU sorting primitive
+ * @param[in] 		num_elements 
+ * 		Size in elements of the vector to sort
+ * @param[in] 		cfg 
+ * 		Config
+ */
+template<typename K, typename V>
+void TestSort(
+	unsigned int iterations,
+	int num_elements,
+	bool keys_only)
+{
+    // Allocate the sorting problem on the host and fill the keys with random bytes
+
+	K *h_keys = NULL;
+	K *h_reference_keys = NULL;
+	V *h_values = NULL;
+	h_keys = (K*) malloc(num_elements * sizeof(K));
+	h_reference_keys = (K*) malloc(num_elements * sizeof(K));
+	if (!keys_only) h_values = (V*) malloc(num_elements * sizeof(V));
+	
+
+	// Use random bits
+	for (unsigned int i = 0; i < num_elements; ++i) {
+		RandomBits<K>(h_keys[i], 0);
+		//h_keys[i] = num_elements-i;
+        //h_keys[i] = 0xffffffffu-i;
+		if (!keys_only)
+			h_values[i] = h_keys[i];//0xffffffffu-i;
+
+		h_reference_keys[i] = h_keys[i];
+	}
+
+    // Run the timing test 
+	if (keys_only) {
+		TimedSort<K>(num_elements, h_keys, iterations);
+	} else {
+		TimedSort<K, V>(num_elements, h_keys, h_values, iterations);
+	}
+
+//	cudaThreadSynchronize();
+    
+	// Display sorted key data
+	if (g_verbose) {
+		printf("\n\nKeys:\n");
+		for (int i = 0; i < num_elements; i++) {	
+			PrintValue<K>(h_keys[i]);
+			printf(", ");
+		}
+		printf("\n\n");
+	}	
+	
+    // Verify solution
+	std::sort(h_reference_keys, h_reference_keys + num_elements);	
+	CompareResults<K>(h_keys, h_reference_keys, num_elements, true);
+	printf("\n");
+	fflush(stdout);
+
+	// Free our allocated host memory 
+	if (h_keys != NULL) free(h_keys);
+    if (h_values != NULL) free(h_values);
+}
+
+
+
+/**
+ * Displays the commandline usage for this tool
+ */
+void Usage() 
+{
+	printf("\ntest_large_problem_sorting [--device=<device index>] [--v] [--i=<num-iterations>] [--n=<num-elements>] [--key-values] [--deviceId=<int>] [--platformId=<int>]\n"); 
+	printf("\n");
+	printf("\t--v\tDisplays sorted results to the console.\n");
+	printf("\n");
+	printf("\t--i\tPerforms the sorting operation <num-iterations> times\n");
+	printf("\t\t\ton the device. Re-copies original input each time. Default = 1\n");
+	printf("\n");
+	printf("\t--n\tThe number of elements to comprise the sample problem\n");
+	printf("\t\t\tDefault = 512\n");
+	printf("\n");
+	printf("\t--key-values\tSpecifies that keys are accommodated by value pairings\n");
+	printf("\n");
+}
+
+
+/******************************************************************************
+ * Command-line parsing
+ ******************************************************************************/
+#include <map>
+#include <algorithm>
+#include <string>
+
+class b3CommandLineArgs
+{
+protected:
+
+	std::map<std::string, std::string> pairs;
+
+public:
+
+	// Constructor
+	b3CommandLineArgs(int argc, char **argv)
+	{
+		using namespace std;
+
+	    for (int i = 1; i < argc; i++)
+	    {
+	        string arg = argv[i];
+
+	        if ((arg[0] != '-') || (arg[1] != '-')) {
+	        	continue;
+	        }
+
+        	string::size_type pos;
+		    string key, val;
+	        if ((pos = arg.find( '=')) == string::npos) {
+	        	key = string(arg, 2, arg.length() - 2);
+	        	val = "";
+	        } else {
+	        	key = string(arg, 2, pos - 2);
+	        	val = string(arg, pos + 1, arg.length() - 1);
+	        }
+        	pairs[key] = val;
+	    }
+	}
+
+	bool CheckCmdLineFlag(const char* arg_name)
+	{
+		using namespace std;
+		map<string, string>::iterator itr;
+		if ((itr = pairs.find(arg_name)) != pairs.end()) {
+			return true;
+	    }
+		return false;
+	}
+
+	template <typename T>
+	void GetCmdLineArgument(const char *arg_name, T &val);
+
+	int ParsedArgc()
+	{
+		return pairs.size();
+	}
+};
+
+template <typename T>
+void b3CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val)
+{
+	using namespace std;
+	map<string, string>::iterator itr;
+	if ((itr = pairs.find(arg_name)) != pairs.end()) {
+		istringstream strstream(itr->second);
+		strstream >> val;
+    }
+}
+
+template <>
+void b3CommandLineArgs::GetCmdLineArgument<char*>(const char* arg_name, char* &val)
+{
+	using namespace std;
+	map<string, string>::iterator itr;
+	if ((itr = pairs.find(arg_name)) != pairs.end()) {
+
+		string s = itr->second;
+		val = (char*) malloc(sizeof(char) * (s.length() + 1));
+		strcpy(val, s.c_str());
+
+	} else {
+    	val = NULL;
+	}
+}
+
+
+
+
+
+/******************************************************************************
+ * Main
+ ******************************************************************************/
+
+extern bool gDebugSkipLoadingBinary;
+
+int main( int argc, char** argv) 
+{
+	//gDebugSkipLoadingBinary = true;
+
+	cl_int ciErrNum;
+	b3CommandLineArgs args(argc,argv);
+
+	args.GetCmdLineArgument("deviceId", gPreferredDeviceId);
+	args.GetCmdLineArgument("platformId", gPreferredPlatformId);
+
+	printf("Initialize OpenCL using b3OpenCLUtils_createContextFromType\n");
+	cl_platform_id platformId;
+//	g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
+	g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
+	//g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_CPU, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
+	
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+	int numDev = b3OpenCLUtils_getNumDevices(g_cxMainContext);
+
+	if (!numDev)
+	{
+		printf("error: no OpenCL devices\n");
+		exit(0);
+	}
+	int result;
+	int devId = 0;
+	g_device = b3OpenCLUtils_getDevice(g_cxMainContext,devId);
+	b3OpenCLUtils_printDeviceInfo(g_device);
+	// create a command-queue
+	g_cqCommandQueue = clCreateCommandQueue(g_cxMainContext, g_device, 0, &ciErrNum);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+
+
+	//srand(time(NULL));	
+	srand(0);				// presently deterministic
+
+    unsigned int num_elements 					= 8*1024*1024;//4*1024*1024;//4*1024*1024;//257;//8*524288;//2048;//512;//524288;
+    unsigned int iterations  					= 10;
+    bool keys_only = true;
+
+    //
+	// Check command line arguments
+    //
+
+	
+
+	if (args.CheckCmdLineFlag("help"))
+	{
+		Usage();
+		return 0;
+	}
+	
+	args.GetCmdLineArgument("i", iterations);
+	args.GetCmdLineArgument("n", num_elements);
+	
+
+
+	keys_only = !args.CheckCmdLineFlag("key-values");
+	g_verbose = args.CheckCmdLineFlag("v");
+
+
+
+	TestSort<unsigned int, unsigned int>(
+			iterations,
+			num_elements, 
+			keys_only);
+
+
+}
--- a/test/OpenCL/RadixSortBenchmark/premake4.lua
+++ b/test/OpenCL/RadixSortBenchmark/premake4.lua
@ -0,0 +1,40 @@
+function createProject(vendor)
+	hasCL = findOpenCL(vendor)
+	
+	if (hasCL) then
+
+		project ("Test_OpenCL_RadixSortBenchmark_" .. vendor)
+
+		initOpenCL(vendor)
+		
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+		includedirs {"..","../../../src"}
+		
+--		links {
+--			("OpenCL_lib_parallel_primitives_host_" .. vendor)
+--		}
+		
+		files {
+			"main.cpp",
+			"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp",
+			"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h",
+			"../../../src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp",
+			"../../../src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp",
+			"../../../src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp",
+			"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
+			"../../../src/Bullet3Common/b3AlignedAllocator.h",
+			"../../../src/Bullet3Common/b3AlignedObjectArray.h",
+			"../../../src/Bullet3Common/b3Quickprof.cpp",
+			"../../../src/Bullet3Common/b3Quickprof.h",
+		}
+		
+	end
+end
+
+createProject("AMD")
+createProject("Intel")
+createProject("NVIDIA")
+createProject("Apple")