reorder files, in preparation for Bullet 3 -> Bullet 2 merge

2024-12-14 13:50:04 +00:00 · 2013-04-29 19:04:08 -07:00 · 2013-04-29 19:04:08 -07:00 · 3ac332f3a7
commit 3ac332f3a7
parent 55b69201a9
162 changed files with 215 additions and 3070 deletions
--- a/Demos3/GpuDemos/GpuDemo.cpp
+++ b/Demos3/GpuDemos/GpuDemo.cpp
@ -1,7 +1,7 @@
 #include "GpuDemo.h"
 #include "GpuDemoInternalData.h"
 #include "Bullet3Common/b3Scalar.h"
-#include "basic_initialize/b3OpenCLUtils.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
 #include "OpenGLWindow/ShapeData.h"
 #include "OpenGLWindow/GLInstancingRenderer.h"

--- a/Demos3/GpuDemos/GpuDemo.h
+++ b/Demos3/GpuDemos/GpuDemo.h
--- a/Demos3/GpuDemos/GpuDemoInternalData.h
+++ b/Demos3/GpuDemos/GpuDemoInternalData.h
@ -1,7 +1,7 @@
 #ifndef GPU_DEMO_INTERNAL_DATA_H
 #define GPU_DEMO_INTERNAL_DATA_H

-#include "basic_initialize/b3OpenCLInclude.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"

 struct GpuDemoInternalData
 {
--- a/Demos3/GpuDemos/ParticleDemo.cpp
+++ b/Demos3/GpuDemos/ParticleDemo.cpp
@ -2,7 +2,7 @@

 #include "OpenGLWindow/GLInstancingRenderer.h"
 #include "OpenGLWindow/ShapeData.h"
-#include "basic_initialize/b3OpenCLUtils.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"

 #define MSTRINGIFY(A) #A
 static char* particleKernelsString = 
@ -12,10 +12,10 @@ static char* particleKernelsString =
 #include "Bullet3Common/b3Vector3.h"
 #include "OpenGLWindow/OpenGLInclude.h"
 #include "OpenGLWindow/GLInstanceRendererInternalData.h"
-#include "parallel_primitives/host/b3LauncherCL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
 //#include "../../opencl/primitives/AdlPrimitives/Math/Math.h"
 //#include "../../opencl/broadphase_benchmark/b3GridBroadphaseCL.h"
-#include "gpu_broadphase/host/b3GpuSapBroadphase.h"
+#include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
 #include "GpuDemoInternalData.h"


--- a/Demos3/GpuDemos/ParticleDemo.h
+++ b/Demos3/GpuDemos/ParticleDemo.h
--- a/Demos3/GpuDemos/ParticleKernels.cl
+++ b/Demos3/GpuDemos/ParticleKernels.cl
--- a/Demos3/GpuDemos/broadphase/PairBench.cpp
+++ b/Demos3/GpuDemos/broadphase/PairBench.cpp
@ -4,12 +4,12 @@
 #include "OpenGLWindow/GLInstancingRenderer.h"
 #include "Bullet3Common/b3Quaternion.h"
 #include "OpenGLWindow/b3gWindowInterface.h"
-#include "gpu_broadphase/host/b3GpuSapBroadphase.h"
+#include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
 #include "../GpuDemoInternalData.h"
-#include "basic_initialize/b3OpenCLUtils.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
 #include "OpenGLWindow/OpenGLInclude.h"
 #include "OpenGLWindow/GLInstanceRendererInternalData.h"
-#include "parallel_primitives/host/b3LauncherCL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"

 static b3KeyboardCallback oldCallback = 0;
 extern bool gReset;
--- a/Demos3/GpuDemos/broadphase/PairBench.h
+++ b/Demos3/GpuDemos/broadphase/PairBench.h
--- a/Demos3/GpuDemos/gwenUserInterface.cpp
+++ b/Demos3/GpuDemos/gwenUserInterface.cpp
--- a/Demos3/GpuDemos/gwenUserInterface.h
+++ b/Demos3/GpuDemos/gwenUserInterface.h
--- a/Demos3/GpuDemos/main_opengl3core.cpp
+++ b/Demos3/GpuDemos/main_opengl3core.cpp
--- a/Demos3/GpuDemos/premake4.lua
+++ b/Demos3/GpuDemos/premake4.lua
@ -4,7 +4,7 @@ function createProject(vendor)
 	
 	if (hasCL) then

-		project ("Bullet3_OpenCL_gpu_demo_" .. vendor)
+		project ("App_Bullet3_OpenCL_Demos_" .. vendor)

 		initOpenCL(vendor)
 		
@ -20,8 +20,7 @@ function createProject(vendor)
 		includedirs {
 		 	"..",
 		 	"../../src",
-		 	"../../btgui",
-		 	"../../opencl"
+		 	"../../btgui"
 		}
 		
 		links {
@ -30,21 +29,23 @@ function createProject(vendor)
 			"Bullet3Geometry",
 			"Bullet3Collision",
 			"Bullet3Dynamics",
-			"Bullet2FileLoader"
+			"Bullet2FileLoader",
+			"Bullet3OpenCL_" .. vendor
+			
 		}
 		
 		files {
 			"**.cpp",
 			"**.h",
 			
-			"../ObjLoader/string_extra.cpp",
-			"../ObjLoader/string_extra.h",
-			"../ObjLoader/objLoader.cpp",
-			"../ObjLoader/objLoader.h",
-			"../ObjLoader/obj_parser.cpp",
-			"../ObjLoader/obj_parser.h",
-			"../ObjLoader/list.cpp",
-			"../ObjLoader/list.h",
+			"../Wavefront/string_extra.cpp",
+			"../Wavefront/string_extra.h",
+			"../Wavefront/objLoader.cpp",
+			"../Wavefront/objLoader.h",
+			"../Wavefront/obj_parser.cpp",
+			"../Wavefront/obj_parser.h",
+			"../Wavefront/list.cpp",
+			"../Wavefront/list.h",
 			
 			
 			"../../btgui/OpenGLWindow/GLInstancingRenderer.cpp",
@ -60,21 +61,6 @@ function createProject(vendor)
 			"../../btgui/OpenGLTrueTypeFont/opengl_fontstashcallbacks.cpp",
 			"../../btgui/OpenGLTrueTypeFont/opengl_fontstashcallbacks.h",
 			"../../btgui/FontFiles/OpenSans.cpp",
-			"../../opencl/basic_initialize/b3OpenCLUtils.cpp",
-			"../../opencl/basic_initialize/b3OpenCLUtils.h",
-			"../../opencl/gpu_broadphase/host/b3GpuSapBroadphase.cpp",
-			"../../opencl/gpu_narrowphase/host/**.cpp",
-			"../../opencl/gpu_narrowphase/host/**.h",
-			"../../opencl/parallel_primitives/host/b3BoundSearchCL.cpp",
-			"../../opencl/parallel_primitives/host/b3BoundSearchCL.h",
-			"../../opencl/parallel_primitives/host/b3FillCL.cpp",
-			"../../opencl/parallel_primitives/host/b3FillCL.h",
-			"../../opencl/parallel_primitives/host/b3PrefixScanCL.cpp",
-			"../../opencl/parallel_primitives/host/b3PrefixScanCL.h",
-			"../../opencl/parallel_primitives/host/b3RadixSort32CL.cpp",
-			"../../opencl/parallel_primitives/host/b3RadixSort32CL.h",
-			"../../opencl/gpu_rigidbody/host/**.cpp",
-			"../../opencl/gpu_rigidbody/host/**.h",

 		}

--- a/Demos3/GpuDemos/rigidbody/Bullet2FileDemo.cpp
+++ b/Demos3/GpuDemos/rigidbody/Bullet2FileDemo.cpp
--- a/Demos3/GpuDemos/rigidbody/Bullet2FileDemo.h
+++ b/Demos3/GpuDemos/rigidbody/Bullet2FileDemo.h
--- a/Demos3/GpuDemos/rigidbody/BulletDataExtractor.cpp
+++ b/Demos3/GpuDemos/rigidbody/BulletDataExtractor.cpp
@ -22,11 +22,11 @@ extern bool enableExperimentalCpuConcaveCollision;
 //#include "LinearMath/b3Quickprof.h"
 #include "Bullet3Common/b3Quaternion.h"
 #include "Bullet3Common/b3Matrix3x3.h"
-#include "gpu_narrowphase/host/b3ConvexUtility.h"
+#include "Bullet3OpenCL/NarrowphaseCollision/b3ConvexUtility.h"
 #include "OpenGLWindow/ShapeData.h"
-#include "../../ObjLoader/objLoader.h"
-#include "gpu_rigidbody/host/b3GpuRigidBodyPipeline.h"
-#include "gpu_rigidbody/host/b3GpuNarrowPhase.h"
+#include "../../Wavefront/objLoader.h"
+#include "Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h"
+#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h"

 ///work-in-progress 
 ///This ReadBulletSample is kept as simple as possible without dependencies to the Bullet SDK.
--- a/Demos3/GpuDemos/rigidbody/BulletDataExtractor.h
+++ b/Demos3/GpuDemos/rigidbody/BulletDataExtractor.h
--- a/Demos3/GpuDemos/rigidbody/ConcaveScene.cpp
+++ b/Demos3/GpuDemos/rigidbody/ConcaveScene.cpp
@ -6,17 +6,17 @@
 #include "OpenGLWindow/GLInstancingRenderer.h"
 #include "Bullet3Common/b3Quaternion.h"
 #include "OpenGLWindow/b3gWindowInterface.h"
-#include "gpu_broadphase/host/b3GpuSapBroadphase.h"
+#include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
 #include "../GpuDemoInternalData.h"
-#include "basic_initialize/b3OpenCLUtils.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
 #include "OpenGLWindow/OpenGLInclude.h"
 #include "OpenGLWindow/GLInstanceRendererInternalData.h"
-#include "parallel_primitives/host/b3LauncherCL.h"
-#include "gpu_rigidbody/host/b3GpuRigidBodyPipeline.h"
-#include "gpu_rigidbody/host/b3GpuNarrowPhase.h"
-#include "gpu_rigidbody/host/b3Config.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
+#include "Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h"
+#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h"
+#include "Bullet3OpenCL/RigidBody/b3Config.h"
 #include "GpuRigidBodyDemoInternalData.h"
-#include"../../ObjLoader/objLoader.h"
+#include"../../Wavefront/objLoader.h"
 #include "Bullet3Common/b3Transform.h"

 #include "OpenGLWindow/GLInstanceGraphicsShape.h"
--- a/Demos3/GpuDemos/rigidbody/ConcaveScene.h
+++ b/Demos3/GpuDemos/rigidbody/ConcaveScene.h
--- a/Demos3/GpuDemos/rigidbody/GpuCompoundScene.cpp
+++ b/Demos3/GpuDemos/rigidbody/GpuCompoundScene.cpp
@ -6,15 +6,15 @@
 #include "OpenGLWindow/GLInstancingRenderer.h"
 #include "Bullet3Common/b3Quaternion.h"
 #include "OpenGLWindow/b3gWindowInterface.h"
-#include "gpu_broadphase/host/b3GpuSapBroadphase.h"
+#include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
 #include "../GpuDemoInternalData.h"
-#include "basic_initialize/b3OpenCLUtils.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
 #include "OpenGLWindow/OpenGLInclude.h"
 #include "OpenGLWindow/GLInstanceRendererInternalData.h"
-#include "parallel_primitives/host/b3LauncherCL.h"
-#include "gpu_rigidbody/host/b3GpuRigidBodyPipeline.h"
-#include "gpu_rigidbody/host/b3GpuNarrowPhase.h"
-#include "gpu_rigidbody/host/b3Config.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
+#include "Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h"
+#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h"
+#include "Bullet3OpenCL/RigidBody/b3Config.h"
 #include "GpuRigidBodyDemoInternalData.h"
 #include "Bullet3Common/b3Transform.h"

--- a/Demos3/GpuDemos/rigidbody/GpuCompoundScene.h
+++ b/Demos3/GpuDemos/rigidbody/GpuCompoundScene.h
--- a/Demos3/GpuDemos/rigidbody/GpuConvexScene.cpp
+++ b/Demos3/GpuDemos/rigidbody/GpuConvexScene.cpp
@ -6,15 +6,15 @@
 #include "OpenGLWindow/GLInstancingRenderer.h"
 #include "Bullet3Common/b3Quaternion.h"
 #include "OpenGLWindow/b3gWindowInterface.h"
-#include "gpu_broadphase/host/b3GpuSapBroadphase.h"
+#include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
 #include "../GpuDemoInternalData.h"
-#include "basic_initialize/b3OpenCLUtils.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
 #include "OpenGLWindow/OpenGLInclude.h"
 #include "OpenGLWindow/GLInstanceRendererInternalData.h"
-#include "parallel_primitives/host/b3LauncherCL.h"
-#include "gpu_rigidbody/host/b3GpuRigidBodyPipeline.h"
-#include "gpu_rigidbody/host/b3GpuNarrowPhase.h"
-#include "gpu_rigidbody/host/b3Config.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
+#include "Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h"
+#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h"
+#include "Bullet3OpenCL/RigidBody/b3Config.h"
 #include "GpuRigidBodyDemoInternalData.h"
 #include "../gwenUserInterface.h"
 #include "Bullet3Dynamics/ConstraintSolver/b3Point2PointConstraint.h"
--- a/Demos3/GpuDemos/rigidbody/GpuConvexScene.h
+++ b/Demos3/GpuDemos/rigidbody/GpuConvexScene.h
--- a/Demos3/GpuDemos/rigidbody/GpuRigidBodyDemo.cpp
+++ b/Demos3/GpuDemos/rigidbody/GpuRigidBodyDemo.cpp
@ -4,15 +4,15 @@
 #include "OpenGLWindow/GLInstancingRenderer.h"
 #include "Bullet3Common/b3Quaternion.h"
 #include "OpenGLWindow/b3gWindowInterface.h"
-#include "gpu_broadphase/host/b3GpuSapBroadphase.h"
+#include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
 #include "../GpuDemoInternalData.h"
-#include "basic_initialize/b3OpenCLUtils.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
 #include "OpenGLWindow/OpenGLInclude.h"
 #include "OpenGLWindow/GLInstanceRendererInternalData.h"
-#include "parallel_primitives/host/b3LauncherCL.h"
-#include "gpu_rigidbody/host/b3GpuRigidBodyPipeline.h"
-#include "gpu_rigidbody/host/b3GpuNarrowPhase.h"
-#include "gpu_rigidbody/host/b3Config.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
+#include "Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h"
+#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h"
+#include "Bullet3OpenCL/RigidBody/b3Config.h"
 #include "GpuRigidBodyDemoInternalData.h"
 #include "Bullet3Collision/BroadPhaseCollision/b3DynamicBvhBroadphase.h"

--- a/Demos3/GpuDemos/rigidbody/GpuRigidBodyDemo.h
+++ b/Demos3/GpuDemos/rigidbody/GpuRigidBodyDemo.h
--- a/Demos3/GpuDemos/rigidbody/GpuRigidBodyDemoInternalData.h
+++ b/Demos3/GpuDemos/rigidbody/GpuRigidBodyDemoInternalData.h
@ -1,8 +1,8 @@
 #ifndef GPU_RIGIDBODY_INTERNAL_DATA_H
 #define GPU_RIGIDBODY_INTERNAL_DATA_H

-#include "basic_initialize/b3OpenCLUtils.h"
-#include "parallel_primitives/host/b3OpenCLArray.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
 #include "Bullet3Common/b3Vector3.h"

 struct	GpuRigidBodyDemoInternalData
--- a/Demos3/GpuDemos/rigidbody/GpuSphereScene.cpp
+++ b/Demos3/GpuDemos/rigidbody/GpuSphereScene.cpp
@ -2,19 +2,18 @@
 #include "GpuRigidBodyDemo.h"
 #include "Bullet3Common/b3Quickprof.h"
 #include "OpenGLWindow/ShapeData.h"
-
 #include "OpenGLWindow/GLInstancingRenderer.h"
 #include "Bullet3Common/b3Quaternion.h"
 #include "OpenGLWindow/b3gWindowInterface.h"
-#include "gpu_broadphase/host/b3GpuSapBroadphase.h"
+#include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
 #include "../GpuDemoInternalData.h"
-#include "basic_initialize/b3OpenCLUtils.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
 #include "OpenGLWindow/OpenGLInclude.h"
 #include "OpenGLWindow/GLInstanceRendererInternalData.h"
-#include "parallel_primitives/host/b3LauncherCL.h"
-#include "gpu_rigidbody/host/b3GpuRigidBodyPipeline.h"
-#include "gpu_rigidbody/host/b3GpuNarrowPhase.h"
-#include "gpu_rigidbody/host/b3Config.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
+#include "Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h"
+#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h"
+#include "Bullet3OpenCL/RigidBody/b3Config.h"
 #include "GpuRigidBodyDemoInternalData.h"
 #include "../gwenUserInterface.h"

--- a/Demos3/GpuDemos/rigidbody/GpuSphereScene.h
+++ b/Demos3/GpuDemos/rigidbody/GpuSphereScene.h
--- a/Demos3/GpuGuiInitialize/main.cpp
+++ b/Demos3/GpuGuiInitialize/main.cpp
--- a/Demos3/GpuGuiInitialize/premake4.lua
+++ b/Demos3/GpuGuiInitialize/premake4.lua
--- a/Demos3/Wavefront/list.cpp
+++ b/Demos3/Wavefront/list.cpp
--- a/Demos3/Wavefront/list.h
+++ b/Demos3/Wavefront/list.h
--- a/Demos3/Wavefront/objLoader.cpp
+++ b/Demos3/Wavefront/objLoader.cpp
--- a/Demos3/Wavefront/objLoader.h
+++ b/Demos3/Wavefront/objLoader.h
--- a/Demos3/Wavefront/objTester.cpp
+++ b/Demos3/Wavefront/objTester.cpp
--- a/Demos3/Wavefront/obj_parser.cpp
+++ b/Demos3/Wavefront/obj_parser.cpp
--- a/Demos3/Wavefront/obj_parser.h
+++ b/Demos3/Wavefront/obj_parser.h
--- a/Demos3/Wavefront/premake4.lua
+++ b/Demos3/Wavefront/premake4.lua
--- a/Demos3/Wavefront/string_extra.cpp
+++ b/Demos3/Wavefront/string_extra.cpp
--- a/Demos3/Wavefront/string_extra.h
+++ b/Demos3/Wavefront/string_extra.h
--- a/Demos3/donttouch/Bullet2GpuDemo.cpp
+++ b/Demos3/donttouch/Bullet2GpuDemo.cpp
--- a/Demos3/donttouch/Bullet2GpuDemo.h
+++ b/Demos3/donttouch/Bullet2GpuDemo.h
--- a/Demos3/donttouch/GpuDemo.cpp
+++ b/Demos3/donttouch/GpuDemo.cpp
--- a/Demos3/donttouch/GpuDemo.h
+++ b/Demos3/donttouch/GpuDemo.h
--- a/Demos3/donttouch/OpenGL3CoreRenderer.cpp
+++ b/Demos3/donttouch/OpenGL3CoreRenderer.cpp
--- a/Demos3/donttouch/OpenGL3CoreRenderer.h
+++ b/Demos3/donttouch/OpenGL3CoreRenderer.h
--- a/Demos3/donttouch/b3CpuDynamicsWorld.cpp
+++ b/Demos3/donttouch/b3CpuDynamicsWorld.cpp
--- a/Demos3/donttouch/b3CpuDynamicsWorld.h
+++ b/Demos3/donttouch/b3CpuDynamicsWorld.h
--- a/Demos3/donttouch/b3GpuDynamicsWorld.cpp
+++ b/Demos3/donttouch/b3GpuDynamicsWorld.cpp
--- a/Demos3/donttouch/b3GpuDynamicsWorld.h
+++ b/Demos3/donttouch/b3GpuDynamicsWorld.h
--- a/btgui/GwenOpenGLTest/premake4.lua
+++ b/btgui/GwenOpenGLTest/premake4.lua
@ -1,5 +1,5 @@

-	project "Gwen_OpenGLTest"
+	project "Test_Gwen_OpenGL"
 		
 	kind "ConsoleApp"
 	flags {"Unicode"}
--- a/build/premake4.lua
+++ b/build/premake4.lua
@ -91,30 +91,35 @@


 	if not _OPTIONS["ios"] then
-		include "../demo/gpudemo"
-	include "../btgui/MidiTest"
+--		include "../demo/gpudemo"
+--	include "../btgui/MidiTest"
 --		include "../opencl/vector_add_simplified"
 --		include "../opencl/vector_add"
-		include "../opencl/basic_initialize"
-		include "../demo/gpu_initialize"
-		include "../opencl/parallel_primitives/host"
-		include "../opencl/parallel_primitives/test"
-		include "../opencl/parallel_primitives/benchmark"
-		include "../opencl/lds_bank_conflict"
--		include "../opencl/reduce"
--		include "../opencl/gpu_broadphase/test"
--		include "../opencl/gpu_narrowphase/test"
 		include "../btgui/Gwen"
 		include "../btgui/GwenOpenGLTest"
-		include "../btgui/OpenGLTrueTypeFont"
--		include "../btgui/OpenGLWindow"
--		include "../demo/ObjLoader"
+
+		include "../test/OpenCL/BasicInitialize"
+		include "../test/OpenCL/BroadphaseCollision"
+		include "../test/OpenCL/NarrowphaseCollision"
+		include "../test/OpenCL/ParallelPrimitives"
+
 		include "../src/Bullet3Dynamics"
 		include "../src/Bullet3Common"
 		include "../src/Bullet3Geometry"
 		include "../src/Bullet3Collision"
 		include "../src/Bullet3Serialize/Bullet2FileLoader"
+	
+		include "../src/Bullet3OpenCL"
+		include "../Demos3/GpuDemos"
+			
+--		include "../demo/gpu_initialize"
+--		include "../opencl/lds_bank_conflict"
+--		include "../opencl/reduce"
+--		include "../btgui/OpenGLTrueTypeFont"
+--		include "../btgui/OpenGLWindow"
+--		include "../demo/ObjLoader"
+
 		
-		include "../test/b3DynamicBvhBroadphase"
+--		include "../test/b3DynamicBvhBroadphase"
 		
 	end
--- a/opencl/basic_initialize/premake4.lua
+++ b/opencl/basic_initialize/premake4.lua
@ -1,28 +0,0 @@
-function createProject(vendor)
-	
-	hasCL = findOpenCL(vendor)
-	
-	if (hasCL) then
-
-		project ("OpenCL_intialize_" .. vendor)
-
-		initOpenCL(vendor)
-	
-		language "C++"
-				
-		kind "ConsoleApp"
-		targetdir "../../bin"
-
-		files {
-			"main.cpp",
-			"b3OpenCLUtils.cpp",
-			"b3OpenCLUtils.h"
-		}
-		
-	end
-end
-	
-createProject("Apple")
-createProject("AMD")
-createProject("Intel")
-createProject("NVIDIA")
--- a/opencl/gpu_broadphase/test/main.cpp
+++ b/opencl/gpu_broadphase/test/main.cpp
@ -1,129 +0,0 @@
-/*
-Copyright (c) 2012 Advanced Micro Devices, Inc.  
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-
-#include <stdio.h>
-#include "../basic_initialize/b3OpenCLUtils.h"
-#include "../host/b3GpuSapBroadphase.h"
-#include "Bullet3Common/b3Vector3.h"
-#include "parallel_primitives/host/b3FillCL.h"
-#include "parallel_primitives/host/b3BoundSearchCL.h"
-#include "parallel_primitives/host/b3RadixSort32CL.h"
-#include "parallel_primitives/host/b3PrefixScanCL.h"
-#include "Bullet3Common/b3CommandLineArgs.h"
-#include "Bullet3Common/b3MinMax.h"
-
-int g_nPassed = 0;
-int g_nFailed = 0;
-bool g_testFailed = 0;
-
-#define TEST_INIT g_testFailed = 0;
-#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;}
-#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++;
-#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
-
-cl_context g_context=0;
-cl_device_id g_device=0;
-cl_command_queue g_queue =0;
-const char* g_deviceName = 0;
-
-void initCL(int preferredDeviceIndex, int preferredPlatformIndex)
-{
-	void* glCtx=0;
-	void* glDC = 0;
-	int ciErrNum = 0;
-	//bound search and radix sort only work on GPU right now (assume 32 or 64 width workgroup without barriers)
-
-	cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
-
-	g_context = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex);
-	oclCHECKERROR(ciErrNum, CL_SUCCESS);
-	int numDev = b3OpenCLUtils::getNumDevices(g_context);
-	if (numDev>0)
-	{
-		b3OpenCLDeviceInfo info;
-		g_device= b3OpenCLUtils::getDevice(g_context,0);
-		g_queue = clCreateCommandQueue(g_context, g_device, 0, &ciErrNum);
-		oclCHECKERROR(ciErrNum, CL_SUCCESS);
-        b3OpenCLUtils::printDeviceInfo(g_device);
-		b3OpenCLUtils::getDeviceInfo(g_device,&info);
-		g_deviceName = info.m_deviceName;
-	}
-}
-
-void exitCL()
-{
-	clReleaseCommandQueue(g_queue);
-	clReleaseContext(g_context);
-}
-
-
-inline void broadphaseTest()
-{
-	TEST_INIT;
-
-	b3GpuSapBroadphase* sap = new b3GpuSapBroadphase(g_context,g_device,g_queue);
-	int group=1;
-	int mask=1;
-	b3Vector3 aabbMin(0,0,0);
-	b3Vector3 aabbMax(1,1,1);
-	int usrPtr = 1;
-	sap->createProxy(aabbMin,aabbMax,usrPtr,group,mask);
-
-	aabbMin.setValue(1,1,1);
-	aabbMax.setValue(2,2,2);
-
-	usrPtr = 2;
-	sap->createProxy(aabbMin,aabbMax,usrPtr,group,mask);
-	sap->writeAabbsToGpu();
-
-	sap->calculateOverlappingPairs();
-	
-	int numOverlap = sap->getNumOverlap();
-	cl_mem buf = sap->getOverlappingPairBuffer();
-	
-	TEST_ASSERT(numOverlap==1);
-
-	delete sap;
-
-	TEST_REPORT( "broadphaseTest" );
-}
-
-int main(int argc, char** argv)
-{
-	int preferredDeviceIndex = -1;
-	int preferredPlatformIndex = -1;
-
-	b3CommandLineArgs args(argc, argv);
-	args.GetCmdLineArgument("deviceId", preferredDeviceIndex);
-	args.GetCmdLineArgument("platformId", preferredPlatformIndex);
-
-	initCL(preferredDeviceIndex,preferredPlatformIndex);
-
-
-	broadphaseTest();
-
-	printf("%d tests passed\n",g_nPassed, g_nFailed);
-	if (g_nFailed)
-	{
-		printf("%d tests failed\n",g_nFailed);
-	}
-	printf("End, press <enter>\n");
-
-	getchar();
-
-	exitCL();
-
-}
-
--- a/opencl/gpu_broadphase/test/premake4.lua
+++ b/opencl/gpu_broadphase/test/premake4.lua
@ -1,46 +0,0 @@
-function createProject(vendor)	
-	hasCL = findOpenCL(vendor)
-	
-	if (hasCL) then
-
-		project ("OpenCL_broadphase_test_" .. vendor)
-
-		initOpenCL(vendor)
-
-		language "C++"
-				
-		kind "ConsoleApp"
-		targetdir "../../../bin"
-		includedirs {"..","../..","../../../src"}
-		
-		
-		files {
-			"main.cpp",
-			"../../basic_initialize/b3OpenCLInclude.h",
-			"../../basic_initialize/b3OpenCLUtils.cpp",
-			"../../basic_initialize/b3OpenCLUtils.h",
-			"../host/b3GpuSapBroadphase.cpp",
-			"../host/b3GpuSapBroadphase.h",
-			"../../parallel_primitives/host/btFillCL.cpp",
-			"../../parallel_primitives/host/btFillCL.h",
-			"../../parallel_primitives/host/btBoundSearchCL.cpp",
-			"../../parallel_primitives/host/btBoundSearchCL.h",
-			"../../parallel_primitives/host/btPrefixScanCL.cpp",
-			"../../parallel_primitives/host/btPrefixScanCL.h",
-			"../../parallel_primitives/host/btRadixSort32CL.cpp",
-			"../../parallel_primitives/host/btRadixSort32CL.h",
-			"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
-			"../../../src/Bullet3Common/b3AlignedAllocator.h",
-			"../../../src/Bullet3Common/b3AlignedObjectArray.h",
-			"../../../src/Bullet3Common/b3Quickprof.cpp",
-			"../../../src/Bullet3Common/b3Quickprof.h",
-
-		}
-		
-	end
-end
-
-createProject("AMD")
-createProject("Intel")
-createProject("NVIDIA")
-createProject("Apple")
--- a/opencl/gpu_narrowphase/test/main.cpp
+++ b/opencl/gpu_narrowphase/test/main.cpp
@ -1,111 +0,0 @@
-/*
-Copyright (c) 2012 Advanced Micro Devices, Inc.  
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-
-#include <stdio.h>
-#include "../basic_initialize/b3OpenCLUtils.h"
-#include "../host/b3ConvexHullContact.h"
-
-#include "Bullet3Common/b3Vector3.h"
-#include "parallel_primitives/host/b3FillCL.h"
-#include "parallel_primitives/host/b3BoundSearchCL.h"
-#include "parallel_primitives/host/b3RadixSort32CL.h"
-#include "parallel_primitives/host/b3PrefixScanCL.h"
-#include "Bullet3Common/b3CommandLineArgs.h"
-#include "../host/b3ConvexHullContact.h"
-
-#include "Bullet3Common/b3MinMax.h"
-int g_nPassed = 0;
-int g_nFailed = 0;
-bool g_testFailed = 0;
-
-#define TEST_INIT g_testFailed = 0;
-#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;}
-#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++;
-#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
-
-cl_context g_context=0;
-cl_device_id g_device=0;
-cl_command_queue g_queue =0;
-const char* g_deviceName = 0;
-
-void initCL(int preferredDeviceIndex, int preferredPlatformIndex)
-{
-	void* glCtx=0;
-	void* glDC = 0;
-	int ciErrNum = 0;
-	//bound search and radix sort only work on GPU right now (assume 32 or 64 width workgroup without barriers)
-
-	cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
-
-	g_context = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex);
-	oclCHECKERROR(ciErrNum, CL_SUCCESS);
-	int numDev = b3OpenCLUtils::getNumDevices(g_context);
-	if (numDev>0)
-	{
-		b3OpenCLDeviceInfo info;
-		g_device= b3OpenCLUtils::getDevice(g_context,0);
-		g_queue = clCreateCommandQueue(g_context, g_device, 0, &ciErrNum);
-		oclCHECKERROR(ciErrNum, CL_SUCCESS);
-        b3OpenCLUtils::printDeviceInfo(g_device);
-		b3OpenCLUtils::getDeviceInfo(g_device,&info);
-		g_deviceName = info.m_deviceName;
-	}
-}
-
-void exitCL()
-{
-	clReleaseCommandQueue(g_queue);
-	clReleaseContext(g_context);
-}
-
-
-inline void gpuConvexHullContactTest()
-{
-	TEST_INIT;
-
-	TEST_ASSERT(1);
-
-	GpuSatCollision* sat = new GpuSatCollision(g_context,g_device,g_queue);
-
-	delete sat;
-
-	TEST_REPORT( "gpuConvexHullContactTest" );
-}
-
-int main(int argc, char** argv)
-{
-	int preferredDeviceIndex = -1;	int preferredPlatformIndex = -1;
-
-	b3CommandLineArgs args(argc, argv);
-	args.GetCmdLineArgument("deviceId", preferredDeviceIndex);
-	args.GetCmdLineArgument("platformId", preferredPlatformIndex);
-
-	initCL(preferredDeviceIndex,preferredPlatformIndex);
-
-	gpuConvexHullContactTest();
-
-	printf("%d tests passed\n",g_nPassed, g_nFailed);
-	if (g_nFailed)
-	{
-		printf("%d tests failed\n",g_nFailed);
-	}
-	printf("End, press <enter>\n");
-
-	getchar();
-
-	exitCL();
-
-}
-
--- a/opencl/gpu_narrowphase/test/premake4.lua
+++ b/opencl/gpu_narrowphase/test/premake4.lua
@ -1,49 +0,0 @@
-function createProject(vendor)	
-	hasCL = findOpenCL(vendor)
-	
-	if (hasCL) then
-
-		project ("OpenCL_sat_test_" .. vendor)
-
-		initOpenCL(vendor)
-
-		language "C++"
-				
-		kind "ConsoleApp"
-		targetdir "../../../bin"
-		includedirs {"..","../..","../../../src"}
-		
-		
-		files {
-			"main.cpp",
-			"../../basic_initialize/b3OpenCLInclude.h",
-			"../../basic_initialize/b3OpenCLUtils.cpp",
-			"../../basic_initialize/b3OpenCLUtils.h",
-			"../host/**.cpp",
-			"../host/**.h",
-			"../../parallel_primitives/host/btFillCL.cpp",
-			"../../parallel_primitives/host/btFillCL.h",
-			"../../parallel_primitives/host/btBoundSearchCL.cpp",
-			"../../parallel_primitives/host/btBoundSearchCL.h",
-			"../../parallel_primitives/host/btPrefixScanCL.cpp",
-			"../../parallel_primitives/host/btPrefixScanCL.h",
-			"../../parallel_primitives/host/btRadixSort32CL.cpp",
-			"../../parallel_primitives/host/btRadixSort32CL.h",
-			"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
-			"../../../src/Bullet3Common/b3AlignedAllocator.h",
-			"../../../src/Bullet3Common/b3AlignedObjectArray.h",
-			"../../../src/Bullet3Common/b3Quickprof.cpp",
-			"../../../src/Bullet3Common/b3Quickprof.h",
-			"../../../src/Bullet3Geometry/**.cpp",
-			"../../../src/Bullet3Geometry/**.h",
-			
-
-		}
-		
-	end
-end
-
-createProject("AMD")
-createProject("Intel")
-createProject("NVIDIA")
-createProject("Apple")
--- a/opencl/lds_bank_conflict/lds_kernels.cl
+++ b/opencl/lds_bank_conflict/lds_kernels.cl
@ -1,171 +0,0 @@
-
-#define TILE_DIM  32
-#define BLOCK_ROWS  8
-
-
-/*// simple copy kernel (CUDA)
-// Used as reference case representing best effective bandwidth.
-__global__ void copy(float *odata, const float *idata)
-{
-  int x = blockIdx.x * TILE_DIM + threadIdx.x;
-  int y = blockIdx.y * TILE_DIM + threadIdx.y;
-  int width = gridDim.x * TILE_DIM;
-
-  for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
-	odata[(y+j)*width + x] = idata[(y+j)*width + x];
-}
-*/
-// simple copy kernel (OpenCL)
-__kernel void copyKernel(__global float* odata, __global const float* idata)
-{
-  int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
-  int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
-  int width = get_num_groups(0) * get_local_size(0);
-  for (int j = 0; j < get_num_groups(1); j+= get_local_size(1))
-  {
-	odata[(y+j)*width + x] = idata[(y+j)*width + x];
-  }
-}
-
-/*
-// copy kernel using shared memory (CUDA)
-// Also used as reference case, demonstrating effect of using shared memory.
-__global__ void copySharedMem(float *odata, const float *idata)
-{
-  __shared__ float tile[TILE_DIM * TILE_DIM];
-  
-  int x = blockIdx.x * TILE_DIM + threadIdx.x;
-  int y = blockIdx.y * TILE_DIM + threadIdx.y;
-  int width = gridDim.x * TILE_DIM;
-
-  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
-	 tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x] = idata[(y+j)*width + x];
-
-  __syncthreads();
-
-  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
-	 odata[(y+j)*width + x] = tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x];          
-}
-*/
-
-// copy kernel using shared memory (OpenCL)
-// Also used as reference case, demonstrating effect of using shared memory.
-__kernel void copySharedMemKernel(__global float *odata, __global const float *idata)
-{
-  __local float tile[TILE_DIM * TILE_DIM];
-  
-  int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
-  int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
-  int width = get_num_groups(0) * get_local_size(0);
-
-  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
-	 tile[(get_local_id(1)+j)*TILE_DIM + get_local_id(0)] = idata[(y+j)*width + x];
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
-	 odata[(y+j)*width + x] = tile[(get_local_id(1)+j)*TILE_DIM + get_local_id(0)];
-}
-
-/*
-// naive transpose (CUDA)
-// Simplest transpose; doesn't use shared memory.
-// Global memory reads are coalesced but writes are not.
-__global__ void transposeNaive(float *odata, const float *idata)
-{
-  int x = blockIdx.x * TILE_DIM + threadIdx.x;
-  int y = blockIdx.y * TILE_DIM + threadIdx.y;
-  int width = gridDim.x * TILE_DIM;
-
-  for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
-	odata[x*width + (y+j)] = idata[(y+j)*width + x];
-}
-*/
-
-// naive transpose (OpenCL)
-// Simplest transpose; doesn't use shared memory.
-// Global memory reads are coalesced but writes are not.
-__kernel void transposeNaiveKernel(__global float *odata, __global const float *idata)
-{
-  int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
-  int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
-  int width = get_num_groups(0) * get_local_size(0);
-
-  for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
-	odata[x*width + (y+j)] = idata[(y+j)*width + x];
-}
-
-/*
-// coalesced transpose (CUDA)
-// Uses shared memory to achieve coalesing in both reads and writes
-// Tile width == #banks causes shared memory bank conflicts.
-__global__ void transposeCoalesced(float *odata, const float *idata)
-{
-  __shared__ float tile[TILE_DIM][TILE_DIM];
-	
-  int x = blockIdx.x * TILE_DIM + threadIdx.x;
-  int y = blockIdx.y * TILE_DIM + threadIdx.y;
-  int width = gridDim.x * TILE_DIM;
-
-  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
-	 tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
-
-  __syncthreads();
-
-  x = blockIdx.y * TILE_DIM + threadIdx.x;  // transpose block offset
-  y = blockIdx.x * TILE_DIM + threadIdx.y;
-
-  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
-	 odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
-}
-*/
-
-// coalesced transpose (OpenCL)
-// Uses shared memory to achieve coalesing in both reads and writes
-// Tile width == #banks causes shared memory bank conflicts.
-__kernel void transposeCoalescedKernel(__global float *odata, __global const float *idata)
-{
-  __local float tile[TILE_DIM][TILE_DIM];
-	
-  int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
-  int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
-  int width = get_num_groups(0) * get_local_size(0);
-    
-  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
-	 tile[get_local_id(1)+j][get_local_id(0)] = idata[(y+j)*width + x];
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  x = get_group_id(1) * TILE_DIM + get_local_id(0);
-  y = get_group_id(0) * TILE_DIM + get_local_id(1);
-  
-  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
-	 odata[(y+j)*width + x] = tile[get_local_id(0)][get_local_id(1) + j];
-}
-
-
-// No bank-conflict transpose (OpenCL)
-// Same as transposeCoalesced except the first tile dimension is padded 
-// to avoid shared memory bank conflicts.
-__kernel void transposeNoBankConflictsKernel(__global float *odata, __global const float *idata)
-{
-  __local float tile[TILE_DIM][TILE_DIM+1];
-	
-  int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
-  int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
-  int width = get_num_groups(0) * get_local_size(0);
-    
-  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
-	 tile[get_local_id(1)+j][get_local_id(0)] = idata[(y+j)*width + x];
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  x = get_group_id(1) * TILE_DIM + get_local_id(0);
-  y = get_group_id(0) * TILE_DIM + get_local_id(1);
-  
-  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
-	 odata[(y+j)*width + x] = tile[get_local_id(0)][get_local_id(1) + j];
-}
-
-
-
--- a/opencl/lds_bank_conflict/main.cpp
+++ b/opencl/lds_bank_conflict/main.cpp
@ -1,361 +0,0 @@
-//Adapted from CUDA to OpenCL by Erwin Coumans
-//See http://bitbucket.org/erwincoumans/opencl_course
-
-// Copyright 2012 NVIDIA Corporation
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// 
-//     http://www.apache.org/licenses/LICENSE-2.0
-// 
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "b3OpenCLUtils.h"
-#include "../parallel_primitives/host/b3OpenCLArray.h"
-#include "../parallel_primitives/host/b3LauncherCL.h"
-#include "Bullet3Common/b3Quickprof.h"
-#include "../parallel_primitives/host/b3FillCL.h"
-#include "Bullet3Common/b3CommandLineArgs.h"
-
-#include <string.h>
-#include <stdio.h>
-#include <assert.h>
-
-//make sure to update the same #define in the opencl/lds_bank_conflict/lds_kernels.cl
-const int TILE_DIM = 32;
-const int BLOCK_ROWS = 8;
-const int NUM_REPS = 100;
-
-// Check errors and print GB/s
-void postprocess(const float *ref, const float *res, int n, float ms)
-{
-  bool passed = true;
-  for (int i = 0; i < n; i++)
-	if (res[i] != ref[i]) {
-	  printf("\nError: at res[%d] got %f but expected %f\n", i, res[i], ref[i]);
-	  printf("%25s\n", "*** FAILED ***");
-	  passed = false;
-	  break;
-	}
-  if (passed)
-	printf("%20.2f\n", 2 * n * sizeof(float) * 1e-6 * NUM_REPS / ms );
-}
-
-char* loadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
-{
-    // locals 
-    FILE* pFileStream = NULL;
-    size_t szSourceLength;
-	
-    // open the OpenCL source code file
-	pFileStream = fopen(cFilename, "rb");
-	if(pFileStream == 0) 
-	{       
-		return NULL;
-	}
-	
-    size_t szPreambleLength = strlen(cPreamble);
-	
-    // get the length of the source code
-    fseek(pFileStream, 0, SEEK_END); 
-    szSourceLength = ftell(pFileStream);
-    fseek(pFileStream, 0, SEEK_SET); 
-	
-    // allocate a buffer for the source code string and read it in
-    char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); 
-    memcpy(cSourceString, cPreamble, szPreambleLength);
-    fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream); 
-	
-    // close the file and return the total length of the combined (preamble + source) string
-    fclose(pFileStream);
-    if(szFinalLength != 0)
-    {
-        *szFinalLength = szSourceLength + szPreambleLength;
-    }
-    cSourceString[szSourceLength + szPreambleLength] = '\0';
-	
-    return cSourceString;
-}
-
-int main(int argc, char **argv)
-{
-	printf("Use --deviceId=<id> or --platformId=<id> to override OpenCL device\n");
-	b3CommandLineArgs args(argc,argv);
-
-	const int nx = 1024;
-	const int ny = 1024;
- 
-	const int mem_size = nx*ny*sizeof(float);
-	const int num_elements = nx*ny;
-	b3Clock clock;
-	double startEvent=0.f;
-	double stopEvent=0.f;
-
-	int localSizeX = TILE_DIM;
-	int localSizeY = BLOCK_ROWS;
-
-	int numThreadsX = (nx/TILE_DIM)*TILE_DIM;
-	int numThreadsY = (ny/TILE_DIM)*BLOCK_ROWS;
-
-	int gridX = numThreadsX / localSizeX;
-	int gridY = numThreadsY / localSizeY;
-
-	int ciErrNum = 0;
-	int preferred_device = -1;
-	int preferred_platform = -1;
-	args.GetCmdLineArgument("deviceId",preferred_device);
-	args.GetCmdLineArgument("platformId",preferred_platform);
-
-
-	cl_platform_id		platformId=0;
-	cl_context			ctx=0;
-	cl_command_queue	queue=0;
-	cl_device_id		device=0;
-	cl_kernel			copyKernel=0;
-	cl_kernel			copySharedMemKernel=0;
-	cl_kernel			transposeNaiveKernel = 0;
-	cl_kernel			transposeCoalescedKernel = 0;
-	cl_kernel			transposeNoBankConflictsKernel= 0;
-	
-
-	ctx = b3OpenCLUtils::createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum,0,0,preferred_device,preferred_platform,&platformId);
-	b3OpenCLUtils::printPlatformInfo(platformId);
-	oclCHECKERROR(ciErrNum, CL_SUCCESS);
-	device = b3OpenCLUtils::getDevice(ctx,0);
-	b3OpenCLUtils::printDeviceInfo(device);
-	queue = clCreateCommandQueue(ctx, device, 0, &ciErrNum);
-
-	const char* cSourceFile = "opencl/lds_bank_conflict/lds_kernels.cl";
-	
-	size_t szKernelLength;
-
-	const char* cSourceCL =0;
-	char relativeFileName[1024];
-
-	{
-		const char* prefix[]={"./","../","../../","../../../","../../../../"};
-		int numPrefixes = sizeof(prefix)/sizeof(char*);
-
-		for (int i=0;!cSourceCL && i<numPrefixes;i++)
-		{
-			
-			sprintf(relativeFileName,"%s%s",prefix[i],cSourceFile);
-			cSourceCL = loadProgSource(relativeFileName, "", &szKernelLength);
-			if (cSourceCL)
-			{
-				printf("Loaded program source: %s\n", relativeFileName); 
-			}
-		}
-	}
-	if (!cSourceCL)
-	{
-		printf("Couldn't find file %s, exiting\n",cSourceFile);
-		exit(0);
-	}
-
-char flags[1024]={0};
-#ifdef CL_PLATFORM_INTEL
-///use this flag to allow for OpenCL kernel debugging on CPU using the Intel OpenCL run-time
-	//sprintf(flags,"-g -s \"%s\"","C:/develop/opencl_course/opencl/lds_bank_conflict/lds_kernels.cl");
-#endif//CL_PLATFORM_INTEL
-
-	
-	copyKernel  = b3OpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"copyKernel",&ciErrNum,0,flags);
-	copySharedMemKernel  = b3OpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"copySharedMemKernel",&ciErrNum,0,flags);
-	transposeNaiveKernel = b3OpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"transposeNaiveKernel",&ciErrNum,0,flags);
-	transposeCoalescedKernel = b3OpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"transposeCoalescedKernel",&ciErrNum,0,flags);
-	transposeNoBankConflictsKernel = b3OpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"transposeNoBankConflictsKernel",&ciErrNum,0,flags);
-	
-	b3FillCL clMemSet(ctx,device,queue);
-
-	printf("\n============================================\n");
-
-	printf("Matrix size: %d %d, Block size: %d %d, Tile size: %d %d\n", 
-		 nx, ny, TILE_DIM, BLOCK_ROWS, TILE_DIM, TILE_DIM);
-
-	float *h_idata = (float*)malloc(mem_size);
-	float *h_cdata = (float*)malloc(mem_size);
-	float *h_tdata = (float*)malloc(mem_size);
-	float *gold    = (float*)malloc(mem_size);
-  
-	b3OpenCLArray<float> d_idataCL(ctx,queue);d_idataCL.resize(num_elements);
-	b3OpenCLArray<float> d_cdataCL(ctx,queue);d_cdataCL.resize(num_elements);
-	b3OpenCLArray<float> d_tdataCL(ctx,queue);d_tdataCL.resize(num_elements);
-  
-
-	// check parameters and calculate execution configuration
-	if (nx % TILE_DIM || ny % TILE_DIM) 
-	{
-		printf("nx and ny must be a multiple of TILE_DIM\n");
-		goto error_exit;
-	}
-
-	if (TILE_DIM % BLOCK_ROWS) 
-	{
-		printf("TILE_DIM must be a multiple of BLOCK_ROWS\n");
-		goto error_exit;
-	}
-	
-  // host
-  for (int j = 0; j < ny; j++)
-	for (int i = 0; i < nx; i++)
-	  h_idata[j*nx + i] = j*nx + i;
-
-  // correct result for error checking
-  for (int j = 0; j < ny; j++)
-	for (int i = 0; i < nx; i++)
-	{
-	  gold[j*nx + i] = h_idata[i*nx + j];
-	}
-  
-  d_idataCL.copyFromHostPointer(h_idata,num_elements);
-
-  // events for timing
-  clock.reset();
-
-  float ms;
-
-  // ------------
-  // time kernels
-  // ------------
-  printf("%25s%25s\n", "Routine", "Bandwidth (GB/s)");
-  
-  // ----
-  // copy 
-  // ----
-  printf("%25s", "copy");
-
-  clMemSet.execute(d_cdataCL,0.f,num_elements);
-  
-  {
-	    // warm up
-		b3LauncherCL launcher( queue, copyKernel);
-		launcher.setBuffer( d_cdataCL.getBufferCL());
-		launcher.setBuffer( d_idataCL.getBufferCL());
-		launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
-
-		startEvent = clock.getTimeMicroseconds()/1e3;
-		for (int i = 0; i < NUM_REPS; i++)
-			launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
-		oclCHECKERROR(ciErrNum, CL_SUCCESS);
-		clFinish(queue);
-		stopEvent = clock.getTimeMicroseconds()/1e3;
-	}
-
-	ms = float(stopEvent-startEvent);
-
-	d_cdataCL.copyToHostPointer(h_cdata,num_elements,0);
-	postprocess(h_idata, h_cdata, nx*ny, ms);
-
-  // -------------
-  // copySharedMem 
-  // -------------
-	printf("%25s", "shared memory copy");
-	clMemSet.execute(d_cdataCL,0.f,num_elements);
-
-	{
-		b3LauncherCL launcher( queue, copySharedMemKernel);
-		launcher.setBuffer( d_cdataCL.getBufferCL());
-		launcher.setBuffer( d_idataCL.getBufferCL());
-		launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
-
-		startEvent = clock.getTimeMicroseconds()/1e3;
-		for (int i = 0; i < NUM_REPS; i++)
-			launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
-		oclCHECKERROR(ciErrNum, CL_SUCCESS);
-		clFinish(queue);
-		stopEvent = clock.getTimeMicroseconds()/1e3;
-	}
-
-	ms = float(stopEvent-startEvent);
-	d_cdataCL.copyToHostPointer(h_cdata,num_elements,0);
-	postprocess(h_idata, h_cdata, nx * ny, ms);
-
-  // --------------
-  // transposeNaive 
-  // --------------
-	printf("%25s", "naive transpose");
-	clMemSet.execute(d_tdataCL,0.f,num_elements);
-	{
-		// warmup
-		b3LauncherCL launcher( queue, transposeNaiveKernel);
-		launcher.setBuffer( d_tdataCL.getBufferCL());
-		launcher.setBuffer( d_idataCL.getBufferCL());
-		launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
-
-		startEvent = clock.getTimeMicroseconds()/1e3;
-		for (int i = 0; i < NUM_REPS; i++)
-			launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
-		oclCHECKERROR(ciErrNum, CL_SUCCESS);
-		clFinish(queue);
-		stopEvent = clock.getTimeMicroseconds()/1e3;
-	}
-	ms = float(stopEvent-startEvent);
-	d_tdataCL.copyToHostPointer(h_tdata,num_elements,0);
-	postprocess(gold, h_tdata, nx * ny, ms);
-
-  // ------------------
-  // transposeCoalesced 
-  // ------------------
-	printf("%25s", "coalesced transpose");
-    clMemSet.execute(d_tdataCL,0.f,num_elements);
-	{
-		b3LauncherCL launcher( queue, transposeCoalescedKernel);
-		launcher.setBuffer( d_tdataCL.getBufferCL());
-		launcher.setBuffer( d_idataCL.getBufferCL());
-		launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
-
-		startEvent = clock.getTimeMicroseconds()/1e3;
-		for (int i = 0; i < NUM_REPS; i++)
-			launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
-		oclCHECKERROR(ciErrNum, CL_SUCCESS);
-		clFinish(queue);
-		stopEvent = clock.getTimeMicroseconds()/1e3;
-	}
-
-	ms = float(stopEvent-startEvent);
-	d_tdataCL.copyToHostPointer(h_tdata,num_elements,0);
-	postprocess(gold, h_tdata, nx * ny, ms);
-
-  // ------------------------
-  // transposeNoBankConflicts
-  // ------------------------
-	printf("%25s", "conflict-free transpose");
-	clMemSet.execute(d_tdataCL,0.f,num_elements);
-	{
-		b3LauncherCL launcher( queue, transposeNoBankConflictsKernel);
-		launcher.setBuffer( d_tdataCL.getBufferCL());
-		launcher.setBuffer( d_idataCL.getBufferCL());
-		launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
-
-		startEvent = clock.getTimeMicroseconds()/1e3;
-		for (int i = 0; i < NUM_REPS; i++)
-			launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
-		oclCHECKERROR(ciErrNum, CL_SUCCESS);
-		clFinish(queue);
-		stopEvent = clock.getTimeMicroseconds()/1e3;
-	}
-
-	ms = float(stopEvent-startEvent);
-	d_tdataCL.copyToHostPointer(h_tdata,num_elements,0);
-	postprocess(gold, h_tdata, nx * ny, ms);
-
-error_exit:
-  // cleanup
-	clReleaseKernel(copyKernel);
-	clReleaseCommandQueue(queue);
-	clReleaseContext(ctx);
-
-	free(h_idata);
-	free(h_tdata);
-	free(h_cdata);
-	free(gold);
-	printf("Press <enter>\n");
-	getchar();
-}
--- a/opencl/lds_bank_conflict/premake4.lua
+++ b/opencl/lds_bank_conflict/premake4.lua
@ -1,44 +0,0 @@
-
-function createProject (vendor)
-
-	local hasCL = findOpenCL(vendor)
-	
-	if (hasCL) then
-
-		project ( "OpenCL_lds_bank_conflict_" .. vendor)
-
-		initOpenCL(vendor)
-	
-		language "C++"
-				
-		kind "ConsoleApp"
-		targetdir "../../bin"
-
-		links {
-			"OpenCL_lib_parallel_primitives_host_" .. vendor
-		}
-
-		includedirs {
-			"../basic_initialize",
-			"../../src"
-		}
-		
-		files {
-			"main.cpp",
-			"../basic_initialize/b3OpenCLUtils.cpp",
-			"../basic_initialize/b3OpenCLUtils.h",
-			"../../src/Bullet3Common/b3AlignedAllocator.cpp",
-			"../../src/Bullet3Common/b3AlignedAllocator.h",
-			"../../src/Bullet3Common/b3AlignedObjectArray.h",
-			"../../src/Bullet3Common/b3Quickprof.cpp",
-			"../../src/Bullet3Common/b3Quickprof.h",
-			
-		}
-	end
-	
-end
-
-createProject("AMD")
-createProject("NVIDIA")
-createProject("Intel")
-createProject("Apple")
--- a/opencl/parallel_primitives/benchmark/premake4.lua
+++ b/opencl/parallel_primitives/benchmark/premake4.lua
@ -1,40 +0,0 @@
-function createProject(vendor)
-	hasCL = findOpenCL(vendor)
-	
-	if (hasCL) then
-
-		project ("OpenCL_radixsort_benchmark_" .. vendor)
-
-		initOpenCL(vendor)
-		
-		language "C++"
-				
-		kind "ConsoleApp"
-		targetdir "../../../bin"
-		includedirs {"..","../../../src"}
-		
-		links {
-			("OpenCL_lib_parallel_primitives_host_" .. vendor)
-		}
-		
-		files {
-			"test_large_problem_sorting.cpp",
-			"../../basic_initialize/b3OpenCLUtils.cpp",
-			"../../basic_initialize/b3OpenCLUtils.h",
-			"../host/b3FillCL.cpp",
-			"../host/b3PrefixScanCL.cpp",
-			"../host/b3RadixSort32CL.cpp",
-			"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
-			"../../../src/Bullet3Common/b3AlignedAllocator.h",
-			"../../../src/Bullet3Common/b3AlignedObjectArray.h",
-			"../../../src/Bullet3Common/b3Quickprof.cpp",
-			"../../../src/Bullet3Common/b3Quickprof.h",
-		}
-		
-	end
-end
-
-createProject("AMD")
-createProject("Intel")
-createProject("NVIDIA")
-createProject("Apple")
--- a/opencl/parallel_primitives/benchmark/test_large_problem_sorting.cpp
+++ b/opencl/parallel_primitives/benchmark/test_large_problem_sorting.cpp
@ -1,711 +0,0 @@
-/******************************************************************************
- * Copyright 2010 Duane Merrill
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. 
- * 
- * 
- * 
- * 
- * AUTHORS' REQUEST: 
- * 
- * 		If you use|reference|benchmark this code, please cite our Technical 
- * 		Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
- * 
- *		@TechReport{ Merrill:Sorting:2010,
- *        	author = "Duane Merrill and Andrew Grimshaw",
- *        	title = "Revisiting Sorting for GPGPU Stream Architectures",
- *        	year = "2010",
- *        	institution = "University of Virginia, Department of Computer Science",
- *        	address = "Charlottesville, VA, USA",
- *        	number = "CS2010-03"
- *		}
- * 
- * For more information, see our Google Code project site: 
- * http://code.google.com/p/back40computing/
- * 
- * Thanks!
- ******************************************************************************/
-
-/******************************************************************************
- * Simple test driver program for *large-problem* radix sorting.
- *
- * Useful for demonstrating how to integrate radix sorting into 
- * your application 
- ******************************************************************************/
-
-/******************************************************************************
- * Converted from CUDA to OpenCL/DirectCompute by Erwin Coumans
- ******************************************************************************/
-#ifdef _WIN32
-#pragma warning (disable:4996)
-#endif
-#include <stdlib.h> 
-#include <stdio.h> 
-#include <string.h> 
-#include <math.h> 
-#include <float.h>
-#include <algorithm>
-#include <string>
-
-
-//#include <iostream>
-#include <sstream>
-/**********************
-*
-*/
-
-#include "../host/b3RadixSort32CL.h"
-#include "../../basic_initialize/b3OpenCLUtils.h"
-#include "Bullet3Common/b3Quickprof.h"
-
-cl_context g_cxMainContext;
-cl_device_id g_device;
-cl_command_queue g_cqCommandQueue;
-
-/***********************
-*
-*/
-
-bool g_verbose;
-///Preferred OpenCL device/platform. When < 0 then no preference is used. 
-///Note that b3OpenCLUtils might still use the preference of using a platform vendor that matches the SDK vendor used to build the application.
-///Preferred device/platform take priority over this platform-vendor match
-int gPreferredDeviceId = -1;
-int gPreferredPlatformId = -1;
-
-
-
-/******************************************************************************
- * Routines
- ******************************************************************************/
-
-
-/**
- * Keys-only sorting.  Uses the GPU to sort the specified vector of elements for the given 
- * number of iterations, displaying runtime information.
- *
- * @param[in] 		num_elements 
- * 		Size in elements of the vector to sort
- * @param[in] 		h_keys 
- * 		Vector of keys to sort 
- * @param[in] 		iterations  
- * 		Number of times to invoke the GPU sorting primitive
-  * @param[in] 		cfg 
- * 		Config
- */
-template <typename K>
-void TimedSort(
-	unsigned int num_elements, 
-	K *h_keys,
-	unsigned int iterations)
-{
-	printf("Keys only, %d iterations, %d elements\n", iterations, num_elements);
-
-	int max_elements = num_elements;
-	b3AlignedObjectArray<unsigned int> hostData;
-	hostData.resize(num_elements);
-	for (int i=0;i<num_elements;i++)
-	{
-		hostData[i] = h_keys[i];
-	}
-
-	b3RadixSort32CL sorter(g_cxMainContext,g_device,g_cqCommandQueue);
-
-	b3OpenCLArray<unsigned int> gpuData(g_cxMainContext,g_cqCommandQueue);
-	gpuData.copyFromHost(hostData);
-	//sorter.executeHost(gpuData);
-    sorter.execute(gpuData);
-    
-	b3AlignedObjectArray<unsigned int> hostDataSorted;
-	gpuData.copyToHost(hostDataSorted);
-    
-	clFinish(g_cqCommandQueue);
-
-	{
-		//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
-
-		// Create sorting enactor
-
-		// Perform the timed number of sorting iterations
-		double elapsed = 0;
-		float duration = 0;
-		b3Clock watch;
-
-		//warm-start
-		gpuData.copyFromHost(hostData);
-		clFinish(g_cqCommandQueue);
-		sorter.execute(gpuData);
-
-		watch.reset();
-
-			
-		for (int i = 0; i < iterations; i++) 
-		{
-
-
-
-			// Move a fresh copy of the problem into device storage
-			gpuData.copyFromHost(hostData);
-			clFinish(g_cqCommandQueue);
-
-			// Start GPU timing record
-			double startMs = watch.getTimeMicroseconds()/1e3;
-			
-			// Call the sorting API routine
-			sorter.execute(gpuData);
-
-
-
-			clFinish(g_cqCommandQueue);
-	
-			double stopMs = watch.getTimeMicroseconds()/1e3;
-
-			duration = stopMs - startMs;
-			
-			// End GPU timing record
-			elapsed += (double) duration;
-			printf("duration = %f\n", duration);
-		}
-
-		// Display timing information
-		double avg_runtime = elapsed / iterations;
-	//	double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0; 
-	//   printf(", %f GPU ms, %f x10^9 elts/sec\n", 	avg_runtime,	throughput);
-		double throughput = ((double) num_elements) / avg_runtime / 1000.0 ; 
-		printf(", %f GPU ms, %f x10^6 elts/sec\n", 	avg_runtime,	throughput);
-
-		gpuData.copyToHost(hostData);
-		for (int i=0;i<num_elements;i++)
-		{
-			h_keys[i] = hostData[i];
-		}
-	}
-}
-
-/**
- * Key-value sorting.  Uses the GPU to sort the specified vector of elements for the given 
- * number of iterations, displaying runtime information.
- *
- * @param[in] 		num_elements 
- * 		Size in elements of the vector to sort
- * @param[in] 		h_keys 
- * 		Vector of keys to sort 
- * @param[in,out] 	h_values  
- * 		Vector of values to sort 
- * @param[in] 		iterations  
- * 		Number of times to invoke the GPU sorting primitive
-  * @param[in] 		cfg 
- * 		Config
- */
-template <typename K, typename V>
-void TimedSort(
-	unsigned int num_elements, 
-	K *h_keys,
-	V *h_values, 
-	unsigned int iterations) 
-{
-	
-	printf("Key-values, %d iterations, %d elements\n", iterations, num_elements);
-
-	int max_elements = num_elements;
-	b3AlignedObjectArray<b3SortData> hostData;
-	hostData.resize(num_elements);
-	for (int i=0;i<num_elements;i++)
-	{
-		hostData[i].m_key = h_keys[i];
-		hostData[i].m_value = h_values[i];
-	}
-
-	b3RadixSort32CL sorter(g_cxMainContext,g_device,g_cqCommandQueue);
-
-	b3OpenCLArray<b3SortData> gpuData(g_cxMainContext,g_cqCommandQueue);
-	gpuData.copyFromHost(hostData);
-	//sorter.executeHost(gpuData);
-    sorter.execute(gpuData);
-    
-	b3AlignedObjectArray<b3SortData> hostDataSorted;
-	gpuData.copyToHost(hostDataSorted);
-#if 0
-    for (int i=0;i<num_elements;i++)
-	{
-		printf("hostData[%d].m_key = %d\n",i, hostDataSorted[i].m_key);
-        printf("hostData[%d].m_value = %d\n",i,hostDataSorted[i].m_value);
-	}
-#endif
-    
-clFinish(g_cqCommandQueue);
-
-	{
-		//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
-
-		// Create sorting enactor
-
-		// Perform the timed number of sorting iterations
-		double elapsed = 0;
-		float duration = 0;
-		b3Clock watch;
-		
-		//warm-start
-		gpuData.copyFromHost(hostData);
-		sorter.execute(gpuData);
-		clFinish(g_cqCommandQueue);
-
-		watch.reset();
-
-			
-		for (int i = 0; i < iterations; i++) 
-		{
-
-
-
-			// Move a fresh copy of the problem into device storage
-			gpuData.copyFromHost(hostData);
-			clFinish(g_cqCommandQueue);
-
-			// Start GPU timing record
-			double startMs = watch.getTimeMicroseconds()/1e3;
-			
-			// Call the sorting API routine
-			sorter.execute(gpuData);
-			clFinish(g_cqCommandQueue);
-	
-			double stopMs = watch.getTimeMicroseconds()/1e3;
-
-			duration = stopMs - startMs;
-			
-			// End GPU timing record
-			elapsed += (double) duration;
-			printf("duration = %f\n", duration);
-		}
-
-		// Display timing information
-		double avg_runtime = elapsed / iterations;
-	//	double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0; 
-	//   printf(", %f GPU ms, %f x10^9 elts/sec\n", 	avg_runtime,	throughput);
-		double throughput = ((double) num_elements) / avg_runtime / 1000.0 ; 
-		printf(", %f GPU ms, %f x10^6 elts/sec\n", 	avg_runtime,	throughput);
-
-		gpuData.copyToHost(hostData);
-		for (int i=0;i<num_elements;i++)
-		{
-			h_keys[i] = hostData[i].m_key;
-			h_values[i] = hostData[i].m_value;
-		}
-	}
-}
-
-
-
-/**
- * Generates random 32-bit keys.
- * 
- * We always take the second-order byte from rand() because the higher-order 
- * bits returned by rand() are commonly considered more uniformly distributed
- * than the lower-order bits.
- * 
- * We can decrease the entropy level of keys by adopting the technique 
- * of Thearling and Smith in which keys are computed from the bitwise AND of 
- * multiple random samples: 
- * 
- * entropy_reduction	| Effectively-unique bits per key
- * -----------------------------------------------------
- * -1					| 0
- * 0					| 32
- * 1					| 25.95
- * 2					| 17.41
- * 3					| 10.78
- * 4					| 6.42
- * ...					| ...
- * 
- */
-template <typename K>
-void RandomBits(K &key, int entropy_reduction = 0, int lower_key_bits = sizeof(K) * 8)
-{
-	const unsigned int NUM_UCHARS = (sizeof(K) + sizeof(unsigned char) - 1) / sizeof(unsigned char);
-	unsigned char key_bits[NUM_UCHARS];
-	
-	do {
-	
-		for (int j = 0; j < NUM_UCHARS; j++) {
-			unsigned char quarterword = 0xff;
-			for (int i = 0; i <= entropy_reduction; i++) {
-				quarterword &= (rand() >> 7);
-			}
-			key_bits[j] = quarterword;
-		}
-		
-		if (lower_key_bits < sizeof(K) * 8) {
-			unsigned long long base = 0;
-			memcpy(&base, key_bits, sizeof(K));
-			base &= (1 << lower_key_bits) - 1;
-			memcpy(key_bits, &base, sizeof(K));
-		}
-		
-		memcpy(&key, key_bits, sizeof(K));
-		
-	} while (key != key);		// avoids NaNs when generating random floating point numbers 
-}
-
-
-/******************************************************************************
- * Templated routines for printing keys/values to the console 
- ******************************************************************************/
-
-template<typename T> 
-void PrintValue(T val) {
-	printf("%d", val);
-}
-
-template<>
-void PrintValue<float>(float val) {
-	printf("%f", val);
-}
-
-template<>
-void PrintValue<double>(double val) {
-	printf("%f", val);
-}
-
-template<>
-void PrintValue<unsigned char>(unsigned char val) {
-	printf("%u", val);
-}
-
-template<>
-void PrintValue<unsigned short>(unsigned short val) {
-	printf("%u", val);
-}
-
-template<>
-void PrintValue<unsigned int>(unsigned int val) {
-	printf("%u", val);
-}
-
-template<>
-void PrintValue<long>(long val) {
-	printf("%ld", val);
-}
-
-template<>
-void PrintValue<unsigned long>(unsigned long val) {
-	printf("%lu", val);
-}
-
-template<>
-void PrintValue<long long>(long long val) {
-	printf("%lld", val);
-}
-
-template<>
-void PrintValue<unsigned long long>(unsigned long long val) {
-	printf("%llu", val);
-}
-
-
-
-/**
- * Compares the equivalence of two arrays
- */
-template <typename T, typename SizeT>
-int CompareResults(T* computed, T* reference, SizeT len, bool verbose = true)
-{
-	printf("\n");
-	for (SizeT i = 0; i < len; i++) {
-
-		if (computed[i] != reference[i]) {
-			printf("INCORRECT: [%lu]: ", (unsigned long) i);
-			PrintValue<T>(computed[i]);
-			printf(" != ");
-			PrintValue<T>(reference[i]);
-
-			if (verbose) {
-				printf("\nresult[...");
-				for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
-					PrintValue<T>(computed[j]);
-					printf(", ");
-				}
-				printf("...]");
-				printf("\nreference[...");
-				for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
-					PrintValue<T>(reference[j]);
-					printf(", ");
-				}
-				printf("...]");
-			}
-
-			return 1;
-		}
-	}
-
-	printf("CORRECT\n");
-	return 0;
-}
-
-/**
- * Creates an example sorting problem whose keys is a vector of the specified 
- * number of K elements, values of V elements, and then dispatches the problem 
- * to the GPU for the given number of iterations, displaying runtime information.
- *
- * @param[in] 		iterations  
- * 		Number of times to invoke the GPU sorting primitive
- * @param[in] 		num_elements 
- * 		Size in elements of the vector to sort
- * @param[in] 		cfg 
- * 		Config
- */
-template<typename K, typename V>
-void TestSort(
-	unsigned int iterations,
-	int num_elements,
-	bool keys_only)
-{
-    // Allocate the sorting problem on the host and fill the keys with random bytes
-
-	K *h_keys = NULL;
-	K *h_reference_keys = NULL;
-	V *h_values = NULL;
-	h_keys = (K*) malloc(num_elements * sizeof(K));
-	h_reference_keys = (K*) malloc(num_elements * sizeof(K));
-	if (!keys_only) h_values = (V*) malloc(num_elements * sizeof(V));
-	
-
-	// Use random bits
-	for (unsigned int i = 0; i < num_elements; ++i) {
-		RandomBits<K>(h_keys[i], 0);
-		//h_keys[i] = num_elements-i;
-        //h_keys[i] = 0xffffffffu-i;
-		if (!keys_only)
-			h_values[i] = h_keys[i];//0xffffffffu-i;
-
-		h_reference_keys[i] = h_keys[i];
-	}
-
-    // Run the timing test 
-	if (keys_only) {
-		TimedSort<K>(num_elements, h_keys, iterations);
-	} else {
-		TimedSort<K, V>(num_elements, h_keys, h_values, iterations);
-	}
-
-//	cudaThreadSynchronize();
-    
-	// Display sorted key data
-	if (g_verbose) {
-		printf("\n\nKeys:\n");
-		for (int i = 0; i < num_elements; i++) {	
-			PrintValue<K>(h_keys[i]);
-			printf(", ");
-		}
-		printf("\n\n");
-	}	
-	
-    // Verify solution
-	std::sort(h_reference_keys, h_reference_keys + num_elements);	
-	CompareResults<K>(h_keys, h_reference_keys, num_elements, true);
-	printf("\n");
-	fflush(stdout);
-
-	// Free our allocated host memory 
-	if (h_keys != NULL) free(h_keys);
-    if (h_values != NULL) free(h_values);
-}
-
-
-
-/**
- * Displays the commandline usage for this tool
- */
-void Usage() 
-{
-	printf("\ntest_large_problem_sorting [--device=<device index>] [--v] [--i=<num-iterations>] [--n=<num-elements>] [--key-values] [--deviceId=<int>] [--platformId=<int>]\n"); 
-	printf("\n");
-	printf("\t--v\tDisplays sorted results to the console.\n");
-	printf("\n");
-	printf("\t--i\tPerforms the sorting operation <num-iterations> times\n");
-	printf("\t\t\ton the device. Re-copies original input each time. Default = 1\n");
-	printf("\n");
-	printf("\t--n\tThe number of elements to comprise the sample problem\n");
-	printf("\t\t\tDefault = 512\n");
-	printf("\n");
-	printf("\t--key-values\tSpecifies that keys are accommodated by value pairings\n");
-	printf("\n");
-}
-
-
-/******************************************************************************
- * Command-line parsing
- ******************************************************************************/
-#include <map>
-#include <algorithm>
-#include <string>
-
-class b3CommandLineArgs
-{
-protected:
-
-	std::map<std::string, std::string> pairs;
-
-public:
-
-	// Constructor
-	b3CommandLineArgs(int argc, char **argv)
-	{
-		using namespace std;
-
-	    for (int i = 1; i < argc; i++)
-	    {
-	        string arg = argv[i];
-
-	        if ((arg[0] != '-') || (arg[1] != '-')) {
-	        	continue;
-	        }
-
-        	string::size_type pos;
-		    string key, val;
-	        if ((pos = arg.find( '=')) == string::npos) {
-	        	key = string(arg, 2, arg.length() - 2);
-	        	val = "";
-	        } else {
-	        	key = string(arg, 2, pos - 2);
-	        	val = string(arg, pos + 1, arg.length() - 1);
-	        }
-        	pairs[key] = val;
-	    }
-	}
-
-	bool CheckCmdLineFlag(const char* arg_name)
-	{
-		using namespace std;
-		map<string, string>::iterator itr;
-		if ((itr = pairs.find(arg_name)) != pairs.end()) {
-			return true;
-	    }
-		return false;
-	}
-
-	template <typename T>
-	void GetCmdLineArgument(const char *arg_name, T &val);
-
-	int ParsedArgc()
-	{
-		return pairs.size();
-	}
-};
-
-template <typename T>
-void b3CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val)
-{
-	using namespace std;
-	map<string, string>::iterator itr;
-	if ((itr = pairs.find(arg_name)) != pairs.end()) {
-		istringstream strstream(itr->second);
-		strstream >> val;
-    }
-}
-
-template <>
-void b3CommandLineArgs::GetCmdLineArgument<char*>(const char* arg_name, char* &val)
-{
-	using namespace std;
-	map<string, string>::iterator itr;
-	if ((itr = pairs.find(arg_name)) != pairs.end()) {
-
-		string s = itr->second;
-		val = (char*) malloc(sizeof(char) * (s.length() + 1));
-		strcpy(val, s.c_str());
-
-	} else {
-    	val = NULL;
-	}
-}
-
-
-
-
-
-/******************************************************************************
- * Main
- ******************************************************************************/
-
-extern bool gDebugSkipLoadingBinary;
-
-int main( int argc, char** argv) 
-{
-	//gDebugSkipLoadingBinary = true;
-
-	cl_int ciErrNum;
-	b3CommandLineArgs args(argc,argv);
-
-	args.GetCmdLineArgument("deviceId", gPreferredDeviceId);
-	args.GetCmdLineArgument("platformId", gPreferredPlatformId);
-
-	printf("Initialize OpenCL using b3OpenCLUtils_createContextFromType\n");
-	cl_platform_id platformId;
-	g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
-//	g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
-	
-	oclCHECKERROR(ciErrNum, CL_SUCCESS);
-
-	int numDev = b3OpenCLUtils_getNumDevices(g_cxMainContext);
-
-	if (!numDev)
-	{
-		printf("error: no OpenCL devices\n");
-		exit(0);
-	}
-	int result;
-	int devId = 0;
-	g_device = b3OpenCLUtils_getDevice(g_cxMainContext,devId);
-	b3OpenCLUtils_printDeviceInfo(g_device);
-	// create a command-queue
-	g_cqCommandQueue = clCreateCommandQueue(g_cxMainContext, g_device, 0, &ciErrNum);
-	oclCHECKERROR(ciErrNum, CL_SUCCESS);
-
-
-
-	//srand(time(NULL));	
-	srand(0);				// presently deterministic
-
-    unsigned int num_elements 					= 8*1024*1024;//4*1024*1024;//4*1024*1024;//257;//8*524288;//2048;//512;//524288;
-    unsigned int iterations  					= 10;
-    bool keys_only = true;
-
-    //
-	// Check command line arguments
-    //
-
-	
-
-	if (args.CheckCmdLineFlag("help"))
-	{
-		Usage();
-		return 0;
-	}
-	
-	args.GetCmdLineArgument("i", iterations);
-	args.GetCmdLineArgument("n", num_elements);
-	
-
-
-	keys_only = !args.CheckCmdLineFlag("key-values");
-	g_verbose = args.CheckCmdLineFlag("v");
-
-
-
-	TestSort<unsigned int, unsigned int>(
-			iterations,
-			num_elements, 
-			keys_only);
-
-
-}
--- a/opencl/parallel_primitives/host/b3Int2.h
+++ b/opencl/parallel_primitives/host/b3Int2.h
@ -1,35 +0,0 @@
-#ifndef B3_INT2_H
-#define B3_INT2_H
-
-struct b3UnsignedInt2
-{
-	union
-	{
-		struct
-		{
-			unsigned int x,y;
-		};
-		struct
-		{
-			unsigned int s[2];
-		};
-	};
-};
-
-struct b3Int2
-{
-	union
-	{
-		struct
-		{
-			int x,y;
-		};
-		struct
-		{
-			int s[2];
-		};
-	};
-};
-
-
-#endif
--- a/opencl/parallel_primitives/test/main.cpp
+++ b/opencl/parallel_primitives/test/main.cpp
@ -1,379 +0,0 @@
-/*
-Copyright (c) 2012 Advanced Micro Devices, Inc.  
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-
-#include <stdio.h>
-#include "../basic_initialize/b3OpenCLUtils.h"
-#include "../host/b3FillCL.h"
-#include "../host/b3BoundSearchCL.h"
-#include "../host/b3RadixSort32CL.h"
-#include "../host/b3PrefixScanCL.h"
-#include "Bullet3Common/b3CommandLineArgs.h"
-#include "Bullet3Common/b3MinMax.h"
-
-int g_nPassed = 0;
-int g_nFailed = 0;
-bool g_testFailed = 0;
-
-#define TEST_INIT g_testFailed = 0;
-#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;}
-#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++;
-#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
-
-cl_context g_context=0;
-cl_device_id g_device=0;
-cl_command_queue g_queue =0;
-const char* g_deviceName = 0;
-
-void initCL(int preferredDeviceIndex, int preferredPlatformIndex)
-{
-	void* glCtx=0;
-	void* glDC = 0;
-	int ciErrNum = 0;
-	//bound search and radix sort only work on GPU right now (assume 32 or 64 width workgroup without barriers)
-
-	cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
-
-	g_context = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex);
-	oclCHECKERROR(ciErrNum, CL_SUCCESS);
-	int numDev = b3OpenCLUtils::getNumDevices(g_context);
-	if (numDev>0)
-	{
-		b3OpenCLDeviceInfo info;
-		g_device= b3OpenCLUtils::getDevice(g_context,0);
-		g_queue = clCreateCommandQueue(g_context, g_device, 0, &ciErrNum);
-		oclCHECKERROR(ciErrNum, CL_SUCCESS);
-        b3OpenCLUtils::printDeviceInfo(g_device);
-		b3OpenCLUtils::getDeviceInfo(g_device,&info);
-		g_deviceName = info.m_deviceName;
-	}
-}
-
-void exitCL()
-{
-	clReleaseCommandQueue(g_queue);
-	clReleaseContext(g_context);
-}
-
-
-inline void fillIntTest()
-{
-	TEST_INIT;
-
-	b3FillCL* fillCL = new b3FillCL(g_context,g_device,g_queue);
-	int maxSize=1024*256;
-	b3OpenCLArray<int> intBuffer(g_context,g_queue,maxSize);
-	intBuffer.resize(maxSize);
-	
-#define NUM_TESTS 7
-
-	int dx = maxSize/NUM_TESTS;
-	for (int iter=0;iter<NUM_TESTS;iter++)
-	{
-		int size = b3Min( 11+dx*iter, maxSize );
-
-		int value = 2;
-		
-
-		int offset=0;
-		fillCL->execute(intBuffer,value,size,offset);
-
-		b3AlignedObjectArray<int> hostBuf2;
-		hostBuf2.resize(size);
-		fillCL->executeHost(hostBuf2,value,size,offset);
-
-		b3AlignedObjectArray<int> hostBuf;
-		intBuffer.copyToHost(hostBuf);
-
-		for(int i=0; i<size; i++)
-		{
-				TEST_ASSERT( hostBuf[i] == hostBuf2[i] );
-				TEST_ASSERT( hostBuf[i] == hostBuf2[i] );
-		}
-	}
-
-	
-
-	delete fillCL;
-
-	TEST_REPORT( "fillIntTest" );
-}
-
-
-__inline
-void seedRandom(int seed)
-{
-	srand( seed );
-}
-
-template<typename T>
-__inline
-T getRandom(const T& minV, const T& maxV)
-{
-	float r = (rand()%10000)/10000.f;
-	T range = maxV - minV;
-	return (T)(minV + r*range);
-}
-
-struct b3SortDataCompare
-{
-	inline bool operator()(const b3SortData& first, const b3SortData& second) const
-	{
-		return (first.m_key < second.m_key) || (first.m_key==second.m_key && first.m_value < second.m_value);
-	}
-};
-
-
-void boundSearchTest( )
-{
-	TEST_INIT;
-
-	int maxSize = 1024*256;
-	int bucketSize = 256;
-
-	b3OpenCLArray<b3SortData> srcCL(g_context,g_queue,maxSize);
-	b3OpenCLArray<unsigned int> upperCL(g_context,g_queue,maxSize);
-	b3OpenCLArray<unsigned int> lowerCL(g_context,g_queue,maxSize);
-	
-	b3AlignedObjectArray<b3SortData> srcHost;
-	b3AlignedObjectArray<unsigned int> upperHost;
-	b3AlignedObjectArray<unsigned int> lowerHost;
-	b3AlignedObjectArray<unsigned int> upperHostCompare;
-	b3AlignedObjectArray<unsigned int> lowerHostCompare;
-	
-	b3BoundSearchCL* search = new b3BoundSearchCL(g_context,g_device,g_queue, maxSize);
-
-
-	int dx = maxSize/NUM_TESTS;
-	for(int iter=0; iter<NUM_TESTS; iter++)
-	{
-		
-		int size = b3Min( 128+dx*iter, maxSize );
-
-		upperHost.resize(bucketSize);
-		lowerHost.resize(bucketSize);
-		upperHostCompare.resize(bucketSize);
-		lowerHostCompare.resize(bucketSize);
-
-		srcHost.resize(size);
-
-		for(int i=0; i<size; i++) 
-		{
-			b3SortData v;
-//			v.m_key = i<2? 0 : 5;
-			v.m_key = getRandom(0,bucketSize);
-
-			v.m_value = i;
-			srcHost.at(i) = v;
-		}
-
-		srcHost.quickSort(b3SortDataCompare());
-		srcCL.copyFromHost(srcHost);
-
-		{
-			
-			for(int i=0; i<bucketSize; i++) 
-			{
-				lowerHost[i] = -1;
-				lowerHostCompare[i] = -1;
-				upperHost[i] = -1;
-				upperHostCompare[i] = -1;
-			}
-			upperCL.copyFromHost(upperHost);
-			lowerCL.copyFromHost(lowerHost);
-		}
-
-		search->execute(srcCL,size,upperCL,bucketSize,b3BoundSearchCL::BOUND_UPPER);
-		search->execute(srcCL,size,lowerCL,bucketSize,b3BoundSearchCL::BOUND_LOWER);
-
-		search->executeHost(srcHost,size,upperHostCompare,bucketSize,b3BoundSearchCL::BOUND_UPPER);
-		search->executeHost(srcHost,size,lowerHostCompare,bucketSize,b3BoundSearchCL::BOUND_LOWER);
-
-		lowerCL.copyToHost(lowerHost);
-		upperCL.copyToHost(upperHost);
-		for(int i=0; i<bucketSize; i++)
-		{
-			TEST_ASSERT(upperHostCompare[i] == upperHost[i]);
-			TEST_ASSERT(lowerHostCompare[i] == lowerHost[i]);
-		}
-		/*
-		for(int i=1; i<bucketSize; i++)
-		{
-			int lhi_1 = lowerHost[i-1];
-			int lhi = lowerHost[i];
-
-			for(int j=lhi_1; j<lhi; j++)
-			//for(int j=lowerHost[i-1]; j<lowerHost[i]; j++)
-			{
-				TEST_ASSERT( srcHost[j].m_key < i );
-			}
-		}
-
-		for(int i=0; i<bucketSize; i++)
-		{
-			int jMin = (i==0)?0:upperHost[i-1];
-			for(int j=jMin; j<upperHost[i]; j++)
-			{
-				TEST_ASSERT( srcHost[j].m_key <= i );
-			}
-		}
-		*/
-
-
-		for(int i=0; i<bucketSize; i++)
-		{
-			int lhi = lowerHost[i];
-			int uhi = upperHost[i];
-
-			for(int j=lhi; j<uhi; j++)
-			{
-				if ( srcHost[j].m_key != i )
-				{
-					printf("error %d != %d\n",srcHost[j].m_key,i);
-				}
-				TEST_ASSERT( srcHost[j].m_key == i );
-			}
-		}
-
-	}
-
-	delete search;
-
-	TEST_REPORT( "boundSearchTest" );
-}
-
-
-void prefixScanTest()
-{
-	TEST_INIT;
-
-	int maxSize = 1024*256;
-
-	b3AlignedObjectArray<unsigned int> buf0Host;
-	b3AlignedObjectArray<unsigned int> buf1Host;
-
-	b3OpenCLArray<unsigned int> buf2CL(g_context,g_queue,maxSize);
-	b3OpenCLArray<unsigned int> buf3CL(g_context,g_queue,maxSize);
-	
-	
-	b3PrefixScanCL* scan = new b3PrefixScanCL(g_context,g_device,g_queue,maxSize);
-		
-	int dx = maxSize/NUM_TESTS;
-	for(int iter=0; iter<NUM_TESTS; iter++)
-	{
-		int size = b3Min( 128+dx*iter, maxSize );
-		buf0Host.resize(size);
-		buf1Host.resize(size);
-
-		for(int i=0; i<size; i++) 
-			buf0Host[i] = 1;
-		
-		buf2CL.copyFromHost( buf0Host);
-	
-		unsigned int sumHost, sumGPU;
-
-		scan->executeHost(buf0Host, buf1Host, size, &sumHost );
-		scan->execute( buf2CL, buf3CL, size, &sumGPU );
-
-		buf3CL.copyToHost(buf0Host);
-		
-		TEST_ASSERT( sumHost == sumGPU );
-		for(int i=0; i<size; i++) 
-			TEST_ASSERT( buf1Host[i] == buf0Host[i] );
-	}
-
-	delete scan;
-
-	TEST_REPORT( "scanTest" );
-}
-
-
-bool radixSortTest()
-{
-	TEST_INIT;
-	
-	int maxSize = 1024*256;
-
-	b3AlignedObjectArray<b3SortData> buf0Host;
-	buf0Host.resize(maxSize);
-	b3AlignedObjectArray<b3SortData> buf1Host;
-	buf1Host.resize(maxSize );
-	b3OpenCLArray<b3SortData> buf2CL(g_context,g_queue,maxSize);
-
-	b3RadixSort32CL* sort = new b3RadixSort32CL(g_context,g_device,g_queue,maxSize);
-
-	int dx = maxSize/NUM_TESTS;
-	for(int iter=0; iter<NUM_TESTS; iter++)
-	{
-		int size = b3Min( 128+dx*iter, maxSize-512 );
-		size = NEXTMULTIPLEOF( size, 512 );//not necessary
-		
-		buf0Host.resize(size);
-
-		for(int i=0; i<size; i++)
-		{
-			b3SortData v;
-			v.m_key = getRandom(0,0xff);
-			v.m_value = i;
-			buf0Host[i] = v;
-		}
-
-		buf2CL.copyFromHost( buf0Host);
-		
-
-		sort->executeHost( buf0Host);
-		sort->execute(buf2CL);
-
-		buf2CL.copyToHost(buf1Host);
-				
-		for(int i=0; i<size; i++) 
-		{
-			TEST_ASSERT( buf0Host[i].m_value == buf1Host[i].m_value && buf0Host[i].m_key == buf1Host[i].m_key );
-		}
-	}
-
-	delete sort;
-
-	TEST_REPORT( "radixSort" );
-
-	return g_testFailed;
-}
-
-
-int main(int argc, char** argv)
-{
-	int preferredDeviceIndex = -1;
-	int preferredPlatformIndex = -1;
-
-	b3CommandLineArgs args(argc, argv);
-	args.GetCmdLineArgument("deviceId", preferredDeviceIndex);
-	args.GetCmdLineArgument("platformId", preferredPlatformIndex);
-
-	initCL(preferredDeviceIndex,preferredPlatformIndex);
-
-	fillIntTest();
-
-	boundSearchTest();
-
-	prefixScanTest();
-
-	radixSortTest();
-
-	exitCL();
-
-	printf("%d tests passed, %d tests failed\n",g_nPassed, g_nFailed);
-	printf("End, press <enter>\n");
-	getchar();
-}
-
--- a/opencl/parallel_primitives/test/premake4.lua
+++ b/opencl/parallel_primitives/test/premake4.lua
@ -1,41 +0,0 @@
-function createProject(vendor)	
-	hasCL = findOpenCL(vendor)
-	
-	if (hasCL) then
-
-		project ("OpenCL_primitives_test_" .. vendor)
-
-		initOpenCL(vendor)
-
-		language "C++"
-				
-		kind "ConsoleApp"
-		targetdir "../../../bin"
-		includedirs {".","..","../../../src"}
-		
-		
-		files {
-			"main.cpp",
-			"../../basic_initialize/b3OpenCLInclude.h",
-			"../../basic_initialize/b3OpenCLUtils.cpp",
-			"../../basic_initialize/b3OpenCLUtils.h",
-			"../host/b3FillCL.cpp",
-			"../host/b3FillCL.h",
-			"../host/b3BoundSearchCL.cpp",
-			"../host/b3BoundSearchCL.h",
-			"../host/b3PrefixScanCL.cpp",
-			"../host/b3PrefixScanCL.h",
-			"../host/b3RadixSort32CL.cpp",
-			"../host/b3RadixSort32CL.h",
-			"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
-			"../../../src/Bullet3Common/b3AlignedAllocator.h",
-			"../../../src/Bullet3Common/b3AlignedObjectArray.h",
-		}
-		
-	end
-end
-
-createProject("AMD")
-createProject("Intel")
-createProject("NVIDIA")
-createProject("Apple")
--- a/opencl/reduce/main.cpp
+++ b/opencl/reduce/main.cpp
@ -1,116 +0,0 @@
-///original author: Erwin Coumans
-#include "b3OpenCLUtils.h"
-#include "../parallel_primitives/host/b3OpenCLArray.h"
-#include "../parallel_primitives/host/b3LauncherCL.h"
-#include <stdio.h>
-
-
-#define MSTRINGIFY(A) #A
-const char* kernelString= MSTRINGIFY(
-__kernel void ReduceGlobal(__global int* d_in, __global int* d_out, int numElements)
-{
-	int myId = get_global_id(0);
-	int tid = get_local_id(0);
-
-
-	int ls = get_local_size(0);
-	for (unsigned int s=ls/2;s>0;s>>=1)
-	{
-		if (myId<numElements)
-		{
-			if (tid<s)
-			{
-				d_in[myId] += d_in[myId+s];
-			}
-		}
-		barrier(CLK_GLOBAL_MEM_FENCE);
-	}
-	if (tid==0)
-	{
-		if (myId<numElements)
-		{
-			d_out[get_group_id(0)]=d_in[myId];
-		}
-	}
-}
-);
-
-int main(int argc, char* argv[])
-{
-	int ciErrNum = 0;
-	int preferred_device = -1;
-	int preferred_platform = -1;
-	cl_platform_id		platformId;
-	cl_context			ctx;
-	cl_command_queue	queue;
-	cl_device_id		device;
-	cl_kernel			addKernel;
-	ctx = b3OpenCLUtils::createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum,0,0,preferred_device,preferred_platform,&platformId);
-	b3OpenCLUtils::printPlatformInfo(platformId);
-	oclCHECKERROR(ciErrNum, CL_SUCCESS);
-	if (!ctx) {
-		printf("No OpenCL capable GPU found!");
-		return 0;
-	}
-
-	device = b3OpenCLUtils::getDevice(ctx,0);
-	queue = clCreateCommandQueue(ctx, device, 0, &ciErrNum);
-	addKernel = b3OpenCLUtils::compileCLKernelFromString(ctx,device,kernelString,"ReduceGlobal",&ciErrNum);
-	oclCHECKERROR(ciErrNum, CL_SUCCESS);
-	int numElements = 1024*1024;
-	b3OpenCLArray<int> a(ctx,queue);
-	b3OpenCLArray<int> b(ctx,queue);
-	b3AlignedObjectArray<int> hostA;
-	b3AlignedObjectArray<int> hostB;
-
-	for (int i=0;i<numElements;i++)
-	{
-		hostA.push_back(1);
-		hostB.push_back(0.f);
-	}
-	a.copyFromHost(hostA);
-	b.copyFromHost(hostB);
-	
-	int hostSum= 0;
-	for (int i=0;i<numElements;i++)
-	{
-		hostSum += hostA.at(i);
-	}
-	b.resize(numElements);
-
-	{
-		b3LauncherCL launcher( queue, addKernel);
-		launcher.setBuffer( a.getBufferCL());
-		launcher.setBuffer( b.getBufferCL());
-		launcher.setConst(  numElements );
-		launcher.launch1D( numElements,1024);
-	}
-	clFinish(queue);
-	{
-		b3LauncherCL launcher( queue, addKernel);
-		launcher.setBuffer( b.getBufferCL());
-		launcher.setBuffer( a.getBufferCL());
-		launcher.setConst(  1024 );
-		launcher.launch1D( 1024,1024);
-	}
-	clFinish(queue);
-
-	printf("hostSum = %d\n", hostSum);
-
-	int clSum = a.at(0);
-	printf("clSum = %d\n", clSum );
-	if (hostSum != clSum)
-	{
-		printf("Incorrect result\n");
-	} else
-	{
-		printf("Correct result\n");
-	}
-
-	
-	clReleaseCommandQueue(queue);
-	clReleaseContext(ctx);
-	printf("press key\n");
-	getchar();
-	return 0;
-}
--- a/opencl/reduce/premake4.lua
+++ b/opencl/reduce/premake4.lua
@ -1,41 +0,0 @@
-
-function createProject (vendor)
-
-	local hasCL = findOpenCL(vendor)
-	
-	if (hasCL) then
-
-		project ( "OpenCL_reduce_" .. vendor)
-
-		initOpenCL(vendor)
-	
-		language "C++"
-				
-		kind "ConsoleApp"
-		targetdir "../../bin"
-
-		links {
-			"OpenCL_lib_parallel_primitives_host_" .. vendor
-		}
-
-		includedirs {
-			"../basic_initialize",
-			"../../src"
-		}
-		
-		files {
-			"main.cpp",
-			"../basic_initialize/b3OpenCLUtils.cpp",
-			"../basic_initialize/b3OpenCLUtils.h",
-			"../../src/Bullet3Common/b3AlignedAllocator.cpp",
-			"../../src/Bullet3Common/b3AlignedAllocator.h",
-			"../../src/Bullet3Common/b3AlignedObjectArray.h",
-		}
-	end
-	
-end
-
-createProject("AMD")
-createProject("NVIDIA")
-createProject("Intel")
-createProject("Apple")
--- a/opencl/vector_add/VectorAddKernels.cl
+++ b/opencl/vector_add/VectorAddKernels.cl
@ -1,16 +0,0 @@
-
-
-__kernel void VectorAdd(__global const float8* a, __global const float8* b, __global float8* c, int numElements)
-{
-    // get oct-float index into global data array
-    int iGID = get_global_id(0);
-	if (iGID>=numElements)
-		return;
-
-	float8 aGID = a[iGID];
-	float8 bGID = b[iGID];
-
-	float8 result = aGID + bGID;
-    // write back out to GMEM
-    c[iGID] = result;
-}
--- a/opencl/vector_add/VectorAddKernels.h
+++ b/opencl/vector_add/VectorAddKernels.h
@ -1,20 +0,0 @@
-//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* vectorAddCL= \
-"\n"
-"\n"
-"__kernel void VectorAdd(__global const float8* a, __global const float8* b, __global float8* c, int numElements)\n"
-"{\n"
-"    // get oct-float index into global data array\n"
-"    int iGID = get_global_id(0);\n"
-"	if (iGID>=numElements)\n"
-"		return;\n"
-"\n"
-"	float8 aGID = a[iGID];\n"
-"	float8 bGID = b[iGID];\n"
-"\n"
-"	float8 result = aGID + bGID;\n"
-"    // write back out to GMEM\n"
-"    c[iGID] = result;\n"
-"}\n"
-"\n"
-;
--- a/opencl/vector_add/main.cpp
+++ b/opencl/vector_add/main.cpp
@ -1,408 +0,0 @@
-
-///VectorAdd sample, from the NVidia JumpStart Guide
-///http://developer.download.nvidia.com/OpenCL/NVIDIA_OpenCL_JumpStart_Guide.pdf
-
-///Instead of #include <CL/cl.h> we include <MiniCL/cl.h>
-///Apart from this include file, all other code should compile and work on OpenCL compliant implementation
-
-
-#define LOAD_FROM_FILE
-
-#ifdef __APPLE__
-	#include <OpenCL/OpenCL.h>
-#else
-	#include <CL/cl.h>
-#endif //__APPLE__
-#ifdef _WIN32
-#pragma warning (disable:4996)
-#endif
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#define GRID3DOCL_CHECKERROR(a, b) if((a)!=(b)) { printf("3D GRID OCL Error : %d\n", (a)); b3Assert((a) == (b)); }
-size_t wgSize;
-
-#include "VectorAddKernels.h"
-
-#ifdef CL_PLATFORM_INTEL
-	const char* preferredPlatform = "Intel(R) Corporation";
-#elif defined CL_PLATFORM_AMD
-	const char* preferredPlatform = "Advanced Micro Devices, Inc.";
-#elif defined CL_PLATFORM_NVIDIA
-	const char* preferredPlatform = "NVIDIA Corporation";
-#else
-	const char* preferredPlatform = "Unknown";
-#endif
-
-
-
-char* loadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
-{
-    // locals 
-    FILE* pFileStream = NULL;
-    size_t szSourceLength;
-	
-    // open the OpenCL source code file
-	pFileStream = fopen(cFilename, "rb");
-	if(pFileStream == 0) 
-	{       
-		return NULL;
-	}
-	
-    size_t szPreambleLength = strlen(cPreamble);
-	
-    // get the length of the source code
-    fseek(pFileStream, 0, SEEK_END); 
-    szSourceLength = ftell(pFileStream);
-    fseek(pFileStream, 0, SEEK_SET); 
-	
-    // allocate a buffer for the source code string and read it in
-    char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); 
-    memcpy(cSourceString, cPreamble, szPreambleLength);
-    fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream); 
-	
-    // close the file and return the total length of the combined (preamble + source) string
-    fclose(pFileStream);
-    if(szFinalLength != 0)
-    {
-        *szFinalLength = szSourceLength + szPreambleLength;
-    }
-    cSourceString[szSourceLength + szPreambleLength] = '\0';
-	
-    return cSourceString;
-}
-
-size_t workitem_size[3];
-
-void printDevInfo(cl_device_id device)
-{
-    char device_string[1024];
-	
-    clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
-    printf(  " Device %s:\n", device_string);
-
-    // CL_DEVICE_INFO
-    cl_device_type type;
-    clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(type), &type, NULL);
-    if( type & CL_DEVICE_TYPE_CPU )
-        printf(" CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_CPU");
-    if( type & CL_DEVICE_TYPE_GPU )
-        printf(  " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_GPU");
-    if( type & CL_DEVICE_TYPE_ACCELERATOR )
-        printf(  " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
-    if( type & CL_DEVICE_TYPE_DEFAULT )
-        printf(  " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
-    
-    // CL_DEVICE_MAX_COMPUTE_UNITS
-    cl_uint compute_units;
-    clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
-    printf(  " CL_DEVICE_MAX_COMPUTE_UNITS:\t%d\n", compute_units);
-
-    // CL_DEVICE_MAX_WORK_GROUP_SIZE
-    
-    clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL);
-    printf(  " CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", workitem_size[0], workitem_size[1], workitem_size[2]);
-    
-}
-
-
-
-
-// Main function 
-// *********************************************************************
-int main(int argc, char **argv)
-{
-	void *srcA, *srcB, *dst;        // Host buffers for OpenCL test
-    cl_context cxGPUContext;       // OpenCL context
-    cl_command_queue cqCommandQue;  // OpenCL command que
-    cl_device_id* cdDevices;        // OpenCL device list    
-    cl_program cpProgram;           // OpenCL program
-    cl_kernel ckKernel;             // OpenCL kernel
-    cl_mem cmMemObjs[3];            // OpenCL memory buffer objects:  3 for device
-    size_t szGlobalWorkSize[1];     // 1D var for Total # of work items
-    size_t szLocalWorkSize[1];		// 1D var for # of work items in the work group	
-    size_t szParmDataBytes;			// Byte size of context information
-    cl_int ciErr1, ciErr2;			// Error code var
-    
-
-	int iTestN = 100000 * 8;		// Size of Vectors to process
-
-	int actualGlobalSize = iTestN / 8;
-	
-	
-    // set Global and Local work size dimensions
-    szGlobalWorkSize[0] = iTestN >> 3;  // do 8 computations per work item
-    szLocalWorkSize[0]= iTestN>>3;
-	
-	
-    // Allocate and initialize host arrays
-    srcA = (void *)malloc (sizeof(cl_float) * iTestN);
-    srcB = (void *)malloc (sizeof(cl_float) * iTestN);
-    dst = (void *)malloc (sizeof(cl_float) * iTestN);
-
-	int i;
-
-	// Initialize arrays with some values
-	for (i=0;i<iTestN;i++)
-	{
-		((cl_float*)srcA)[i] = cl_float(i);
-		((cl_float*)srcB)[i] = 2;
-		((cl_float*)dst)[i]=-1;
-	}
-
-
-	 cl_uint numPlatforms;
-    cl_platform_id platform = NULL;
-    cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
-
-    if (0 < numPlatforms) 
-    {
-        cl_platform_id* platforms = new cl_platform_id[numPlatforms];
-        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
-        
-        for (unsigned i = 0; i < numPlatforms; ++i) 
-        {
-            char pbuf[100];
-            status = clGetPlatformInfo(platforms[i],
-                                       CL_PLATFORM_VENDOR,
-                                       sizeof(pbuf),
-                                       pbuf,
-                                       NULL);
-
-            platform = platforms[i];
-			if (!strcmp(pbuf, preferredPlatform))
-            {
-				printf("Found platform %s\n", preferredPlatform);
-                break;
-            }
-        }
-        delete[] platforms;
-    }
-
-	cl_context_properties cps[3] = 
-    {
-        CL_CONTEXT_PLATFORM, 
-        (cl_context_properties)platform, 
-        0
-    };
-
-    // Create OpenCL context & context
-    cxGPUContext = clCreateContextFromType(cps, CL_DEVICE_TYPE_ALL, NULL, NULL, &ciErr1); //could also be CL_DEVICE_TYPE_GPU
-	
-    // Query all devices available to the context
-    ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
-    cdDevices = (cl_device_id*)malloc(szParmDataBytes);
-    ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
-	if (cdDevices)
-	{
-		printDevInfo(cdDevices[0]);
-	}
-
-    // Create a command queue for first device the context reported
-    cqCommandQue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, &ciErr2);
-    ciErr1 |= ciErr2; 
-
-    // Allocate the OpenCL source and result buffer memory objects on the device GMEM
-    cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcA, &ciErr2);
-    ciErr1 |= ciErr2;
-    cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcB, &ciErr2);
-    ciErr1 |= ciErr2;
-    cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float8) * szGlobalWorkSize[0], NULL, &ciErr2);
-    ciErr1 |= ciErr2;
-
-///create kernels from binary
-	int numDevices = 1;
-	::size_t* lengths = (::size_t*) malloc(numDevices * sizeof(::size_t));
-	const unsigned char** images = (const unsigned char**) malloc(numDevices * sizeof(const void*));
-
-	for (i = 0; i < numDevices; ++i) {
-		images[i] = 0;
-		lengths[i] = 0;
-	}
-
-	
-	// Read the OpenCL kernel in from source file
-	const char* cSourceFile = "opencl/vector_add/VectorAddKernels.cl";
-	
-    
-    const char* cPathAndName = cSourceFile;
-#ifdef LOAD_FROM_FILE
-	size_t szKernelLength;
-
-	const char* cSourceCL =0;
-	char relativeFileName[1024];
-
-	{
-		const char* prefix[]={"../","../../","../../../","../../../../"};
-		int numPrefixes = sizeof(prefix)/sizeof(char*);
-
-		for (int i=0;!cSourceCL && i<numPrefixes;i++)
-		{
-			
-			sprintf(relativeFileName,"%s%s",prefix[i],cSourceFile);
-			cSourceCL = loadProgSource(relativeFileName, "", &szKernelLength);
-			if (cSourceCL)
-			{
-				printf("Loaded program source: %s\n", relativeFileName); 
-			}
-		}
-	}
-
-	if (!cSourceCL)
-	{
-		printf("Couldn't find file %s, exiting\n",cSourceFile);
-		exit(0);
-	}
-#else
-	const char* cSourceCL = vectorAddCL;
-	size_t szKernelLength = strlen(cSourceCL);
-#endif //LOAD_FROM_FILE
-
-
-	
-    // Create the program
-    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1);
-    printf("clCreateProgramWithSource...\n"); 
-    if (ciErr1 != CL_SUCCESS)
-    {
-        printf("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
-        exit(0);
-    }
-	
-    // Build the program with 'mad' Optimization option
-#ifdef MAC
-	char* flags = "-cl-mad-enable -DMAC ";
-#else
-	char flags[1024]={0};
-#ifdef CL_PLATFORM_INTEL
-	sprintf(flags,"-g -s \"%s\"","C:/develop/experiments/opencl/vector_add/VectorAddKernels.cl");
-#endif//CL_PLATFORM_INTEL
-
-#endif//MAC
-    ciErr1 = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL);
-    printf("clBuildProgram...\n"); 
-    if (ciErr1 != CL_SUCCESS)
-    {
-        printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
-        exit(0);
-    }
-	
-    // Create the kernel
-    ckKernel = clCreateKernel(cpProgram, "VectorAdd", &ciErr1);
-    printf("clCreateKernel (VectorAdd)...\n"); 
-    if (ciErr1 != CL_SUCCESS)
-    {
-        printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
-		exit(0);
-    }
-	
-	
-	cl_int ciErrNum;
-	
-	ciErrNum = clGetKernelWorkGroupInfo(ckKernel, cdDevices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
-	if (ciErrNum != CL_SUCCESS)
-	{
-		printf("cannot get workgroup size\n");
-		exit(0);
-	}
-
-	
-
-   
-    // Set the Argument values
-    ciErr1 |= clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmMemObjs[0]);
-    ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmMemObjs[1]);
-    ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmMemObjs[2]);
-	ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(int), (void*)&actualGlobalSize);
-
-		printf("Press ENTER to quit\n");
-	getchar();
-	
-	int workgroupSize = wgSize;
-	if(workgroupSize <= 0)
-	{ // let OpenCL library calculate workgroup size
-		size_t globalWorkSize[2];
-		globalWorkSize[0] = actualGlobalSize;
-		globalWorkSize[1] = 1;
-	
-		// Copy input data from host to GPU and launch kernel 
-		ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalWorkSize, NULL, 0,0,0 );
-
-	}
-	else
-	{
-		size_t localWorkSize[2], globalWorkSize[2];
-		//workgroupSize = b3Min(workgroupSize, actualGlobalSize);
-		int num_t = actualGlobalSize / workgroupSize;
-		int num_g = num_t * workgroupSize;
-		if(num_g < actualGlobalSize)
-		{
-			num_t++;
-			//this can cause problems -> processing outside of the buffer
-			//make sure to check kernel
-		}
-
-		size_t globalThreads[] = {num_t * workgroupSize};
-		size_t localThreads[] = {workgroupSize};
-
-
-		localWorkSize[0]  = workgroupSize;
-		globalWorkSize[0] = num_t * workgroupSize;
-		localWorkSize[1] = 1;
-		globalWorkSize[1] = 1;
-
-		// Copy input data from host to GPU and launch kernel 
-		ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL);
-
-	}
-	
-	if (ciErrNum != CL_SUCCESS)
-	{
-		printf("cannot clEnqueueNDRangeKernel\n");
-		exit(0);
-	}
-	
-	clFinish(cqCommandQue);
-    // Read back results and check accumulated errors
-    ciErr1 |= clEnqueueReadBuffer(cqCommandQue, cmMemObjs[2], CL_TRUE, 0, sizeof(cl_float8) * szGlobalWorkSize[0], dst, 0, NULL, NULL);
-
-    // Release kernel, program, and memory objects
-	// NOTE:  Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity.
-    free(cdDevices);
-	clReleaseKernel(ckKernel);  
-    clReleaseProgram(cpProgram);
-    clReleaseCommandQueue(cqCommandQue);
-    clReleaseContext(cxGPUContext);
-
-
-    // print the results
-    int iErrorCount = 0;
-    for (i = 0; i < iTestN; i++) 
-    {
-		if (((float*)dst)[i] != ((float*)srcA)[i]+((float*)srcB)[i])
-			iErrorCount++;
-    }
-	
-	if (iErrorCount)
-	{
-		printf("Validation FAILED\n");
-	} else
-	{
-		printf("Validation SUCCESSFULL\n");
-	}
-    // Free host memory, close log and return success
-	for (i = 0; i < 3; i++)
-    {
-        clReleaseMemObject(cmMemObjs[i]);
-    }
-
-    free(srcA); 
-    free(srcB);
-    free (dst);
-	printf("Press ENTER to quit\n");
-	getchar();
-}
-
-
--- a/opencl/vector_add_simplified/main.cpp
+++ b/opencl/vector_add_simplified/main.cpp
@ -1,69 +0,0 @@
-///original author: Erwin Coumans
-#include "b3OpenCLUtils.h"
-#include "../parallel_primitives/host/b3OpenCLArray.h"
-#include "../parallel_primitives/host/b3LauncherCL.h"
-#include <stdio.h>
-
-
-#define MSTRINGIFY(A) #A
-const char* kernelString= MSTRINGIFY(
-__kernel void VectorAdd(__global const float* a, __global const float* b, __global float* c, int numElements)
-{
-  int iGID = get_global_id(0);
-	if (iGID>=numElements)
-		return;
-	float aGID = a[iGID];
-	float bGID = b[iGID];
-	float result = aGID + bGID;
-    c[iGID] = result;
-}
-);
-
-int main(int argc, char* argv[])
-{
-	int ciErrNum = 0;
-	int preferred_device = -1;
-	int preferred_platform = -1;
-	cl_platform_id		platformId;
-	cl_context			ctx;
-	cl_command_queue	queue;
-	cl_device_id		device;
-	cl_kernel			addKernel;
-	ctx = b3OpenCLUtils::createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum,0,0,preferred_device,preferred_platform,&platformId);
-	b3OpenCLUtils::printPlatformInfo(platformId);
-	oclCHECKERROR(ciErrNum, CL_SUCCESS);
-	if (!ctx) {
-		printf("No OpenCL capable GPU found!");
-		return 0;
-	}
-
-	device = b3OpenCLUtils::getDevice(ctx,0);
-	queue = clCreateCommandQueue(ctx, device, 0, &ciErrNum);
-	addKernel = b3OpenCLUtils::compileCLKernelFromString(ctx,device,kernelString,"VectorAdd",&ciErrNum);
-	oclCHECKERROR(ciErrNum, CL_SUCCESS);
-	int numElements = 32;
-	b3OpenCLArray<float> a(ctx,queue);
-	b3OpenCLArray<float> b(ctx,queue);
-	b3OpenCLArray<float> c(ctx,queue);
-	for (int i=0;i<numElements;i++)
-	{
-		a.push_back(float(i));
-		b.push_back(float(i));
-	}
-	
-	c.resize(numElements);
-	b3LauncherCL launcher( queue, addKernel);
-	launcher.setBuffer( a.getBufferCL());
-	launcher.setBuffer( b.getBufferCL());
-	launcher.setBuffer( c.getBufferCL());
-	launcher.setConst(  numElements );
-	launcher.launch1D( numElements);
-	for (int i=0;i<numElements;i++)
-	{
-		float v = c.at(i);
-		printf("c[%d]=%f\n",i,v);
-	}
-	clReleaseCommandQueue(queue);
-	clReleaseContext(ctx);
-	return 0;
-}
--- a/opencl/vector_add_simplified/premake4.lua
+++ b/opencl/vector_add_simplified/premake4.lua
@ -1,41 +0,0 @@
-
-function createProject (vendor)
-
-	local hasCL = findOpenCL(vendor)
-	
-	if (hasCL) then
-
-		project ( "OpenCL_vector_add_simplified_" .. vendor)
-
-		initOpenCL(vendor)
-	
-		language "C++"
-				
-		kind "ConsoleApp"
-		targetdir "../../bin"
-
-		links {
-			"OpenCL_lib_parallel_primitives_host_" .. vendor			
-		}
-
-		includedirs {
-			"../basic_initialize",
-			"../../src"
-		}
-		
-		files {
-			"main.cpp",
-			"../basic_initialize/b3OpenCLUtils.cpp",
-			"../basic_initialize/b3OpenCLUtils.h",
-			"../../src/Bullet3Common/b3AlignedAllocator.cpp",
-			"../../src/Bullet3Common/b3AlignedAllocator.h",
-			"../../src/Bullet3Common/b3AlignedObjectArray.h",
-		}
-	end
-	
-end
-
-createProject("AMD")
-createProject("NVIDIA")
-createProject("Intel")
-createProject("Apple")
--- a/opencl/parallel_primitives/host/b3Int4.h
+++ b/opencl/parallel_primitives/host/b3Int4.h
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp
@ -1,15 +1,15 @@

 #include "b3GpuSapBroadphase.h"
 #include "Bullet3Common/b3Vector3.h"
-#include "parallel_primitives/host/b3LauncherCL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
 #include "Bullet3Common/b3Quickprof.h"
-#include "basic_initialize/b3OpenCLUtils.h"
-
-
-#include "../kernels/sapKernels.h"
-#include "../kernels/sapFastKernels.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "kernels/sapKernels.h"
+#include "kernels/sapFastKernels.h"
 #include "Bullet3Common/b3MinMax.h"

+#define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl"
+#define B3_BROADPHASE_SAPFAST_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl"

 b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue  q )
 :m_context(ctx),
@ -28,9 +28,9 @@ m_currentBuffer(-1)
    
 	cl_int errNum=0;

-	cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"","opencl/gpu_broadphase/kernels/sap.cl");
+	cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH);
 	b3Assert(errNum==CL_SUCCESS);
-	cl_program sapFastProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapFastSrc,&errNum,"","opencl/gpu_broadphase/kernels/sapFast.cl");
+	cl_program sapFastProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapFastSrc,&errNum,"",B3_BROADPHASE_SAPFAST_PATH);
 	b3Assert(errNum==CL_SUCCESS);

 	
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h
@ -1,10 +1,10 @@
 #ifndef B3_GPU_SAP_BROADPHASE_H
 #define B3_GPU_SAP_BROADPHASE_H

-#include "parallel_primitives/host/b3OpenCLArray.h"
-#include "parallel_primitives/host/b3FillCL.h" //b3Int2
+#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2
 class b3Vector3;
-#include "parallel_primitives/host/b3RadixSort32CL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"

 #include "b3SapAabb.h"

--- a/src/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h
--- a/src/Bullet3OpenCL/Initialize/b3OpenCLInclude.h
+++ b/src/Bullet3OpenCL/Initialize/b3OpenCLInclude.h
--- a/src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp
+++ b/src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp
--- a/src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h
+++ b/src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3Collidable.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3Collidable.h
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.cpp
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.cpp
@ -29,18 +29,23 @@ typedef b3AlignedObjectArray<b3Vector3> b3VertexArray;
 #include "Bullet3Common/b3Quickprof.h"

 #include <float.h> //for FLT_MAX
-#include "basic_initialize/b3OpenCLUtils.h"
-#include "parallel_primitives/host/b3LauncherCL.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
 //#include "AdlQuaternion.h"

-#include "../kernels/satKernels.h"
-#include "../kernels/satClipHullContacts.h"
-#include "../kernels/bvhTraversal.h"
-#include "../kernels/primitiveContacts.h"
+#include "kernels/satKernels.h"
+#include "kernels/satClipHullContacts.h"
+#include "kernels/bvhTraversal.h"
+#include "kernels/primitiveContacts.h"


 #include "Bullet3Geometry/b3AabbUtil.h"

+#define BT_NARROWPHASE_SAT_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl"
+#define BT_NARROWPHASE_CLIPHULL_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl"
+#define BT_NARROWPHASE_BVH_TRAVERSAL_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.cl"
+#define BT_NARROWPHASE_PRIMITIVE_CONTACT_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl"
+

 #define dot3F4 b3Dot

@ -64,7 +69,7 @@ m_totalContactsOut(m_context, m_queue)
 //		sprintf(flags,"-g -s \"%s\"","C:/develop/bullet3_experiments2/opencl/gpu_narrowphase/kernels/sat.cl");
 //#endif

-		cl_program satProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,src,&errNum,flags,"opencl/gpu_narrowphase/kernels/sat.cl");
+		cl_program satProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,src,&errNum,flags,BT_NARROWPHASE_SAT_PATH);
 		b3Assert(errNum==CL_SUCCESS);

 		m_findSeparatingAxisKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,src, "findSeparatingAxisKernel",&errNum,satProg );
@ -92,7 +97,7 @@ m_totalContactsOut(m_context, m_queue)
 //		sprintf(flags,"-g -s \"%s\"","C:/develop/bullet3_experiments2/opencl/gpu_narrowphase/kernels/satClipHullContacts.cl");
 //#endif

-		cl_program satClipContactsProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,srcClip,&errNum,flags,"opencl/gpu_narrowphase/kernels/satClipHullContacts.cl");
+		cl_program satClipContactsProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,srcClip,&errNum,flags,BT_NARROWPHASE_CLIPHULL_PATH);
 		b3Assert(errNum==CL_SUCCESS);

 		m_clipHullHullKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "clipHullHullKernel",&errNum,satClipContactsProg);
@ -132,7 +137,7 @@ m_totalContactsOut(m_context, m_queue)
 	 if (1)
 	{
 		const char* srcBvh = bvhTraversalKernelCL;
-		cl_program bvhTraversalProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,srcBvh,&errNum,"","opencl/gpu_narrowphase/kernels/bvhTraversal.cl");
+		cl_program bvhTraversalProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,srcBvh,&errNum,"",BT_NARROWPHASE_BVH_TRAVERSAL_PATH);
 		b3Assert(errNum==CL_SUCCESS);

 		m_bvhTraversalKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcBvh, "bvhTraversalKernel",&errNum,bvhTraversalProg,"");
@ -142,7 +147,7 @@ m_totalContactsOut(m_context, m_queue)
        
 	 {
 		 const char* primitiveContactsSrc = primitiveContactsKernelsCL;
-		cl_program primitiveContactsProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,primitiveContactsSrc,&errNum,"","opencl/gpu_narrowphase/kernels/primitiveContacts.cl");
+		cl_program primitiveContactsProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,primitiveContactsSrc,&errNum,"",BT_NARROWPHASE_PRIMITIVE_CONTACT_PATH);
 		b3Assert(errNum==CL_SUCCESS);

 		m_primitiveContactsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,primitiveContactsSrc, "primitiveContactsKernel",&errNum,primitiveContactsProg,"");
@ -527,7 +532,7 @@ void computeContactPlaneConvex(int pairIndex,
 				b3Vector3 pOnB1 = contactPoints[contactIdx.s[i]];
 				c->m_worldPos[i] = pOnB1;
 			}
-			c->m_worldNormal[3] = numReducedPoints;
+			c->m_worldNormal[3] = (b3Scalar)numReducedPoints;
 		}//if (dstIdx < numPairs)
 	}	
 		
@ -665,7 +670,7 @@ void computeContactPlaneCompound(int pairIndex,
 				b3Vector3 pOnB1 = contactPoints[contactIdx.s[i]];
 				c->m_worldPos[i] = pOnB1;
 			}
-			c->m_worldNormal[3] = numReducedPoints;
+			c->m_worldNormal[3] = (b3Scalar)numReducedPoints;
 		}//if (dstIdx < numPairs)
 	}	
 		
@ -825,7 +830,7 @@ void	computeContactSphereConvex(int pairIndex,
 			c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;
 			c->m_worldPos[0] = pOnB1;
 			int numPoints = 1;
-			c->m_worldNormal[3] = numPoints;
+			c->m_worldNormal[3] = (b3Scalar)numPoints;
 		}//if (dstIdx < numPairs)
 		}
 	}//if (hasCollision)
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h
@ -2,15 +2,15 @@
 #ifndef _CONVEX_HULL_CONTACT_H
 #define _CONVEX_HULL_CONTACT_H

-#include "parallel_primitives/host/b3OpenCLArray.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
 #include "Bullet3Collision/NarrowPhaseCollision/b3RigidBodyCL.h"
 #include "Bullet3Common/b3AlignedObjectArray.h"
 #include "b3ConvexUtility.h"
 #include "b3ConvexPolyhedronCL.h"
 #include "b3Collidable.h"
 #include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
-#include "parallel_primitives/host/b3Int2.h"
-#include "parallel_primitives/host/b3Int4.h"
+#include "Bullet3Common/b3Int2.h"
+#include "Bullet3Common/b3Int4.h"
 #include "b3OptimizedBvh.h"
 #include "b3BvhInfo.h"

--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexPolyhedronCL.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexPolyhedronCL.h
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexUtility.cpp
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexUtility.cpp
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexUtility.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexUtility.h
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.cpp
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.cpp
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.h
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.cpp
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.cpp
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.cpp
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.cpp
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.h
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.cpp
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.cpp
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.h
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.cpp
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.cpp
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.h
--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.cl
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.cl
--- a/Show More
+++ b/Show More